Added LessThan() implementation

author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Tue, 10 Mar 2009 19:07:14 +0000 (19:07 +0000)

committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Tue, 10 Mar 2009 19:07:14 +0000 (19:07 +0000)
author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Tue, 10 Mar 2009 19:07:14 +0000 (19:07 +0000)
committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Tue, 10 Mar 2009 19:07:14 +0000 (19:07 +0000)
diff --git a/CSA.cpp b/CSA.cpp

index a190b48..eaa5cc0 100644 (file)
--- a/CSA.cpp
+++ b/CSA.cpp
@@ -205,6 +205,12 @@ void CSA::MakeStatic()
          throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
  #endif
  
+    if (texts.size() == 0) // Empty collection
+    {
+         ++n;
+         texts.append(1, '\0');
+    }
+
      // Bitvector of empty texts, FIXME Remove?
      {
          //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
@@ -217,6 +223,12 @@ void CSA::MakeStatic()
          emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
      }
  
+    texts.reserve(0); // Release extra capacity
+
+/*    FILE *fp = fopen("texts.txt", "wb");
+    std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
+    fclose(fp);*/
+
      uchar *bwt = new uchar[n];
      {
          // More succinct solution via StringIterator, see below.
@@ -260,6 +272,12 @@ void CSA::MakeStatic()
          texts.reserve(0); // Release capacity
          delete br; // bitvector of endmarkers
      } // End of bw transform
+//  std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
+
+/*    fp = fopen("bwt.txt", "wb");
+    std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
+    fclose(fp);*/
+
  
  #ifdef CSA_TEST_BWT
      { 
@@ -431,10 +449,10 @@ bool CSA::IsContains(uchar const * pattern) const
      return false;
  }
  
-bool CSA::IsLessThan(uchar const*) const
+bool CSA::IsLessThan(uchar const * pattern) const
  {
-    // TODO
-    assert(0);
+    if (CountLessThan(pattern) > 0)
+        return true;
      return false;
  }
  
@@ -504,11 +522,18 @@ unsigned CSA::CountContains(uchar const * pattern) const
      return result.size();
  }
  
+// Less than or equal
  unsigned CSA::CountLessThan(uchar const * pattern) const
  {
-    // TODO
-    assert(0);
-    return 0;
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return numberOfAllTexts - numberOfTexts; // Empty texts. 
+
+    TextPosition sp = 0, ep = 0;
+    SearchLessThan(pattern, m, &sp, &ep);
+
+    // Count end-markers in result interval
+    return CountEndmarkers(sp, ep);
  }
  
  /**
@@ -690,9 +715,39 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const
  
  TextCollection::document_result CSA::LessThan(uchar const * pattern) const
  {
-    // TODO
-    assert(0);
-    return document_result();
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result(); // empty result set
+
+    TextPosition sp = 0, ep = 0;
+    SearchLessThan(pattern, m, &sp, &ep);
+
+    TextCollection::document_result result;
+    
+    // Report end-markers in result interval
+    unsigned resultSize = CountEndmarkers(sp, ep);
+    if (resultSize == 0)
+        return result;
+
+    result.reserve(resultSize); // Try to avoid reallocation.
+
+    // Iterate through end-markers in [sp,ep]:
+    unsigned i = 0;
+    if (sp > 0)
+        i = alphabetrank->rank(0, sp - 1);
+    while (resultSize)
+    {
+        // End-marker we found belongs to the "preceeding" doc in the collection
+        DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
+        result.push_back(docId);
+
+        -- resultSize;
+        ++ i;
+    }
+    
+    return result;
  }
  
  /**
@@ -904,6 +959,33 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult,
          return 0;
  }
  
+
+ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const 
+{
+    // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+    uint c = (int)pattern[m-1]; 
+    TextPosition i=m-1;
+    TextPosition sp = 1;
+    TextPosition ep = C[c+1]-1;
+    while (sp<=ep && i>=1) 
+    {
+//         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+        c = (int)pattern[--i];
+        uint result = alphabetrank->rank(c,ep);
+        if (result == ~0u)
+            ep = 0;
+        else
+            ep = C[c]+result-1;
+    }
+    *spResult = sp;
+    *epResult = ep;
+    if (sp<=ep)
+        return ep - sp + 1;
+    else
+        return 0;
+}
+
+
  CSA::~CSA() {
  #ifdef CSA_TEST_BWT
      delete dynFMI;
@@ -942,13 +1024,21 @@ void CSA::makewavelet(uchar *bwt)
  //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);   
  //    delete [] bwt;
      //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
+  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+    std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+
+    HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
  
      alphabet_mapper * am = new alphabet_mapper_none();
-    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
-    wt_coder * wtc = new wt_coder_huff(bwt,n,am);
+    static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
+//    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
+    wt_coder * wtc = new wt_coder_binary(bwt,n,am);
      alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
      delete bmb;
-    delete [] bwt;
+    bwt = 0; // already deleted
+   
+    std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+    std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
  }
  
  void CSA::maketables()
@@ -957,17 +1047,24 @@ void CSA::maketables()
      // Note: bwtEndPos already set!
      // and the length of the first text (counter l):
  #ifdef CSA_TEST_BWT
-    ulong i = 0;
-    ulong l = 1;
-
-    uchar c  = alphabetrank->access(i);
-    while (c != '\0')
      {
-        i = C[c]+alphabetrank->rank(c, i)-1;
-        ++l;
-        c = alphabetrank->access(i);
+        ulong i = 0;
+        ulong l = 1;
+        uint alphabetrank_i_tmp = 0;
+        uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
+        while (c != '\0')
+        {
+            i = C[c]+alphabetrank_i_tmp-1;
+            ++l;
+            c = alphabetrank->access(i, alphabetrank_i_tmp);
+        }
+
+        if (i != bwtEndPos) // compare result
+        {
+            std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
+            exit(0);
+        }
      }
-    assert(i == bwtEndPos); // compare result
  #endif
  
      // Build up arrays for text length and starting positions
@@ -997,6 +1094,7 @@ void CSA::maketables()
      DocId textId = numberOfTexts;
      ulong ulongmax = 0;
      ulongmax--;
+    uint alphabetrank_i_tmp =0;
  
      //positions:
      for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
@@ -1010,13 +1108,13 @@ void CSA::maketables()
              sampleCount ++;
          }
  
-        uchar c = alphabetrank->access(p);
+        uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
          if (c == '\0')
          {
              --textId;
              
              // Record the order of end-markers in BWT:
-            ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
+            ulong endmarkerRank = alphabetrank_i_tmp - 1;
              (*endmarkerDocId)[endmarkerRank] = textId;
  
              // Store text length and text start position:
@@ -1031,7 +1129,7 @@ void CSA::maketables()
              p = textId; // Correct LF-mapping to the last char of the previous text.
          }
          else
-            p = C[c]+alphabetrank->rank(c, p)-1;
+            p = C[c]+alphabetrank_i_tmp-1;
      }
      assert(textId == 0);
author	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Tue, 10 Mar 2009 19:07:14 +0000 (19:07 +0000)
committer	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Tue, 10 Mar 2009 19:07:14 +0000 (19:07 +0000)