X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.cpp;h=3f75433f32586eaf4dfef2e87f24522ca7a9a634;hb=af8938dbee21244687184dd0502a84ce1af70c50;hp=5b2b190d92c7a2e8defe2028bf7690951b7ff06b;hpb=35c8f7ba17cd67f73c3ecb933fa9ea0becaf16f1;p=SXSI%2FTextCollection.git

diff --git a/CSA.cpp b/CSA.cpp
index 5b2b190..3f75433 100644
--- a/CSA.cpp
+++ b/CSA.cpp
@@ -25,7 +25,7 @@
 #undef max
 #include "dcover/bwt.hpp"
 
-#include "HeapProfiler.h" // FIXME remove
+//#include "HeapProfiler.h" // FIXME remove
 
 #include <iostream>
 #include <queue>
@@ -40,7 +40,8 @@ using std::pair;
 using std::make_pair;
 using std::map;
 
-using SXSI::TextCollection;
+namespace SXSI
+{
 
 // Save file version info
 const uchar CSA::versionFlag = 2;
@@ -204,7 +205,14 @@ void CSA::MakeStatic()
         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 #endif
 
-    // Bitvector of empty texts
+    if (texts.size() == 0) // Empty collection
+    {
+         ++n;
+         ++numberOfTexts;
+         texts.append(1, '\0');
+    }
+
+    // Bitvector of empty texts, FIXME Remove?
     {
         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
@@ -216,17 +224,16 @@ void CSA::MakeStatic()
         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
     }
 
-/*    std::cout << "texts: ";
-    for (ulong i = 0; i < n; i++)
-        if (texts[i] > 50)
-            std::cout << texts[i];
-        else
-            std::cout << (int)texts[i];
-            std::cout << std::endl;*/
+    texts.reserve(0); // Release extra capacity
+
+/*    FILE *fp = fopen("texts.txt", "wb");
+    std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
+    fclose(fp);*/
+
     uchar *bwt = new uchar[n];
     {
-        // FIXME More succinct solution needed
-        unsigned* maptexts = new unsigned[n+1];
+        // More succinct solution via StringIterator, see below.
+/*        unsigned* maptexts = new unsigned[n+1];
         // Map text chars to [0..255+numberOfTexts]
         unsigned count = 0;
         for (ulong i = 0; i < n; ++i)
@@ -234,25 +241,49 @@ void CSA::MakeStatic()
                 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
             else {
                 uchar c = (uchar)texts[i];
-                maptexts[i] = (unsigned)c + numberOfTexts + 1;
+                maptexts[i] = (unsigned)c + numberOfTexts;
             }
-
         maptexts[n] = '\0';
-        texts.erase(); 
-        texts.reserve(0); // Release capacity
         assert(count == numberOfTexts);
 
-        bwtEndPos = (ulong)compute_bwt(&maptexts[0], &maptexts[n],
-                                       &bwt[0], numberOfTexts + 1);
+        std::cout << "maptext: ";
+        for (ulong i = 0; i <= n; ++i)
+            std::cout << maptexts[i] << ", ";
+            std::cout << std::endl;*/
+
+        // Mark endmarker positions into bitvector
+        ulong * endmarkers = new ulong[n/W+1];
+        for (ulong i = 0; i < n/W+1; ++i)
+            endmarkers[i] = 0;
+        for (ulong i = 0; i < n; ++i)
+            if (texts[i] == 0)
+                Tools::SetField(endmarkers, 1, i, 1);
+        BitRank* br = new BitRank(endmarkers, n, true);
+        assert(numberOfTexts == br->rank(n-1));
+
+        // Build iterators, FIXME clean up iterator construction.
+        StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
+        StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
+    
+        bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
+                                       &bwt[0], numberOfTexts);
 
         bwt[--bwtEndPos] = '\0';
-        delete [] maptexts;
+        texts.erase(); 
+        texts.reserve(0); // Release capacity
+        delete br; // bitvector of endmarkers
     } // End of bw transform
+//  std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
+
+/*    fp = fopen("bwt.txt", "wb");
+    std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
+    fclose(fp);*/
+
 
 #ifdef CSA_TEST_BWT
     { 
         uchar *bwtTest = dynFMI->getBWT();
-/*        printf("123456789012345678901234567890123456789\n");
+        printf("123456789012345678901234567890123456789\n");
         for (TextPosition i = 0; i < n && i < 100; i ++)
             if (bwt[i] < 50)
                 printf("%d", (int)bwt[i]);
@@ -264,7 +295,7 @@ void CSA::MakeStatic()
                 printf("%d", (int)bwtTest[i]);
             else
                 printf("%c", bwtTest[i]);
-                printf("\n");*/
+                printf("\n");
         
         // Sanity check
         assert(numberOfTexts == dynFMI->getCollectionSize());    
@@ -376,6 +407,22 @@ bool CSA::IsPrefix(uchar const * pattern) const
     return false;
 }
 
+bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return true;
+
+    TextPosition sp = 0, ep = 0;
+    Search(pattern, m, &sp, &ep);
+
+    // Check for end-marker(s) in result interval
+    if (CountEndmarkers(sp, ep, begin, end))
+        return true;
+    return false;
+}
+
+
 bool CSA::IsSuffix(uchar const *pattern) const
 {
     // Here counting is as fast as IsSuffix():
@@ -384,6 +431,14 @@ bool CSA::IsSuffix(uchar const *pattern) const
     return false;
 }
 
+bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
+{
+    // Here counting is as fast as IsSuffix():
+    if (CountSuffix(pattern, begin, end) > 0)
+        return true;
+    return false;
+}
+
 bool CSA::IsEqual(uchar const *pattern) const
 {
     TextPosition m = std::strlen((char *)pattern);
@@ -404,6 +459,26 @@ bool CSA::IsEqual(uchar const *pattern) const
     return false;
 }
 
+bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
+{
+    TextPosition m = std::strlen((char *)pattern);
+    if (m == 0)
+    {
+        if (numberOfAllTexts - numberOfTexts > 0u)
+            return true; // Empty texts exists
+        return false; // No empty texts exists
+    }
+
+    TextPosition sp = 0, ep = 0;
+    // Match including end-marker
+    Search(pattern, m+1, &sp, &ep, begin, end);
+
+    // Check for end-marker(s) in result interval
+    if (CountEndmarkers(sp, ep))
+        return true;
+    return false;
+}
+
 bool CSA::IsContains(uchar const * pattern) const
 {
     TextPosition m = strlen((char *)pattern);
@@ -419,10 +494,25 @@ bool CSA::IsContains(uchar const * pattern) const
     return false;
 }
 
-bool CSA::IsLessThan(uchar const*) const
+bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
 {
-    // TODO
-    assert(0);
+    // Here counting is as fast as existential querying
+    if (CountContains(pattern, begin, end) > 0)
+        return true; // FIXME No need to filter result set
+    return false;
+}
+
+bool CSA::IsLessThan(uchar const * pattern) const
+{
+    if (CountLessThan(pattern) > 0)
+        return true;
+    return false;
+}
+
+bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+    if (CountLessThan(pattern, begin, end) > 0)
+        return true;
     return false;
 }
 
@@ -453,6 +543,19 @@ unsigned CSA::CountPrefix(uchar const * pattern) const
     return CountEndmarkers(sp, ep);
 }
 
+unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return numberOfAllTexts;
+
+    TextPosition sp = 0, ep = 0;
+    Search(pattern, m, &sp, &ep);
+
+    // Count end-markers in result interval
+    return CountEndmarkers(sp, ep, begin, end);
+}
+
 unsigned CSA::CountSuffix(uchar const * pattern) const
 {
     TextPosition m = strlen((char *)pattern);
@@ -466,6 +569,19 @@ unsigned CSA::CountSuffix(uchar const * pattern) const
     return count;
 }
 
+unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return numberOfAllTexts;
+
+    TextPosition sp = 0, ep = 0;
+    // Search with end-marker
+    unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
+ 
+    return count;
+}
+
 unsigned CSA::CountEqual(uchar const *pattern) const
 {
     TextPosition m = strlen((char const *)pattern);
@@ -480,6 +596,20 @@ unsigned CSA::CountEqual(uchar const *pattern) const
     return CountEndmarkers(sp, ep);
 }
 
+unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return numberOfAllTexts - numberOfTexts; // Empty texts. 
+
+    TextPosition sp = 0, ep = 0;
+    // Match including end-marker
+    Search(pattern, m+1, &sp, &ep, begin, end);
+
+    // Count end-markers in result interval
+    return CountEndmarkers(sp, ep); // Already within [begin, end]
+}
+
 unsigned CSA::CountContains(uchar const * pattern) const
 {
     TextPosition m = strlen((char const *)pattern);
@@ -487,16 +617,48 @@ unsigned CSA::CountContains(uchar const * pattern) const
         return numberOfAllTexts; // Total number of texts. 
 
     // Here counting is as slow as fetching the result set
-    // because we would have to filter out occ's that fall within same document.
+    // because we have to filter out occ's that fall within same document.
     TextCollection::document_result result = Contains(pattern);
     return result.size();
 }
 
+unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return numberOfAllTexts; // Total number of texts. 
+
+    // Here counting is as slow as fetching the result set
+    // because we have to filter out occ's that fall within same document.
+    TextCollection::document_result result = Contains(pattern, begin, end);
+    return result.size();
+}
+
+// Less than or equal
 unsigned CSA::CountLessThan(uchar const * pattern) const
 {
-    // TODO
-    assert(0);
-    return 0;
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return numberOfAllTexts - numberOfTexts; // Empty texts. 
+
+    TextPosition sp = 0, ep = 0;
+    SearchLessThan(pattern, m, &sp, &ep);
+
+    // Count end-markers in result interval
+    return CountEndmarkers(sp, ep);
+}
+
+unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return numberOfAllTexts - numberOfTexts; // Empty texts. 
+
+    TextPosition sp = 0, ep = 0;
+    SearchLessThan(pattern, m, &sp, &ep);
+
+    // Count end-markers in result interval
+    return CountEndmarkers(sp, ep, begin, end);
 }
 
 /**
@@ -539,6 +701,44 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const
     return result;
 }
 
+TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result(); // FIXME Should return all 1...k
+
+    TextPosition sp = 0, ep = 0;
+    Search(pattern, m, &sp, &ep);
+
+    TextCollection::document_result result;
+    
+    // Report end-markers in result interval
+    unsigned resultSize = CountEndmarkers(sp, ep);
+    if (resultSize == 0)
+        return result;
+
+    result.reserve(resultSize); // Try to avoid reallocation.
+
+    // Iterate through end-markers in [sp,ep] and [begin, end]:
+    unsigned i = 0;
+    if (sp > 0)
+        i = alphabetrank->rank(0, sp - 1);
+    while (resultSize)
+    {
+        // End-marker we found belongs to the "preceeding" doc in the collection
+        DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
+        if (docId >= begin && docId <= end)
+            result.push_back(docId);
+
+        -- resultSize;
+        ++ i;
+    }
+    
+    return result;
+}
+
 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 {
     TextPosition m = strlen((char *)pattern);
@@ -589,6 +789,57 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const
     return result;
 }
 
+TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result(); // FIXME Should return all 1...k
+
+    TextPosition sp = 0, ep = 0;
+    // Search with end-marker
+    Search(pattern, m+1, &sp, &ep, begin, end);
+ 
+    TextCollection::document_result result;
+    result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+    ulong sampled_rank_i = 0;
+    // Check each occurrence, already within [begin, end]
+    for (; sp <= ep; ++sp)
+    {
+        TextPosition i = sp;
+
+        uchar c = alphabetrank->access(i);
+        while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+        {
+            i = C[c]+alphabetrank->rank(c,i)-1;
+            c = alphabetrank->access(i);
+        }
+        // Assert: c == '\0'  OR  sampled->IsBitSet(i)
+
+        if (c == '\0')
+        {
+            // Rank among the end-markers in BWT
+            unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+            // End-marker that we found belongs to the "preceeding" doc in collection:
+            DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;        
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
+            result.push_back(docId);
+        }
+        else // Sampled position
+        {
+            DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
+            result.push_back(docId);
+        }
+    }
+    
+    return result;
+}
+
+
 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 {
     TextPosition m = strlen((char const *)pattern);
@@ -626,6 +877,43 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const
     return result;
 }
 
+TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return TextCollection::document_result(); // FIXME Should return all empty texts
+
+    TextPosition sp = 0, ep = 0;
+    // Match including end-marker
+    Search(pattern, m+1, &sp, &ep, begin, end);
+
+    TextCollection::document_result result;
+
+    // Report end-markers in result interval
+    unsigned resultSize = CountEndmarkers(sp, ep);
+    if (resultSize == 0)
+        return result;
+
+    result.reserve(resultSize); // Try to avoid reallocation.
+
+    unsigned i = 0;
+    if (sp > 0)
+        i = alphabetrank->rank(0, sp - 1);
+    while (resultSize)
+    {
+        // End-marker we found belongs to the "previous" doc in the collection
+        DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
+        result.push_back(docId); // already within [begin, end]
+
+        -- resultSize;
+        ++ i;
+    }
+    
+    return result;
+}
+
 
 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 {
@@ -676,11 +964,130 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const
     return result;
 }
 
+TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result();
+
+    TextPosition sp = 0, ep = 0;
+    // Search all occurrences
+    Search(pattern, m, &sp, &ep);
+
+    // We want unique document indentifiers, using std::set to collect them
+    std::set<DocId> resultSet;
+
+    ulong sampled_rank_i = 0;
+    // Check each occurrence
+    for (; sp <= ep; ++sp)
+    {
+        TextPosition i = sp;
+        uchar c = alphabetrank->access(i);
+        while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+        {
+            i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+            c = alphabetrank->access(i);
+        }
+        if (c == '\0')
+        {
+            // Rank among the end-markers in BWT
+            unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+            
+            // End-marker that we found belongs to the "preceeding" doc in collection:
+            DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+            if (docId >= begin && docId <= end)
+                resultSet.insert(docId);
+        }
+        else
+        {
+            DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+            assert((unsigned)docId < numberOfTexts);
+            if (docId >= begin && docId <= end)
+                resultSet.insert(docId);
+        }
+    }
+    
+    // Convert std::set to std::vector
+    TextCollection::document_result result(resultSet.begin(), resultSet.end());
+    // Map to doc ID's
+    for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+        *it = emptyTextRank->select0(*it+1);
+    return result;
+}
+
 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 {
-    // TODO
-    assert(0);
-    return document_result();
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result(); // empty result set
+
+    TextPosition sp = 0, ep = 0;
+    SearchLessThan(pattern, m, &sp, &ep);
+
+    TextCollection::document_result result;
+    
+    // Report end-markers in result interval
+    unsigned resultSize = CountEndmarkers(sp, ep);
+    if (resultSize == 0)
+        return result;
+
+    result.reserve(resultSize); // Try to avoid reallocation.
+
+    // Iterate through end-markers in [sp,ep]:
+    unsigned i = 0;
+    if (sp > 0)
+        i = alphabetrank->rank(0, sp - 1);
+    while (resultSize)
+    {
+        // End-marker we found belongs to the "preceeding" doc in the collection
+        DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
+        result.push_back(docId);
+
+        -- resultSize;
+        ++ i;
+    }
+    
+    return result;
+}
+
+TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result(); // empty result set
+
+    TextPosition sp = 0, ep = 0;
+    SearchLessThan(pattern, m, &sp, &ep);
+
+    TextCollection::document_result result;
+    
+    // Report end-markers in result interval
+    unsigned resultSize = CountEndmarkers(sp, ep);
+    if (resultSize == 0)
+        return result;
+
+    result.reserve(resultSize); // Try to avoid reallocation.
+
+    // Iterate through end-markers in [sp,ep] and [begin, end]:
+    unsigned i = 0;
+    if (sp > 0)
+        i = alphabetrank->rank(0, sp - 1);
+    while (resultSize)
+    {
+        // End-marker we found belongs to the "preceeding" doc in the collection
+        DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
+        if (docId >= begin && docId <= end)
+            result.push_back(docId);
+
+        -- resultSize;
+        ++ i;
+    }
+    
+    return result;
 }
 
 /**
@@ -738,6 +1145,60 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const
     return result;
 }
 
+TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return full_result(); // FIXME Throw exception?
+
+    TextPosition sp = 0, ep = 0;
+    // Search all occurrences
+    Search(pattern, m, &sp, &ep);
+ 
+    full_result result;
+    result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+    ulong sampled_rank_i = 0;    
+    // Report each occurrence
+    for (; sp <= ep; ++sp)
+    {
+        TextPosition i = sp;
+        TextPosition dist = 0;
+        uchar c = alphabetrank->access(i);
+        while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+        {
+            i = C[c]+alphabetrank->rank(c,i)-1;
+            c = alphabetrank->access(i);
+            ++ dist;
+        }
+        if (c == '\0')
+        {
+            // Rank among the end-markers in BWT
+            unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+            
+            // End-marker that we found belongs to the "preceeding" doc in collection:
+            DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
+            if (docId >= begin && docId <= end)
+                result.push_back(make_pair(docId, dist)); 
+        }
+        else
+        {
+            TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
+            DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+//            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
+
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
+            if (docId >= begin && docId <= end)
+                result.push_back(make_pair(docId, textPos));
+        }
+    }
+    
+    return result;
+}
+
 
 /**
  * Save index to a file handle
@@ -869,6 +1330,38 @@ void CSA::Load(FILE *file, unsigned samplerate)
  * Rest of the functions follow...
  */
 
+
+// FIXME Use 2D-search structure
+unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+{
+    if (sp > ep)
+        return 0;
+    
+    ulong ranksp = 0;
+    if (sp != 0)
+        ranksp = alphabetrank->rank(0, sp - 1);
+    
+    unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
+    if (resultSize == 0)
+        return 0;
+
+    // Count end-markers in result interval and within [begin, end]
+    unsigned count = 0;
+    unsigned i = ranksp;
+    while (resultSize)
+    {
+        // End-marker we found belongs to the "previous" doc in the collection
+        DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
+        if (docId >= begin && docId <= end)
+            ++ count;
+
+        -- resultSize;
+        ++ i;
+    }
+    return count;
+}
  
 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const 
 {
@@ -892,6 +1385,56 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult,
         return 0;
 }
 
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const 
+{
+    // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+    int c = (int)pattern[m-1]; 
+    assert(c == 0); // Start from endmarkers
+    TextPosition i=m-1;
+    TextPosition sp = begin;
+    TextPosition ep = end;
+    while (sp<=ep && i>=1) 
+    {
+//         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+        c = (int)pattern[--i];
+        sp = C[c]+alphabetrank->rank(c,sp-1);
+        ep = C[c]+alphabetrank->rank(c,ep)-1;
+    }
+    *spResult = sp;
+    *epResult = ep;
+    if (sp<=ep)
+        return ep - sp + 1;
+    else
+        return 0;
+}
+
+
+ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const 
+{
+    // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+    uint c = (int)pattern[m-1]; 
+    TextPosition i=m-1;
+    TextPosition sp = 1;
+    TextPosition ep = C[c+1]-1;
+    while (sp<=ep && i>=1) 
+    {
+//         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+        c = (int)pattern[--i];
+        uint result = alphabetrank->rank(c,ep);
+        if (result == ~0u)
+            ep = 0;
+        else
+            ep = C[c]+result-1;
+    }
+    *spResult = sp;
+    *epResult = ep;
+    if (sp<=ep)
+        return ep - sp + 1;
+    else
+        return 0;
+}
+
+
 CSA::~CSA() {
 #ifdef CSA_TEST_BWT
     delete dynFMI;
@@ -930,21 +1473,21 @@ void CSA::makewavelet(uchar *bwt)
 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);   
 //    delete [] bwt;
     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
+//  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+//    std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 
-    // FIXME: static_sequence_wvtree accepts only uint arrays
-    uint *text = new uint[n];
-    for (i = 0; i < n; ++i) // Silly
-        text[i] = bwt[i];
-    delete [] bwt;
-    bwt = 0;
+//    HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
 
     alphabet_mapper * am = new alphabet_mapper_none();
-    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
-    wt_coder * wtc = new wt_coder_huff(text,n,am);
-    alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
+    static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
+//    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
+    wt_coder * wtc = new wt_coder_binary(bwt,n,am);
+    alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
     delete bmb;
-    delete [] text;
-    text = 0;
+    bwt = 0; // already deleted
+   
+//    std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+//    std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 }
 
 void CSA::maketables()
@@ -953,17 +1496,24 @@ void CSA::maketables()
     // Note: bwtEndPos already set!
     // and the length of the first text (counter l):
 #ifdef CSA_TEST_BWT
-    ulong i = 0;
-    ulong l = 1;
-
-    uchar c  = alphabetrank->access(i);
-    while (c != '\0')
     {
-        i = C[c]+alphabetrank->rank(c, i)-1;
-        ++l;
-        c = alphabetrank->access(i);
+        ulong i = 0;
+        ulong l = 1;
+        uint alphabetrank_i_tmp = 0;
+        uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
+        while (c != '\0')
+        {
+            i = C[c]+alphabetrank_i_tmp-1;
+            ++l;
+            c = alphabetrank->access(i, alphabetrank_i_tmp);
+        }
+
+        if (i != bwtEndPos) // compare result
+        {
+            std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
+            exit(0);
+        }
     }
-    assert(i == bwtEndPos); // compare result
 #endif
 
     // Build up arrays for text length and starting positions
@@ -993,6 +1543,7 @@ void CSA::maketables()
     DocId textId = numberOfTexts;
     ulong ulongmax = 0;
     ulongmax--;
+    uint alphabetrank_i_tmp =0;
 
     //positions:
     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
@@ -1006,13 +1557,13 @@ void CSA::maketables()
             sampleCount ++;
         }
 
-        uchar c = alphabetrank->access(p);
+        uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
         if (c == '\0')
         {
             --textId;
             
             // Record the order of end-markers in BWT:
-            ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
+            ulong endmarkerRank = alphabetrank_i_tmp - 1;
             (*endmarkerDocId)[endmarkerRank] = textId;
 
             // Store text length and text start position:
@@ -1027,7 +1578,7 @@ void CSA::maketables()
             p = textId; // Correct LF-mapping to the last char of the previous text.
         }
         else
-            p = C[c]+alphabetrank->rank(c, p)-1;
+            p = C[c]+alphabetrank_i_tmp-1;
     }
     assert(textId == 0);
     
@@ -1154,3 +1705,5 @@ unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
       return x | (bit << pos);
 }
 
+} // namespace SXSI
+