Added deltavector for non-indexed texts

[SXSI/TextCollection.git] / TCImplementation.h
diff --git a/TCImplementation.h b/TCImplementation.h

index b74f705..f0a21cc 100644 (file)
--- a/TCImplementation.h
+++ b/TCImplementation.h
@@ -23,13 +23,13 @@
  #include "BitRank.h"
  #include "TextCollection.h"
  #include "BlockArray.h"
  #include "BitRank.h"
  #include "TextCollection.h"
  #include "BlockArray.h"
-#include "BSGAP.h"
  
  // Include  from XMLTree/libcds
  #include <basics.h> // Defines W == 32
  #include <static_bitsequence.h>
  #include <alphabet_mapper.h>
  #include <static_sequence.h>
  
  // Include  from XMLTree/libcds
  #include <basics.h> // Defines W == 32
  #include <static_bitsequence.h>
  #include <alphabet_mapper.h>
  #include <static_sequence.h>
+#include <static_sequence_wvtree_noptrs.h>
  
  // Re-define word size to ulong:
  #undef W
  
  // Re-define word size to ulong:
  #undef W
@@ -39,6 +39,11 @@
  #   define W 32
  #endif
  #undef bitset
  #   define W 32
  #endif
  #undef bitset
+#undef bitget
+
+#include "TextStorage.h"
+#include "ArrayDoc.h"
+#include <set>
  
  namespace SXSI 
  {
  
  namespace SXSI 
  {
@@ -50,15 +55,35 @@ namespace SXSI
   */
  class TCImplementation : public SXSI::TextCollection {
  public:
   */
  class TCImplementation : public SXSI::TextCollection {
  public:
-    TCImplementation(uchar *, ulong, unsigned, unsigned, ulong);
+    TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong, char);
      ~TCImplementation();
  
      bool EmptyText(DocId) const;
      ~TCImplementation();
  
      bool EmptyText(DocId) const;
-    uchar* GetText(DocId) const;
+
+    /**
+     * Extracting one text.
+     *
+     * Call DeleteText() for each pointer returned by GetText()
+     * to avoid possible memory leaks.
+     */
+    uchar * GetText(DocId) const;
+    void DeleteText(uchar *text) const
+    { textStorage->DeleteText(text); }
+
      /**
      /**
-     * Next method is not supported:
-     * Supporting GetText for some substring [i,j]
-     * would require more space.
+     * Returns a pointer to the beginning of texts i, i+1, ..., j.
+     * Texts are separated by a '\0' byte.
+     *
+     * Call DeleteText() for each pointer returned by GetText()
+     * to avoid possible memory leaks.
+     */
+    uchar * GetText(DocId i, DocId j) const
+    { return textStorage->GetText(i, j); }
+
+    /**
+     * Returns a substring of given text ID.
+     * 
+     * FIXME This may be reimplemented via TextStorage.
       */
  //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  
       */
  //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  
@@ -93,6 +118,8 @@ public:
      document_result Equal(uchar const *) const;
      document_result Contains(uchar const *) const;
      document_result LessThan(uchar const *) const;
      document_result Equal(uchar const *) const;
      document_result Contains(uchar const *) const;
      document_result LessThan(uchar const *) const;
+    document_result KMismaches(uchar const *, unsigned) const;
+    document_result KErrors(uchar const *, unsigned) const;
  
      document_result Prefix(uchar const *, DocId, DocId) const;
      document_result Suffix(uchar const *, DocId, DocId) const;
  
      document_result Prefix(uchar const *, DocId, DocId) const;
      document_result Suffix(uchar const *, DocId, DocId) const;
@@ -103,12 +130,16 @@ public:
      // Definition of full_result is inherited from SXSI::TextCollection.
      full_result FullContains(uchar const *) const;
      full_result FullContains(uchar const *, DocId, DocId) const;
      // Definition of full_result is inherited from SXSI::TextCollection.
      full_result FullContains(uchar const *) const;
      full_result FullContains(uchar const *, DocId, DocId) const;
+    full_result FullKMismatches(uchar const *, unsigned) const;
+    full_result FullKErrors(uchar const *, unsigned) const;
  
      // Index from/to disk
      TCImplementation(FILE *, unsigned);
      void Save(FILE *) const;
  
  private:
  
      // Index from/to disk
      TCImplementation(FILE *, unsigned);
      void Save(FILE *) const;
  
  private:
+    typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
+
      static const uchar versionFlag;
      TextPosition n;
      unsigned samplerate;
      static const uchar versionFlag;
      TextPosition n;
      unsigned samplerate;
@@ -117,7 +148,7 @@ private:
      static_sequence * alphabetrank;
  
      // Sample structures for texts longer than samplerate
      static_sequence * alphabetrank;
  
      // Sample structures for texts longer than samplerate
-    BSGAP *sampled;  // FIXME Replace with RRR02
+    static_bitsequence * sampled;
      BlockArray *suffixes;
      BlockArray *suffixDocId;
  
      BlockArray *suffixes;
      BlockArray *suffixDocId;
  
@@ -127,18 +158,23 @@ private:
      ulong maxTextLength;
  
      // Array of document id's in the order of end-markers in BWT
      ulong maxTextLength;
  
      // Array of document id's in the order of end-markers in BWT
-    // Access by endmarkerDocId[rank_$(L, p) - 1].
-    static_sequence *Doc;
+//    static_sequence *Doc;
+    ArrayDoc *Doc;
  
  
-    // Following are not part of the public API
+    // Text storage for fast extraction
+    TextStorage * textStorage;
+
+    // Following methods are not part of the public API
      uchar * BWT(uchar *);
      void makewavelet(uchar *);
      uchar * BWT(uchar *);
      void makewavelet(uchar *);
-    void maketables();
+    void maketables(ulong, char);
      DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
      ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
      DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
      ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-
+    ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
+    ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
+    ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; 
      /**
       * Count end-markers in given interval
       */
      /**
       * Count end-markers in given interval
       */
@@ -197,6 +233,146 @@ private:
          
          return Doc->access(sp, ep-1, min, max);
      }
          
          return Doc->access(sp, ep-1, min, max);
      }
+
+    /**
+     * Enumerate documents in given interval [sp, ep]
+     */
+    inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
+    {
+        // We want unique document indentifiers, using std::set to collect them
+        // FIXME use unordered_set?
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                resultSet.insert(Doc->access(endmarkerRank));
+            }
+            else
+            {
+                DocId di = (*suffixDocId)[sampled->rank1(i)-1];
+                assert((unsigned)di < numberOfTexts);
+                resultSet.insert(di);
+            }
+        }
+    }
+
+    /**
+     * Enumerate documents in given interval [sp, ep]
+     * and within [begin, end]
+     */
+    inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+    {
+        // We want unique document indentifiers, using std::set to collect them
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                DocId docId = Doc->access(endmarkerRank);
+                if (docId >= begin && docId <= end)
+                    resultSet.insert(docId);
+            }
+            else
+            {
+                DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+                assert((unsigned)docId < numberOfTexts);
+                if (docId >= begin && docId <= end)
+                    resultSet.insert(docId);
+            }
+        }
+    }
+
+    /**
+     * Enumerate document+position pairs (full_result) of
+     * each suffix in given interval.
+     */
+    inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
+    {
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            TextPosition dist = 0;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+                ++ dist;
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                DocId docId = Doc->access(endmarkerRank);
+                result.push_back(make_pair(docId, dist)); 
+            }
+            else
+            {
+                TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+                DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+                result.push_back(make_pair(docId, textPos));
+            }
+        }
+    }
+
+    /**
+     * Enumerate document+position pairs (full_result) of
+     * each suffix in given interval and within [begin, end].
+     */
+    inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+    {
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            TextPosition dist = 0;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+                ++ dist;
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                DocId docId = Doc->access(endmarkerRank);
+                if (docId >= begin && docId <= end)
+                    result.push_back(make_pair(docId, dist)); 
+            }
+            else
+            {
+                TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+                DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+                if (docId >= begin && docId <= end)
+                    result.push_back(make_pair(docId, textPos));
+            }
+        }
+    }
+
  }; // class TCImplementation
  
  } // namespace SXSI
  }; // class TCImplementation
  
  } // namespace SXSI