X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.h;h=d9ac0637584beeb90a289af071ce1a2fb2850785;hb=0b35fca408fd60a0f4dc82c1e26c06f05b1661f6;hp=0439978d3d0e4a4572169b6a7d929222b61149e1;hpb=bcbed10c547780b6e5b2028d936eae337ecebac5;p=SXSI%2FTextCollection.git

diff --git a/TCImplementation.h b/TCImplementation.h
index 0439978..d9ac063 100644
--- a/TCImplementation.h
+++ b/TCImplementation.h
@@ -41,7 +41,7 @@
 #undef bitset
 
 #include "TextStorage.h"
-
+#include <set>
 
 namespace SXSI 
 {
@@ -117,6 +117,8 @@ public:
     document_result Equal(uchar const *) const;
     document_result Contains(uchar const *) const;
     document_result LessThan(uchar const *) const;
+    document_result Kmismaches(uchar const *, unsigned) const;
+    document_result Kerrors(uchar const *, unsigned) const;
 
     document_result Prefix(uchar const *, DocId, DocId) const;
     document_result Suffix(uchar const *, DocId, DocId) const;
@@ -127,12 +129,16 @@ public:
     // Definition of full_result is inherited from SXSI::TextCollection.
     full_result FullContains(uchar const *) const;
     full_result FullContains(uchar const *, DocId, DocId) const;
+    full_result FullKmismatches(uchar const *, unsigned) const;
+    full_result FullKerrors(uchar const *, unsigned) const;
 
     // Index from/to disk
     TCImplementation(FILE *, unsigned);
     void Save(FILE *) const;
 
 private:
+    typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
+
     static const uchar versionFlag;
     TextPosition n;
     unsigned samplerate;
@@ -164,7 +170,9 @@ private:
     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
     ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-
+    ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
+    ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
+    ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; 
     /**
      * Count end-markers in given interval
      */
@@ -223,6 +231,146 @@ private:
         
         return Doc->access(sp, ep-1, min, max);
     }
+
+    /**
+     * Enumerate documents in given interval [sp, ep]
+     */
+    inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
+    {
+        // We want unique document indentifiers, using std::set to collect them
+        // FIXME use unordered_set?
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                resultSet.insert(Doc->access(endmarkerRank));
+            }
+            else
+            {
+                DocId di = (*suffixDocId)[sampled->rank1(i)-1];
+                assert((unsigned)di < numberOfTexts);
+                resultSet.insert(di);
+            }
+        }
+    }
+
+    /**
+     * Enumerate documents in given interval [sp, ep]
+     * and within [begin, end]
+     */
+    inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+    {
+        // We want unique document indentifiers, using std::set to collect them
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                DocId docId = Doc->access(endmarkerRank);
+                if (docId >= begin && docId <= end)
+                    resultSet.insert(docId);
+            }
+            else
+            {
+                DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+                assert((unsigned)docId < numberOfTexts);
+                if (docId >= begin && docId <= end)
+                    resultSet.insert(docId);
+            }
+        }
+    }
+
+    /**
+     * Enumerate document+position pairs (full_result) of
+     * each suffix in given interval.
+     */
+    inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
+    {
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            TextPosition dist = 0;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+                ++ dist;
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                DocId docId = Doc->access(endmarkerRank);
+                result.push_back(make_pair(docId, dist)); 
+            }
+            else
+            {
+                TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+                DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+                result.push_back(make_pair(docId, textPos));
+            }
+        }
+    }
+
+    /**
+     * Enumerate document+position pairs (full_result) of
+     * each suffix in given interval and within [begin, end].
+     */
+    inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+    {
+        uint tmp_rank_c = 0; // Cache rank value of c.
+        for (; sp <= ep; ++sp)
+        {
+            TextPosition i = sp;
+            TextPosition dist = 0;
+            uchar c = alphabetrank->access(i, tmp_rank_c);
+            while (c != '\0' && !sampled->access(i))
+            {
+                i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+                c = alphabetrank->access(i, tmp_rank_c);
+                ++ dist;
+            }
+            if (c == '\0')
+            {
+                // Rank among the end-markers in BWT
+                unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+                DocId docId = Doc->access(endmarkerRank);
+                if (docId >= begin && docId <= end)
+                    result.push_back(make_pair(docId, dist)); 
+            }
+            else
+            {
+                TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+                DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+                if (docId >= begin && docId <= end)
+                    result.push_back(make_pair(docId, textPos));
+            }
+        }
+    }
+
 }; // class TCImplementation
 
 } // namespace SXSI