X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.h;h=75f1d40366f35191de41642cd984c5480705074b;hb=7cdaf25b1e5f1890e359b3ad37ab7ec2c9e30d5a;hp=5e488db94da8a623ace9f6dad8df4ead78643732;hpb=40ddf9aca842bdc081b6350a4ebfe36b066c94c9;p=SXSI%2FTextCollection.git diff --git a/TCImplementation.h b/TCImplementation.h index 5e488db..75f1d40 100644 --- a/TCImplementation.h +++ b/TCImplementation.h @@ -23,13 +23,13 @@ #include "BitRank.h" #include "TextCollection.h" #include "BlockArray.h" -#include "BSGAP.h" // Include from XMLTree/libcds #include // Defines W == 32 #include #include #include +#include // Re-define word size to ulong: #undef W @@ -39,12 +39,14 @@ # define W 32 #endif #undef bitset +#undef bitget + +#include "TextStorage.h" +#include namespace SXSI { -// Un-comment to compare BWT against a BWT generated from class dynFMI: -//#define CSA_TEST_BWT /** * Implementation of the TextCollection interface @@ -52,15 +54,35 @@ namespace SXSI */ class TCImplementation : public SXSI::TextCollection { public: - TCImplementation(uchar *, ulong, unsigned, unsigned, ulong); + TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong, char); ~TCImplementation(); bool EmptyText(DocId) const; - uchar* GetText(DocId) const; + /** - * Next method is not supported: - * Supporting GetText for some substring [i,j] - * would require more space. + * Extracting one text. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId) const; + void DeleteText(uchar *text) const + { textStorage->DeleteText(text); } + + /** + * Returns a pointer to the beginning of texts i, i+1, ..., j. + * Texts are separated by a '\0' byte. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId i, DocId j) const + { return textStorage->GetText(i, j); } + + /** + * Returns a substring of given text ID. + * + * FIXME This may be reimplemented via TextStorage. */ // uchar* GetText(DocId, TextPosition, TextPosition) const; @@ -95,6 +117,8 @@ public: document_result Equal(uchar const *) const; document_result Contains(uchar const *) const; document_result LessThan(uchar const *) const; + document_result KMismaches(uchar const *, unsigned) const; + document_result KErrors(uchar const *, unsigned) const; document_result Prefix(uchar const *, DocId, DocId) const; document_result Suffix(uchar const *, DocId, DocId) const; @@ -105,12 +129,16 @@ public: // Definition of full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; full_result FullContains(uchar const *, DocId, DocId) const; + full_result FullKMismatches(uchar const *, unsigned) const; + full_result FullKErrors(uchar const *, unsigned) const; // Index from/to disk TCImplementation(FILE *, unsigned); void Save(FILE *) const; private: + typedef std::vector > suffix_range_vector; + static const uchar versionFlag; TextPosition n; unsigned samplerate; @@ -119,34 +147,32 @@ private: static_sequence * alphabetrank; // Sample structures for texts longer than samplerate - BSGAP *sampled; // FIXME Replace with RRR02 + static_bitsequence * sampled; BlockArray *suffixes; BlockArray *suffixDocId; // Total number of texts in the collection unsigned numberOfTexts; - // Total number of texts including empty texts - unsigned numberOfAllTexts; // Length of the longest text ulong maxTextLength; // Array of document id's in the order of end-markers in BWT - // Access by endmarkerDocId[rank_$(L, p) - 1]. - BlockArray *endmarkerDocId; + static_sequence *Doc; - // FIXME Replace with a more succinct data structure - // Note: Empty texts are already handled inside XMLTree class. - BSGAP *emptyTextRank; // FIXME Remove + // Text storage for fast extraction + TextStorage * textStorage; - // Following are not part of the public API + // Following methods are not part of the public API uchar * BWT(uchar *); void makewavelet(uchar *); - void maketables(); + void maketables(ulong, char); DocId DocIdAtTextPos(BlockArray*, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - + ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const; + ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const; + ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; /** * Count end-markers in given interval */ @@ -161,7 +187,190 @@ private: return alphabetrank->rank(0, ep) - ranksp; } - unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const; + + /** + * Count end-markers in given interval and + * within docIds [min,max] + */ + inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const + { + if (sp != 0) + sp = alphabetrank->rank(0, sp - 1); + ep = alphabetrank->rank(0, ep); + if (ep == 0) + return 0; + + return Doc->count(sp, ep-1, min, max); + } + + /** + * Enumerate all end-markers in given interval + */ + inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const + { + if (sp != 0) + sp = alphabetrank->rank(0, sp - 1); + ep = alphabetrank->rank(0, ep); + if (ep == 0) + return document_result(); + + return Doc->accessAll(sp, ep-1); + } + + /** + * Enumerate end-markers in given interval and + * within docIds [min,max] + */ + inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const + { + if (sp != 0) + sp = alphabetrank->rank(0, sp - 1); + ep = alphabetrank->rank(0, ep); + if (ep == 0) + return document_result(); + + return Doc->access(sp, ep-1, min, max); + } + + /** + * Enumerate documents in given interval [sp, ep] + */ + inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep) const + { + // We want unique document indentifiers, using std::set to collect them + // FIXME use unordered_set? + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + resultSet.insert(Doc->access(endmarkerRank)); + } + else + { + DocId di = (*suffixDocId)[sampled->rank1(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); + } + } + } + + /** + * Enumerate documents in given interval [sp, ep] + * and within [begin, end] + */ + inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const + { + // We want unique document indentifiers, using std::set to collect them + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + else + { + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + assert((unsigned)docId < numberOfTexts); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + } + } + + /** + * Enumerate document+position pairs (full_result) of + * each suffix in given interval. + */ + inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const + { + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + + result.push_back(make_pair(docId, textPos)); + } + } + } + + /** + * Enumerate document+position pairs (full_result) of + * each suffix in given interval and within [begin, end]. + */ + inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const + { + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, textPos)); + } + } + } + }; // class TCImplementation } // namespace SXSI