X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;h=be9a2f9246f120bcae166441e01d068fe65df34b;hb=1ea86bc7fdc05347b46919374f071920fec8adb5;hp=ec6e890be2bffd93e39452af1bd4d33013ed46e0;hpb=9df50e6e5e58f746e83b9ad1d539024f82fa8b64;p=SXSI%2FTextCollection.git diff --git a/CSA.h b/CSA.h index ec6e890..be9a2f9 100644 --- a/CSA.h +++ b/CSA.h @@ -25,11 +25,12 @@ #include "TextCollection.h" #include "BlockArray.h" #include "RLWaveletTree.h" +#include "StringIterator.h" #include #include // Include from XMLTree/libcds -#include +#include // Defines W == 32 #include #include #include @@ -41,22 +42,17 @@ #else # define W 32 #endif +#undef bitset +namespace SXSI +{ + +// Un-comment to compare BWT against a BWT generated from class dynFMI: +//#define CSA_TEST_BWT /** * Implementation of the TextCollection interface * - * Implementation notes: - * - * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree - * which requires O(n log \sigma) bits of memory. If we know the distribution - * of characters beforehand, space can easily be made O(nH_0). - * - * The method MakeStatic() constructs a Succinct Suffix Array with a huffman - * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME) - * - * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used - * with any FM-index variant supporting LF-mapping. */ class CSA : public SXSI::TextCollection { public: @@ -73,6 +69,11 @@ public: void MakeStatic(); bool EmptyText(DocId) const; uchar* GetText(DocId) const; + /** + * Next method is not supported: + * Supporting GetText for some substring [i,j] + * would require more space. + */ // uchar* GetText(DocId, TextPosition, TextPosition) const; bool IsPrefix(uchar const *) const; @@ -81,6 +82,7 @@ public: bool IsContains(uchar const *) const; bool IsLessThan(uchar const *) const; + ulong Count(uchar const *) const; unsigned CountPrefix(uchar const *) const; unsigned CountSuffix(uchar const *) const; unsigned CountEqual(uchar const *) const; @@ -102,7 +104,7 @@ public: void Save(FILE *) const; - // Debug FIXME Remove + // FIXME Remove void deleteWT() { delete alphabetrank; @@ -116,8 +118,6 @@ public: sampled =0; delete suffixes; suffixes = 0; - delete positions; - positions = 0; delete suffixDocId; suffixDocId = 0; } @@ -128,6 +128,7 @@ public: } private: + // FIXME Unused code class TCodeEntry { public: unsigned count; @@ -137,6 +138,7 @@ private: }; + // FIXME Unused code class THuffAlphabetRank { // using fixed 0...255 alphabet private: @@ -223,6 +225,7 @@ private: } }; + // FIXME Unused code class node { private: unsigned weight; @@ -256,8 +259,6 @@ private: static TCodeEntry *makecodetable(uchar *, TextPosition); }; - static const unsigned char print = 1; - static const unsigned char report = 1; static const uchar versionFlag; TextPosition n; unsigned samplerate; @@ -266,12 +267,16 @@ private: // THuffAlphabetRank *alphabetrank; // RLWaveletTree *alphabetrank; static_sequence * alphabetrank; + +#ifdef CSA_TEST_BWT + DynFMI *dynFMI; +#endif + TCodeEntry *codetable; + + // Sample structures for texts longer than samplerate BSGAP *sampled; BlockArray *suffixes; BlockArray *suffixDocId; - BlockArray *positions; - TCodeEntry *codetable; - DynFMI *dynFMI; // Total number of texts in the collection unsigned numberOfTexts; @@ -283,30 +288,27 @@ private: // Array of document id's in the order of end-markers in BWT // Access by endmarkerDocId[rank_$(L, p) - 1]. BlockArray *endmarkerDocId; - // Array of text lengths (in the inserted order) - BlockArray *textLength; - // Array of text starting positions (in the inserted order) - BlockArray *textStartPos; // FIXME Replace with a more succinct data structure + // Note: Empty texts are already handled inside XMLTree class. std::set emptyTextId; BSGAP *emptyTextRank; - // Private methods + // FIXME A better solution? + std::string texts; + + // Following are not part of the public API uchar * BWT(uchar *); - uchar * LoadFromFile(const char *); - void SaveToFile(const char *, uchar *); void makewavelet(uchar *); void maketables(); - - // Following are not part of the public API - DocId DocIdAtTextPos(TextPosition) const; + DocId DocIdAtTextPos(BlockArray*, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - TextPosition Inverse(TextPosition) const; - TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; - TextPosition Psi(TextPosition) const; - uchar * Substring(TextPosition, TextPosition) const; - TextPosition Lookup(TextPosition) const; + ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; +// TextPosition Inverse(TextPosition) const; +// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; +// TextPosition Psi(TextPosition) const; +// uchar * Substring(TextPosition, TextPosition) const; +// TextPosition Lookup(TextPosition) const; /** * Count end-markers in given interval @@ -322,6 +324,8 @@ private: return alphabetrank->rank(0, ep) - ranksp; } -}; +}; // class CSA + +} // namespace SXSI #endif