X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;h=c0751cf406b4254b625c1c7f062fbd7509151781;hb=6149332e97e23dbcfb42434f88be4a59143e73ab;hp=bd7b199507476130e1933126e14af006ba74f4ae;hpb=8c8e860bacea7245fb06b7b7e12ca073f45e1a42;p=SXSI%2FTextCollection.git diff --git a/CSA.h b/CSA.h index bd7b199..c0751cf 100644 --- a/CSA.h +++ b/CSA.h @@ -20,28 +20,35 @@ #ifndef _CSA_H_ #define _CSA_H_ - -#include -#include -#include "BitRank.h" #include "dynFMI.h" +#include "BitRank.h" #include "TextCollection.h" #include "BlockArray.h" +#include "RLWaveletTree.h" +#include +#include + +// Include from XMLTree/libcds +#include // Defines W == 32 +#include +#include +#include + +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif +#undef bitset + +// Un-comment to compare BWT against a BWT generated from class dynFMI: +//#define CSA_TEST_BWT /** * Implementation of the TextCollection interface * - * Implementation notes: - * - * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree - * which requires O(n log \sigma) bits of memory. If we know the distribution - * of characters beforehand, space can easily be made O(nH_0). - * - * The method MakeStatic() constructs a Succinct Suffix Array with a huffman - * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME) - * - * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used - * with any FM-index variant supporting LF-mapping. */ class CSA : public SXSI::TextCollection { public: @@ -58,7 +65,12 @@ public: void MakeStatic(); bool EmptyText(DocId) const; uchar* GetText(DocId) const; - uchar* GetText(DocId, TextPosition, TextPosition) const; + /** + * Next method is not supported: + * Supporting GetText for some substring [i,j] + * would require more space. + */ +// uchar* GetText(DocId, TextPosition, TextPosition) const; bool IsPrefix(uchar const *) const; bool IsSuffix(uchar const *) const; @@ -66,6 +78,7 @@ public: bool IsContains(uchar const *) const; bool IsLessThan(uchar const *) const; + ulong Count(uchar const *) const; unsigned CountPrefix(uchar const *) const; unsigned CountSuffix(uchar const *) const; unsigned CountEqual(uchar const *) const; @@ -86,10 +99,32 @@ public: void Load(FILE *, unsigned); void Save(FILE *) const; - // debug - TextPosition Lookup(TextPosition) const; + + // FIXME Remove + void deleteWT() + { + delete alphabetrank; + alphabetrank = 0; + delete [] codetable; + codetable = 0; + } + void deleteSamples() + { + delete sampled; + sampled =0; + delete suffixes; + suffixes = 0; + delete suffixDocId; + suffixDocId = 0; + } + void deleteEndmarker() + { + delete endmarkerDocId; + endmarkerDocId = 0; + } private: + // FIXME Unused code class TCodeEntry { public: unsigned count; @@ -99,6 +134,7 @@ private: }; + // FIXME Unused code class THuffAlphabetRank { // using fixed 0...255 alphabet private: @@ -110,7 +146,10 @@ private: bool leaf; public: THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); + THuffAlphabetRank(FILE *); ~THuffAlphabetRank(); + + void Save(FILE *); bool Test(uchar *, TextPosition); inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i @@ -151,7 +190,7 @@ private: } return true; } - inline int charAtPos(TextPosition i) const { + inline uchar access(TextPosition i) const { THuffAlphabetRank const * temp=this; while (!temp->leaf) { if (temp->bitrank->IsBitSet(i)) { @@ -163,10 +202,26 @@ private: temp = temp->left; } } - return (int)temp->ch; + return temp->ch; + } + + inline uchar charAtPos(ulong i, TextPosition *rank) const { + THuffAlphabetRank const * temp=this; + while (!temp->leaf) { + if (temp->bitrank->IsBitSet(i)) { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } else { + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + } + (*rank)=i; + return temp->ch; } }; + // FIXME Unused code class node { private: unsigned weight; @@ -200,48 +255,70 @@ private: static TCodeEntry *makecodetable(uchar *, TextPosition); }; - static const unsigned char print = 1; - static const unsigned char report = 1; + static const uchar versionFlag; TextPosition n; unsigned samplerate; unsigned C[256]; TextPosition bwtEndPos; - THuffAlphabetRank *alphabetrank; - BitRank *sampled; - BlockArray *suffixes; - BlockArray *positions; - TCodeEntry *codetable; +// THuffAlphabetRank *alphabetrank; + // RLWaveletTree *alphabetrank; + static_sequence * alphabetrank; + +#ifdef CSA_TEST_BWT DynFMI *dynFMI; +#endif + TCodeEntry *codetable; + + // Sample structures for texts longer than samplerate + BSGAP *sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; // Total number of texts in the collection unsigned numberOfTexts; + // Total number of texts including empty texts + unsigned numberOfAllTexts; // Length of the longest text ulong maxTextLength; - - + // Array of document id's in the order of end-markers in BWT // Access by endmarkerDocId[rank_$(L, p) - 1]. BlockArray *endmarkerDocId; - // Array of text lengths (in the inserted order) - BlockArray *textLength; - // Array of text starting positions (in the inserted order) - BlockArray *textStartPos; - - // Private methods + + // FIXME Replace with a more succinct data structure + // Note: Empty texts are already handled inside XMLTree class. + std::set emptyTextId; + BSGAP *emptyTextRank; + + // FIXME A better solution? + std::string texts; + + // Following are not part of the public API uchar * BWT(uchar *); - uchar * LoadFromFile(const char *); - void SaveToFile(const char *, uchar *); void makewavelet(uchar *); void maketables(); - - // Following are not part of the public API - DocId DocIdAtTextPos(TextPosition) const; - unsigned CountEndmarkers(TextPosition, TextPosition) const; + DocId DocIdAtTextPos(BlockArray*, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - TextPosition Inverse(TextPosition) const; - TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; - TextPosition Psi(TextPosition) const; - uchar * Substring(TextPosition, TextPosition) const; +// TextPosition Inverse(TextPosition) const; +// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; +// TextPosition Psi(TextPosition) const; +// uchar * Substring(TextPosition, TextPosition) const; +// TextPosition Lookup(TextPosition) const; + + /** + * Count end-markers in given interval + */ + inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const + { + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + return alphabetrank->rank(0, ep) - ranksp; + } }; #endif