X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;h=ec6e890be2bffd93e39452af1bd4d33013ed46e0;hb=a03a69215d4bf9cee1d42ecbc9adcf057b2229dc;hp=1ecddc66723245554a450d77614345a52bf6630c;hpb=7d27a4450ed429e3b63e9d3ba7217a28cbbf9a31;p=SXSI%2FTextCollection.git diff --git a/CSA.h b/CSA.h index 1ecddc6..ec6e890 100644 --- a/CSA.h +++ b/CSA.h @@ -20,12 +20,28 @@ #ifndef _CSA_H_ #define _CSA_H_ - -#include -#include -#include "BitRank.h" #include "dynFMI.h" +#include "BitRank.h" #include "TextCollection.h" +#include "BlockArray.h" +#include "RLWaveletTree.h" +#include +#include + +// Include from XMLTree/libcds +#include +#include +#include +#include + +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + /** * Implementation of the TextCollection interface @@ -55,8 +71,9 @@ public: */ void InsertText(uchar const *); void MakeStatic(); + bool EmptyText(DocId) const; uchar* GetText(DocId) const; - uchar* GetText(DocId, TextPosition, TextPosition) const; +// uchar* GetText(DocId, TextPosition, TextPosition) const; bool IsPrefix(uchar const *) const; bool IsSuffix(uchar const *) const; @@ -80,9 +97,35 @@ public: // full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; - // TODO implement: + // Index from/to disk void Load(FILE *, unsigned); - void Save(FILE *); + void Save(FILE *) const; + + + // Debug FIXME Remove + void deleteWT() + { + delete alphabetrank; + alphabetrank = 0; + delete [] codetable; + codetable = 0; + } + void deleteSamples() + { + delete sampled; + sampled =0; + delete suffixes; + suffixes = 0; + delete positions; + positions = 0; + delete suffixDocId; + suffixDocId = 0; + } + void deleteEndmarker() + { + delete endmarkerDocId; + endmarkerDocId = 0; + } private: class TCodeEntry { @@ -105,10 +148,13 @@ private: bool leaf; public: THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); + THuffAlphabetRank(FILE *); ~THuffAlphabetRank(); + + void Save(FILE *); bool Test(uchar *, TextPosition); - inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i + inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i THuffAlphabetRank const * temp=this; if (codetable[c].count == 0) return 0; unsigned level = 0; @@ -146,7 +192,7 @@ private: } return true; } - inline int charAtPos(TextPosition i) const { + inline uchar access(TextPosition i) const { THuffAlphabetRank const * temp=this; while (!temp->leaf) { if (temp->bitrank->IsBitSet(i)) { @@ -158,7 +204,22 @@ private: temp = temp->left; } } - return (int)temp->ch; + return temp->ch; + } + + inline uchar charAtPos(ulong i, TextPosition *rank) const { + THuffAlphabetRank const * temp=this; + while (!temp->leaf) { + if (temp->bitrank->IsBitSet(i)) { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } else { + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + } + (*rank)=i; + return temp->ch; } }; @@ -197,35 +258,70 @@ private: static const unsigned char print = 1; static const unsigned char report = 1; + static const uchar versionFlag; TextPosition n; unsigned samplerate; - ulong C[256]; + unsigned C[256]; TextPosition bwtEndPos; - THuffAlphabetRank *alphabetrank; - BitRank *sampled; - ulong *suffixes; - ulong *positions; +// THuffAlphabetRank *alphabetrank; + // RLWaveletTree *alphabetrank; + static_sequence * alphabetrank; + BSGAP *sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; + BlockArray *positions; TCodeEntry *codetable; DynFMI *dynFMI; - // Map from end-marker in BWT to pair (textId, sampled text position) - std::map > endmarkers; - // Vector of pairs of - std::vector > textLength; - + + // Total number of texts in the collection + unsigned numberOfTexts; + // Total number of texts including empty texts + unsigned numberOfAllTexts; + // Length of the longest text + ulong maxTextLength; + + // Array of document id's in the order of end-markers in BWT + // Access by endmarkerDocId[rank_$(L, p) - 1]. + BlockArray *endmarkerDocId; + // Array of text lengths (in the inserted order) + BlockArray *textLength; + // Array of text starting positions (in the inserted order) + BlockArray *textStartPos; + + // FIXME Replace with a more succinct data structure + std::set emptyTextId; + BSGAP *emptyTextRank; + // Private methods uchar * BWT(uchar *); uchar * LoadFromFile(const char *); void SaveToFile(const char *, uchar *); + void makewavelet(uchar *); void maketables(); // Following are not part of the public API DocId DocIdAtTextPos(TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - TextPosition Lookup(TextPosition) const; TextPosition Inverse(TextPosition) const; TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; TextPosition Psi(TextPosition) const; uchar * Substring(TextPosition, TextPosition) const; + TextPosition Lookup(TextPosition) const; + + /** + * Count end-markers in given interval + */ + inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const + { + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + return alphabetrank->rank(0, ep) - ranksp; + } }; #endif