X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;h=bd7b199507476130e1933126e14af006ba74f4ae;hb=dee0161e4f31f06e9389db9986766395c1b1d2b8;hp=1ecddc66723245554a450d77614345a52bf6630c;hpb=7d27a4450ed429e3b63e9d3ba7217a28cbbf9a31;p=SXSI%2FTextCollection.git diff --git a/CSA.h b/CSA.h index 1ecddc6..bd7b199 100644 --- a/CSA.h +++ b/CSA.h @@ -26,6 +26,7 @@ #include "BitRank.h" #include "dynFMI.h" #include "TextCollection.h" +#include "BlockArray.h" /** * Implementation of the TextCollection interface @@ -55,6 +56,7 @@ public: */ void InsertText(uchar const *); void MakeStatic(); + bool EmptyText(DocId) const; uchar* GetText(DocId) const; uchar* GetText(DocId, TextPosition, TextPosition) const; @@ -80,9 +82,12 @@ public: // full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; - // TODO implement: + // Index from/to disk void Load(FILE *, unsigned); - void Save(FILE *); + void Save(FILE *) const; + + // debug + TextPosition Lookup(TextPosition) const; private: class TCodeEntry { @@ -108,7 +113,7 @@ private: ~THuffAlphabetRank(); bool Test(uchar *, TextPosition); - inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i + inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i THuffAlphabetRank const * temp=this; if (codetable[c].count == 0) return 0; unsigned level = 0; @@ -199,29 +204,40 @@ private: static const unsigned char report = 1; TextPosition n; unsigned samplerate; - ulong C[256]; + unsigned C[256]; TextPosition bwtEndPos; THuffAlphabetRank *alphabetrank; BitRank *sampled; - ulong *suffixes; - ulong *positions; + BlockArray *suffixes; + BlockArray *positions; TCodeEntry *codetable; DynFMI *dynFMI; - // Map from end-marker in BWT to pair (textId, sampled text position) - std::map > endmarkers; - // Vector of pairs of - std::vector > textLength; + + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + + // Array of document id's in the order of end-markers in BWT + // Access by endmarkerDocId[rank_$(L, p) - 1]. + BlockArray *endmarkerDocId; + // Array of text lengths (in the inserted order) + BlockArray *textLength; + // Array of text starting positions (in the inserted order) + BlockArray *textStartPos; + // Private methods uchar * BWT(uchar *); uchar * LoadFromFile(const char *); void SaveToFile(const char *, uchar *); + void makewavelet(uchar *); void maketables(); // Following are not part of the public API DocId DocIdAtTextPos(TextPosition) const; + unsigned CountEndmarkers(TextPosition, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - TextPosition Lookup(TextPosition) const; TextPosition Inverse(TextPosition) const; TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; TextPosition Psi(TextPosition) const;