From 8c8e860bacea7245fb06b7b7e12ca073f45e1a42 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Fri, 9 Jan 2009 13:34:54 +0000 Subject: [PATCH] Rewrote some data structures git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@47 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- CSA.h | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/CSA.h b/CSA.h index 00cf1c6..bd7b199 100644 --- a/CSA.h +++ b/CSA.h @@ -26,6 +26,7 @@ #include "BitRank.h" #include "dynFMI.h" #include "TextCollection.h" +#include "BlockArray.h" /** * Implementation of the TextCollection interface @@ -85,6 +86,9 @@ public: void Load(FILE *, unsigned); void Save(FILE *) const; + // debug + TextPosition Lookup(TextPosition) const; + private: class TCodeEntry { public: @@ -109,7 +113,7 @@ private: ~THuffAlphabetRank(); bool Test(uchar *, TextPosition); - inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i + inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i THuffAlphabetRank const * temp=this; if (codetable[c].count == 0) return 0; unsigned level = 0; @@ -200,19 +204,29 @@ private: static const unsigned char report = 1; TextPosition n; unsigned samplerate; - ulong C[256]; + unsigned C[256]; TextPosition bwtEndPos; THuffAlphabetRank *alphabetrank; BitRank *sampled; - ulong *suffixes; - ulong *positions; + BlockArray *suffixes; + BlockArray *positions; TCodeEntry *codetable; DynFMI *dynFMI; - // Map from end-marker in BWT to pair (textId, sampled text position) - std::map > endmarkers; - // Vector of pairs of - std::vector > textLength; + + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + + // Array of document id's in the order of end-markers in BWT + // Access by endmarkerDocId[rank_$(L, p) - 1]. + BlockArray *endmarkerDocId; + // Array of text lengths (in the inserted order) + BlockArray *textLength; + // Array of text starting positions (in the inserted order) + BlockArray *textStartPos; + // Private methods uchar * BWT(uchar *); uchar * LoadFromFile(const char *); @@ -222,8 +236,8 @@ private: // Following are not part of the public API DocId DocIdAtTextPos(TextPosition) const; + unsigned CountEndmarkers(TextPosition, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - TextPosition Lookup(TextPosition) const; TextPosition Inverse(TextPosition) const; TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; TextPosition Psi(TextPosition) const; -- 2.17.1