From bcbed10c547780b6e5b2028d936eae337ecebac5 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Fri, 8 May 2009 12:03:31 +0000 Subject: [PATCH] Added TextStorage class git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@375 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- TCImplementation.cpp | 21 ++++++++++++++++----- TCImplementation.h | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/TCImplementation.cpp b/TCImplementation.cpp index a8303d8..5959e6a 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -19,6 +19,7 @@ *****************************************************************************/ #include "TCImplementation.h" +//#define DEBUG_MEMUSAGE #ifdef DEBUG_MEMUSAGE #include "HeapProfiler.h" // FIXME remove #endif @@ -40,7 +41,7 @@ namespace SXSI { // Save file version info -const uchar TCImplementation::versionFlag = 4; +const uchar TCImplementation::versionFlag = 6; /** * Constructor inits an empty dynamic FM-index. @@ -63,10 +64,12 @@ bool TCImplementation::EmptyText(DocId k) const return false; // Empty texts are not indexed } -uchar* TCImplementation::GetText(DocId k) const +uchar * TCImplementation::GetText(DocId k) const { assert(k < (DocId)numberOfTexts); - TextPosition i = k; + + return textStorage->GetText(k); +/* TextPosition i = k; string result; // Reserve average string length to avoid reallocs @@ -87,7 +90,7 @@ uchar* TCImplementation::GetText(DocId k) const res[i] = '\0'; for (ulong j = 0; j < i; ++j) res[i-j-1] = result[j]; - return res; + return res;*/ } /* * Not supported @@ -769,6 +772,7 @@ void TCImplementation::Save(FILE *file) const throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength)."); Doc->save(file); + textStorage->Save(file); fflush(file); } @@ -815,6 +819,7 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); Doc = static_sequence::load(file); + textStorage = new TextStorage(file); // FIXME Construct data structures with new samplerate //maketables(); @@ -906,6 +911,7 @@ TCImplementation::~TCImplementation() { delete suffixes; delete suffixDocId; delete Doc; + delete textStorage; } void TCImplementation::makewavelet(uchar *bwt) @@ -1036,6 +1042,8 @@ void TCImplementation::maketables() p=bwtEndPos; textId = numberOfTexts; + TextStorageBuilder tsbuilder(n); + /** * Second pass: populate tables suffixes and suffixDocId. */ @@ -1052,6 +1060,8 @@ void TCImplementation::maketables() } uchar c = alphabetrank->access(p, alphabetrank_i_tmp); + tsbuilder[i] = c; + if (c == '\0') { --textId; @@ -1062,9 +1072,10 @@ void TCImplementation::maketables() p = C[c]+alphabetrank_i_tmp-1; } assert(textId == 0); - delete textStartPos; + textStorage = tsbuilder.InitTextStorage(); + #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; HeapProfiler::ResetMaxHeapConsumption(); diff --git a/TCImplementation.h b/TCImplementation.h index 5430f21..0439978 100644 --- a/TCImplementation.h +++ b/TCImplementation.h @@ -40,6 +40,9 @@ #endif #undef bitset +#include "TextStorage.h" + + namespace SXSI { @@ -54,11 +57,32 @@ public: ~TCImplementation(); bool EmptyText(DocId) const; - uchar* GetText(DocId) const; + + /** + * Returns a pointer to the original text. + * + * Do *not* try to free the array. + * (However, this implementation is suspect to change.) + * + * See TextStorage.h for details. + */ + uchar * GetText(DocId) const; + /** - * Next method is not supported: - * Supporting GetText for some substring [i,j] - * would require more space. + * Returns a pointer to the beginning of texts i, i+1, ..., j. + * Texts are separated by a '\0' byte. + * + * Do *not* try to free the array. + * (However, this implementation is suspect to change.) + * + * See TextStorage.h for details. + */ + uchar * GetText(DocId i, DocId j) const; + + /** + * Returns a substring of given text ID. + * + * FIXME This may be reimplemented via TextStorage. */ // uchar* GetText(DocId, TextPosition, TextPosition) const; @@ -117,7 +141,7 @@ private: static_sequence * alphabetrank; // Sample structures for texts longer than samplerate - static_bitsequence * sampled; // FIXME Replace with RRR02 + static_bitsequence * sampled; BlockArray *suffixes; BlockArray *suffixDocId; @@ -129,7 +153,10 @@ private: // Array of document id's in the order of end-markers in BWT static_sequence *Doc; - // Following are not part of the public API + // Text storage for fast extraction + TextStorage * textStorage; + + // Following methods are not part of the public API uchar * BWT(uchar *); void makewavelet(uchar *); void maketables(); -- 2.17.1