X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.cpp;h=5959e6af991c9a7d3e5d803e77c20e9ee8f7bd85;hb=131f6c58241d797b0c8aaf1e691eb10cd19467aa;hp=211849e312f177c5acfa15387e0daf30b57c237d;hpb=2ff99d618e7f031bcfd95895c982e0958ae58d8c;p=SXSI%2FTextCollection.git diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 211849e..5959e6a 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -19,6 +19,7 @@ *****************************************************************************/ #include "TCImplementation.h" +//#define DEBUG_MEMUSAGE #ifdef DEBUG_MEMUSAGE #include "HeapProfiler.h" // FIXME remove #endif @@ -40,7 +41,7 @@ namespace SXSI { // Save file version info -const uchar TCImplementation::versionFlag = 4; +const uchar TCImplementation::versionFlag = 6; /** * Constructor inits an empty dynamic FM-index. @@ -63,10 +64,12 @@ bool TCImplementation::EmptyText(DocId k) const return false; // Empty texts are not indexed } -uchar* TCImplementation::GetText(DocId k) const +uchar * TCImplementation::GetText(DocId k) const { assert(k < (DocId)numberOfTexts); - TextPosition i = k; + + return textStorage->GetText(k); +/* TextPosition i = k; string result; // Reserve average string length to avoid reallocs @@ -87,7 +90,7 @@ uchar* TCImplementation::GetText(DocId k) const res[i] = '\0'; for (ulong j = 0; j < i; ++j) res[i-j-1] = result[j]; - return res; + return res;*/ } /* * Not supported @@ -769,6 +772,7 @@ void TCImplementation::Save(FILE *file) const throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength)."); Doc->save(file); + textStorage->Save(file); fflush(file); } @@ -815,6 +819,7 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); Doc = static_sequence::load(file); + textStorage = new TextStorage(file); // FIXME Construct data structures with new samplerate //maketables(); @@ -906,6 +911,7 @@ TCImplementation::~TCImplementation() { delete suffixes; delete suffixDocId; delete Doc; + delete textStorage; } void TCImplementation::makewavelet(uchar *bwt) @@ -968,19 +974,12 @@ void TCImplementation::maketables() this->bwtEndPos = i; } - // Build up arrays for text length and starting positions - // FIXME Temp, remove + // Build up array for text starting positions BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); (*textStartPos)[0] = 0; - // Construct samples - ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; - unsigned ceilLog2n = Tools::CeilLog2(n); - BlockArray* positions = new BlockArray(sampleLength, ceilLog2n); - BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); - // Mapping from end-markers to doc ID's: - BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); + uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs. uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++) @@ -995,14 +994,15 @@ void TCImplementation::maketables() ulongmax--; uint alphabetrank_i_tmp =0; + /** + * First pass: populate tables textStartPos and sampledpositions. + */ for (ulong i=n-1;iGetPos(i) x=(i==n-1)?0:i+1; if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { set_field(sampledpositions,1,p,1); - (*positions)[sampleCount] = p; - (*tmpSuffix)[sampleCount] = x; // FIXME remove sampleCount ++; } @@ -1013,7 +1013,7 @@ void TCImplementation::maketables() // Record the order of end-markers in BWT: ulong endmarkerRank = alphabetrank_i_tmp - 1; - (*endmarkerDocId)[endmarkerRank] = textId; + endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts; // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) @@ -1032,61 +1032,64 @@ void TCImplementation::maketables() sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); delete [] sampledpositions; - sampleLength = sampled->rank1(n-1); + ulong sampleLength = sampled->rank1(n-1); assert(sampleCount == sampleLength); // Suffixes store an offset from the text start position suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - for(ulong i=0; irank1((*positions)[i]); + p=bwtEndPos; + textId = numberOfTexts; + + TextStorageBuilder tsbuilder(n); + + /** + * Second pass: populate tables suffixes and suffixDocId. + */ + for (ulong i=n-1;iaccess(p)) { + ulong j = sampled->rank1(p)-1; - if (j==0) j=sampleLength; - TextPosition textPos = (*tmpSuffix)[i]; - (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos); + (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x); - assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts); - assert((*suffixDocId)[j-1] < numberOfTexts); - // calculate offset from text start: - (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; + // calculate offset from text start: + (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]]; + } + + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); + tsbuilder[i] = c; + + if (c == '\0') + { + --textId; + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). + p = textId; // Correct LF-mapping to the last char of the previous text. + } + else // Now c != '\0', do LF-mapping: + p = C[c]+alphabetrank_i_tmp-1; } - // FIXME Temp, remove - delete tmpSuffix; - delete positions; + assert(textId == 0); delete textStartPos; + textStorage = tsbuilder.InitTextStorage(); + #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; HeapProfiler::ResetMaxHeapConsumption(); #endif - uint *tmp = new uint[numberOfTexts]; // FIXME Silly... - for (unsigned i = 0; i < numberOfTexts; ++i) - { - tmp[i] = ((*endmarkerDocId)[i] + 1) % numberOfTexts; -// cout << tmp[i] << ","; - } -// cout << endl; - delete endmarkerDocId; - alphabet_mapper * am = new alphabet_mapper_none(); static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - Doc = new static_sequence_wvtree_noptrs(tmp, numberOfTexts, bmb, am); + Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true); delete bmb; - // delete [] tmp; // already deleted! + // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; #endif - - /*document_result res = Doc->access(3, 3, 0, 3); - cout << "Access result: "; - for (document_result::iterator it = res.begin(); it != res.end(); ++it) - cout << *it << ", "; - cout << endl; - exit(0);*/ }