X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.cpp;h=a8303d8be74b7251ba2171dda41cda9f27c4499a;hb=b472b8a9abdb1695994b4b23963de98c361dc66b;hp=211849e312f177c5acfa15387e0daf30b57c237d;hpb=2ff99d618e7f031bcfd95895c982e0958ae58d8c;p=SXSI%2FTextCollection.git diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 211849e..a8303d8 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -968,19 +968,12 @@ void TCImplementation::maketables() this->bwtEndPos = i; } - // Build up arrays for text length and starting positions - // FIXME Temp, remove + // Build up array for text starting positions BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); (*textStartPos)[0] = 0; - // Construct samples - ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; - unsigned ceilLog2n = Tools::CeilLog2(n); - BlockArray* positions = new BlockArray(sampleLength, ceilLog2n); - BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); - // Mapping from end-markers to doc ID's: - BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); + uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs. uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++) @@ -995,14 +988,15 @@ void TCImplementation::maketables() ulongmax--; uint alphabetrank_i_tmp =0; + /** + * First pass: populate tables textStartPos and sampledpositions. + */ for (ulong i=n-1;iGetPos(i) x=(i==n-1)?0:i+1; if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { set_field(sampledpositions,1,p,1); - (*positions)[sampleCount] = p; - (*tmpSuffix)[sampleCount] = x; // FIXME remove sampleCount ++; } @@ -1013,7 +1007,7 @@ void TCImplementation::maketables() // Record the order of end-markers in BWT: ulong endmarkerRank = alphabetrank_i_tmp - 1; - (*endmarkerDocId)[endmarkerRank] = textId; + endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts; // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) @@ -1032,29 +1026,43 @@ void TCImplementation::maketables() sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); delete [] sampledpositions; - sampleLength = sampled->rank1(n-1); + ulong sampleLength = sampled->rank1(n-1); assert(sampleCount == sampleLength); // Suffixes store an offset from the text start position suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - for(ulong i=0; irank1((*positions)[i]); + p=bwtEndPos; + textId = numberOfTexts; + + /** + * Second pass: populate tables suffixes and suffixDocId. + */ + for (ulong i=n-1;iaccess(p)) { + ulong j = sampled->rank1(p)-1; - if (j==0) j=sampleLength; - TextPosition textPos = (*tmpSuffix)[i]; - (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos); + (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x); - assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts); - assert((*suffixDocId)[j-1] < numberOfTexts); - // calculate offset from text start: - (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; + // calculate offset from text start: + (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]]; + } + + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); + if (c == '\0') + { + --textId; + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). + p = textId; // Correct LF-mapping to the last char of the previous text. + } + else // Now c != '\0', do LF-mapping: + p = C[c]+alphabetrank_i_tmp-1; } - // FIXME Temp, remove - delete tmpSuffix; - delete positions; + assert(textId == 0); + delete textStartPos; #ifdef DEBUG_MEMUSAGE @@ -1062,31 +1070,15 @@ void TCImplementation::maketables() HeapProfiler::ResetMaxHeapConsumption(); #endif - uint *tmp = new uint[numberOfTexts]; // FIXME Silly... - for (unsigned i = 0; i < numberOfTexts; ++i) - { - tmp[i] = ((*endmarkerDocId)[i] + 1) % numberOfTexts; -// cout << tmp[i] << ","; - } -// cout << endl; - delete endmarkerDocId; - alphabet_mapper * am = new alphabet_mapper_none(); static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - Doc = new static_sequence_wvtree_noptrs(tmp, numberOfTexts, bmb, am); + Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true); delete bmb; - // delete [] tmp; // already deleted! + // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; #endif - - /*document_result res = Doc->access(3, 3, 0, 3); - cout << "Access result: "; - for (document_result::iterator it = res.begin(); it != res.end(); ++it) - cout << *it << ", "; - cout << endl; - exit(0);*/ }