From: nvalimak Date: Tue, 19 May 2009 12:31:14 +0000 (+0000) Subject: Construction time&space fix X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;ds=sidebyside;h=e59bd57d53c2d9e5db4c22f2501a9789f62f46c1;p=SXSI%2FTextCollection.git Construction time&space fix git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@402 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/TCImplementation.cpp b/TCImplementation.cpp index c3b430a..2aec680 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -47,7 +47,8 @@ const uchar TCImplementation::versionFlag = 6; * Constructor inits an empty dynamic FM-index. * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ -TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_) +TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, + unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_) : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) { @@ -55,7 +56,7 @@ TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerat bwt = 0; // Make sampling tables - maketables(); + maketables(numberOfSamples_); } bool TCImplementation::EmptyText(DocId k) const @@ -858,11 +859,6 @@ ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pa //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const { - cout << "j = " << j << ", k = " << k << ", d:"; - for (unsigned i = 0; i < m+1; ++i) - cout << " " << d[i]; - cout << endl; - if (d[m]<=k) // range of suffixes with at most k-errors found { if (sp<=ep) @@ -1010,7 +1006,7 @@ void TCImplementation::makewavelet(uchar *bwt) #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove + HeapProfiler::ResetMaxHeapConsumption(); #endif alphabet_mapper * am = new alphabet_mapper_none(); @@ -1026,7 +1022,7 @@ void TCImplementation::makewavelet(uchar *bwt) #endif } -void TCImplementation::maketables() +void TCImplementation::maketables(ulong sampleLength) { // Calculate BWT end-marker position (of last inserted text) { @@ -1042,13 +1038,20 @@ void TCImplementation::maketables() this->bwtEndPos = i; } +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + // Build up array for text starting positions - BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); - (*textStartPos)[0] = 0; +// BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); +// (*textStartPos)[0] = 0; // Mapping from end-markers to doc ID's: - uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs. - + unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts); + uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1]; + + BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n)); uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++) sampledpositions[i] = 0; @@ -1056,38 +1059,42 @@ void TCImplementation::maketables() ulong x,p=bwtEndPos; ulong sampleCount = 0; // Keeping track of text position of prev. end-marker seen - ulong posOfSuccEndmarker = n; + ulong posOfSuccEndmarker = n-1; DocId textId = numberOfTexts; ulong ulongmax = 0; ulongmax--; uint alphabetrank_i_tmp =0; - /** - * First pass: populate tables textStartPos and sampledpositions. - */ + TextStorageBuilder tsbuilder(n); + Tools::StartTimer(); + for (ulong i=n-1;iGetPos(i) x=(i==n-1)?0:i+1; - if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); + tsbuilder[i] = c; + + if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0') + { set_field(sampledpositions,1,p,1); + (*positions)[sampleCount] = p; sampleCount ++; } - uchar c = alphabetrank->access(p, alphabetrank_i_tmp); if (c == '\0') { --textId; // Record the order of end-markers in BWT: ulong endmarkerRank = alphabetrank_i_tmp - 1; - endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts; + set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) { - (*textStartPos)[textId + 1] = x; // x is the position of end-marker. - posOfSuccEndmarker = x; +// (*textStartPos)[textId + 1] = x; // x-1 is text position of end-marker. + posOfSuccEndmarker = i; } // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). @@ -1097,38 +1104,83 @@ void TCImplementation::maketables() p = C[c]+alphabetrank_i_tmp-1; } assert(textId == 0); - + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + textStorage = tsbuilder.InitTextStorage(); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); delete [] sampledpositions; - ulong sampleLength = sampled->rank1(n-1); assert(sampleCount == sampleLength); + assert(sampled->rank1(n-1) == sampleLength); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif // Suffixes store an offset from the text start position suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - p=bwtEndPos; - textId = numberOfTexts; +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif - TextStorageBuilder tsbuilder(n); + x = n - 2; + posOfSuccEndmarker = n-1; + for(ulong i=0; irank1((*positions)[i]); + + assert(j != 0); // if (j==0) j=sampleLength; + + TextPosition textPos = (x==n-1)?0:x+1; + (*suffixDocId)[j-1] = textStorage->DocIdAtTextPos(textPos); + + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); + --x; + if (tsbuilder[x] == '\0') + posOfSuccEndmarker = x--; + } + + delete positions; /** - * Second pass: populate tables suffixes and suffixDocId. + * Second pass: check tables */ +/* p=bwtEndPos; + textId = numberOfTexts; for (ulong i=n-1;iaccess(p)) { ulong j = sampled->rank1(p)-1; - - (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x); + assert((*suffixDocId)[j] == DocIdAtTextPos(textStartPos, x)); // calculate offset from text start: - (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]]; + assert((*suffixes)[j] == x - (*textStartPos)[(*suffixDocId)[j]]); } uchar c = alphabetrank->access(p, alphabetrank_i_tmp); - tsbuilder[i] = c; if (c == '\0') { @@ -1140,9 +1192,8 @@ void TCImplementation::maketables() p = C[c]+alphabetrank_i_tmp-1; } assert(textId == 0); - delete textStartPos; - - textStorage = tsbuilder.InitTextStorage(); + delete textStartPos +*/ #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; @@ -1150,8 +1201,8 @@ void TCImplementation::maketables() #endif alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate? + Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true); delete bmb; // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! diff --git a/TCImplementation.h b/TCImplementation.h index d9ac063..65bf22c 100644 --- a/TCImplementation.h +++ b/TCImplementation.h @@ -53,7 +53,7 @@ namespace SXSI */ class TCImplementation : public SXSI::TextCollection { public: - TCImplementation(uchar *, ulong, unsigned, unsigned, ulong); + TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong); ~TCImplementation(); bool EmptyText(DocId) const; @@ -165,7 +165,7 @@ private: // Following methods are not part of the public API uchar * BWT(uchar *); void makewavelet(uchar *); - void maketables(); + void maketables(ulong); DocId DocIdAtTextPos(BlockArray*, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;