X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.cpp;h=467bf949677e7ae05d6ee279828b2fb0853fbdd4;hb=349abfa386a1ad59eddd4ec98024e1d1a92ba139;hp=67657bf4927e957c07d78f0709f0fb11f0b231f4;hpb=40ddf9aca842bdc081b6350a4ebfe36b066c94c9;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp index 67657bf..467bf94 100644 --- a/TextCollectionBuilder.cpp +++ b/TextCollectionBuilder.cpp @@ -23,6 +23,7 @@ struct TCBuilderRep unsigned numberOfTexts; // Length of the longest text ulong maxTextLength; + ulong numberOfSamples; #ifdef TCB_TEST_BWT DynFMI *dynFMI; @@ -32,17 +33,20 @@ struct TCBuilderRep /** * Init text collection * - * See CSA.h for more details. */ -TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate) +TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength) : p_(new struct TCBuilderRep()) { p_->n = 0; p_->samplerate = samplerate; p_->numberOfTexts = 0; + p_->numberOfSamples = 0; - // Current params: 8 bytes, 15 MB, no samples - p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024); + // Current params: 8 bytes, no samples, buffer size n/10 bytes. + // Buffer size is always at least 15MB: + if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH) + estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH; + p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); assert(p_->sa->isOk()); #ifdef TCB_TEST_BWT @@ -74,11 +78,11 @@ void TextCollectionBuilder::InsertText(uchar const * text) { p_->n += m; p_->numberOfTexts ++; + p_->numberOfSamples += (m-1)/p_->samplerate; #ifdef TCB_TEST_BWT p_->dynFMI->addText(text, m); #endif - p_->sa->insertSequence((char*)text, m-1, 0); assert(p_->sa->isOk()); } @@ -91,7 +95,7 @@ void TextCollectionBuilder::InsertText(uchar const * text) } -TextCollection * TextCollectionBuilder::InitTextCollection() +TextCollection * TextCollectionBuilder::InitTextCollection(char type) { uchar * bwt = 0; CSA::usint length = 0; @@ -137,7 +141,8 @@ TextCollection * TextCollectionBuilder::InitTextCollection() for (ulong i = 0; i < p_->n; ++i) if (bwt[i] != bwtTest[i]) { - std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl; + std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " + << (unsigned)bwtTest[i] << std::endl; assert(0); } delete [] bwtTest; @@ -145,7 +150,8 @@ TextCollection * TextCollectionBuilder::InitTextCollection() #endif // TCB_TEST_BWT } - TextCollection *result = new TCImplementation(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength); + TextCollection *result = new TCImplementation(bwt, (ulong)length, + p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, type); return result; }