X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.cpp;h=552abb17eb7ab758bf177f45b7ebf467f83c5ca4;hb=1c0ccf5923b31eceae88afc64f1b4727cd488643;hp=67657bf4927e957c07d78f0709f0fb11f0b231f4;hpb=40ddf9aca842bdc081b6350a4ebfe36b066c94c9;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp index 67657bf..552abb1 100644 --- a/TextCollectionBuilder.cpp +++ b/TextCollectionBuilder.cpp @@ -1,15 +1,11 @@ #include "incbwt/rlcsa_builder.h" -#include "TextCollectionBuilder.h" - -// Un-comment next line to run a comparison of resulting BWT -//#define TCB_TEST_BWT - -#ifdef TCB_TEST_BWT -#include "dynFMI.h" -#endif +#include "incbwt/bits/deltavector.h" +#include "TextCollectionBuilder.h" #include "TCImplementation.h" +using std::string; + namespace SXSI { @@ -23,6 +19,10 @@ struct TCBuilderRep unsigned numberOfTexts; // Length of the longest text ulong maxTextLength; + ulong numberOfSamples; + + CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index. + string niText; // Texts that are not indexed. #ifdef TCB_TEST_BWT DynFMI *dynFMI; @@ -32,17 +32,23 @@ struct TCBuilderRep /** * Init text collection * - * See CSA.h for more details. */ -TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate) +TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength) : p_(new struct TCBuilderRep()) { p_->n = 0; p_->samplerate = samplerate; p_->numberOfTexts = 0; + p_->numberOfSamples = 0; + p_->maxTextLength = 0; + p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32 + p_->niText = ""; - // Current params: 8 bytes, 15 MB, no samples - p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024); + // Current params: 8 bytes, no samples, buffer size n/10 bytes. + // Buffer size is always at least 15MB: + if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH) + estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH; + p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); assert(p_->sa->isOk()); #ifdef TCB_TEST_BWT @@ -61,37 +67,48 @@ TextCollectionBuilder::~TextCollectionBuilder() #endif delete p_->sa; + delete p_->notIndexed; delete p_; } -void TextCollectionBuilder::InsertText(uchar const * text) +void TextCollectionBuilder::InsertText(uchar const * text, bool index) { TextCollection::TextPosition m = std::strlen((char *)text) + 1; - if (m > p_->maxTextLength) - p_->maxTextLength = m; // Store length of the longest text seen so far. - - if (m > 1) + if (m <= 1) { + // FIXME indexing empty texts + std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } + + p_->numberOfTexts ++; + + if (index) + { + /** + * Insert text into the index + */ p_->n += m; - p_->numberOfTexts ++; - -#ifdef TCB_TEST_BWT - p_->dynFMI->addText(text, m); -#endif + p_->numberOfSamples += (m-1)/p_->samplerate; + + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. p_->sa->insertSequence((char*)text, m-1, 0); assert(p_->sa->isOk()); } else { - // FIXME indexing empty texts - std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; - exit(1); + /** + * Insert text only to TextStorage + */ + p_->notIndexed->setBit(p_->numberOfTexts - 1); + p_->niText.append((const char *)text, m); } } -TextCollection * TextCollectionBuilder::InitTextCollection() +TextCollection * TextCollectionBuilder::InitTextCollection(char type) { uchar * bwt = 0; CSA::usint length = 0; @@ -111,41 +128,17 @@ TextCollection * TextCollectionBuilder::InitTextCollection() p_->sa = 0; assert(length == p_->n); - -#ifdef TCB_TEST_BWT - { - uchar *bwtTest = p_->dynFMI->getBWT(); - printf("123456789012345678901234567890123456789\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwt[i] < 50) - printf("%d", (int)bwt[i]); - else - printf("%c", bwt[i]); - printf("\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwtTest[i] < 50) - printf("%d", (int)bwtTest[i]); - else - printf("%c", bwtTest[i]); - printf("\n"); - - // Sanity check - assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize()); - - delete p_->dynFMI; - p_->dynFMI = 0; - for (ulong i = 0; i < p_->n; ++i) - if (bwt[i] != bwtTest[i]) - { - std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl; - assert(0); - } - delete [] bwtTest; - } -#endif // TCB_TEST_BWT } - TextCollection *result = new TCImplementation(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength); + p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's + CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1); + delete p_->notIndexed; + p_->notIndexed = 0; + + TextCollection *result = new TCImplementation(bwt, (ulong)length, + p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, + deltav, p_->niText, type); + return result; }