X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.cpp;h=552abb17eb7ab758bf177f45b7ebf467f83c5ca4;hb=1c0ccf5923b31eceae88afc64f1b4727cd488643;hp=ff10f7b7fc44d03e8bfd59d8f6d4cadfccfd1a38;hpb=0b35fca408fd60a0f4dc82c1e26c06f05b1661f6;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp index ff10f7b..552abb1 100644 --- a/TextCollectionBuilder.cpp +++ b/TextCollectionBuilder.cpp @@ -1,15 +1,11 @@ #include "incbwt/rlcsa_builder.h" -#include "TextCollectionBuilder.h" - -// Un-comment next line to run a comparison of resulting BWT -//#define TCB_TEST_BWT - -#ifdef TCB_TEST_BWT -#include "dynFMI.h" -#endif +#include "incbwt/bits/deltavector.h" +#include "TextCollectionBuilder.h" #include "TCImplementation.h" +using std::string; + namespace SXSI { @@ -25,6 +21,9 @@ struct TCBuilderRep ulong maxTextLength; ulong numberOfSamples; + CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index. + string niText; // Texts that are not indexed. + #ifdef TCB_TEST_BWT DynFMI *dynFMI; #endif @@ -33,18 +32,23 @@ struct TCBuilderRep /** * Init text collection * - * See CSA.h for more details. */ -TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate) +TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength) : p_(new struct TCBuilderRep()) { p_->n = 0; p_->samplerate = samplerate; p_->numberOfTexts = 0; p_->numberOfSamples = 0; + p_->maxTextLength = 0; + p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32 + p_->niText = ""; - // Current params: 8 bytes, 15 MB, no samples - p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024); + // Current params: 8 bytes, no samples, buffer size n/10 bytes. + // Buffer size is always at least 15MB: + if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH) + estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH; + p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); assert(p_->sa->isOk()); #ifdef TCB_TEST_BWT @@ -63,37 +67,48 @@ TextCollectionBuilder::~TextCollectionBuilder() #endif delete p_->sa; + delete p_->notIndexed; delete p_; } -void TextCollectionBuilder::InsertText(uchar const * text) +void TextCollectionBuilder::InsertText(uchar const * text, bool index) { TextCollection::TextPosition m = std::strlen((char *)text) + 1; - if (m > p_->maxTextLength) - p_->maxTextLength = m; // Store length of the longest text seen so far. - - if (m > 1) + if (m <= 1) + { + // FIXME indexing empty texts + std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } + + p_->numberOfTexts ++; + + if (index) { + /** + * Insert text into the index + */ p_->n += m; - p_->numberOfTexts ++; p_->numberOfSamples += (m-1)/p_->samplerate; + + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. -#ifdef TCB_TEST_BWT - p_->dynFMI->addText(text, m); -#endif p_->sa->insertSequence((char*)text, m-1, 0); assert(p_->sa->isOk()); } else { - // FIXME indexing empty texts - std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; - exit(1); + /** + * Insert text only to TextStorage + */ + p_->notIndexed->setBit(p_->numberOfTexts - 1); + p_->niText.append((const char *)text, m); } } -TextCollection * TextCollectionBuilder::InitTextCollection() +TextCollection * TextCollectionBuilder::InitTextCollection(char type) { uchar * bwt = 0; CSA::usint length = 0; @@ -113,43 +128,17 @@ TextCollection * TextCollectionBuilder::InitTextCollection() p_->sa = 0; assert(length == p_->n); - -#ifdef TCB_TEST_BWT - { - uchar *bwtTest = p_->dynFMI->getBWT(); - printf("123456789012345678901234567890123456789\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwt[i] < 50) - printf("%d", (int)bwt[i]); - else - printf("%c", bwt[i]); - printf("\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwtTest[i] < 50) - printf("%d", (int)bwtTest[i]); - else - printf("%c", bwtTest[i]); - printf("\n"); - - // Sanity check - assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize()); - - delete p_->dynFMI; - p_->dynFMI = 0; - for (ulong i = 0; i < p_->n; ++i) - if (bwt[i] != bwtTest[i]) - { - std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " - << (unsigned)bwtTest[i] << std::endl; - assert(0); - } - delete [] bwtTest; - } -#endif // TCB_TEST_BWT } + p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's + CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1); + delete p_->notIndexed; + p_->notIndexed = 0; + TextCollection *result = new TCImplementation(bwt, (ulong)length, - p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples); + p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, + deltav, p_->niText, type); + return result; }