X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=FMIndexBuilder.cpp;fp=FMIndexBuilder.cpp;h=ea522240d31df52f0d3a1461010b27eca68e7035;hb=89dc22aee980ba16f757cd9a7f77478c2da50051;hp=0000000000000000000000000000000000000000;hpb=443151511a86083b21c1c06eb610f86b3aed35be;p=SXSI%2FTextCollection.git diff --git a/FMIndexBuilder.cpp b/FMIndexBuilder.cpp new file mode 100644 index 0000000..ea52224 --- /dev/null +++ b/FMIndexBuilder.cpp @@ -0,0 +1,146 @@ +#include "incbwt/rlcsa_builder.h" +#include "incbwt/bits/deltavector.h" + +#include "FMIndexBuilder.h" +#include "FMIndex.h" + +using std::string; + +namespace SXSI +{ + +struct TCBuilderRep +{ + unsigned samplerate; + CSA::RLCSABuilder * sa; + + ulong n; + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + ulong numberOfSamples; + + CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index. + string niText; // Texts that are not indexed. + +#ifdef TCB_TEST_BWT + DynFMI *dynFMI; +#endif +}; + +/** + * Init text collection + * + */ +FMIndexBuilder::FMIndexBuilder(unsigned samplerate, ulong estimatedInputLength) + : p_(new struct TCBuilderRep()) +{ + p_->n = 0; + p_->samplerate = samplerate; + p_->numberOfTexts = 0; + p_->numberOfSamples = 0; + p_->maxTextLength = 0; + p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32 + p_->niText = ""; + + // Current params: 8 bytes, no samples, buffer size n/10 bytes. + // Buffer size is always at least 15MB: + if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH) + estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH; + p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); + assert(p_->sa->isOk()); + +#ifdef TCB_TEST_BWT + uchar temp[256]; + for (unsigned i = 0; i < 255; ++i) + temp[i] = i+1; + temp[255] = 0; + p_->dynFMI = new DynFMI(temp, 1, 255, false); +#endif +} + +FMIndexBuilder::~FMIndexBuilder() +{ +#ifdef TCB_TEST_BWT + delete p_->dynFMI; +#endif + + delete p_->sa; + delete p_->notIndexed; + delete p_; +} + +void FMIndexBuilder::InsertText(uchar const * text, bool index) +{ + TextCollection::TextPosition m = std::strlen((char *)text) + 1; + if (m <= 1) + { + // FIXME indexing empty texts + std::cerr << "FMIndexBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } + + p_->numberOfTexts ++; + + if (index) + { + /** + * Insert text into the index + */ + p_->n += m; + p_->numberOfSamples += (m-1)/p_->samplerate; + + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. + + p_->sa->insertSequence((char*)text, m-1, 0); + assert(p_->sa->isOk()); + } + else + { + /** + * Insert text only to TextStorage + */ + p_->notIndexed->setBit(p_->numberOfTexts - 1); + p_->niText.append((const char *)text, m); + } +} + + +TextCollection * FMIndexBuilder::InitTextCollection(char type) +{ + uchar * bwt = 0; + CSA::usint length = 0; + if (p_->numberOfTexts == 0) + { + p_->numberOfTexts ++; // Add one empty text + bwt = new uchar[2]; + bwt[0] = '\0'; + bwt[1] = '\0'; + length = 1; + p_->maxTextLength = 1; + } + else + { + bwt = (uchar *)p_->sa->getBWT(length); + delete p_->sa; + p_->sa = 0; + + assert(length == p_->n); + } + + p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's + CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1); + delete p_->notIndexed; + p_->notIndexed = 0; + + TextCollection *result = new FMIndex(bwt, (ulong)length, + p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, + deltav, p_->niText, type); + + return result; +} + + +} // namespace SXSI