X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;ds=sidebyside;f=TextCollectionBuilder.cpp;fp=TextCollectionBuilder.cpp;h=67657bf4927e957c07d78f0709f0fb11f0b231f4;hb=40ddf9aca842bdc081b6350a4ebfe36b066c94c9;hp=0000000000000000000000000000000000000000;hpb=af8938dbee21244687184dd0502a84ce1af70c50;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp new file mode 100644 index 0000000..67657bf --- /dev/null +++ b/TextCollectionBuilder.cpp @@ -0,0 +1,153 @@ +#include "incbwt/rlcsa_builder.h" +#include "TextCollectionBuilder.h" + +// Un-comment next line to run a comparison of resulting BWT +//#define TCB_TEST_BWT + +#ifdef TCB_TEST_BWT +#include "dynFMI.h" +#endif + +#include "TCImplementation.h" + +namespace SXSI +{ + +struct TCBuilderRep +{ + unsigned samplerate; + CSA::RLCSABuilder * sa; + + ulong n; + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + +#ifdef TCB_TEST_BWT + DynFMI *dynFMI; +#endif +}; + +/** + * Init text collection + * + * See CSA.h for more details. + */ +TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate) + : p_(new struct TCBuilderRep()) +{ + p_->n = 0; + p_->samplerate = samplerate; + p_->numberOfTexts = 0; + + // Current params: 8 bytes, 15 MB, no samples + p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024); + assert(p_->sa->isOk()); + +#ifdef TCB_TEST_BWT + uchar temp[256]; + for (unsigned i = 0; i < 255; ++i) + temp[i] = i+1; + temp[255] = 0; + p_->dynFMI = new DynFMI(temp, 1, 255, false); +#endif +} + +TextCollectionBuilder::~TextCollectionBuilder() +{ +#ifdef TCB_TEST_BWT + delete p_->dynFMI; +#endif + + delete p_->sa; + delete p_; +} + +void TextCollectionBuilder::InsertText(uchar const * text) +{ + TextCollection::TextPosition m = std::strlen((char *)text) + 1; + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. + + if (m > 1) + { + p_->n += m; + p_->numberOfTexts ++; + +#ifdef TCB_TEST_BWT + p_->dynFMI->addText(text, m); +#endif + + p_->sa->insertSequence((char*)text, m-1, 0); + assert(p_->sa->isOk()); + } + else + { + // FIXME indexing empty texts + std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } +} + + +TextCollection * TextCollectionBuilder::InitTextCollection() +{ + uchar * bwt = 0; + CSA::usint length = 0; + if (p_->numberOfTexts == 0) + { + p_->numberOfTexts ++; // Add one empty text + bwt = new uchar[2]; + bwt[0] = '\0'; + bwt[1] = '\0'; + length = 1; + p_->maxTextLength = 1; + } + else + { + bwt = (uchar *)p_->sa->getBWT(length); + delete p_->sa; + p_->sa = 0; + + assert(length == p_->n); + +#ifdef TCB_TEST_BWT + { + uchar *bwtTest = p_->dynFMI->getBWT(); + printf("123456789012345678901234567890123456789\n"); + for (ulong i = 0; i < p_->n && i < 100; i ++) + if (bwt[i] < 50) + printf("%d", (int)bwt[i]); + else + printf("%c", bwt[i]); + printf("\n"); + for (ulong i = 0; i < p_->n && i < 100; i ++) + if (bwtTest[i] < 50) + printf("%d", (int)bwtTest[i]); + else + printf("%c", bwtTest[i]); + printf("\n"); + + // Sanity check + assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize()); + + delete p_->dynFMI; + p_->dynFMI = 0; + for (ulong i = 0; i < p_->n; ++i) + if (bwt[i] != bwtTest[i]) + { + std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl; + assert(0); + } + delete [] bwtTest; + } +#endif // TCB_TEST_BWT + } + + TextCollection *result = new TCImplementation(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength); + return result; +} + + +} // namespace SXSI