X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.cpp;h=334e0b09055a139683afd8b5ebe435a7f87926b7;hb=37830ca9a16449ed145da79c80e7a37b465e4816;hp=1d49f397ec4e52b74d6c32b9ae304daddd711cc2;hpb=6e35318fa5b3d5630aa8e5c8ac019d62a47b8948;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp index 1d49f39..334e0b0 100644 --- a/TextCollectionBuilder.cpp +++ b/TextCollectionBuilder.cpp @@ -1,156 +1,28 @@ -#include "incbwt/rlcsa_builder.h" #include "TextCollectionBuilder.h" - -// Un-comment next line to run a comparison of resulting BWT -//#define TCB_TEST_BWT - -#ifdef TCB_TEST_BWT -#include "dynFMI.h" -#endif - -#include "TCImplementation.h" +#include "FMIndexBuilder.h" +#include "SWCSABuilder.h" +#include "RLCSABuilder.h" namespace SXSI { - -struct TCBuilderRep -{ - unsigned samplerate; - CSA::RLCSABuilder * sa; - - ulong n; - // Total number of texts in the collection - unsigned numberOfTexts; - // Length of the longest text - ulong maxTextLength; - ulong numberOfSamples; - -#ifdef TCB_TEST_BWT - DynFMI *dynFMI; -#endif -}; - -/** - * Init text collection - * - */ -TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength) - : p_(new struct TCBuilderRep()) -{ - p_->n = 0; - p_->samplerate = samplerate; - p_->numberOfTexts = 0; - p_->numberOfSamples = 0; - - // Current params: 8 bytes, no samples, buffer size n/10 bytes. - p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); - assert(p_->sa->isOk()); - -#ifdef TCB_TEST_BWT - uchar temp[256]; - for (unsigned i = 0; i < 255; ++i) - temp[i] = i+1; - temp[255] = 0; - p_->dynFMI = new DynFMI(temp, 1, 255, false); -#endif -} - -TextCollectionBuilder::~TextCollectionBuilder() -{ -#ifdef TCB_TEST_BWT - delete p_->dynFMI; -#endif - - delete p_->sa; - delete p_; -} - -void TextCollectionBuilder::InsertText(uchar const * text) -{ - TextCollection::TextPosition m = std::strlen((char *)text) + 1; - if (m > p_->maxTextLength) - p_->maxTextLength = m; // Store length of the longest text seen so far. - - if (m > 1) - { - p_->n += m; - p_->numberOfTexts ++; - p_->numberOfSamples += (m-1)/p_->samplerate; - -#ifdef TCB_TEST_BWT - p_->dynFMI->addText(text, m); -#endif - p_->sa->insertSequence((char*)text, m-1, 0); - assert(p_->sa->isOk()); - } - else - { - // FIXME indexing empty texts - std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; - exit(1); - } -} - - -TextCollection * TextCollectionBuilder::InitTextCollection() +TextCollectionBuilder* TextCollectionBuilder::create(unsigned samplerate, + index_type_t type, + ulong estimatedInputLength) { - uchar * bwt = 0; - CSA::usint length = 0; - if (p_->numberOfTexts == 0) - { - p_->numberOfTexts ++; // Add one empty text - bwt = new uchar[2]; - bwt[0] = '\0'; - bwt[1] = '\0'; - length = 1; - p_->maxTextLength = 1; - } - else + switch (type) { - bwt = (uchar *)p_->sa->getBWT(length); - delete p_->sa; - p_->sa = 0; - - assert(length == p_->n); - -#ifdef TCB_TEST_BWT - { - uchar *bwtTest = p_->dynFMI->getBWT(); - printf("123456789012345678901234567890123456789\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwt[i] < 50) - printf("%d", (int)bwt[i]); - else - printf("%c", bwt[i]); - printf("\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwtTest[i] < 50) - printf("%d", (int)bwtTest[i]); - else - printf("%c", bwtTest[i]); - printf("\n"); - - // Sanity check - assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize()); - - delete p_->dynFMI; - p_->dynFMI = 0; - for (ulong i = 0; i < p_->n; ++i) - if (bwt[i] != bwtTest[i]) - { - std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " - << (unsigned)bwtTest[i] << std::endl; - assert(0); - } - delete [] bwtTest; - } -#endif // TCB_TEST_BWT + case index_type_default: + return new FMIndexBuilder(samplerate, estimatedInputLength); + break; + case index_type_swcsa: + return new SWCSABuilder(samplerate); + break; + case index_type_rlcsa: + return new RLCSABuilder(samplerate, estimatedInputLength); + break; } - - TextCollection *result = new TCImplementation(bwt, (ulong)length, - p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples); - return result; + std::cerr << "TextCollectionBuilder::create(): unknown type given: expecting enum value, type = " << type << std::endl; + std::exit(2); } - -} // namespace SXSI +} // Namespace SXSI