From 7499c996b739914abd5651dc8b4bf8f234306f56 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Sat, 12 Dec 2009 21:11:34 +0000 Subject: [PATCH] Added deltavector for non-indexed texts git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@620 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- TextCollectionBuilder.cpp | 95 +++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 54 deletions(-) diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp index 467bf94..552abb1 100644 --- a/TextCollectionBuilder.cpp +++ b/TextCollectionBuilder.cpp @@ -1,15 +1,11 @@ #include "incbwt/rlcsa_builder.h" -#include "TextCollectionBuilder.h" - -// Un-comment next line to run a comparison of resulting BWT -//#define TCB_TEST_BWT - -#ifdef TCB_TEST_BWT -#include "dynFMI.h" -#endif +#include "incbwt/bits/deltavector.h" +#include "TextCollectionBuilder.h" #include "TCImplementation.h" +using std::string; + namespace SXSI { @@ -25,6 +21,9 @@ struct TCBuilderRep ulong maxTextLength; ulong numberOfSamples; + CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index. + string niText; // Texts that are not indexed. + #ifdef TCB_TEST_BWT DynFMI *dynFMI; #endif @@ -41,6 +40,9 @@ TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimate p_->samplerate = samplerate; p_->numberOfTexts = 0; p_->numberOfSamples = 0; + p_->maxTextLength = 0; + p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32 + p_->niText = ""; // Current params: 8 bytes, no samples, buffer size n/10 bytes. // Buffer size is always at least 15MB: @@ -65,32 +67,43 @@ TextCollectionBuilder::~TextCollectionBuilder() #endif delete p_->sa; + delete p_->notIndexed; delete p_; } -void TextCollectionBuilder::InsertText(uchar const * text) +void TextCollectionBuilder::InsertText(uchar const * text, bool index) { TextCollection::TextPosition m = std::strlen((char *)text) + 1; - if (m > p_->maxTextLength) - p_->maxTextLength = m; // Store length of the longest text seen so far. - - if (m > 1) + if (m <= 1) + { + // FIXME indexing empty texts + std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } + + p_->numberOfTexts ++; + + if (index) { + /** + * Insert text into the index + */ p_->n += m; - p_->numberOfTexts ++; p_->numberOfSamples += (m-1)/p_->samplerate; + + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. -#ifdef TCB_TEST_BWT - p_->dynFMI->addText(text, m); -#endif p_->sa->insertSequence((char*)text, m-1, 0); assert(p_->sa->isOk()); } else { - // FIXME indexing empty texts - std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; - exit(1); + /** + * Insert text only to TextStorage + */ + p_->notIndexed->setBit(p_->numberOfTexts - 1); + p_->niText.append((const char *)text, m); } } @@ -115,43 +128,17 @@ TextCollection * TextCollectionBuilder::InitTextCollection(char type) p_->sa = 0; assert(length == p_->n); - -#ifdef TCB_TEST_BWT - { - uchar *bwtTest = p_->dynFMI->getBWT(); - printf("123456789012345678901234567890123456789\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwt[i] < 50) - printf("%d", (int)bwt[i]); - else - printf("%c", bwt[i]); - printf("\n"); - for (ulong i = 0; i < p_->n && i < 100; i ++) - if (bwtTest[i] < 50) - printf("%d", (int)bwtTest[i]); - else - printf("%c", bwtTest[i]); - printf("\n"); - - // Sanity check - assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize()); - - delete p_->dynFMI; - p_->dynFMI = 0; - for (ulong i = 0; i < p_->n; ++i) - if (bwt[i] != bwtTest[i]) - { - std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " - << (unsigned)bwtTest[i] << std::endl; - assert(0); - } - delete [] bwtTest; - } -#endif // TCB_TEST_BWT } + p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's + CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1); + delete p_->notIndexed; + p_->notIndexed = 0; + TextCollection *result = new TCImplementation(bwt, (ulong)length, - p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, type); + p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, + deltav, p_->niText, type); + return result; } -- 2.17.1