1 #include "incbwt/rlcsa_builder.h"
2 #include "incbwt/bits/deltavector.h"
4 #include "FMIndexBuilder.h"
15 CSA::RLCSABuilder * sa;
18 // Total number of texts in the collection
19 unsigned numberOfTexts;
20 // Length of the longest text
22 ulong numberOfSamples;
24 CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index.
25 string niText; // Texts that are not indexed.
33 * Init text collection
36 FMIndexBuilder::FMIndexBuilder(unsigned samplerate, ulong estimatedInputLength)
37 : p_(new struct TCBuilderRep())
40 p_->samplerate = samplerate;
41 p_->numberOfTexts = 0;
42 p_->numberOfSamples = 0;
43 p_->maxTextLength = 0;
44 p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32
47 // Current params: 8 bytes, no samples, buffer size n/10 bytes.
48 // Buffer size is always at least 15MB:
49 if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
50 estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
51 p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10);
52 assert(p_->sa->isOk());
56 for (unsigned i = 0; i < 255; ++i)
59 p_->dynFMI = new DynFMI(temp, 1, 255, false);
63 FMIndexBuilder::~FMIndexBuilder()
70 delete p_->notIndexed;
74 void FMIndexBuilder::InsertText(uchar const * text, bool index)
76 TextCollection::TextPosition m = std::strlen((char *)text) + 1;
79 // FIXME indexing empty texts
80 std::cerr << "FMIndexBuilder::InsertText() error: can not index empty texts!" << std::endl;
89 * Insert text into the index
92 p_->numberOfSamples += (m-1)/p_->samplerate;
94 if (m > p_->maxTextLength)
95 p_->maxTextLength = m; // Store length of the longest text seen so far.
97 p_->sa->insertSequence((char*)text, m-1, 0);
98 assert(p_->sa->isOk());
103 * Insert text only to TextStorage
105 p_->notIndexed->setBit(p_->numberOfTexts - 1);
106 p_->niText.append((const char *)text, m);
111 TextCollection * FMIndexBuilder::InitTextCollection(char type)
114 CSA::usint length = 0;
115 if (p_->numberOfTexts == 0)
117 p_->numberOfTexts ++; // Add one empty text
122 p_->maxTextLength = 1;
126 bwt = (uchar *)p_->sa->getBWT(length);
130 assert(length == p_->n);
133 p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's
134 CSA::DeltaVector deltav(*p_->notIndexed, p_->numberOfTexts+1);
135 delete p_->notIndexed;
138 TextCollection *result = new FMIndex(bwt, (ulong)length,
139 p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples,
140 deltav, p_->niText, type);