--- /dev/null
+#include "incbwt/rlcsa_builder.h"
+#include "incbwt/bits/deltavector.h"
+
+#include "FMIndexBuilder.h"
+#include "FMIndex.h"
+
+using std::string;
+
+namespace SXSI
+{
+
+struct TCBuilderRep
+{
+ unsigned samplerate;
+ CSA::RLCSABuilder * sa;
+
+ ulong n;
+ // Total number of texts in the collection
+ unsigned numberOfTexts;
+ // Length of the longest text
+ ulong maxTextLength;
+ ulong numberOfSamples;
+
+ CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index.
+ string niText; // Texts that are not indexed.
+
+#ifdef TCB_TEST_BWT
+ DynFMI *dynFMI;
+#endif
+};
+
+/**
+ * Init text collection
+ *
+ */
+FMIndexBuilder::FMIndexBuilder(unsigned samplerate, ulong estimatedInputLength)
+ : p_(new struct TCBuilderRep())
+{
+ p_->n = 0;
+ p_->samplerate = samplerate;
+ p_->numberOfTexts = 0;
+ p_->numberOfSamples = 0;
+ p_->maxTextLength = 0;
+ p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32
+ p_->niText = "";
+
+ // Current params: 8 bytes, no samples, buffer size n/10 bytes.
+ // Buffer size is always at least 15MB:
+ if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
+ estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
+ p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10);
+ assert(p_->sa->isOk());
+
+#ifdef TCB_TEST_BWT
+ uchar temp[256];
+ for (unsigned i = 0; i < 255; ++i)
+ temp[i] = i+1;
+ temp[255] = 0;
+ p_->dynFMI = new DynFMI(temp, 1, 255, false);
+#endif
+}
+
+FMIndexBuilder::~FMIndexBuilder()
+{
+#ifdef TCB_TEST_BWT
+ delete p_->dynFMI;
+#endif
+
+ delete p_->sa;
+ delete p_->notIndexed;
+ delete p_;
+}
+
+void FMIndexBuilder::InsertText(uchar const * text, bool index)
+{
+ TextCollection::TextPosition m = std::strlen((char *)text) + 1;
+ if (m <= 1)
+ {
+ // FIXME indexing empty texts
+ std::cerr << "FMIndexBuilder::InsertText() error: can not index empty texts!" << std::endl;
+ exit(1);
+ }
+
+ p_->numberOfTexts ++;
+
+ if (index)
+ {
+ /**
+ * Insert text into the index
+ */
+ p_->n += m;
+ p_->numberOfSamples += (m-1)/p_->samplerate;
+
+ if (m > p_->maxTextLength)
+ p_->maxTextLength = m; // Store length of the longest text seen so far.
+
+ p_->sa->insertSequence((char*)text, m-1, 0);
+ assert(p_->sa->isOk());
+ }
+ else
+ {
+ /**
+ * Insert text only to TextStorage
+ */
+ p_->notIndexed->setBit(p_->numberOfTexts - 1);
+ p_->niText.append((const char *)text, m);
+ }
+}
+
+
+TextCollection * FMIndexBuilder::InitTextCollection(char type)
+{
+ uchar * bwt = 0;
+ CSA::usint length = 0;
+ if (p_->numberOfTexts == 0)
+ {
+ p_->numberOfTexts ++; // Add one empty text
+ bwt = new uchar[2];
+ bwt[0] = '\0';
+ bwt[1] = '\0';
+ length = 1;
+ p_->maxTextLength = 1;
+ }
+ else
+ {
+ bwt = (uchar *)p_->sa->getBWT(length);
+ delete p_->sa;
+ p_->sa = 0;
+
+ assert(length == p_->n);
+ }
+
+ p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's
+ CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1);
+ delete p_->notIndexed;
+ p_->notIndexed = 0;
+
+ TextCollection *result = new FMIndex(bwt, (ulong)length,
+ p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples,
+ deltav, p_->niText, type);
+
+ return result;
+}
+
+
+} // namespace SXSI