Added SWCSA
[SXSI/TextCollection.git] / FMIndexBuilder.cpp
diff --git a/FMIndexBuilder.cpp b/FMIndexBuilder.cpp
new file mode 100644 (file)
index 0000000..ea52224
--- /dev/null
@@ -0,0 +1,146 @@
+#include "incbwt/rlcsa_builder.h"
+#include "incbwt/bits/deltavector.h"
+
+#include "FMIndexBuilder.h"
+#include "FMIndex.h"
+
+using std::string;
+
+namespace SXSI
+{
+
+struct TCBuilderRep
+{
+    unsigned samplerate;
+    CSA::RLCSABuilder * sa;
+
+    ulong n;
+    // Total number of texts in the collection
+    unsigned numberOfTexts;
+    // Length of the longest text
+    ulong maxTextLength;
+    ulong numberOfSamples;
+
+    CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index.
+    string niText; // Texts that are not indexed.
+
+#ifdef TCB_TEST_BWT
+    DynFMI *dynFMI;
+#endif
+};
+
+/**
+ * Init text collection
+ *
+ */
+FMIndexBuilder::FMIndexBuilder(unsigned samplerate, ulong estimatedInputLength)
+    : p_(new struct TCBuilderRep())
+{
+    p_->n = 0;
+    p_->samplerate = samplerate;
+    p_->numberOfTexts = 0;
+    p_->numberOfSamples = 0;
+    p_->maxTextLength = 0;
+    p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32
+    p_->niText = "";
+    
+    // Current params: 8 bytes, no samples, buffer size n/10 bytes.
+    // Buffer size is always at least 15MB:
+    if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
+        estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
+    p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10);
+    assert(p_->sa->isOk());
+
+#ifdef TCB_TEST_BWT
+    uchar temp[256];
+    for (unsigned i = 0; i < 255; ++i)
+        temp[i] = i+1;
+    temp[255] = 0;
+    p_->dynFMI = new DynFMI(temp, 1, 255, false);
+#endif
+}
+
+FMIndexBuilder::~FMIndexBuilder()
+{
+#ifdef TCB_TEST_BWT
+    delete p_->dynFMI;
+#endif
+
+    delete p_->sa;
+    delete p_->notIndexed;
+    delete p_;
+}
+
+void FMIndexBuilder::InsertText(uchar const * text, bool index)
+{
+    TextCollection::TextPosition m = std::strlen((char *)text) + 1;
+    if (m <= 1)
+    {
+        // FIXME indexing empty texts
+        std::cerr << "FMIndexBuilder::InsertText() error: can not index empty texts!" << std::endl;
+        exit(1);
+    }
+    
+    p_->numberOfTexts ++;
+    
+    if (index)
+    {
+        /** 
+         * Insert text into the index
+         */
+        p_->n += m;
+        p_->numberOfSamples += (m-1)/p_->samplerate;
+            
+        if (m > p_->maxTextLength)
+            p_->maxTextLength = m; // Store length of the longest text seen so far.
+
+        p_->sa->insertSequence((char*)text, m-1, 0);
+        assert(p_->sa->isOk());
+    }
+    else
+    {
+        /** 
+         * Insert text only to TextStorage
+         */
+        p_->notIndexed->setBit(p_->numberOfTexts - 1);
+        p_->niText.append((const char *)text, m);
+    }
+}
+
+
+TextCollection * FMIndexBuilder::InitTextCollection(char type)
+{
+    uchar * bwt = 0;
+    CSA::usint length = 0;
+    if (p_->numberOfTexts == 0)
+    {
+        p_->numberOfTexts ++; // Add one empty text
+        bwt = new uchar[2];
+        bwt[0] = '\0';
+        bwt[1] = '\0';
+        length = 1;
+        p_->maxTextLength = 1;
+    }
+    else
+    {
+        bwt = (uchar *)p_->sa->getBWT(length);
+        delete p_->sa;
+        p_->sa = 0;
+
+        assert(length == p_->n);
+    }
+
+    p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's
+    CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1);
+    delete p_->notIndexed;
+    p_->notIndexed = 0;
+
+    TextCollection *result = new FMIndex(bwt, (ulong)length, 
+                   p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples,
+                   deltav, p_->niText, type);
+
+    return result;
+}
+
+
+} // namespace SXSI