Jouni's Incremental BWT integrated into TextCollection
[SXSI/TextCollection.git] / TextCollectionBuilder.cpp
diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp
new file mode 100644 (file)
index 0000000..67657bf
--- /dev/null
@@ -0,0 +1,153 @@
+#include "incbwt/rlcsa_builder.h"
+#include "TextCollectionBuilder.h"
+
+// Un-comment next line to run a comparison of resulting BWT
+//#define TCB_TEST_BWT
+
+#ifdef TCB_TEST_BWT
+#include "dynFMI.h"
+#endif
+
+#include "TCImplementation.h"
+
+namespace SXSI
+{
+
+struct TCBuilderRep
+{
+    unsigned samplerate;
+    CSA::RLCSABuilder * sa;
+
+    ulong n;
+    // Total number of texts in the collection
+    unsigned numberOfTexts;
+    // Length of the longest text
+    ulong maxTextLength;
+
+#ifdef TCB_TEST_BWT
+    DynFMI *dynFMI;
+#endif
+};
+
+/**
+ * Init text collection
+ *
+ * See CSA.h for more details.
+ */
+TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate)
+    : p_(new struct TCBuilderRep())
+{
+    p_->n = 0;
+    p_->samplerate = samplerate;
+    p_->numberOfTexts = 0;
+    
+    // Current params: 8 bytes, 15 MB, no samples
+    p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024);
+    assert(p_->sa->isOk());
+
+#ifdef TCB_TEST_BWT
+    uchar temp[256];
+    for (unsigned i = 0; i < 255; ++i)
+        temp[i] = i+1;
+    temp[255] = 0;
+    p_->dynFMI = new DynFMI(temp, 1, 255, false);
+#endif
+}
+
+TextCollectionBuilder::~TextCollectionBuilder()
+{
+#ifdef TCB_TEST_BWT
+    delete p_->dynFMI;
+#endif
+
+    delete p_->sa;
+    delete p_;
+}
+
+void TextCollectionBuilder::InsertText(uchar const * text)
+{
+    TextCollection::TextPosition m = std::strlen((char *)text) + 1;
+    if (m > p_->maxTextLength)
+        p_->maxTextLength = m; // Store length of the longest text seen so far.
+
+    if (m > 1)
+    {
+        p_->n += m;
+        p_->numberOfTexts ++;
+
+#ifdef TCB_TEST_BWT
+        p_->dynFMI->addText(text, m);
+#endif
+
+        p_->sa->insertSequence((char*)text, m-1, 0);
+        assert(p_->sa->isOk());
+    }
+    else
+    {
+        // FIXME indexing empty texts
+        std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
+        exit(1);
+    }
+}
+
+
+TextCollection * TextCollectionBuilder::InitTextCollection()
+{
+    uchar * bwt = 0;
+    CSA::usint length = 0;
+    if (p_->numberOfTexts == 0)
+    {
+        p_->numberOfTexts ++; // Add one empty text
+        bwt = new uchar[2];
+        bwt[0] = '\0';
+        bwt[1] = '\0';
+        length = 1;
+        p_->maxTextLength = 1;
+    }
+    else
+    {
+        bwt = (uchar *)p_->sa->getBWT(length);
+        delete p_->sa;
+        p_->sa = 0;
+
+        assert(length == p_->n);
+
+#ifdef TCB_TEST_BWT
+        { 
+            uchar *bwtTest = p_->dynFMI->getBWT();
+            printf("123456789012345678901234567890123456789\n");
+            for (ulong i = 0; i < p_->n && i < 100; i ++)
+                if (bwt[i] < 50)
+                    printf("%d", (int)bwt[i]);
+                else
+                    printf("%c", bwt[i]);
+            printf("\n");
+            for (ulong i = 0; i < p_->n && i < 100; i ++)
+                if (bwtTest[i] < 50)
+                    printf("%d", (int)bwtTest[i]);
+                else
+                    printf("%c", bwtTest[i]);
+            printf("\n");
+            
+            // Sanity check
+            assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize());    
+            
+            delete p_->dynFMI;
+            p_->dynFMI = 0;
+            for (ulong i = 0; i < p_->n; ++i)
+                if (bwt[i] != bwtTest[i])
+                {
+                    std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
+                    assert(0);
+                }
+            delete [] bwtTest;
+        }
+#endif // TCB_TEST_BWT
+    }
+
+    TextCollection *result = new TCImplementation(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength);
+    return result;
+}
+
+
+} // namespace SXSI