Added RLCSA index option
[SXSI/TextCollection.git] / RLCSABuilder.cpp
diff --git a/RLCSABuilder.cpp b/RLCSABuilder.cpp
new file mode 100644 (file)
index 0000000..3c17351
--- /dev/null
@@ -0,0 +1,99 @@
+#include "incbwt/rlcsa_builder.h"
+#include "RLCSABuilder.h"
+#include "RLCSAWrapper.h"
+
+namespace SXSI
+{
+
+// Using pimpl idiom to hide RLCSABuilder*
+struct TCBuilderRep
+{
+    unsigned samplerate;
+    CSA::RLCSABuilder * sa;
+
+    ulong n;
+    unsigned numberOfTexts;
+    bool insertAllowed;
+};
+
+/**
+ * Init text collection
+ */
+RLCSABuilder::RLCSABuilder(unsigned samplerate, ulong estimatedInputLength)
+    : p_(new struct TCBuilderRep())
+{
+    p_->n = 0;
+    p_->samplerate = samplerate;
+    if (samplerate == 0)
+        p_->samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE;
+
+    p_->numberOfTexts = 0;
+    p_->insertAllowed = true;
+
+    CSA::usint rlcsa_block_size = CSA::RLCSA_BLOCK_SIZE.second;
+    CSA::usint rlcsa_sample_rate = p_->samplerate;
+    // Parameters for RLCSA: 32 bytes, samples, buffer size n/10 bytes.
+    // Buffer size is always at least 15MB:
+    if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
+        estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
+    p_->sa = new CSA::RLCSABuilder(rlcsa_block_size, rlcsa_sample_rate, estimatedInputLength/10);
+    assert(p_->sa->isOk());
+}
+
+RLCSABuilder::~RLCSABuilder()
+{
+    delete p_->sa;
+    delete p_;
+}
+
+void RLCSABuilder::InsertText(uchar const * text, bool index)
+{
+    assert(index);
+    if (!index)
+    {
+        std::cerr << "SWCSABuilder::InsertText(): The implementation of SWCSA does not support non-indexed texts" 
+                  << std::endl << "Use the default (FMIndex) text collection instead." << std::endl;
+        std::exit(1);                
+    }
+    
+    if (!p_->insertAllowed)
+    {
+        std::cerr << "RLCSABuilder::InsertText() error: new text can not be inserted after InitTextCollection() call!" << std::endl;
+        std::exit(1);
+    }
+
+    TextCollection::TextPosition m = std::strlen((char *)text) + 1;
+    if (m > 1)
+    {
+        p_->n += m;
+        p_->numberOfTexts ++;
+
+        p_->sa->insertSequence((char*)text, m-1, 0);
+        assert(p_->sa->isOk());
+    }
+    else
+    {
+        // FIXME indexing empty texts
+        std::cerr << "RLCSABuilder::InsertText() error: can not index empty texts!" << std::endl;
+        exit(1);
+    }
+}
+
+TextCollection * RLCSABuilder::InitTextCollection(char type)
+{
+    p_->insertAllowed = false; // Disable future insertions
+    assert(type == TextStorage::TYPE_PLAIN_TEXT);
+    if (type != TextStorage::TYPE_PLAIN_TEXT)
+    {
+        std::cerr << "RLCSABuilder::InitTextCollection(): The implementation of RLCSA supports only TextStorage::TYPE_PLAIN_TEXT" 
+                  << std::endl << "Use the default (FMIndex) text collection instead." << std::endl;
+        std::exit(1);
+    }
+    
+    TextCollection *result = new RLCSAWrapper(p_->sa->getRLCSA());
+    delete p_->sa;
+    p_->sa = 0;
+    return result;
+}
+}