Debug swcsa
[SXSI/TextCollection.git] / TextCollectionBuilder.cpp
index 7228f72..334e0b0 100644 (file)
-#include "incbwt/rlcsa_builder.h"
 #include "TextCollectionBuilder.h"
-
-// Un-comment next line to run a comparison of resulting BWT
-//#define TCB_TEST_BWT
-
-#ifdef TCB_TEST_BWT
-#include "dynFMI.h"
-#endif
-
-#include "TCImplementation.h"
+#include "FMIndexBuilder.h"
+#include "SWCSABuilder.h"
+#include "RLCSABuilder.h"
 
 namespace SXSI
 {
-
-struct TCBuilderRep
-{
-    unsigned samplerate;
-    CSA::RLCSABuilder * sa;
-
-    ulong n;
-    // Total number of texts in the collection
-    unsigned numberOfTexts;
-    // Length of the longest text
-    ulong maxTextLength;
-    ulong numberOfSamples;
-
-#ifdef TCB_TEST_BWT
-    DynFMI *dynFMI;
-#endif
-};
-
-/**
- * Init text collection
- *
- */
-TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength)
-    : p_(new struct TCBuilderRep())
-{
-    p_->n = 0;
-    p_->samplerate = samplerate;
-    p_->numberOfTexts = 0;
-    p_->numberOfSamples = 0;
-    
-    // Current params: 8 bytes, no samples, buffer size n/10 bytes.
-    // Buffer size is always at least 15MB:
-    if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
-        estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
-    p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10);
-    assert(p_->sa->isOk());
-
-#ifdef TCB_TEST_BWT
-    uchar temp[256];
-    for (unsigned i = 0; i < 255; ++i)
-        temp[i] = i+1;
-    temp[255] = 0;
-    p_->dynFMI = new DynFMI(temp, 1, 255, false);
-#endif
-}
-
-TextCollectionBuilder::~TextCollectionBuilder()
-{
-#ifdef TCB_TEST_BWT
-    delete p_->dynFMI;
-#endif
-
-    delete p_->sa;
-    delete p_;
-}
-
-void TextCollectionBuilder::InsertText(uchar const * text)
-{
-    TextCollection::TextPosition m = std::strlen((char *)text) + 1;
-    if (m > p_->maxTextLength)
-        p_->maxTextLength = m; // Store length of the longest text seen so far.
-
-    if (m > 1)
-    {
-        p_->n += m;
-        p_->numberOfTexts ++;
-        p_->numberOfSamples += (m-1)/p_->samplerate;
-
-#ifdef TCB_TEST_BWT
-        p_->dynFMI->addText(text, m);
-#endif
-        p_->sa->insertSequence((char*)text, m-1, 0);
-        assert(p_->sa->isOk());
-    }
-    else
-    {
-        // FIXME indexing empty texts
-        std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
-        exit(1);
-    }
-}
-
-
-TextCollection * TextCollectionBuilder::InitTextCollection()
+TextCollectionBuilder* TextCollectionBuilder::create(unsigned samplerate, 
+                                     index_type_t type,
+                                     ulong estimatedInputLength)
 {
-    uchar * bwt = 0;
-    CSA::usint length = 0;
-    if (p_->numberOfTexts == 0)
-    {
-        p_->numberOfTexts ++; // Add one empty text
-        bwt = new uchar[2];
-        bwt[0] = '\0';
-        bwt[1] = '\0';
-        length = 1;
-        p_->maxTextLength = 1;
-    }
-    else
+    switch (type)
     {
-        bwt = (uchar *)p_->sa->getBWT(length);
-        delete p_->sa;
-        p_->sa = 0;
-
-        assert(length == p_->n);
-
-#ifdef TCB_TEST_BWT
-        { 
-            uchar *bwtTest = p_->dynFMI->getBWT();
-            printf("123456789012345678901234567890123456789\n");
-            for (ulong i = 0; i < p_->n && i < 100; i ++)
-                if (bwt[i] < 50)
-                    printf("%d", (int)bwt[i]);
-                else
-                    printf("%c", bwt[i]);
-            printf("\n");
-            for (ulong i = 0; i < p_->n && i < 100; i ++)
-                if (bwtTest[i] < 50)
-                    printf("%d", (int)bwtTest[i]);
-                else
-                    printf("%c", bwtTest[i]);
-            printf("\n");
-            
-            // Sanity check
-            assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize());    
-            
-            delete p_->dynFMI;
-            p_->dynFMI = 0;
-            for (ulong i = 0; i < p_->n; ++i)
-                if (bwt[i] != bwtTest[i])
-                {
-                    std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " 
-                              << (unsigned)bwtTest[i] << std::endl;
-                    assert(0);
-                }
-            delete [] bwtTest;
-        }
-#endif // TCB_TEST_BWT
+    case index_type_default:
+        return new FMIndexBuilder(samplerate, estimatedInputLength);
+        break;
+    case index_type_swcsa:
+        return new SWCSABuilder(samplerate);
+        break;
+    case index_type_rlcsa:
+        return new RLCSABuilder(samplerate, estimatedInputLength);
+        break;
     }
-
-    TextCollection *result = new TCImplementation(bwt, (ulong)length, 
-                      p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples);
-    return result;
+    std::cerr << "TextCollectionBuilder::create(): unknown type given: expecting enum value, type = " << type << std::endl;
+    std::exit(2);
 }
 
-
-} // namespace SXSI
+} // Namespace SXSI