Added deltavector for non-indexed texts
authornvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 12 Dec 2009 21:11:34 +0000 (21:11 +0000)
committernvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 12 Dec 2009 21:11:34 +0000 (21:11 +0000)
git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@620 3cdefd35-fc62-479d-8e8d-bae585ffb9ca

TextCollectionBuilder.cpp

index 467bf94..552abb1 100644 (file)
@@ -1,15 +1,11 @@
 #include "incbwt/rlcsa_builder.h"
-#include "TextCollectionBuilder.h"
-
-// Un-comment next line to run a comparison of resulting BWT
-//#define TCB_TEST_BWT
-
-#ifdef TCB_TEST_BWT
-#include "dynFMI.h"
-#endif
+#include "incbwt/bits/deltavector.h"
 
+#include "TextCollectionBuilder.h"
 #include "TCImplementation.h"
 
+using std::string;
+
 namespace SXSI
 {
 
@@ -25,6 +21,9 @@ struct TCBuilderRep
     ulong maxTextLength;
     ulong numberOfSamples;
 
+    CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index.
+    string niText; // Texts that are not indexed.
+
 #ifdef TCB_TEST_BWT
     DynFMI *dynFMI;
 #endif
@@ -41,6 +40,9 @@ TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimate
     p_->samplerate = samplerate;
     p_->numberOfTexts = 0;
     p_->numberOfSamples = 0;
+    p_->maxTextLength = 0;
+    p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32
+    p_->niText = "";
     
     // Current params: 8 bytes, no samples, buffer size n/10 bytes.
     // Buffer size is always at least 15MB:
@@ -65,32 +67,43 @@ TextCollectionBuilder::~TextCollectionBuilder()
 #endif
 
     delete p_->sa;
+    delete p_->notIndexed;
     delete p_;
 }
 
-void TextCollectionBuilder::InsertText(uchar const * text)
+void TextCollectionBuilder::InsertText(uchar const * text, bool index)
 {
     TextCollection::TextPosition m = std::strlen((char *)text) + 1;
-    if (m > p_->maxTextLength)
-        p_->maxTextLength = m; // Store length of the longest text seen so far.
-
-    if (m > 1)
+    if (m <= 1)
+    {
+        // FIXME indexing empty texts
+        std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
+        exit(1);
+    }
+    
+    p_->numberOfTexts ++;
+    
+    if (index)
     {
+        /** 
+         * Insert text into the index
+         */
         p_->n += m;
-        p_->numberOfTexts ++;
         p_->numberOfSamples += (m-1)/p_->samplerate;
+            
+        if (m > p_->maxTextLength)
+            p_->maxTextLength = m; // Store length of the longest text seen so far.
 
-#ifdef TCB_TEST_BWT
-        p_->dynFMI->addText(text, m);
-#endif
         p_->sa->insertSequence((char*)text, m-1, 0);
         assert(p_->sa->isOk());
     }
     else
     {
-        // FIXME indexing empty texts
-        std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
-        exit(1);
+        /** 
+         * Insert text only to TextStorage
+         */
+        p_->notIndexed->setBit(p_->numberOfTexts - 1);
+        p_->niText.append((const char *)text, m);
     }
 }
 
@@ -115,43 +128,17 @@ TextCollection * TextCollectionBuilder::InitTextCollection(char type)
         p_->sa = 0;
 
         assert(length == p_->n);
-
-#ifdef TCB_TEST_BWT
-        { 
-            uchar *bwtTest = p_->dynFMI->getBWT();
-            printf("123456789012345678901234567890123456789\n");
-            for (ulong i = 0; i < p_->n && i < 100; i ++)
-                if (bwt[i] < 50)
-                    printf("%d", (int)bwt[i]);
-                else
-                    printf("%c", bwt[i]);
-            printf("\n");
-            for (ulong i = 0; i < p_->n && i < 100; i ++)
-                if (bwtTest[i] < 50)
-                    printf("%d", (int)bwtTest[i]);
-                else
-                    printf("%c", bwtTest[i]);
-            printf("\n");
-            
-            // Sanity check
-            assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize());    
-            
-            delete p_->dynFMI;
-            p_->dynFMI = 0;
-            for (ulong i = 0; i < p_->n; ++i)
-                if (bwt[i] != bwtTest[i])
-                {
-                    std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " 
-                              << (unsigned)bwtTest[i] << std::endl;
-                    assert(0);
-                }
-            delete [] bwtTest;
-        }
-#endif // TCB_TEST_BWT
     }
 
+    p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's
+    CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1);
+    delete p_->notIndexed;
+    p_->notIndexed = 0;
+
     TextCollection *result = new TCImplementation(bwt, (ulong)length, 
-                   p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, type);
+                   p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples,
+                   deltav, p_->niText, type);
+
     return result;
 }