#include "incbwt/rlcsa_builder.h"
-#include "TextCollectionBuilder.h"
-
-// Un-comment next line to run a comparison of resulting BWT
-//#define TCB_TEST_BWT
-
-#ifdef TCB_TEST_BWT
-#include "dynFMI.h"
-#endif
+#include "incbwt/bits/deltavector.h"
+#include "TextCollectionBuilder.h"
#include "TCImplementation.h"
+using std::string;
+
namespace SXSI
{
ulong maxTextLength;
ulong numberOfSamples;
+ CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index.
+ string niText; // Texts that are not indexed.
+
#ifdef TCB_TEST_BWT
DynFMI *dynFMI;
#endif
p_->samplerate = samplerate;
p_->numberOfTexts = 0;
p_->numberOfSamples = 0;
+ p_->maxTextLength = 0;
+ p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32
+ p_->niText = "";
// Current params: 8 bytes, no samples, buffer size n/10 bytes.
// Buffer size is always at least 15MB:
#endif
delete p_->sa;
+ delete p_->notIndexed;
delete p_;
}
-void TextCollectionBuilder::InsertText(uchar const * text)
+void TextCollectionBuilder::InsertText(uchar const * text, bool index)
{
TextCollection::TextPosition m = std::strlen((char *)text) + 1;
- if (m > p_->maxTextLength)
- p_->maxTextLength = m; // Store length of the longest text seen so far.
-
- if (m > 1)
+ if (m <= 1)
+ {
+ // FIXME indexing empty texts
+ std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
+ exit(1);
+ }
+
+ p_->numberOfTexts ++;
+
+ if (index)
{
+ /**
+ * Insert text into the index
+ */
p_->n += m;
- p_->numberOfTexts ++;
p_->numberOfSamples += (m-1)/p_->samplerate;
+
+ if (m > p_->maxTextLength)
+ p_->maxTextLength = m; // Store length of the longest text seen so far.
-#ifdef TCB_TEST_BWT
- p_->dynFMI->addText(text, m);
-#endif
p_->sa->insertSequence((char*)text, m-1, 0);
assert(p_->sa->isOk());
}
else
{
- // FIXME indexing empty texts
- std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
- exit(1);
+ /**
+ * Insert text only to TextStorage
+ */
+ p_->notIndexed->setBit(p_->numberOfTexts - 1);
+ p_->niText.append((const char *)text, m);
}
}
p_->sa = 0;
assert(length == p_->n);
-
-#ifdef TCB_TEST_BWT
- {
- uchar *bwtTest = p_->dynFMI->getBWT();
- printf("123456789012345678901234567890123456789\n");
- for (ulong i = 0; i < p_->n && i < 100; i ++)
- if (bwt[i] < 50)
- printf("%d", (int)bwt[i]);
- else
- printf("%c", bwt[i]);
- printf("\n");
- for (ulong i = 0; i < p_->n && i < 100; i ++)
- if (bwtTest[i] < 50)
- printf("%d", (int)bwtTest[i]);
- else
- printf("%c", bwtTest[i]);
- printf("\n");
-
- // Sanity check
- assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize());
-
- delete p_->dynFMI;
- p_->dynFMI = 0;
- for (ulong i = 0; i < p_->n; ++i)
- if (bwt[i] != bwtTest[i])
- {
- std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", "
- << (unsigned)bwtTest[i] << std::endl;
- assert(0);
- }
- delete [] bwtTest;
- }
-#endif // TCB_TEST_BWT
}
+ p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's
+ CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1);
+ delete p_->notIndexed;
+ p_->notIndexed = 0;
+
TextCollection *result = new TCImplementation(bwt, (ulong)length,
- p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, type);
+ p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples,
+ deltav, p_->niText, type);
+
return result;
}