-#include "incbwt/rlcsa_builder.h"
-#include "incbwt/bits/deltavector.h"
-
#include "TextCollectionBuilder.h"
-#include "TCImplementation.h"
-
-using std::string;
+#include "FMIndexBuilder.h"
+#include "SWCSABuilder.h"
+#include "RLCSABuilder.h"
namespace SXSI
{
-
-struct TCBuilderRep
-{
- unsigned samplerate;
- CSA::RLCSABuilder * sa;
-
- ulong n;
- // Total number of texts in the collection
- unsigned numberOfTexts;
- // Length of the longest text
- ulong maxTextLength;
- ulong numberOfSamples;
-
- CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index.
- string niText; // Texts that are not indexed.
-
-#ifdef TCB_TEST_BWT
- DynFMI *dynFMI;
-#endif
-};
-
-/**
- * Init text collection
- *
- */
-TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength)
- : p_(new struct TCBuilderRep())
-{
- p_->n = 0;
- p_->samplerate = samplerate;
- p_->numberOfTexts = 0;
- p_->numberOfSamples = 0;
- p_->maxTextLength = 0;
- p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32
- p_->niText = "";
-
- // Current params: 8 bytes, no samples, buffer size n/10 bytes.
- // Buffer size is always at least 15MB:
- if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
- estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
- p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10);
- assert(p_->sa->isOk());
-
-#ifdef TCB_TEST_BWT
- uchar temp[256];
- for (unsigned i = 0; i < 255; ++i)
- temp[i] = i+1;
- temp[255] = 0;
- p_->dynFMI = new DynFMI(temp, 1, 255, false);
-#endif
-}
-
-TextCollectionBuilder::~TextCollectionBuilder()
-{
-#ifdef TCB_TEST_BWT
- delete p_->dynFMI;
-#endif
-
- delete p_->sa;
- delete p_->notIndexed;
- delete p_;
-}
-
-void TextCollectionBuilder::InsertText(uchar const * text, bool index)
-{
- TextCollection::TextPosition m = std::strlen((char *)text) + 1;
- if (m <= 1)
- {
- // FIXME indexing empty texts
- std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
- exit(1);
- }
-
- p_->numberOfTexts ++;
-
- if (index)
- {
- /**
- * Insert text into the index
- */
- p_->n += m;
- p_->numberOfSamples += (m-1)/p_->samplerate;
-
- if (m > p_->maxTextLength)
- p_->maxTextLength = m; // Store length of the longest text seen so far.
-
- p_->sa->insertSequence((char*)text, m-1, 0);
- assert(p_->sa->isOk());
- }
- else
- {
- /**
- * Insert text only to TextStorage
- */
- p_->notIndexed->setBit(p_->numberOfTexts - 1);
- p_->niText.append((const char *)text, m);
- }
-}
-
-
-TextCollection * TextCollectionBuilder::InitTextCollection(char type)
+TextCollectionBuilder* TextCollectionBuilder::create(unsigned samplerate,
+ index_type_t type,
+ ulong estimatedInputLength)
{
- uchar * bwt = 0;
- CSA::usint length = 0;
- if (p_->numberOfTexts == 0)
- {
- p_->numberOfTexts ++; // Add one empty text
- bwt = new uchar[2];
- bwt[0] = '\0';
- bwt[1] = '\0';
- length = 1;
- p_->maxTextLength = 1;
- }
- else
+ switch (type)
{
- bwt = (uchar *)p_->sa->getBWT(length);
- delete p_->sa;
- p_->sa = 0;
-
- assert(length == p_->n);
+ case index_type_default:
+ return new FMIndexBuilder(samplerate, estimatedInputLength);
+ break;
+ case index_type_swcsa:
+ return new SWCSABuilder(samplerate);
+ break;
+ case index_type_rlcsa:
+ return new RLCSABuilder(samplerate, estimatedInputLength);
+ break;
}
-
- p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's
- CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1);
- delete p_->notIndexed;
- p_->notIndexed = 0;
-
- TextCollection *result = new TCImplementation(bwt, (ulong)length,
- p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples,
- deltav, p_->niText, type);
-
- return result;
+ std::cerr << "TextCollectionBuilder::create(): unknown type given: expecting enum value, type = " << type << std::endl;
+ std::exit(2);
}
-
-} // namespace SXSI
+} // Namespace SXSI