1 #include "incbwt/rlcsa_builder.h"
2 #include "TextCollectionBuilder.h"
4 // Un-comment next line to run a comparison of resulting BWT
11 #include "TCImplementation.h"
19 CSA::RLCSABuilder * sa;
22 // Total number of texts in the collection
23 unsigned numberOfTexts;
24 // Length of the longest text
26 ulong numberOfSamples;
34 * Init text collection
37 TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength)
38 : p_(new struct TCBuilderRep())
41 p_->samplerate = samplerate;
42 p_->numberOfTexts = 0;
43 p_->numberOfSamples = 0;
45 // Current params: 8 bytes, no samples, buffer size n/10 bytes.
46 // Buffer size is always at least 15MB:
47 if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
48 estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
49 p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10);
50 assert(p_->sa->isOk());
54 for (unsigned i = 0; i < 255; ++i)
57 p_->dynFMI = new DynFMI(temp, 1, 255, false);
61 TextCollectionBuilder::~TextCollectionBuilder()
71 void TextCollectionBuilder::InsertText(uchar const * text)
73 TextCollection::TextPosition m = std::strlen((char *)text) + 1;
74 if (m > p_->maxTextLength)
75 p_->maxTextLength = m; // Store length of the longest text seen so far.
81 p_->numberOfSamples += (m-1)/p_->samplerate;
84 p_->dynFMI->addText(text, m);
86 p_->sa->insertSequence((char*)text, m-1, 0);
87 assert(p_->sa->isOk());
91 // FIXME indexing empty texts
92 std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
98 TextCollection * TextCollectionBuilder::InitTextCollection(char type)
101 CSA::usint length = 0;
102 if (p_->numberOfTexts == 0)
104 p_->numberOfTexts ++; // Add one empty text
109 p_->maxTextLength = 1;
113 bwt = (uchar *)p_->sa->getBWT(length);
117 assert(length == p_->n);
121 uchar *bwtTest = p_->dynFMI->getBWT();
122 printf("123456789012345678901234567890123456789\n");
123 for (ulong i = 0; i < p_->n && i < 100; i ++)
125 printf("%d", (int)bwt[i]);
127 printf("%c", bwt[i]);
129 for (ulong i = 0; i < p_->n && i < 100; i ++)
131 printf("%d", (int)bwtTest[i]);
133 printf("%c", bwtTest[i]);
137 assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize());
141 for (ulong i = 0; i < p_->n; ++i)
142 if (bwt[i] != bwtTest[i])
144 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", "
145 << (unsigned)bwtTest[i] << std::endl;
150 #endif // TCB_TEST_BWT
153 TextCollection *result = new TCImplementation(bwt, (ulong)length,
154 p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, type);