6 #include "rlcsa_builder.h"
7 #include "misc/utils.h"
9 #ifdef MULTITHREAD_SUPPORT
18 RLCSABuilder::RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size, usint _threads) :
19 block_size(_block_size), sample_rate(_sample_rate), buffer_size(_buffer_size),
24 this->build_time = this->search_time = this->sort_time = this->merge_time = 0.0;
27 RLCSABuilder::~RLCSABuilder()
30 delete[] this->buffer;
33 //--------------------------------------------------------------------------
36 RLCSABuilder::insertSequence(char* sequence, usint length, bool delete_sequence)
38 if(sequence == 0 || length == 0 || !this->ok)
40 if(delete_sequence) { delete[] sequence; }
46 double start = readTimer();
47 RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
48 this->build_time += readTimer() - start;
49 this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
53 if(this->buffer_size - this->chars > length)
55 memcpy(this->buffer + this->chars, sequence, length);
56 if(delete_sequence) { delete[] sequence; }
57 this->chars += length;
58 this->buffer[this->chars] = 0;
66 this->buffer = new uchar[this->buffer_size];
68 if(length >= this->buffer_size - 1)
70 double start = readTimer();
71 RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
72 this->build_time += readTimer() - start;
73 this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
77 memcpy(this->buffer + this->chars, sequence, length);
78 if(delete_sequence) { delete[] sequence; }
79 this->chars += length;
80 this->buffer[this->chars] = 0;
87 RLCSABuilder::insertFromFile(const std::string& base_name)
89 if(!this->ok) { return; }
91 if(this->buffer != 0 && this->chars > 0)
94 this->buffer = new uchar[this->buffer_size];
97 std::ifstream input(base_name.c_str(), std::ios_base::binary);
98 if(!input) { return; }
99 RLCSA* increment = new RLCSA(base_name);
100 usint data_size = increment->getSize() + increment->getNumberOfSequences();
101 uchar* data = new uchar[data_size];
102 input.read((char*)data, data_size);
105 this->addRLCSA(increment, data, data_size, true);
109 RLCSABuilder::getRLCSA()
111 if(this->chars > 0) { this->flush(); }
113 RLCSA* temp = this->index;
120 RLCSABuilder::getBWT(usint& length)
125 if(this->buffer_size > 0) { this->buffer = new uchar[this->buffer_size]; }
128 if(this->index == 0 || !(this->ok))
134 length = this->index->getSize() + this->index->getNumberOfSequences();
135 return (char*)(this->index->readBWT());
145 RLCSABuilder::getBuildTime()
147 return this->build_time;
151 RLCSABuilder::getSearchTime()
153 return this->search_time;
157 RLCSABuilder::getSortTime()
159 return this->sort_time;
163 RLCSABuilder::getMergeTime()
165 return this->merge_time;
168 //--------------------------------------------------------------------------
171 RLCSABuilder::flush()
173 double start = readTimer();
174 RLCSA* temp = new RLCSA(this->buffer, this->chars, this->block_size, this->sample_rate, true, (this->index == 0));
175 this->build_time += readTimer() - start;
176 this->addRLCSA(temp, this->buffer, this->chars, (this->index != 0));
177 this->buffer = 0; this->chars = 0;
181 RLCSABuilder::addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence)
185 double start = readTimer();
187 usint sequences = increment->getNumberOfSequences();
188 usint* end_markers = new usint[sequences];
190 for(usint i = 0; i < length - 1; i++)
192 if(sequence[i] == 0) { end_markers[curr++] = i; }
194 end_markers[sequences - 1] = length - 1;
196 usint* positions = new usint[length]; usint begin;
197 #ifdef MULTITHREAD_SUPPORT
198 usint chunk = std::max((usint)1, sequences / (8 * this->threads));
199 omp_set_num_threads(this->threads);
200 #pragma omp parallel private(begin)
202 #pragma omp for schedule(dynamic, chunk)
203 for(sint i = 0; i < (sint)sequences; i++)
205 if(i > 0) { begin = end_markers[i - 1] + 1; } else { begin = 0; }
206 this->index->reportPositions(sequence + begin, end_markers[i] - begin, positions + begin);
210 for(sint i = 0; i < (sint)sequences; i++)
212 if(i > 0) { begin = end_markers[i - 1] + 1; } else { begin = 0; }
213 this->index->reportPositions(sequence + begin, end_markers[i] - begin, positions + begin);
216 delete[] end_markers;
217 if(delete_sequence) { delete[] sequence; }
219 double mark = readTimer();
220 this->search_time += mark - start;
222 #ifdef MULTITHREAD_SUPPORT
223 omp_set_num_threads(this->threads);
225 std::sort(positions, positions + length);
226 for(usint i = 0; i < length; i++)
228 positions[i] += i + 1; // +1 because the insertion will be after positions[i]
231 double sort = readTimer();
232 this->sort_time += sort - mark;
234 RLCSA* merged = new RLCSA(*(this->index), *increment, positions, this->block_size, this->threads);
238 this->index = merged;
240 this->merge_time += readTimer() - sort;
244 if(delete_sequence) { delete[] sequence; }
245 this->index = increment;
248 this->ok &= this->index->isOk();
252 RLCSABuilder::reset()
256 if(this->buffer_size != 0)
258 this->buffer = new uchar[this->buffer_size];