Debug swcsa
[SXSI/TextCollection.git] / incbwt / rlcsa_builder.cpp
1 #include <algorithm>
2 #include <cstdlib>
3 #include <cstring>
4 #include <iostream>
5
6 #include "rlcsa_builder.h"
7 #include "misc/utils.h"
8
9 #ifdef MULTITHREAD_SUPPORT
10 #include <omp.h>
11 #endif
12
13
14 namespace CSA
15 {
16
17
18 RLCSABuilder::RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size, usint _threads) :
19   block_size(_block_size), sample_rate(_sample_rate), buffer_size(_buffer_size),
20   threads(_threads),
21   buffer(0)
22 {
23   this->reset();
24   this->build_time = this->search_time = this->sort_time = this->merge_time = 0.0;
25 }
26
27 RLCSABuilder::~RLCSABuilder()
28 {
29   delete this->index;
30   delete[] this->buffer;
31 }
32
33 //--------------------------------------------------------------------------
34
35 void
36 RLCSABuilder::insertSequence(char* sequence, usint length, bool delete_sequence)
37 {
38   if(sequence == 0 || length == 0 || !this->ok)
39   {
40     if(delete_sequence) { delete[] sequence; }
41     return;
42   }
43
44   if(this->buffer == 0)
45   {
46     double start = readTimer();
47     RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
48     this->build_time += readTimer() - start;
49     this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
50     return;
51   }
52
53   if(this->buffer_size - this->chars > length)
54   {
55     memcpy(this->buffer + this->chars, sequence, length);
56     if(delete_sequence) { delete[] sequence; }
57     this->chars += length;
58     this->buffer[this->chars] = 0;
59     this->chars++;
60   }
61   else
62   {
63     if(this->chars > 0)
64     {
65       this->flush();
66       this->buffer = new uchar[this->buffer_size];
67     }
68     if(length >= this->buffer_size - 1)
69     {
70       double start = readTimer();
71       RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
72       this->build_time += readTimer() - start;
73       this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
74     }
75     else
76     {
77       memcpy(this->buffer + this->chars, sequence, length);
78       if(delete_sequence) { delete[] sequence; }
79       this->chars += length;
80       this->buffer[this->chars] = 0;
81       this->chars++;
82     }
83   }
84 }
85
86 void
87 RLCSABuilder::insertFromFile(const std::string& base_name)
88 {
89   if(!this->ok) { return; }
90
91   if(this->buffer != 0 && this->chars > 0)
92   {
93     this->flush();
94     this->buffer = new uchar[this->buffer_size];
95   }
96
97   std::ifstream input(base_name.c_str(), std::ios_base::binary);
98   if(!input) { return; }
99   RLCSA* increment = new RLCSA(base_name);
100   usint data_size = increment->getSize() + increment->getNumberOfSequences();
101   uchar* data = new uchar[data_size];
102   input.read((char*)data, data_size);
103   input.close();
104
105   this->addRLCSA(increment, data, data_size, true);
106 }
107
108 RLCSA*
109 RLCSABuilder::getRLCSA()
110 {
111   if(this->chars > 0) { this->flush(); }
112
113   RLCSA* temp = this->index;
114   this->reset();
115
116   return temp;
117 }
118
119 char*
120 RLCSABuilder::getBWT(usint& length)
121 {
122   if(this->chars > 0)
123   {
124     this->flush();
125     if(this->buffer_size > 0) { this->buffer = new uchar[this->buffer_size]; }
126   }
127
128   if(this->index == 0 || !(this->ok))
129   {
130     length = 0;
131     return 0;
132   }
133
134   length = this->index->getSize() + this->index->getNumberOfSequences();
135   return (char*)(this->index->readBWT());
136 }
137
138 bool
139 RLCSABuilder::isOk()
140 {
141   return this->ok;
142 }
143
144 double
145 RLCSABuilder::getBuildTime()
146 {
147   return this->build_time;
148 }
149
150 double
151 RLCSABuilder::getSearchTime()
152 {
153   return this->search_time;
154 }
155
156 double
157 RLCSABuilder::getSortTime()
158 {
159   return this->sort_time;
160 }
161
162 double
163 RLCSABuilder::getMergeTime()
164 {
165   return this->merge_time;
166 }
167
168 //--------------------------------------------------------------------------
169
170 void
171 RLCSABuilder::flush()
172 {
173   double start = readTimer();
174   RLCSA* temp = new RLCSA(this->buffer, this->chars, this->block_size, this->sample_rate, true, (this->index == 0));
175   this->build_time += readTimer() - start;
176   this->addRLCSA(temp, this->buffer, this->chars, (this->index != 0));
177   this->buffer = 0; this->chars = 0;
178 }
179
180 void
181 RLCSABuilder::addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence)
182 {
183   if(this->index != 0)
184   {
185     double start = readTimer();
186
187     usint sequences = increment->getNumberOfSequences();
188     usint* end_markers = new usint[sequences];
189     usint curr = 0;
190     for(usint i = 0; i < length - 1; i++)
191     {
192       if(sequence[i] == 0) { end_markers[curr++] = i; }
193     }
194     end_markers[sequences - 1] = length - 1;
195
196     usint* positions = new usint[length]; usint begin;
197     #ifdef MULTITHREAD_SUPPORT
198     usint chunk = std::max((usint)1, sequences / (8 * this->threads));
199     omp_set_num_threads(this->threads);
200     #pragma omp parallel private(begin)
201     {
202       #pragma omp for schedule(dynamic, chunk)
203       for(sint i = 0; i < (sint)sequences; i++)
204       {
205         if(i > 0) { begin = end_markers[i - 1] + 1; } else { begin = 0; }
206         this->index->reportPositions(sequence + begin, end_markers[i] - begin, positions + begin);
207       }
208     }
209     #else
210     for(sint i = 0; i < (sint)sequences; i++)
211     {
212       if(i > 0) { begin = end_markers[i - 1] + 1; } else { begin = 0; }
213       this->index->reportPositions(sequence + begin, end_markers[i] - begin, positions + begin);
214     }
215     #endif
216     delete[] end_markers;
217     if(delete_sequence) { delete[] sequence; }
218
219     double mark = readTimer();
220     this->search_time += mark - start;
221
222     #ifdef MULTITHREAD_SUPPORT
223     omp_set_num_threads(this->threads);
224     #endif
225     std::sort(positions, positions + length);
226     for(usint i = 0; i < length; i++)
227     {
228       positions[i] += i + 1;  // +1 because the insertion will be after positions[i]
229     }
230
231     double sort = readTimer();
232     this->sort_time += sort - mark;
233
234     RLCSA* merged = new RLCSA(*(this->index), *increment, positions, this->block_size, this->threads);
235     delete[] positions;
236     delete this->index;
237     delete increment;
238     this->index = merged;
239
240     this->merge_time += readTimer() - sort;  
241   }
242   else
243   {
244     if(delete_sequence) { delete[] sequence; }
245     this->index = increment;
246   }
247
248   this->ok &= this->index->isOk();
249 }
250
251 void
252 RLCSABuilder::reset()
253 {
254   this->index = 0;
255
256   if(this->buffer_size != 0)
257   {
258     this->buffer = new uchar[this->buffer_size];
259   }
260   this->chars = 0;
261
262   this->ok = true;
263 }
264
265
266 } // namespace CSA