43d5a7d981fd379d17e44caaeb4505518fc50422
[SXSI/TextCollection.git] / incbwt / rlcsa_builder.cpp
1 #include <iostream>
2
3 #include "rlcsa_builder.h"
4
5
6 namespace CSA
7 {
8
9
10 RLCSABuilder::RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size) :
11   block_size(_block_size), sample_rate(_sample_rate), buffer_size(_buffer_size),
12   buffer(0)
13 {
14   this->reset();
15 }
16
17 RLCSABuilder::~RLCSABuilder()
18 {
19   delete this->index;
20   delete[] this->buffer;
21 }
22
23 //--------------------------------------------------------------------------
24
25 void
26 RLCSABuilder::insertSequence(char* sequence, usint length, bool delete_sequence)
27 {
28   if(sequence == 0 || length == 0 || !this->ok)
29   {
30     if(delete_sequence) { delete[] sequence; }
31     return;
32   }
33
34   if(this->buffer == 0)
35   {
36     clock_t start = clock();
37     RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
38     this->build_time += clock() - start;
39     this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
40     return;
41   }
42
43   if(this->buffer_size - this->chars > length)
44   {
45     memcpy(this->buffer + this->chars, sequence, length);
46     if(delete_sequence) { delete[] sequence; }
47     this->chars += length;
48     this->buffer[this->chars] = 0;
49     this->chars++;
50   }
51   else
52   {
53     this->flush();
54     this->buffer = new uchar[this->buffer_size];
55     if(length >= this->buffer_size - 1)
56     {
57       clock_t start = clock();
58       RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
59       this->build_time += clock() - start;
60       this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
61     }
62     else
63     {
64       memcpy(this->buffer + this->chars, sequence, length);
65       if(delete_sequence) { delete[] sequence; }
66       this->chars += length;
67       this->buffer[this->chars] = 0;
68       this->chars++;
69     }
70   }
71 }
72
73 RLCSA*
74 RLCSABuilder::getRLCSA()
75 {
76   if(this->chars > 0) { this->flush(); }
77
78   RLCSA* temp = this->index;
79   this->reset();
80
81   return temp;
82 }
83
84 char*
85 RLCSABuilder::getBWT(usint& length)
86 {
87   if(this->chars > 0)
88   {
89     this->flush();
90     if(this->buffer_size > 0) { this->buffer = new uchar[this->buffer_size]; }
91   }
92
93   if(this->index == 0 || !(this->ok))
94   {
95     length = 0;
96     return 0;
97   }
98
99   length = this->index->getSize() + this->index->getNumberOfSequences();
100   return (char*)(this->index->readBWT());
101 }
102
103 bool
104 RLCSABuilder::isOk()
105 {
106   return this->ok;
107 }
108
109 double
110 RLCSABuilder::getBuildTime()
111 {
112   return this->build_time / (double)CLOCKS_PER_SEC;
113 }
114
115 double
116 RLCSABuilder::getSearchTime()
117 {
118   return this->search_time / (double)CLOCKS_PER_SEC;
119 }
120
121 double
122 RLCSABuilder::getMergeTime()
123 {
124   return this->merge_time / (double)CLOCKS_PER_SEC;
125 }
126
127 //--------------------------------------------------------------------------
128
129 void
130 RLCSABuilder::flush()
131 {
132   clock_t start = clock();
133   RLCSA* temp = new RLCSA(this->buffer, this->chars, this->block_size, this->sample_rate, true, (this->index == 0));
134   this->build_time += clock() - start;
135   this->addRLCSA(temp, this->buffer, this->chars, true);
136   this->buffer = 0; this->chars = 0;
137 }
138
139 void
140 RLCSABuilder::addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence)
141 {
142   if(this->index != 0)
143   {
144     clock_t start = clock();
145
146     usint* positions = new usint[length];
147     usint begin = 0;
148     for(usint i = 0; i < length - 1; i++)
149     {
150       if(sequence[i] == 0)
151       {
152         this->index->reportPositions(&(sequence[begin]), i - begin, &(positions[begin]));
153         begin = i + 1;
154       }
155     }
156     this->index->reportPositions(&(sequence[begin]), length - 1 - begin, &(positions[begin]));
157
158     std::sort(positions, positions + length);
159     for(usint i = 0; i < length; i++)
160     {
161       positions[i] += i + 1;  // +1 because the insertion will be after positions[i]
162     }
163     if(delete_sequence) { delete[] sequence; }
164
165     clock_t mark = clock();
166     this->search_time += mark - start;
167
168     RLCSA* merged = new RLCSA(*(this->index), *increment, positions, this->block_size);
169     delete[] positions;
170     delete this->index;
171     delete increment;
172     this->index = merged;
173
174     this->merge_time += clock() - mark;  
175   }
176   else
177   {
178     this->index = increment;
179   }
180
181   this->ok &= this->index->isOk();
182 }
183
184 void
185 RLCSABuilder::reset()
186 {
187   this->index = 0;
188
189   if(this->buffer_size != 0)
190   {
191     this->buffer = new uchar[this->buffer_size];
192   }
193   this->chars = 0;
194
195   this->ok = true;
196 }
197
198
199 } // namespace CSA