Added RLCSA index option
[SXSI/TextCollection.git] / incbwt / sasamples.cpp
1 #include <algorithm>
2 #include <fstream>
3 #include <iostream>
4 #include <vector>
5
6 #include "sasamples.h"
7 #include "misc/utils.h"
8
9 #ifdef MULTITHREAD_SUPPORT
10 #ifndef _GLIBCXX_PARALLEL
11 #include <mcstl.h>
12 #endif
13 #endif
14
15
16 namespace CSA
17 {
18
19
20 SASamples::SASamples(std::ifstream& sample_file, usint sample_rate) :
21   rate(sample_rate)
22 {
23   this->indexes = new DeltaVector(sample_file);
24
25   this->size = indexes->getSize();
26   this->items = indexes->getNumberOfItems();
27   this->integer_bits = length(this->items - 1);
28
29   this->samples = new ReadBuffer(sample_file, this->items, this->integer_bits);
30   this->buildInverseSamples();
31 }
32
33 SASamples::SASamples(uint* array, usint data_size, usint sample_rate) :
34   rate(sample_rate),
35   size(data_size)
36 {
37   this->items = (this->size - 1) / this->rate + 1;
38   this->integer_bits = length(this->items - 1);
39
40   WriteBuffer sample_buffer(this->items, this->integer_bits);
41   DeltaEncoder encoder(INDEX_BLOCK_SIZE);
42   for(usint i = 0; i < this->size; i++)
43   {
44     if(array[i] % sample_rate == 0)
45     {
46       encoder.setBit(i);
47       sample_buffer.writeItem(array[i] / sample_rate);
48     }
49   }
50
51   this->indexes = new DeltaVector(encoder, this->size);
52   this->samples = sample_buffer.getReadBuffer();
53   this->buildInverseSamples();
54 }
55
56 SASamples::SASamples(uint* inverse, DeltaVector* end_points, usint data_size, usint sample_rate) :
57   rate(sample_rate),
58   items(0)
59 {
60   usint sequences = end_points->getNumberOfItems();
61
62   DeltaVector::Iterator iter(*(end_points));
63
64   // Determine the samples, insert them into a vector, and sort them.
65   usint start = 0, end = iter.select(0);  // Closed range in padded collection.
66   usint seq_start = 0, seq_end = end;            // Closed range in inverse SA.
67   std::vector<pair_type>* vec = new std::vector<pair_type>();
68   for(usint i = 0; i < sequences; i++)
69   {
70     for(usint j = seq_start; j <= seq_end; j += this->rate)
71     {
72       vec->push_back(pair_type(inverse[j] - sequences - 1, this->items));
73       this->items++;
74     }
75     start = nextMultipleOf(this->rate, end);
76     end = iter.selectNext();
77     seq_start = seq_end + 2;
78     seq_end = seq_start + end - start;
79   }
80   #ifdef MULTITHREAD_SUPPORT
81     #ifdef _GLIBCXX_PARALLEL
82       std::sort(vec->begin(), vec->end(), __gnu_parallel::sequential_tag());
83     #else
84       std::sort(vec->begin(), vec->end(), mcstl::sequential_tag());
85     #endif
86   #else
87     std::sort(vec->begin(), vec->end());
88   #endif
89
90   // Compress the samples.
91   this->integer_bits = length(this->items - 1);
92   this->size = end + 1;
93   WriteBuffer sample_buffer(this->items, this->integer_bits);
94   DeltaEncoder encoder(INDEX_BLOCK_SIZE);
95   for(usint i = 0; i < this->items; i++)
96   {
97     pair_type sample = (*vec)[i];
98     encoder.setBit(sample.first);
99     sample_buffer.writeItem(sample.second);
100   }
101   delete vec;
102
103   this->indexes = new DeltaVector(encoder, this->size);
104   this->samples = sample_buffer.getReadBuffer();
105   this->buildInverseSamples();
106 }
107
108 SASamples::SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions, usint number_of_sequences) :
109   rate(index.rate),
110   size(index.size + increment.size),
111   items(index.items + increment.items)
112 {
113   this->mergeSamples(index, increment, positions, number_of_positions, number_of_sequences);
114
115   index.indexes = 0;
116   index.samples = 0;
117   index.inverse_samples = 0;
118   increment.indexes = 0;
119   increment.samples = 0;
120   increment.inverse_samples = 0;
121
122   this->buildInverseSamples();
123 }
124
125 SASamples::~SASamples()
126 {
127   delete this->indexes;
128   delete this->samples;
129   delete this->inverse_samples;
130 }
131
132 //--------------------------------------------------------------------------
133
134 void
135 SASamples::writeTo(std::ofstream& sample_file) const
136 {
137   this->indexes->writeTo(sample_file);
138   this->samples->writeBuffer(sample_file);
139 }
140
141 usint
142 SASamples::reportSize() const
143 {
144   usint bytes = sizeof(*this);
145   bytes += this->indexes->reportSize();
146   bytes += this->samples->reportSize();
147   bytes += this->inverse_samples->reportSize();
148   return bytes;
149 }
150
151 //--------------------------------------------------------------------------
152
153 void
154 SASamples::buildInverseSamples()
155 {
156   WriteBuffer inverse_buffer(this->items, this->integer_bits);
157   this->samples->goToItem(0);
158   for(usint i = 0; i < this->items; i++)
159   {
160     inverse_buffer.goToItem(this->samples->readItem());
161     inverse_buffer.writeItem(i);
162   }
163
164   this->inverse_samples = inverse_buffer.getReadBuffer();
165 }
166
167
168 //--------------------------------------------------------------------------
169
170 void
171 SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n, usint skip)
172 {
173   DeltaVector::Iterator first(*(index.indexes));
174   DeltaVector::Iterator second(*(increment.indexes));
175   ReadBuffer* first_samples = index.samples;
176   ReadBuffer* second_samples = increment.samples;
177
178   usint first_bit = first.select(0);
179   bool first_finished = false;
180   usint second_bit = second.select(0);
181   usint sum = index.items;
182   first_samples->goToItem(0);
183   second_samples->goToItem(0);
184
185   DeltaEncoder encoder(INDEX_BLOCK_SIZE);
186   this->integer_bits = length(this->items - 1);
187   WriteBuffer sample_buffer(this->items, this->integer_bits);
188   for(usint i = 0; i < n; i++, first_bit++)
189   {
190     while(!first_finished && first_bit < positions[i] - skip)
191     {
192       encoder.setBit(first_bit);
193       sample_buffer.writeItem(first_samples->readItem());
194       if(first.hasNext())
195       {
196         first_bit = first.selectNext() + i;
197       }
198       else
199       {
200         first_finished = true;
201       }
202     }
203
204     if(i == second_bit) // positions[i] is one
205     {
206       encoder.setBit(positions[i] - skip);
207       sample_buffer.writeItem(second_samples->readItem() + sum);
208       second_bit = second.selectNext();
209     }
210   }
211
212   while(!first_finished)
213   {
214     encoder.setBit(first_bit);
215     sample_buffer.writeItem(first_samples->readItem());
216     if(!first.hasNext()) { break; }
217     first_bit = first.selectNext() + n;
218   }
219
220   delete index.indexes;
221   delete index.samples;
222   delete index.inverse_samples;
223   delete increment.indexes;
224   delete increment.samples;
225   delete increment.inverse_samples;
226
227   this->indexes = new DeltaVector(encoder, size);
228   this->samples = sample_buffer.getReadBuffer();
229 }
230
231
232 } // namespace CSA