Added RLCSA index option
[SXSI/TextCollection.git] / incbwt / bits / array.cpp
1 #include <cstdlib>
2
3 #include "array.h"
4
5
6 namespace CSA
7 {
8
9
10 Array::Array(std::ifstream& file) :
11   delete_array(true), item_index(0)
12 {
13   file.read((char*)&(this->items), sizeof(this->items));
14   file.read((char*)&(this->number_of_blocks), sizeof(this->number_of_blocks));
15   file.read((char*)&(this->block_size), sizeof(this->block_size));
16
17   this->array = new usint[this->block_size * this->number_of_blocks];
18   file.read((char*)(this->array), this->block_size * this->number_of_blocks * sizeof(usint));
19   this->buffer = new ReadBuffer(this->array, this->block_size);
20
21   this->integer_bits = length(this->items);
22   this->samples = new ReadBuffer(file, this->number_of_blocks + 1, this->integer_bits);
23
24   this->buildIndex();
25 }
26
27 Array::Array(ArrayEncoder& encoder) :
28   items(encoder.items),
29   delete_array(true),
30   block_size(encoder.block_size),
31   number_of_blocks(encoder.blocks),
32   item_index(0)
33 {
34   usint* array_buffer = new usint[this->block_size * this->number_of_blocks];
35   this->array = array_buffer;
36   this->buffer = new ReadBuffer(this->array, this->block_size);
37
38   this->integer_bits = length(this->items);
39   WriteBuffer sample_buffer(this->number_of_blocks + 1, this->integer_bits);
40
41   // Copy & linearize the array.
42   usint pos = 0;
43   for(std::list<usint*>::iterator iter = encoder.array_blocks.begin(); iter != encoder.array_blocks.end(); iter++)
44   {
45     memcpy(array_buffer + pos, *iter, encoder.superblock_bytes);
46     pos += encoder.block_size * encoder.blocks_in_superblock;
47   }
48   memcpy(array_buffer + pos, encoder.array, encoder.current_blocks * encoder.block_size * sizeof(usint));
49
50   // Compress the samples.
51   for(std::list<usint*>::iterator iter = encoder.sample_blocks.begin(); iter != encoder.sample_blocks.end(); iter++)
52   {
53     usint* buf = *iter;
54     for(usint i = 0; i < encoder.samples_in_superblock; i++)
55     {
56       sample_buffer.writeItem(buf[i]);
57     }
58   }
59   for(usint i = 0; i < encoder.current_samples; i++)
60   {
61     sample_buffer.writeItem(encoder.samples[i]);
62   }
63   sample_buffer.writeItem(this->items);
64
65   this->samples = sample_buffer.getReadBuffer();
66
67   this->buildIndex();
68 }
69
70 Array::~Array()
71 {
72   if(this->delete_array) { delete[] this->array; }
73   delete   this->buffer;
74   delete   this->samples;
75   delete   this->item_index;
76 }
77
78 //--------------------------------------------------------------------------
79
80 void
81 Array::writeTo(std::ofstream& file) const
82 {
83   file.write((char*)&(this->items), sizeof(this->items));
84   file.write((char*)&(this->number_of_blocks), sizeof(this->number_of_blocks));
85   file.write((char*)&(this->block_size), sizeof(this->block_size));
86   file.write((char*)(this->array), this->block_size * this->number_of_blocks * sizeof(usint));
87   this->samples->writeBuffer(file);
88 }
89
90 usint
91 Array::reportSize() const
92 {
93   usint bytes = sizeof(*this);
94   bytes += this->buffer->reportSize();
95   bytes += this->block_size * this->number_of_blocks * sizeof(usint);
96   bytes += this->samples->reportSize();
97   bytes += this->item_index->reportSize();
98   return bytes;
99 }
100
101 //--------------------------------------------------------------------------
102
103 void
104 Array::buildIndex()
105 {
106   delete this->item_index;
107
108   usint index_samples = (this->number_of_blocks + Array::INDEX_RATE - 1) / Array::INDEX_RATE;
109   this->index_rate = (this->items + index_samples - 1) / index_samples;
110   index_samples = (this->items + this->index_rate - 1) / this->index_rate + 1;
111   WriteBuffer index_buffer(index_samples, length(this->number_of_blocks - 1));
112
113   usint current = 0, pointer = 0;
114   this->samples->goToItem(1);
115   while(this->samples->hasNextItem())
116   {
117     usint limit = this->samples->readItem();
118     while(current < limit)
119     {
120       index_buffer.writeItem(pointer);
121       current += this->index_rate;
122     }
123     pointer++;
124   }
125   index_buffer.writeItem(this->number_of_blocks - 1);
126
127   this->item_index = index_buffer.getReadBuffer();
128 }
129
130 //--------------------------------------------------------------------------
131
132 Array::Iterator::Iterator(const Array& par) :
133   parent(par),
134   buffer(*(par.buffer)),
135   samples(*(par.samples))
136 {
137 }
138
139 Array::Iterator::~Iterator()
140 {
141 }
142
143 usint
144 Array::Iterator::getItem(usint index)
145 {
146   if(index >= this->parent.items) { return 0; }
147   this->getSample(this->sampleForIndex(index));
148
149   usint value = 0;
150   while(this->sample + this->cur <= index)
151   {
152     value = this->buffer.readDeltaCode();
153     this->cur++;
154   }
155
156   return value;
157 }
158
159 usint
160 Array::Iterator::nextItem()
161 {
162   if(this->cur >= this->block_items)
163   {
164     this->getSample(this->block + 1);
165   }
166
167   this->cur++;
168   return this->buffer.readDeltaCode();
169 }
170
171 usint
172 Array::Iterator::sampleForIndex(usint index)
173 {
174   usint low = this->parent.item_index->readItemConst(index / this->parent.index_rate);
175   usint high = this->parent.number_of_blocks - 1;
176
177   this->samples.goToItem(low + 1);
178   for(; low < high; low++)
179   {
180     if(this->samples.readItem() > index) { return low; }
181   }
182
183   return low;
184 }
185
186 //--------------------------------------------------------------------------
187
188 ArrayEncoder::ArrayEncoder(usint block_bytes, usint superblock_size) :
189   items(0), blocks(1),
190   block_size(BYTES_TO_WORDS(block_bytes)),
191   superblock_bytes(superblock_size)
192 {
193   this->array = new usint[this->superblock_bytes / sizeof(usint)];
194   memset(this->array, 0, this->superblock_bytes);
195   this->blocks_in_superblock = this->superblock_bytes / (sizeof(usint) * this->block_size);
196   this->current_blocks = 1;
197
198   this->samples = new usint[this->superblock_bytes / sizeof(usint)];
199   this->samples_in_superblock = this->superblock_bytes / sizeof(usint);
200   this->current_samples = 1;
201   this->samples[0] = 0;
202
203   this->buffer = new WriteBuffer(this->array, this->block_size);
204 }
205
206 ArrayEncoder::~ArrayEncoder()
207 {
208   delete[] this->array;
209
210   delete this->buffer;
211   for(std::list<usint*>::iterator iter = this->array_blocks.begin(); iter != this->array_blocks.end(); iter++)
212   {
213     delete[] *iter;
214   }
215
216   delete[] this->samples;
217   for(std::list<usint*>::iterator iter = this->sample_blocks.begin(); iter != this->sample_blocks.end(); iter++)
218   {
219     delete[] *iter;
220   }
221 }
222
223 void
224 ArrayEncoder::addItem(usint value)
225 {
226   this->items++;
227   if(this->buffer->writeDeltaCode(value)) { return; }
228
229   // Didn't fit into the block. A new block & sample required.
230   this->blocks++;
231   this->current_blocks++;
232   this->current_samples++;
233
234   // Do we need a new superblock for the block?
235   if(this->current_blocks > this->blocks_in_superblock)
236   {
237     this->array_blocks.push_back(this->array);
238     this->array = new usint[this->superblock_bytes / sizeof(usint)];
239     memset(this->array, 0, this->superblock_bytes);
240     this->current_blocks = 1;
241   }
242   this->buffer->moveBuffer(this->array + (this->block_size * (this->current_blocks - 1)));
243
244   // Do we need a new superblock for the sample?
245   if(this->current_samples > this->samples_in_superblock)
246   {
247     this->sample_blocks.push_back(this->samples);
248     this->samples = new usint[this->superblock_bytes / sizeof(usint)];
249     this->current_samples = 1;
250   }
251   this->samples[this->current_samples - 1] = this->items - 1;
252
253   // Finally, write the item.
254   this->buffer->writeDeltaCode(value);
255 }
256
257
258 } // namespace CSA