8 #include "misc/utils.h"
9 #include "qsufsort/qsufsort.h"
16 RLCSA::RLCSA(const std::string& base_name, bool print) :
21 for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
23 std::string array_name = base_name + ARRAY_EXTENSION;
24 std::ifstream array_file(array_name.c_str(), std::ios_base::binary);
27 std::cerr << "RLCSA: Error opening Psi array file!" << std::endl;
31 usint distribution[CHARS];
32 array_file.read((char*)distribution, CHARS * sizeof(usint));
33 this->buildCharIndexes(distribution);
35 Parameters parameters;
36 parameters.read(base_name + PARAMETERS_EXTENSION);
37 for(usint c = 0; c < CHARS; c++)
39 if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); }
42 this->end_points = new DeltaVector(array_file);
43 this->number_of_sequences = this->end_points->getNumberOfItems();
45 array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate));
48 this->support_locate = parameters.get(SUPPORT_LOCATE);
49 this->support_display = parameters.get(SUPPORT_DISPLAY);
51 if(this->support_locate || this->support_display)
53 std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION;
54 std::ifstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary);
57 std::cerr << "RLCSA: Error opening suffix array sample file!" << std::endl;
60 this->sa_samples = new SASamples(sa_sample_file, this->sample_rate);
61 sa_sample_file.close();
64 if(print) { parameters.print(); }
69 RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data) :
72 sample_rate(sa_sample_rate),
75 for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
77 if(!data || bytes == 0)
79 std::cerr << "RLCSA: No input data given!" << std::endl;
80 if(delete_data) { delete[] data; }
83 if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0)
85 std::cerr << "RLCSA: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl;
86 if(delete_data) { delete[] data; }
91 // Do we store SA samples?
92 if(this->sample_rate > 0)
94 this->support_locate = this->support_display = true;
98 this->support_locate = this->support_display = false;
99 this->sample_rate = 1;
103 // Determine the number of sequences and mark their end points.
104 if(multiple_sequences)
106 DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE);
107 this->number_of_sequences = 0;
109 usint padding = 0, chars_encountered = 0;
111 for(usint i = 0; i < bytes; i++)
115 if(i == marker) { break; } // Empty sequence.
116 this->number_of_sequences++;
119 usint pos = chars_encountered + padding - 1;
121 padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - chars_encountered;
129 if(this->number_of_sequences == 0 || marker != bytes)
131 std::cerr << "RLCSA: Collection must consist of 0-terminated nonempty sequences!" << std::endl;
132 if(delete_data) { delete[] data; }
135 this->end_points = new DeltaVector(endings, chars_encountered + padding);
139 this->number_of_sequences = 1;
140 DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE, RLCSA::ENDPOINT_BLOCK_SIZE);
141 usint pos = bytes - 1;
143 usint padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - bytes;
144 this->end_points = new DeltaVector(endings, bytes + padding);
148 // Build character tables etc.
149 usint distribution[CHARS];
150 sint low = CHARS, high = 0;
151 for(usint c = 0; c < CHARS; c++) { distribution[c] = 0; }
152 for(usint i = 0; i < bytes; i++)
154 if(data[i] < low) { low = data[i]; }
155 if(data[i] > high) { high = data[i]; }
156 distribution[(usint)data[i]]++;
158 if(multiple_sequences)
162 high = high + this->number_of_sequences;
164 this->buildCharIndexes(distribution);
167 // Build suffix array.
168 sint* inverse = new sint[bytes + 1];
169 if(multiple_sequences)
172 for(usint i = 0; i < bytes; i++)
181 inverse[i] = (sint)data[i] + this->number_of_sequences;
187 for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; }
189 if(delete_data) { delete[] data; }
190 sint* sa = new sint[bytes + 1];
191 suffixsort(inverse, sa, bytes, high + 1, low);
195 usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA.
196 if(this->support_locate || this->support_display)
198 if(multiple_sequences)
200 std::cout << "We shouldn't be here!" << std::endl;
201 // Real SA starts at sa + incr.
203 // find starting position
204 // sample relative to starting position
205 // sort samples to SA order
209 this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate);
215 for(usint i = 0; i <= bytes; i++)
217 sa[i] = inverse[(sa[i] + 1) % (bytes + 1)];
223 usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences.
224 for(usint c = 0; c < CHARS; c++)
226 if(distribution[c] == 0) { this->array[c] = 0; continue; }
228 usint* curr = (usint*)(sa + index_ranges[c].first + incr);
229 usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1);
230 RLEEncoder encoder(block_size);
231 pair_type run(*curr, 1);
234 for(; curr < limit; curr++)
236 if(*curr == run.first + run.second) { run.second++; }
239 encoder.setRun(run.first - decr, run.second);
240 run = pair_type(*curr, 1);
243 encoder.setRun(run.first - decr, run.second);
245 this->array[c] = new RLEVector(encoder, this->data_size + incr - decr);
253 RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) :
258 for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
260 if(!index.isOk() || !increment.isOk())
262 return; // Fail silently. Actual error has already been reported.
266 std::cerr << "RLCSA: Positions for insertions not available!" << std::endl;
269 if(index.sample_rate != increment.sample_rate)
271 std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl;
272 std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl;
276 if(index.sa_samples == 0 || increment.sa_samples == 0)
278 this->support_locate = this->support_display = false;
282 this->support_locate = this->support_display = true;
286 // Build character tables etc.
287 usint distribution[CHARS];
288 for(usint c = 0; c < CHARS; c++)
290 distribution[c] = length(index.index_ranges[c]) + length(increment.index_ranges[c]);
292 this->buildCharIndexes(distribution);
293 this->sample_rate = index.sample_rate;
296 // Combine end points of sequences.
297 this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences;
298 DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
300 endings->setBit(index.end_points->select(0));
301 for(usint i = 1; i < index.number_of_sequences; i++)
303 endings->setBit(index.end_points->selectNext());
305 usint sum = index.end_points->getSize();
306 delete index.end_points; index.end_points = 0;
308 endings->setBit(sum + increment.end_points->select(0));
309 for(usint i = 1; i < increment.number_of_sequences; i++)
311 endings->setBit(sum + increment.end_points->selectNext());
313 sum += increment.end_points->getSize();
314 delete increment.end_points; increment.end_points = 0;
316 this->end_points = new DeltaVector(*endings, sum);
321 usint psi_size = this->data_size + this->number_of_sequences;
322 for(usint c = 0; c < CHARS; c++)
324 if(distribution[c] == 0) { this->array[c] = 0; continue; }
325 this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
327 increment.array[c] = 0;
329 if(this->array[c] == 0)
331 std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
337 // Combine suffix array samples.
338 if(this->support_locate || this->support_display)
340 positions += increment.number_of_sequences;
341 for(usint i = 0; i < increment.data_size; i++)
343 positions[i] -= this->number_of_sequences;
345 this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size);
353 for(usint c = 0; c < CHARS; c++) { delete this->array[c]; }
354 delete this->sa_samples;
355 delete this->end_points;
359 RLCSA::writeTo(const std::string& base_name)
361 std::string array_name = base_name + ARRAY_EXTENSION;
362 std::ofstream array_file(array_name.c_str(), std::ios_base::binary);
365 std::cerr << "RLCSA: Error creating Psi array file!" << std::endl;
369 usint distribution[CHARS];
370 for(usint c = 0; c < CHARS; c++)
372 distribution[c] = length(this->index_ranges[c]);
374 array_file.write((char*)distribution, CHARS * sizeof(usint));
375 for(usint c = 0; c < CHARS; c++)
377 if(this->array[c] != 0)
379 this->array[c]->writeTo(array_file);
383 this->end_points->writeTo(array_file);
384 array_file.write((char*)&(this->sample_rate), sizeof(this->sample_rate));
387 if(this->support_locate || this->support_display)
389 std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION;
390 std::ofstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary);
393 std::cerr << "RLCSA: Error creating suffix array sample file!" << std::endl;
397 this->sa_samples->writeTo(sa_sample_file);
398 sa_sample_file.close();
408 //--------------------------------------------------------------------------
411 RLCSA::count(const std::string& pattern)
413 if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); }
415 pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())];
416 if(isEmpty(index_range)) { return index_range; }
417 index_range.first += this->number_of_sequences;
418 index_range.second += this->number_of_sequences;
420 for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++)
422 RLEVector* vector = this->array[(usint)*iter];
423 usint start = this->index_ranges[(usint)*iter].first;
425 index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences;
426 index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences;
428 if(isEmpty(index_range)) { return index_range; }
431 // Suffix array indexes are 0-based.
432 index_range.first -= this->number_of_sequences;
433 index_range.second -= this->number_of_sequences;
439 RLCSA::reportPositions(uchar* data, usint length, usint* positions)
441 if(data == 0 || length == 0 || positions == 0) { return; }
443 usint current = this->number_of_sequences - 1;
444 positions[length] = current; // "immediately after current"
445 for(sint i = (sint)(length - 1); i >= 0; i--)
447 // positions[i] = current; // "immediately after current"
448 usint c = (usint)data[i];
451 current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences;
455 if(c < this->text_chars[0]) // No previous characters either.
457 current = this->number_of_sequences - 1;
461 current = this->index_ranges[c].first - 1 + this->number_of_sequences;
464 positions[i] = current; // "immediately after current"
468 //--------------------------------------------------------------------------
471 RLCSA::locate(pair_type range)
473 if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; }
475 LocateItem* data = new LocateItem[range.second + 1 - range.first];
476 if(!data) { return 0; }
477 this->locateUnsafe(range, data);
483 RLCSA::locate(pair_type range, LocateItem* data)
485 if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; }
486 this->locateUnsafe(range, data);
491 RLCSA::locateUnsafe(pair_type range, LocateItem* data)
493 usint items = range.second + 1 - range.first;
494 for(usint i = 0, j = range.first; i < items; i++, j++)
496 data[i].value = j + this->number_of_sequences;
498 data[i].found = false;
505 pair_type run = EMPTY_PAIR;
506 for(usint i = 0; i < items; i++)
510 continue; // The run might continue after this.
512 else if(isEmpty(run))
514 run = pair_type(i, i);
516 else if(data[i].value - data[run.first].value == i - run.first)
522 found &= this->processRun(run, data);
523 run = pair_type(i, i);
526 if(!isEmpty(run)) { found &= this->processRun(run, data); }
531 RLCSA::processRun(pair_type run, LocateItem* data)
534 usint run_start = 0, run_left = 0;
535 pair_type next_sample = pair_type(0, 0);
537 for(usint i = run.first; i <= run.second; i++)
541 if(run_left > 0) { run_left--; }
544 if(data[i].value < this->number_of_sequences) // Implicit sample here.
546 data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset;
547 data[i].found = true;
548 if(run_left > 0) { run_left--; }
551 if(next_sample.first < data[i].value) // Need another sample.
553 next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences);
554 next_sample.first += this->number_of_sequences;
556 if(data[i].value < next_sample.first) // No sample found for current position.
560 data[i].value = data[run_start].value + i - run_start;
565 pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i);
566 data[i].value = value.first;
567 run_left = value.second;
573 else // Sampled position found.
575 data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset;
576 data[i].found = true;
577 if(run_left > 0) { run_left--; }
584 RLCSA::locate(usint index)
586 if(!(this->support_locate) || index >= this->data_size) { return this->data_size; }
589 index += this->number_of_sequences;
592 if(index < this->number_of_sequences) // Implicit sample here
594 return this->end_points->select(index) + 1 - offset;
596 pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences);
597 next_sample.first += this->number_of_sequences;
598 if(next_sample.first == index)
600 return this->sa_samples->getSample(next_sample.second) - offset;
602 index = this->psi(index - this->number_of_sequences);
607 //--------------------------------------------------------------------------
610 RLCSA::display(usint sequence, pair_type range)
612 if(!(this->support_display) || isEmpty(range)) { return 0; }
614 pair_type seq_range = this->getSequence(sequence);
615 if(isEmpty(seq_range)) { return 0; }
617 range.first += seq_range.first; range.second += seq_range.first;
618 if(range.second > seq_range.second) { return 0; }
620 uchar* data = new uchar[range.second + 1 - range.first];
621 if(!data) { return 0; }
622 this->displayUnsafe(range, data);
628 RLCSA::display(usint sequence, pair_type range, uchar* data)
630 if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; }
632 pair_type seq_range = this->getSequence(sequence);
633 if(isEmpty(seq_range)) { return 0; }
635 range.first += seq_range.first; range.second += seq_range.first;
636 if(range.second > seq_range.second) { return 0; }
638 this->displayUnsafe(range, data);
643 RLCSA::displayUnsafe(pair_type range, uchar* data)
645 usint i = range.first - range.first % this->sa_samples->getSampleRate();
647 usint pos = this->sa_samples->inverseSA(i);
649 for(; i < range.first; i++)
651 pos = this->psi(pos) - this->number_of_sequences;
653 for(; i <= range.second; i++)
655 data[i - range.first] = this->getCharacter(pos);
656 pos = this->psi(pos) - this->number_of_sequences;
660 //--------------------------------------------------------------------------
663 RLCSA::decompressInto(std::ofstream& psi_file)
665 for(usint c = 0; c < CHARS; c++)
667 if(!(this->array[c])) { continue; }
668 usint value = this->array[c]->select(0);
669 psi_file.write((char*)&(value), sizeof(value));
670 while(this->array[c]->hasNext())
672 value = this->array[c]->selectNext();
673 psi_file.write((char*)&value, sizeof(value));
681 usint n = this->data_size + this->number_of_sequences;
683 uchar* bwt = new uchar[n];
686 for(usint c = 0; c < CHARS; c++)
688 RLEVector* vector = this->array[c];
691 bwt[vector->select(0)] = c;
692 while(vector->hasNext()) { bwt[vector->selectNext()] = c; }
699 //--------------------------------------------------------------------------
702 RLCSA::psi(usint index)
704 if(index >= this->data_size)
706 return this->data_size + this->number_of_sequences;
709 usint c = this->getCharacter(index);
710 return this->array[c]->select(index - this->index_ranges[c].first);
714 RLCSA::psi(usint index, usint max_length)
716 if(index >= this->data_size)
718 return pair_type(this->data_size + this->number_of_sequences, 0);
721 usint c = this->getCharacter(index);
722 return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length);
725 //--------------------------------------------------------------------------
728 RLCSA::getSequence(usint number)
730 if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
736 result.second = this->end_points->select(number);
740 if(this->sa_samples == 0) { return EMPTY_PAIR; }
741 usint d = this->sa_samples->getSampleRate();
742 result.first = d * ((this->end_points->select(number - 1) / d) + 1);
743 result.second = this->end_points->selectNext();
750 RLCSA::reportSize(bool print)
752 usint bytes = 0, temp = 0;
754 for(usint c = 0; c < CHARS; c++)
756 if(this->array[c]) { temp += this->array[c]->reportSize(); }
758 if(print) { std::cout << "Psi array: " << temp << std::endl; }
761 if(this->support_locate || this->support_display)
763 temp = this->sa_samples->reportSize();
764 if(print) { std::cout << "SA samples: " << temp << std::endl; }
768 temp = sizeof(*this) + this->end_points->reportSize();
769 if(print) { std::cout << "RLCSA overhead: " << temp << std::endl; }
774 std::cout << "Total size: " << bytes << std::endl;
775 std::cout << std::endl;
781 //--------------------------------------------------------------------------
784 RLCSA::buildCharIndexes(usint* distribution)
786 this->data_size = buildRanges(distribution, this->index_ranges);
789 for(; c < CHARS; c++)
791 if(!isEmpty(this->index_ranges[c]))
793 this->text_chars[i] = c;
799 this->index_rate = std::max((this->data_size + CHARS - 1) / CHARS, (usint)1);
802 for(c = 0, i = 0; c < this->chars; c++)
804 pair_type range = this->index_ranges[this->text_chars[c]];
805 while(current <= range.second)
807 this->index_pointers[i] = c;
808 current += this->index_rate;
815 buildRanges(usint* distribution, pair_type* index_ranges)
817 if(distribution == 0 || index_ranges == 0) { return 0; }
820 for(usint c = 0; c < CHARS; c++)
822 if(distribution[c] == 0)
824 if(sum == 0) { index_ranges[c] = EMPTY_PAIR; }
825 else { index_ranges[c] = pair_type(sum, sum - 1); }
829 index_ranges[c].first = sum;
830 sum += distribution[c];
831 index_ranges[c].second = sum - 1;