9 #include "misc/utils.h"
10 #include "qsufsort/qsufsort.h"
11 #include "bits/vectors.h"
14 #ifdef MULTITHREAD_SUPPORT
24 RLCSA::RLCSA(const std::string& base_name, bool print) :
29 for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
31 std::string array_name = base_name + ARRAY_EXTENSION;
32 std::ifstream array_file(array_name.c_str(), std::ios_base::binary);
35 std::cerr << "RLCSA: Error opening Psi array file!" << std::endl;
39 usint distribution[CHARS];
40 array_file.read((char*)distribution, CHARS * sizeof(usint));
41 this->buildCharIndexes(distribution);
43 // Parameters parameters;
44 // parameters.read(base_name + PARAMETERS_EXTENSION);
45 for(usint c = 0; c < CHARS; c++)
47 if(!isEmpty(this->index_ranges[c])) { this->array[c] = new PsiVector(array_file); }
50 this->end_points = new DeltaVector(array_file);
51 this->number_of_sequences = this->end_points->getNumberOfItems();
53 array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate));
56 this->support_locate = true;// parameters.get(SUPPORT_LOCATE);
57 this->support_display = true; //parameters.get(SUPPORT_DISPLAY);
59 if(this->support_locate || this->support_display)
61 std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION;
62 std::ifstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary);
65 std::cerr << "RLCSA: Error opening suffix array sample file!" << std::endl;
68 this->sa_samples = new SASamples(sa_sample_file, this->sample_rate);
69 sa_sample_file.close();
72 // if(print) { parameters.print(); }
77 RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data) :
80 sample_rate(sa_sample_rate),
83 for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
85 if(!data || bytes == 0)
87 std::cerr << "RLCSA: No input data given!" << std::endl;
88 if(delete_data) { delete[] data; }
91 if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0)
93 std::cerr << "RLCSA: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl;
94 if(delete_data) { delete[] data; }
99 // Do we store SA samples?
100 if(this->sample_rate > 0)
102 this->support_locate = this->support_display = true;
106 this->support_locate = this->support_display = false;
107 this->sample_rate = 1;
111 // Determine the number of sequences and mark their end points.
112 if(multiple_sequences)
114 DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE);
115 this->number_of_sequences = 0;
117 usint padding = 0, chars_encountered = 0;
119 for(usint i = 0; i < bytes; i++)
123 if(i == marker) { break; } // Empty sequence.
124 this->number_of_sequences++;
127 usint pos = chars_encountered + padding - 1;
129 padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - chars_encountered;
137 if(this->number_of_sequences == 0 || marker != bytes)
139 std::cerr << "RLCSA: Collection must consist of 0-terminated nonempty sequences!" << std::endl;
140 if(delete_data) { delete[] data; }
143 this->end_points = new DeltaVector(endings, chars_encountered + padding);
147 this->number_of_sequences = 1;
148 DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE, RLCSA::ENDPOINT_BLOCK_SIZE);
149 usint pos = bytes - 1;
151 usint padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - bytes;
152 this->end_points = new DeltaVector(endings, bytes + padding);
156 // Build character tables etc.
157 usint distribution[CHARS];
158 sint low = CHARS, high = 0;
159 for(usint c = 0; c < CHARS; c++) { distribution[c] = 0; }
160 for(usint i = 0; i < bytes; i++)
162 if(data[i] < low) { low = data[i]; }
163 if(data[i] > high) { high = data[i]; }
164 distribution[(usint)data[i]]++;
166 if(multiple_sequences)
170 high = high + this->number_of_sequences;
172 this->buildCharIndexes(distribution);
175 // Build suffix array.
176 int* inverse = new int[bytes + 1];
177 if(multiple_sequences)
180 for(usint i = 0; i < bytes; i++)
189 inverse[i] = (int)(data[i] + this->number_of_sequences);
195 for(usint i = 0; i < bytes; i++) { inverse[i] = (int)data[i]; }
197 if(delete_data) { delete[] data; }
198 int* sa = new int[bytes + 1];
199 suffixsort(inverse, sa, bytes, high + 1, low);
203 if(this->support_locate || this->support_display)
205 if(multiple_sequences)
207 this->sa_samples = new SASamples((uint*)inverse, this->end_points, this->data_size, this->sample_rate);
211 this->sa_samples = new SASamples((uint*)(sa + 1), this->data_size, this->sample_rate);
217 for(usint i = 0; i <= bytes; i++)
219 sa[i] = inverse[(sa[i] + 1) % (bytes + 1)];
225 usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA.
226 usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences.
227 for(usint c = 0; c < CHARS; c++)
229 if(distribution[c] == 0) { this->array[c] = 0; continue; }
231 uint* curr = (uint*)(sa + index_ranges[c].first + incr);
232 uint* limit = (uint*)(sa + index_ranges[c].second + incr + 1);
233 PsiVector::Encoder encoder(block_size);
234 pair_type run(*curr, 1);
237 for(; curr < limit; curr++)
239 if(*curr == run.first + run.second) { run.second++; }
242 encoder.setRun(run.first - decr, run.second);
243 run = pair_type(*curr, 1);
246 encoder.setRun(run.first - decr, run.second);
248 this->array[c] = new PsiVector(encoder, this->data_size + incr - decr);
256 RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads) :
261 for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
263 if(!index.isOk() || !increment.isOk())
265 return; // Fail silently. Actual error has already been reported.
269 std::cerr << "RLCSA: Positions for insertions not available!" << std::endl;
272 if(index.sample_rate != increment.sample_rate)
274 std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl;
278 if(index.sa_samples == 0 || increment.sa_samples == 0)
280 this->support_locate = this->support_display = false;
284 this->support_locate = this->support_display = true;
288 // Build character tables etc.
289 usint distribution[CHARS];
290 for(usint c = 0; c < CHARS; c++)
292 distribution[c] = length(index.index_ranges[c]) + length(increment.index_ranges[c]);
294 this->buildCharIndexes(distribution);
295 this->sample_rate = index.sample_rate;
296 this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences;
298 // Merge end points, SA samples, and Psi.
299 usint psi_size = this->data_size + this->number_of_sequences;
300 bool should_be_ok = true;
302 #ifdef MULTITHREAD_SUPPORT
303 omp_set_num_threads(threads);
306 #pragma omp for schedule(dynamic, 1)
308 for(int c = -2; c < (int)CHARS; c++)
310 if(c == -2) { this->mergeEndPoints(index, increment); }
311 else if(c == -1) { this->mergeSamples(index, increment, positions); }
312 else if(distribution[c] != 0)
314 this->array[c] = mergeVectors<PsiVector, PsiVector::Encoder, PsiVector::Iterator>(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
316 increment.array[c] = 0;
318 if(this->array[c] == 0)
320 std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
321 should_be_ok = false;
325 #ifdef MULTITHREAD_SUPPORT
329 this->ok = should_be_ok;
334 for(usint c = 0; c < CHARS; c++) { delete this->array[c]; }
335 delete this->sa_samples;
336 delete this->end_points;
339 //--------------------------------------------------------------------------
342 RLCSA::writeTo(const std::string& base_name) const
344 std::string array_name = base_name + ARRAY_EXTENSION;
345 std::ofstream array_file(array_name.c_str(), std::ios_base::binary);
348 std::cerr << "RLCSA: Error creating Psi array file!" << std::endl;
352 usint distribution[CHARS];
353 for(usint c = 0; c < CHARS; c++)
355 distribution[c] = length(this->index_ranges[c]);
357 array_file.write((char*)distribution, CHARS * sizeof(usint));
358 for(usint c = 0; c < CHARS; c++)
360 if(this->array[c] != 0)
362 this->array[c]->writeTo(array_file);
366 this->end_points->writeTo(array_file);
367 array_file.write((char*)&(this->sample_rate), sizeof(this->sample_rate));
370 if(this->support_locate || this->support_display)
372 std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION;
373 std::ofstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary);
376 std::cerr << "RLCSA: Error creating suffix array sample file!" << std::endl;
380 this->sa_samples->writeTo(sa_sample_file);
381 sa_sample_file.close();
391 //--------------------------------------------------------------------------
394 RLCSA::count(const std::string& pattern) const
396 if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); }
398 std::string::const_reverse_iterator iter = pattern.rbegin();
400 pair_type index_range = this->index_ranges[(uchar)*iter];
401 if(isEmpty(index_range)) { return EMPTY_PAIR; }
402 index_range.first += this->number_of_sequences;
403 index_range.second += this->number_of_sequences;
405 for(++iter; iter != pattern.rend(); ++iter)
407 index_range = this->backwardSearchStep(index_range, (uchar)*iter);
408 if(isEmpty(index_range)) { return EMPTY_PAIR; }
411 // Suffix array indexes are 0-based.
412 index_range.first -= this->number_of_sequences;
413 index_range.second -= this->number_of_sequences;
418 //--------------------------------------------------------------------------
421 RLCSA::reportPositions(uchar* data, usint length, usint* positions) const
423 if(data == 0 || length == 0 || positions == 0) { return; }
425 PsiVector::Iterator** iters = this->getIterators();
427 usint current = this->number_of_sequences - 1;
428 positions[length] = current; // "immediately after current"
429 for(sint i = (sint)(length - 1); i >= 0; i--)
431 usint c = (usint)data[i];
432 if(this->array[c] != 0)
434 current = this->LF(current, c, *(iters[c]));
438 if(c < this->text_chars[0]) // No previous characters either.
440 current = this->number_of_sequences - 1;
444 current = this->index_ranges[c].first - 1 + this->number_of_sequences;
447 positions[i] = current; // "immediately after current"
450 this->deleteIterators(iters);
453 //--------------------------------------------------------------------------
456 RLCSA::locate(pair_type range) const
458 if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; }
460 usint* data = new usint[length(range)];
461 this->locateUnsafe(range, data);
467 RLCSA::locate(pair_type range, usint* data) const
469 if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; }
470 this->locateUnsafe(range, data);
475 RLCSA::locateUnsafe(pair_type range, usint* data) const
477 usint items = length(range);
478 usint* offsets = new usint[items];
479 bool* finished = new bool[items]; // FIXME This could be more space efficient...
481 PsiVector::Iterator** iters = this->getIterators();
483 for(usint i = 0, j = range.first; i < items; i++, j++)
485 data[i] = j + this->number_of_sequences;
494 pair_type run = EMPTY_PAIR;
495 for(usint i = 0; i < items; i++)
499 continue; // The run might continue after this.
501 else if(isEmpty(run))
503 run = pair_type(i, i);
505 else if(data[i] - data[run.first] == i - run.first)
511 found &= this->processRun(run, data, offsets, finished, iters);
512 run = pair_type(i, i);
515 if(!isEmpty(run)) { found &= this->processRun(run, data, offsets, finished, iters); }
518 this->deleteIterators(iters);
524 RLCSA::processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters) const
527 usint run_start = 0, run_left = 0;
528 pair_type next_sample = pair_type(0, 0);
530 for(usint i = run.first; i <= run.second; i++)
534 if(run_left > 0) { run_left--; }
537 if(data[i] < this->number_of_sequences) // Implicit sample here.
539 DeltaVector::Iterator iter(*(this->end_points));
540 data[i] = iter.select(data[i]) + 1 - offsets[i];
542 if(run_left > 0) { run_left--; }
545 if(next_sample.first < data[i]) // Need another sample.
547 next_sample = this->sa_samples->getFirstSampleAfter(data[i] - this->number_of_sequences);
548 next_sample.first += this->number_of_sequences;
550 if(data[i] < next_sample.first) // No sample found for current position.
554 data[i] = data[run_start] + i - run_start;
559 pair_type value = this->psi(data[i] - this->number_of_sequences, run.second - i, iters);
560 data[i] = value.first;
561 run_left = value.second;
567 else // Sampled position found.
569 data[i] = this->sa_samples->getSample(next_sample.second) - offsets[i];
571 if(run_left > 0) { run_left--; }
578 RLCSA::locate(usint index) const
580 if(!(this->support_locate) || index >= this->data_size) { return this->data_size; }
583 index += this->number_of_sequences;
586 if(index < this->number_of_sequences) // Implicit sample here
588 DeltaVector::Iterator iter(*(this->end_points));
589 return iter.select(index) + 1 - offset;
591 pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences);
592 next_sample.first += this->number_of_sequences;
593 if(next_sample.first == index)
595 return this->sa_samples->getSample(next_sample.second) - offset;
597 index = this->psi(index - this->number_of_sequences);
602 //--------------------------------------------------------------------------
605 RLCSA::display(usint sequence) const
607 if(!(this->support_display)) { return 0; }
609 pair_type seq_range = this->getSequenceRange(sequence);
610 if(isEmpty(seq_range)) { return 0; }
612 uchar* data = new uchar[length(seq_range)+1];
613 this->displayUnsafe(seq_range, data);
614 data[length(seq_range)] = 0;
620 RLCSA::display(usint sequence, pair_type range) const
622 if(!(this->support_display) || isEmpty(range)) { return 0; }
624 pair_type seq_range = this->getSequenceRange(sequence);
625 if(isEmpty(seq_range)) { return 0; }
627 range.first += seq_range.first; range.second += seq_range.first;
628 if(range.second > seq_range.second) { return 0; }
630 uchar* data = new uchar[length(range)];
631 this->displayUnsafe(range, data);
637 RLCSA::display(usint sequence, pair_type range, uchar* data) const
639 if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; }
641 pair_type seq_range = this->getSequenceRange(sequence);
642 if(isEmpty(seq_range)) { return 0; }
644 range.first += seq_range.first; range.second += seq_range.first;
645 if(range.second > seq_range.second) { return 0; }
647 this->displayUnsafe(range, data);
652 RLCSA::display(usint position, usint len, usint context, usint& result_length) const
654 if(!(this->support_display)) { return 0; }
656 pair_type range = this->getSequenceRangeForPosition(position);
657 if(isEmpty(range)) { return 0; }
659 range.first = position - std::min(context, position - range.first);
660 range.second = std::min(range.second, position + len + context - 1);
661 result_length = length(range);
662 if(isEmpty(range)) { return 0; }
664 uchar* data = new uchar[length(range)];
665 this->displayUnsafe(range, data);
671 RLCSA::displayPrefix(usint sequence, usint len, uchar* data) const
673 if(!(this->support_display) || len == 0 || data == 0) { return 0; }
675 pair_type seq_range = this->getSequenceRange(sequence);
676 if(isEmpty(seq_range)) { return 0; }
678 pair_type range(seq_range.first, std::min(seq_range.second, seq_range.first + len - 1));
680 this->displayUnsafe(range, data);
681 return length(range);
685 RLCSA::displayFromPosition(usint index, usint max_len, uchar* data) const
687 if(max_len == 0 || data == 0 || index >= this->data_size) { return 0; }
689 for(usint i = 0; i < max_len; i++)
691 data[i] = this->getCharacter(index);
692 index = this->psiUnsafe(index, data[i]);
693 if(index < this->number_of_sequences) { return i + 1; }
694 index -= this->number_of_sequences;
701 RLCSA::displayUnsafe(pair_type range, uchar* data) const
703 usint i = range.first - range.first % this->sa_samples->getSampleRate();
705 usint pos = this->sa_samples->inverseSA(i);
707 if(length(range) >= 1024)
709 PsiVector::Iterator** iters = this->getIterators();
710 for(; i < range.first; i++)
712 pos = this->psi(pos, iters) - this->number_of_sequences;
714 for(; i <= range.second; i++)
716 usint c = this->getCharacter(pos);
717 data[i - range.first] = c;
718 pos = this->psiUnsafe(pos, c, *(iters[c])) - this->number_of_sequences;
720 this->deleteIterators(iters);
724 for(; i < range.first; i++)
726 pos = this->psi(pos) - this->number_of_sequences;
728 for(; i <= range.second; i++)
730 usint c = this->getCharacter(pos);
731 data[i - range.first] = c;
732 pos = this->psiUnsafe(pos, c) - this->number_of_sequences;
737 //--------------------------------------------------------------------------
740 RLCSA::getSequenceRange(usint number) const
742 if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
745 DeltaVector::Iterator iter(*(this->end_points));
749 result.second = iter.select(number);
753 result.first = nextMultipleOf(this->sample_rate, iter.select(number - 1));
754 result.second = iter.selectNext();
761 RLCSA::getSequenceForPosition(usint value) const
763 if(value == 0) { return 0; }
764 DeltaVector::Iterator iter(*(this->end_points));
765 return iter.rank(value - 1);
769 RLCSA::getSequenceRangeForPosition(usint value) const
771 return this->getSequenceRange(this->getSequenceForPosition(value));
775 RLCSA::getSequenceForPosition(usint* values, usint len) const
777 if(values == 0) { return 0; }
779 DeltaVector::Iterator iter(*(this->end_points));
780 for(usint i = 0; i < len; i++)
782 if(values[i] > 0) { values[i] = iter.rank(values[i] - 1); }
789 RLCSA::getRelativePosition(usint value) const
791 DeltaVector::Iterator iter(*(this->end_points));
792 pair_type result(0, value);
794 if(value > 0) { result.first = iter.rank(value - 1); }
797 result.second -= nextMultipleOf(this->sample_rate, iter.select(result.first - 1));
803 //--------------------------------------------------------------------------
806 RLCSA::readBWT() const
808 return this->readBWT(pair_type(0, this->data_size + this->number_of_sequences - 1));
812 RLCSA::readBWT(pair_type range) const
814 if(isEmpty(range) || range.second >= this->data_size + this->number_of_sequences) { return 0; }
816 usint n = length(range);
818 uchar* bwt = new uchar[n];
821 for(usint c = 0; c < CHARS; c++)
823 if(this->array[c] != 0)
825 PsiVector::Iterator iter(*(this->array[c]));
826 usint pos = iter.valueAfter(range.first).first;
827 while(pos <= range.second)
829 bwt[pos - range.first] = c;
830 pos = iter.selectNext();
839 RLCSA::countRuns() const
842 for(usint c = 0; c < CHARS; c++)
844 if(this->array[c] != 0)
846 PsiVector::Iterator iter(*(this->array[c]));
847 runs += iter.countRuns();
854 //--------------------------------------------------------------------------
856 PsiVector::Iterator**
857 RLCSA::getIterators() const
859 PsiVector::Iterator** iters = new PsiVector::Iterator*[CHARS];
860 for(usint i = 0; i < CHARS; i++)
862 if(this->array[i] == 0) { iters[i] = 0; }
863 else { iters[i] = new PsiVector::Iterator(*(this->array[i])); }
869 RLCSA::deleteIterators(PsiVector::Iterator** iters) const
871 if(iters == 0) { return; }
872 for(usint i = 0; i < CHARS; i++) { delete iters[i]; }
876 //--------------------------------------------------------------------------
879 RLCSA::reportSize(bool print) const
881 usint bytes = 0, temp = 0, bwt = 0;
883 for(usint c = 0; c < CHARS; c++)
887 bytes += this->array[c]->reportSize();
888 bwt += this->array[c]->getCompressedSize();
891 bytes += sizeof(*this) + this->end_points->reportSize();
894 std::cout << "RLCSA: " << (bytes / (double)MEGABYTE) << " MB" << std::endl;
895 std::cout << " BWT only: " << (bwt / (double)MEGABYTE) << " MB" << std::endl;
898 if(this->support_locate || this->support_display)
900 temp = this->sa_samples->reportSize();
901 if(print) { std::cout << "SA samples: " << (temp / (double)MEGABYTE) << " MB" << std::endl; }
907 std::cout << "Total size: " << (bytes / (double)MEGABYTE) << " MB" << std::endl;
908 std::cout << std::endl;
915 RLCSA::printInfo() const
917 double megabytes = this->data_size / (double)MEGABYTE;
919 std::cout << "Sequences: " << this->number_of_sequences << std::endl;
920 std::cout << "Original size: " << megabytes << " MB" << std::endl;
921 std::cout << "Block size: " << (this->getBlockSize() * sizeof(usint)) << " bytes" << std::endl;
922 if(this->support_locate || this->support_display)
924 std::cout << "Sample rate: " << this->sample_rate << std::endl;
926 std::cout << std::endl;
929 //--------------------------------------------------------------------------
932 RLCSA::buildPLCP(usint block_size) const
934 if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0)
936 std::cerr << "PLCP: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl;
940 PsiVector::Iterator** iters = this->getIterators();
942 RLEEncoder plcp(block_size);
943 std::list<pair_type> matches;
944 pair_type prev_range = EMPTY_PAIR;
945 for(usint j = 0; j < this->number_of_sequences; j++)
947 // Encode the padding as a single run.
948 pair_type seq_range = this->getSequenceRange(j);
949 if(j > 0 && prev_range.second + 1 < seq_range.first)
951 this->encodePLCPRun(plcp, prev_range.second + 1, seq_range.first, seq_range.first - prev_range.second - 2);
953 prev_range = seq_range;
955 usint maximal = seq_range.first;
956 usint x = this->sa_samples->inverseSA(seq_range.first), next_x;
958 // Invariant: x == inverseSA(i)
959 for(usint i = seq_range.first; i <= seq_range.second; i++, x = next_x)
961 usint c = this->getCharacter(x);
963 // T[i,n] is lexicographically the first suffix beginning with c.
964 if(x == this->index_ranges[c].first)
968 maximal = (*(matches.begin())).first;
971 this->encodePLCPRun(plcp, maximal, i + 1, i - maximal);
973 next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences;
977 // Process the previous left matches we are still following.
978 usint low = this->index_ranges[c].first + this->number_of_sequences;
979 usint start, stop; start = stop = maximal;
980 for(std::list<pair_type>::iterator iter = matches.begin(); iter != matches.end();)
982 pair_type match = *iter;
983 if(match.second < low) // These no longer match the current character.
985 if(match.first < start) { start = match.first; } // Form a single run from then.
986 iter = matches.erase(iter);
990 if(match.first < stop) { stop = match.first; } // End of the run to be encoded.
991 (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c]));
995 if(start < stop) { this->encodePLCPRun(plcp, start, stop, i - start); }
997 // If PLCP[i] is minimal, we add the left match to the list.
998 // We add pairs of type (j, inverseSA(j) + n_of_s) as inverseSA is < 0 for end markers.
999 usint next_y = this->psiUnsafe(x - 1, c, *(iters[c]));
1000 next_x = this->psiUnsafeNext(c, *(iters[c])) - this->number_of_sequences;
1001 if(next_y != next_x + this->number_of_sequences - 1)
1003 matches.push_back(pair_type(maximal, next_y));
1008 if(!matches.empty())
1010 maximal = (*(matches.begin())).first;
1013 if(maximal <= seq_range.second)
1015 this->encodePLCPRun(plcp, maximal, seq_range.second + 1, seq_range.second + 1 - maximal);
1019 this->deleteIterators(iters);
1021 return new RLEVector(plcp, 2 * (this->end_points->getSize() + 1));
1025 RLCSA::sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report) const
1027 if(sample_rate == 0)
1029 sample_rate = this->data_size + 1;
1032 PsiVector::Iterator** iters = this->getIterators();
1034 usint runs = this->countRuns();
1035 usint samples = 0, minimal_sum = 0, nonstrict_minimal_sum = 0, minimal_samples = 0;
1036 usint max_samples = runs + (this->data_size - runs) / sample_rate;
1037 sampled_values = new pair_type[max_samples];
1039 std::list<Triple> matches;
1040 for(usint j = 0; j < this->number_of_sequences; j++)
1042 pair_type seq_range = this->getSequenceRange(j);
1043 usint first_sample = samples; // First minimal sample of the current sequence.
1044 usint minimal = seq_range.second + 1; // Last minimal position.
1045 usint i, x = this->sa_samples->inverseSA(seq_range.first), next_x, prev_x = 0;
1047 // Invariant: x == inverseSA(i)
1048 for(i = seq_range.first; i <= seq_range.second; i++, x = next_x)
1050 usint c = this->getCharacter(x);
1052 // T[i,n] is lexicographically the first suffix beginning with c.
1053 if(x == this->index_ranges[c].first)
1055 if(!matches.empty())
1057 for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end(); ++iter)
1059 nonstrict_minimal_sum += i - (*iter).first;
1064 // Minimal sample: SA[x] = i, LCP[x] = 0
1065 sampled_values[samples] = pair_type(x, 0); minimal = i;
1066 samples++; minimal_samples++;
1067 next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences; prev_x = x;
1071 // Process the previous left matches we are still following.
1072 usint low = this->index_ranges[c].first + this->number_of_sequences;
1073 pair_type potential_sample(this->data_size, this->data_size);
1074 for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end();)
1076 Triple match = *iter;
1077 if(match.second < low) // These no longer match the current character.
1079 if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; }
1081 // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first
1082 potential_sample = pair_type(match.third, i - match.first);
1083 iter = matches.erase(iter);
1087 (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c]));
1091 if(potential_sample.first != this->data_size)
1093 // Last potential sample is minimal.
1094 sampled_values[samples] = potential_sample;
1095 samples++; minimal_sum += potential_sample.second; minimal_samples++;
1098 // If PLCP[i] is minimal, we add the left match to the list.
1099 // We add triples of type (j, inverseSA(j) + n_of_s, x) as inverseSA is < 0 for end markers.
1100 // x is the original minimal position.
1101 usint next_y = this->psiUnsafe(x - 1, c, *(iters[c]));
1102 next_x = this->psiUnsafeNext(c, *(iters[c]));
1103 if(next_y != next_x - 1 || next_x < this->number_of_sequences)
1105 matches.push_back(Triple(i, next_y, x)); minimal = i;
1107 next_x -= this->number_of_sequences; prev_x = x;
1110 // Are we still following something?
1111 if(!matches.empty())
1113 pair_type potential_sample(this->data_size, this->data_size);
1114 for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end(); ++iter)
1116 Triple match = *iter;
1117 if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; }
1119 // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first
1120 potential_sample = pair_type(match.third, i - match.first);
1123 if(potential_sample.first != this->data_size)
1125 // Last potential sample is minimal.
1126 sampled_values[samples] = potential_sample;
1127 samples++; minimal_sum += potential_sample.second; minimal_samples++;
1131 // Add the non-minimal samples.
1132 if(sample_rate <= this->data_size)
1134 usint last_sample = samples - 1;
1135 i = seq_range.first; x = this->sa_samples->inverseSA(seq_range.first);
1136 for(usint current_sample = first_sample; current_sample <= last_sample; current_sample++)
1138 // Find the next minimal sample and add nonminimal samples if needed.
1139 pair_type first_nonminimal(i + sample_rate - 1, samples);
1140 pair_type next_nonminimal = first_nonminimal;
1141 while(x != sampled_values[current_sample].first)
1143 if(i == next_nonminimal.first)
1145 sampled_values[samples] = pair_type(x, 0); samples++;
1146 next_nonminimal.first += sample_rate; next_nonminimal.second++;
1148 i++; x = this->psi(x, iters) - this->number_of_sequences;
1151 // Reduce the nonminimal samples to the current minimal sample.
1152 for(next_nonminimal = first_nonminimal; next_nonminimal.second < samples; next_nonminimal.first += sample_rate, next_nonminimal.second++)
1154 sampled_values[next_nonminimal.second].second = sampled_values[current_sample].second + i - next_nonminimal.first;
1156 i++; x = this->psi(x, iters) - this->number_of_sequences;
1161 std::sort(sampled_values, sampled_values + samples);
1165 std::cout << "Samples: " << samples << " (total) / " << minimal_samples << " (minimal)" << std::endl;
1166 std::cout << "Upper bounds: " << max_samples << " (total) / " << runs << " (minimal)" << std::endl;
1167 std::cout << "Sum of minimal samples: " << (minimal_sum + nonstrict_minimal_sum) << " (total) / " << minimal_sum << " (strict)" << std::endl;
1168 std::cout << std::endl;
1171 this->deleteIterators(iters);
1177 RLCSA::lcp(usint index, const LCPSamples& lcp_samples) const
1179 if(index >= this->data_size) { return this->data_size; }
1182 index += this->number_of_sequences;
1185 pair_type next_sample = lcp_samples.getFirstSampleAfter(index - this->number_of_sequences);
1186 next_sample.first += this->number_of_sequences;
1187 if(next_sample.first == index)
1189 return lcp_samples.getSample(next_sample.second) + offset;
1191 index = this->psi(index - this->number_of_sequences);
1197 RLCSA::lcp(usint index, const LCPSamples& lcp_samples, const RLEVector& plcp) const
1199 if(index >= this->data_size) { return this->data_size; }
1204 usint c = this->getCharacter(index);
1205 PsiVector::Iterator iter(*(this->array[c]));
1207 if(index == this->index_ranges[c].first)
1209 return lcp_samples.getSampleAtPosition(index) + offset;
1211 usint next_y = this->psiUnsafe(index - 1, c, iter);
1212 usint next_x = this->psiUnsafeNext(c, iter);
1213 if(next_y != next_x - 1 || next_x < this->number_of_sequences)
1215 pair_type sample = lcp_samples.getFirstSampleAfter(index);
1216 if(sample.first == index)
1218 return lcp_samples.getSample(sample.second) + offset;
1222 pair_type next_sample = this->sa_samples->getFirstSampleAfter(index);
1223 if(next_sample.first == index)
1225 index = this->sa_samples->getSample(next_sample.second) - offset;
1226 RLEVector::Iterator plcp_iter(plcp);
1227 return plcp_iter.select(index) - 2 * index;
1229 index = next_x - this->number_of_sequences;
1235 RLCSA::lcpDirect(usint index) const
1237 if(index >= this->data_size || index == 0)
1242 usint match = index - 1;
1245 usint c = this->getCharacter(index);
1246 usint low = this->index_ranges[c].first;
1249 PsiVector::Iterator iter(*(this->array[c]));
1250 match = this->psiUnsafe(match, c, iter);
1251 index = this->psiUnsafe(index, c, iter);
1254 if(match < this->number_of_sequences || index < this->number_of_sequences)
1258 match -= this->number_of_sequences;
1259 index -= this->number_of_sequences;
1261 c = this->getCharacter(index);
1262 low = this->index_ranges[c].first;
1268 //--------------------------------------------------------------------------
1271 RLCSA::mergeEndPoints(RLCSA& index, RLCSA& increment)
1273 DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
1275 DeltaVector::Iterator index_iter(*(index.end_points));
1276 DeltaVector::Iterator increment_iter(*(increment.end_points));
1278 endings->setBit(index_iter.select(0));
1279 for(usint i = 1; i < index.number_of_sequences; i++)
1281 endings->setBit(index_iter.selectNext());
1283 usint sum = index.end_points->getSize();
1284 delete index.end_points; index.end_points = 0;
1286 endings->setBit(sum + increment_iter.select(0));
1287 for(usint i = 1; i < increment.number_of_sequences; i++)
1289 endings->setBit(sum + increment_iter.selectNext());
1291 sum += increment.end_points->getSize();
1292 delete increment.end_points; increment.end_points = 0;
1294 this->end_points = new DeltaVector(*endings, sum);
1300 RLCSA::mergeSamples(RLCSA& index, RLCSA& increment, usint* positions)
1302 if(this->support_locate || this->support_display)
1304 positions += increment.number_of_sequences;
1305 this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size, this->number_of_sequences);
1309 //--------------------------------------------------------------------------
1312 RLCSA::buildCharIndexes(usint* distribution)
1314 this->data_size = buildRanges(distribution, this->index_ranges);
1317 for(; c < CHARS; c++)
1319 if(!isEmpty(this->index_ranges[c]))
1321 this->text_chars[i] = c;
1327 this->index_rate = std::max((this->data_size + CHARS - 1) / CHARS, (usint)1);
1330 for(c = 0, i = 0; c < this->chars; c++)
1332 pair_type range = this->index_ranges[this->text_chars[c]];
1333 while(current <= range.second)
1335 this->index_pointers[i] = c;
1336 current += this->index_rate;
1343 buildRanges(usint* distribution, pair_type* index_ranges)
1345 if(distribution == 0 || index_ranges == 0) { return 0; }
1348 for(usint c = 0; c < CHARS; c++)
1350 if(distribution[c] == 0)
1352 if(sum == 0) { index_ranges[c] = EMPTY_PAIR; }
1353 else { index_ranges[c] = pair_type(sum, sum - 1); }
1357 index_ranges[c].first = sum;
1358 sum += distribution[c];
1359 index_ranges[c].second = sum - 1;