#include <algorithm>
#include <cstdlib>
-#include <ctime>
+#include <cstring>
#include <fstream>
#include <iostream>
+#include <list>
#include "rlcsa.h"
#include "misc/utils.h"
#include "qsufsort/qsufsort.h"
+#include "bits/vectors.h"
+
+
+#ifdef MULTITHREAD_SUPPORT
+#include <omp.h>
+#endif
+
namespace CSA
array_file.read((char*)distribution, CHARS * sizeof(usint));
this->buildCharIndexes(distribution);
- Parameters parameters;
- parameters.read(base_name + PARAMETERS_EXTENSION);
+// Parameters parameters;
+// parameters.read(base_name + PARAMETERS_EXTENSION);
for(usint c = 0; c < CHARS; c++)
{
- if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); }
+ if(!isEmpty(this->index_ranges[c])) { this->array[c] = new PsiVector(array_file); }
}
this->end_points = new DeltaVector(array_file);
array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate));
array_file.close();
- this->support_locate = parameters.get(SUPPORT_LOCATE);
- this->support_display = parameters.get(SUPPORT_DISPLAY);
+ this->support_locate = true;// parameters.get(SUPPORT_LOCATE);
+ this->support_display = true; //parameters.get(SUPPORT_DISPLAY);
if(this->support_locate || this->support_display)
{
sa_sample_file.close();
}
- if(print) { parameters.print(); }
+// if(print) { parameters.print(); }
this->ok = true;
}
// Build suffix array.
- sint* inverse = new sint[bytes + 1];
+ int* inverse = new int[bytes + 1];
if(multiple_sequences)
{
- sint zeros = 0;
+ int zeros = 0;
for(usint i = 0; i < bytes; i++)
{
if(data[i] == 0)
}
else
{
- inverse[i] = (sint)data[i] + this->number_of_sequences;
+ inverse[i] = (int)(data[i] + this->number_of_sequences);
}
}
}
else
{
- for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; }
+ for(usint i = 0; i < bytes; i++) { inverse[i] = (int)data[i]; }
}
if(delete_data) { delete[] data; }
- sint* sa = new sint[bytes + 1];
+ int* sa = new int[bytes + 1];
suffixsort(inverse, sa, bytes, high + 1, low);
// Sample SA.
- usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA.
if(this->support_locate || this->support_display)
{
if(multiple_sequences)
{
- std::cout << "We shouldn't be here!" << std::endl;
- // Real SA starts at sa + incr.
- // for each sequence
- // find starting position
- // sample relative to starting position
- // sort samples to SA order
+ this->sa_samples = new SASamples((uint*)inverse, this->end_points, this->data_size, this->sample_rate);
}
else
{
- this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate);
+ this->sa_samples = new SASamples((uint*)(sa + 1), this->data_size, this->sample_rate);
}
}
// Build RLCSA.
+ usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA.
usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences.
for(usint c = 0; c < CHARS; c++)
{
if(distribution[c] == 0) { this->array[c] = 0; continue; }
- usint* curr = (usint*)(sa + index_ranges[c].first + incr);
- usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1);
- RLEEncoder encoder(block_size);
+ uint* curr = (uint*)(sa + index_ranges[c].first + incr);
+ uint* limit = (uint*)(sa + index_ranges[c].second + incr + 1);
+ PsiVector::Encoder encoder(block_size);
pair_type run(*curr, 1);
curr++;
}
encoder.setRun(run.first - decr, run.second);
- this->array[c] = new RLEVector(encoder, this->data_size + incr - decr);
+ this->array[c] = new PsiVector(encoder, this->data_size + incr - decr);
}
delete[] sa;
this->ok = true;
}
-RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) :
+RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads) :
ok(false),
sa_samples(0),
end_points(0)
if(index.sample_rate != increment.sample_rate)
{
std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl;
- std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl;
return;
}
}
this->buildCharIndexes(distribution);
this->sample_rate = index.sample_rate;
-
-
- // Combine end points of sequences.
this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences;
- DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
-
- endings->setBit(index.end_points->select(0));
- for(usint i = 1; i < index.number_of_sequences; i++)
- {
- endings->setBit(index.end_points->selectNext());
- }
- usint sum = index.end_points->getSize();
- delete index.end_points; index.end_points = 0;
-
- endings->setBit(sum + increment.end_points->select(0));
- for(usint i = 1; i < increment.number_of_sequences; i++)
- {
- endings->setBit(sum + increment.end_points->selectNext());
- }
- sum += increment.end_points->getSize();
- delete increment.end_points; increment.end_points = 0;
-
- this->end_points = new DeltaVector(*endings, sum);
- delete endings;
-
- // Combine Psi.
+ // Merge end points, SA samples, and Psi.
usint psi_size = this->data_size + this->number_of_sequences;
- for(usint c = 0; c < CHARS; c++)
- {
- if(distribution[c] == 0) { this->array[c] = 0; continue; }
- this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
- index.array[c] = 0;
- increment.array[c] = 0;
-
- if(this->array[c] == 0)
- {
- std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
- return;
- }
- }
-
+ bool should_be_ok = true;
- // Combine suffix array samples.
- if(this->support_locate || this->support_display)
+ #ifdef MULTITHREAD_SUPPORT
+ omp_set_num_threads(threads);
+ #pragma omp parallel
{
- positions += increment.number_of_sequences;
- for(usint i = 0; i < increment.data_size; i++)
+ #pragma omp for schedule(dynamic, 1)
+ #endif
+ for(int c = -2; c < (int)CHARS; c++)
{
- positions[i] -= this->number_of_sequences;
+ if(c == -2) { this->mergeEndPoints(index, increment); }
+ else if(c == -1) { this->mergeSamples(index, increment, positions); }
+ else if(distribution[c] != 0)
+ {
+ this->array[c] = mergeVectors<PsiVector, PsiVector::Encoder, PsiVector::Iterator>(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
+ index.array[c] = 0;
+ increment.array[c] = 0;
+
+ if(this->array[c] == 0)
+ {
+ std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
+ should_be_ok = false;
+ }
+ }
}
- this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size);
+ #ifdef MULTITHREAD_SUPPORT
}
+ #endif
- this->ok = true;
+ this->ok = should_be_ok;
}
RLCSA::~RLCSA()
delete this->end_points;
}
+//--------------------------------------------------------------------------
+
void
-RLCSA::writeTo(const std::string& base_name)
+RLCSA::writeTo(const std::string& base_name) const
{
std::string array_name = base_name + ARRAY_EXTENSION;
std::ofstream array_file(array_name.c_str(), std::ios_base::binary);
}
bool
-RLCSA::isOk()
+RLCSA::isOk() const
{
return this->ok;
}
//--------------------------------------------------------------------------
pair_type
-RLCSA::count(const std::string& pattern)
+RLCSA::count(const std::string& pattern) const
{
if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); }
- pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())];
- if(isEmpty(index_range)) { return index_range; }
+ std::string::const_reverse_iterator iter = pattern.rbegin();
+
+ pair_type index_range = this->index_ranges[(uchar)*iter];
+ if(isEmpty(index_range)) { return EMPTY_PAIR; }
index_range.first += this->number_of_sequences;
index_range.second += this->number_of_sequences;
- for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++)
+ for(++iter; iter != pattern.rend(); ++iter)
{
- RLEVector* vector = this->array[(usint)*iter];
- usint start = this->index_ranges[(usint)*iter].first;
-
- index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences;
- index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences;
-
- if(isEmpty(index_range)) { return index_range; }
+ index_range = this->backwardSearchStep(index_range, (uchar)*iter);
+ if(isEmpty(index_range)) { return EMPTY_PAIR; }
}
// Suffix array indexes are 0-based.
return index_range;
}
+//--------------------------------------------------------------------------
+
void
-RLCSA::reportPositions(uchar* data, usint length, usint* positions)
+RLCSA::reportPositions(uchar* data, usint length, usint* positions) const
{
if(data == 0 || length == 0 || positions == 0) { return; }
+ PsiVector::Iterator** iters = this->getIterators();
+
usint current = this->number_of_sequences - 1;
positions[length] = current; // "immediately after current"
for(sint i = (sint)(length - 1); i >= 0; i--)
{
-// positions[i] = current; // "immediately after current"
usint c = (usint)data[i];
- if(array[c])
+ if(this->array[c] != 0)
{
- current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences;
+ current = this->LF(current, c, *(iters[c]));
}
else
{
}
positions[i] = current; // "immediately after current"
}
+
+ this->deleteIterators(iters);
}
//--------------------------------------------------------------------------
-LocateItem*
-RLCSA::locate(pair_type range)
+usint*
+RLCSA::locate(pair_type range) const
{
if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; }
- LocateItem* data = new LocateItem[range.second + 1 - range.first];
- if(!data) { return 0; }
+ usint* data = new usint[length(range)];
this->locateUnsafe(range, data);
return data;
}
-LocateItem*
-RLCSA::locate(pair_type range, LocateItem* data)
+usint*
+RLCSA::locate(pair_type range, usint* data) const
{
if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; }
this->locateUnsafe(range, data);
}
void
-RLCSA::locateUnsafe(pair_type range, LocateItem* data)
+RLCSA::locateUnsafe(pair_type range, usint* data) const
{
- usint items = range.second + 1 - range.first;
+ usint items = length(range);
+ usint* offsets = new usint[items];
+ bool* finished = new bool[items]; // FIXME This could be more space efficient...
+
+ PsiVector::Iterator** iters = this->getIterators();
+
for(usint i = 0, j = range.first; i < items; i++, j++)
{
- data[i].value = j + this->number_of_sequences;
- data[i].offset = 0;
- data[i].found = false;
+ data[i] = j + this->number_of_sequences;
+ offsets[i] = 0;
+ finished[i] = false;
}
bool found = false;
pair_type run = EMPTY_PAIR;
for(usint i = 0; i < items; i++)
{
- if(data[i].found)
+ if(finished[i])
{
continue; // The run might continue after this.
}
{
run = pair_type(i, i);
}
- else if(data[i].value - data[run.first].value == i - run.first)
+ else if(data[i] - data[run.first] == i - run.first)
{
run.second = i;
}
else
{
- found &= this->processRun(run, data);
+ found &= this->processRun(run, data, offsets, finished, iters);
run = pair_type(i, i);
}
}
- if(!isEmpty(run)) { found &= this->processRun(run, data); }
+ if(!isEmpty(run)) { found &= this->processRun(run, data, offsets, finished, iters); }
}
+
+ this->deleteIterators(iters);
+ delete[] offsets;
+ delete[] finished;
}
bool
-RLCSA::processRun(pair_type run, LocateItem* data)
+RLCSA::processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters) const
{
bool found = true;
usint run_start = 0, run_left = 0;
for(usint i = run.first; i <= run.second; i++)
{
- if(data[i].found)
+ if(finished[i])
{
if(run_left > 0) { run_left--; }
continue;
}
- if(data[i].value < this->number_of_sequences) // Implicit sample here.
+ if(data[i] < this->number_of_sequences) // Implicit sample here.
{
- data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset;
- data[i].found = true;
+ DeltaVector::Iterator iter(*(this->end_points));
+ data[i] = iter.select(data[i]) + 1 - offsets[i];
+ finished[i] = true;
if(run_left > 0) { run_left--; }
continue;
}
- if(next_sample.first < data[i].value) // Need another sample.
+ if(next_sample.first < data[i]) // Need another sample.
{
- next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences);
+ next_sample = this->sa_samples->getFirstSampleAfter(data[i] - this->number_of_sequences);
next_sample.first += this->number_of_sequences;
}
- if(data[i].value < next_sample.first) // No sample found for current position.
+ if(data[i] < next_sample.first) // No sample found for current position.
{
if(run_left > 0)
{
- data[i].value = data[run_start].value + i - run_start;
+ data[i] = data[run_start] + i - run_start;
run_left--;
}
else
{
- pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i);
- data[i].value = value.first;
+ pair_type value = this->psi(data[i] - this->number_of_sequences, run.second - i, iters);
+ data[i] = value.first;
run_left = value.second;
run_start = i;
}
- data[i].offset++;
+ offsets[i]++;
found = false;
}
else // Sampled position found.
{
- data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset;
- data[i].found = true;
+ data[i] = this->sa_samples->getSample(next_sample.second) - offsets[i];
+ finished[i] = true;
if(run_left > 0) { run_left--; }
}
}
}
usint
-RLCSA::locate(usint index)
+RLCSA::locate(usint index) const
{
if(!(this->support_locate) || index >= this->data_size) { return this->data_size; }
{
if(index < this->number_of_sequences) // Implicit sample here
{
- return this->end_points->select(index) + 1 - offset;
+ DeltaVector::Iterator iter(*(this->end_points));
+ return iter.select(index) + 1 - offset;
}
pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences);
next_sample.first += this->number_of_sequences;
//--------------------------------------------------------------------------
uchar*
-RLCSA::display(usint sequence, pair_type range)
+RLCSA::display(usint sequence) const
+{
+ if(!(this->support_display)) { return 0; }
+
+ pair_type seq_range = this->getSequenceRange(sequence);
+ if(isEmpty(seq_range)) { return 0; }
+
+ uchar* data = new uchar[length(seq_range)+1];
+ this->displayUnsafe(seq_range, data);
+ data[length(seq_range)] = 0;
+
+ return data;
+}
+
+uchar*
+RLCSA::display(usint sequence, pair_type range) const
{
if(!(this->support_display) || isEmpty(range)) { return 0; }
- pair_type seq_range = this->getSequence(sequence);
+ pair_type seq_range = this->getSequenceRange(sequence);
if(isEmpty(seq_range)) { return 0; }
range.first += seq_range.first; range.second += seq_range.first;
if(range.second > seq_range.second) { return 0; }
- uchar* data = new uchar[range.second + 1 - range.first];
- if(!data) { return 0; }
+ uchar* data = new uchar[length(range)];
this->displayUnsafe(range, data);
return data;
}
uchar*
-RLCSA::display(usint sequence, pair_type range, uchar* data)
+RLCSA::display(usint sequence, pair_type range, uchar* data) const
{
if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; }
- pair_type seq_range = this->getSequence(sequence);
+ pair_type seq_range = this->getSequenceRange(sequence);
if(isEmpty(seq_range)) { return 0; }
range.first += seq_range.first; range.second += seq_range.first;
return data;
}
+uchar*
+RLCSA::display(usint position, usint len, usint context, usint& result_length) const
+{
+ if(!(this->support_display)) { return 0; }
+
+ pair_type range = this->getSequenceRangeForPosition(position);
+ if(isEmpty(range)) { return 0; }
+
+ range.first = position - std::min(context, position - range.first);
+ range.second = std::min(range.second, position + len + context - 1);
+ result_length = length(range);
+ if(isEmpty(range)) { return 0; }
+
+ uchar* data = new uchar[length(range)];
+ this->displayUnsafe(range, data);
+
+ return data;
+}
+
+usint
+RLCSA::displayPrefix(usint sequence, usint len, uchar* data) const
+{
+ if(!(this->support_display) || len == 0 || data == 0) { return 0; }
+
+ pair_type seq_range = this->getSequenceRange(sequence);
+ if(isEmpty(seq_range)) { return 0; }
+
+ pair_type range(seq_range.first, std::min(seq_range.second, seq_range.first + len - 1));
+
+ this->displayUnsafe(range, data);
+ return length(range);
+}
+
+usint
+RLCSA::displayFromPosition(usint index, usint max_len, uchar* data) const
+{
+ if(max_len == 0 || data == 0 || index >= this->data_size) { return 0; }
+
+ for(usint i = 0; i < max_len; i++)
+ {
+ data[i] = this->getCharacter(index);
+ index = this->psiUnsafe(index, data[i]);
+ if(index < this->number_of_sequences) { return i + 1; }
+ index -= this->number_of_sequences;
+ }
+
+ return max_len;
+}
+
void
-RLCSA::displayUnsafe(pair_type range, uchar* data)
+RLCSA::displayUnsafe(pair_type range, uchar* data) const
{
usint i = range.first - range.first % this->sa_samples->getSampleRate();
usint pos = this->sa_samples->inverseSA(i);
- for(; i < range.first; i++)
+ if(length(range) >= 1024)
{
- pos = this->psi(pos) - this->number_of_sequences;
+ PsiVector::Iterator** iters = this->getIterators();
+ for(; i < range.first; i++)
+ {
+ pos = this->psi(pos, iters) - this->number_of_sequences;
+ }
+ for(; i <= range.second; i++)
+ {
+ usint c = this->getCharacter(pos);
+ data[i - range.first] = c;
+ pos = this->psiUnsafe(pos, c, *(iters[c])) - this->number_of_sequences;
+ }
+ this->deleteIterators(iters);
}
- for(; i <= range.second; i++)
+ else
{
- data[i - range.first] = this->getCharacter(pos);
- pos = this->psi(pos) - this->number_of_sequences;
+ for(; i < range.first; i++)
+ {
+ pos = this->psi(pos) - this->number_of_sequences;
+ }
+ for(; i <= range.second; i++)
+ {
+ usint c = this->getCharacter(pos);
+ data[i - range.first] = c;
+ pos = this->psiUnsafe(pos, c) - this->number_of_sequences;
+ }
}
}
//--------------------------------------------------------------------------
-void
-RLCSA::decompressInto(std::ofstream& psi_file)
+pair_type
+RLCSA::getSequenceRange(usint number) const
{
- for(usint c = 0; c < CHARS; c++)
+ if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
+
+ pair_type result;
+ DeltaVector::Iterator iter(*(this->end_points));
+ if(number == 0)
{
- if(!(this->array[c])) { continue; }
- usint value = this->array[c]->select(0);
- psi_file.write((char*)&(value), sizeof(value));
- while(this->array[c]->hasNext())
- {
- value = this->array[c]->selectNext();
- psi_file.write((char*)&value, sizeof(value));
- }
+ result.first = 0;
+ result.second = iter.select(number);
+ }
+ else
+ {
+ result.first = nextMultipleOf(this->sample_rate, iter.select(number - 1));
+ result.second = iter.selectNext();
+ }
+
+ return result;
+}
+
+usint
+RLCSA::getSequenceForPosition(usint value) const
+{
+ if(value == 0) { return 0; }
+ DeltaVector::Iterator iter(*(this->end_points));
+ return iter.rank(value - 1);
+}
+
+pair_type
+RLCSA::getSequenceRangeForPosition(usint value) const
+{
+ return this->getSequenceRange(this->getSequenceForPosition(value));
+}
+
+usint*
+RLCSA::getSequenceForPosition(usint* values, usint len) const
+{
+ if(values == 0) { return 0; }
+
+ DeltaVector::Iterator iter(*(this->end_points));
+ for(usint i = 0; i < len; i++)
+ {
+ if(values[i] > 0) { values[i] = iter.rank(values[i] - 1); }
+ }
+
+ return values;
+}
+
+pair_type
+RLCSA::getRelativePosition(usint value) const
+{
+ DeltaVector::Iterator iter(*(this->end_points));
+ pair_type result(0, value);
+
+ if(value > 0) { result.first = iter.rank(value - 1); }
+ if(result.first > 0)
+ {
+ result.second -= nextMultipleOf(this->sample_rate, iter.select(result.first - 1));
}
+
+ return result;
+}
+
+//--------------------------------------------------------------------------
+
+uchar*
+RLCSA::readBWT() const
+{
+ return this->readBWT(pair_type(0, this->data_size + this->number_of_sequences - 1));
}
uchar*
-RLCSA::readBWT()
+RLCSA::readBWT(pair_type range) const
{
- usint n = this->data_size + this->number_of_sequences;
+ if(isEmpty(range) || range.second >= this->data_size + this->number_of_sequences) { return 0; }
+
+ usint n = length(range);
uchar* bwt = new uchar[n];
memset(bwt, 0, n);
for(usint c = 0; c < CHARS; c++)
{
- RLEVector* vector = this->array[c];
- if(vector != 0)
+ if(this->array[c] != 0)
{
- bwt[vector->select(0)] = c;
- while(vector->hasNext()) { bwt[vector->selectNext()] = c; }
+ PsiVector::Iterator iter(*(this->array[c]));
+ usint pos = iter.valueAfter(range.first).first;
+ while(pos <= range.second)
+ {
+ bwt[pos - range.first] = c;
+ pos = iter.selectNext();
+ }
}
}
return bwt;
}
+usint
+RLCSA::countRuns() const
+{
+ usint runs = 0;
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(this->array[c] != 0)
+ {
+ PsiVector::Iterator iter(*(this->array[c]));
+ runs += iter.countRuns();
+ }
+ }
+
+ return runs;
+}
+
//--------------------------------------------------------------------------
-usint
-RLCSA::psi(usint index)
+PsiVector::Iterator**
+RLCSA::getIterators() const
{
- if(index >= this->data_size)
+ PsiVector::Iterator** iters = new PsiVector::Iterator*[CHARS];
+ for(usint i = 0; i < CHARS; i++)
{
- return this->data_size + this->number_of_sequences;
+ if(this->array[i] == 0) { iters[i] = 0; }
+ else { iters[i] = new PsiVector::Iterator(*(this->array[i])); }
}
+ return iters;
+}
- usint c = this->getCharacter(index);
- return this->array[c]->select(index - this->index_ranges[c].first);
+void
+RLCSA::deleteIterators(PsiVector::Iterator** iters) const
+{
+ if(iters == 0) { return; }
+ for(usint i = 0; i < CHARS; i++) { delete iters[i]; }
+ delete[] iters;
}
-pair_type
-RLCSA::psi(usint index, usint max_length)
+//--------------------------------------------------------------------------
+
+usint
+RLCSA::reportSize(bool print) const
{
- if(index >= this->data_size)
+ usint bytes = 0, temp = 0, bwt = 0;
+
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(this->array[c])
+ {
+ bytes += this->array[c]->reportSize();
+ bwt += this->array[c]->getCompressedSize();
+ }
+ }
+ bytes += sizeof(*this) + this->end_points->reportSize();
+ if(print)
{
- return pair_type(this->data_size + this->number_of_sequences, 0);
+ std::cout << "RLCSA: " << (bytes / (double)MEGABYTE) << " MB" << std::endl;
+ std::cout << " BWT only: " << (bwt / (double)MEGABYTE) << " MB" << std::endl;
}
- usint c = this->getCharacter(index);
- return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length);
+ if(this->support_locate || this->support_display)
+ {
+ temp = this->sa_samples->reportSize();
+ if(print) { std::cout << "SA samples: " << (temp / (double)MEGABYTE) << " MB" << std::endl; }
+ bytes += temp;
+ }
+
+ if(print)
+ {
+ std::cout << "Total size: " << (bytes / (double)MEGABYTE) << " MB" << std::endl;
+ std::cout << std::endl;
+ }
+
+ return bytes;
+}
+
+void
+RLCSA::printInfo() const
+{
+ double megabytes = this->data_size / (double)MEGABYTE;
+
+ std::cout << "Sequences: " << this->number_of_sequences << std::endl;
+ std::cout << "Original size: " << megabytes << " MB" << std::endl;
+ std::cout << "Block size: " << (this->getBlockSize() * sizeof(usint)) << " bytes" << std::endl;
+ if(this->support_locate || this->support_display)
+ {
+ std::cout << "Sample rate: " << this->sample_rate << std::endl;
+ }
+ std::cout << std::endl;
}
//--------------------------------------------------------------------------
-pair_type
-RLCSA::getSequence(usint number)
+RLEVector*
+RLCSA::buildPLCP(usint block_size) const
{
- if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
+ if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0)
+ {
+ std::cerr << "PLCP: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl;
+ return 0;
+ }
- pair_type result;
- if(number == 0)
+ PsiVector::Iterator** iters = this->getIterators();
+
+ RLEEncoder plcp(block_size);
+ std::list<pair_type> matches;
+ pair_type prev_range = EMPTY_PAIR;
+ for(usint j = 0; j < this->number_of_sequences; j++)
{
- result.first = 0;
- result.second = this->end_points->select(number);
+ // Encode the padding as a single run.
+ pair_type seq_range = this->getSequenceRange(j);
+ if(j > 0 && prev_range.second + 1 < seq_range.first)
+ {
+ this->encodePLCPRun(plcp, prev_range.second + 1, seq_range.first, seq_range.first - prev_range.second - 2);
+ }
+ prev_range = seq_range;
+
+ usint maximal = seq_range.first;
+ usint x = this->sa_samples->inverseSA(seq_range.first), next_x;
+
+ // Invariant: x == inverseSA(i)
+ for(usint i = seq_range.first; i <= seq_range.second; i++, x = next_x)
+ {
+ usint c = this->getCharacter(x);
+
+ // T[i,n] is lexicographically the first suffix beginning with c.
+ if(x == this->index_ranges[c].first)
+ {
+ if(!matches.empty())
+ {
+ maximal = (*(matches.begin())).first;
+ matches.clear();
+ }
+ this->encodePLCPRun(plcp, maximal, i + 1, i - maximal);
+ maximal = i + 1;
+ next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences;
+ continue;
+ }
+
+ // Process the previous left matches we are still following.
+ usint low = this->index_ranges[c].first + this->number_of_sequences;
+ usint start, stop; start = stop = maximal;
+ for(std::list<pair_type>::iterator iter = matches.begin(); iter != matches.end();)
+ {
+ pair_type match = *iter;
+ if(match.second < low) // These no longer match the current character.
+ {
+ if(match.first < start) { start = match.first; } // Form a single run from then.
+ iter = matches.erase(iter);
+ }
+ else
+ {
+ if(match.first < stop) { stop = match.first; } // End of the run to be encoded.
+ (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c]));
+ ++iter;
+ }
+ }
+ if(start < stop) { this->encodePLCPRun(plcp, start, stop, i - start); }
+
+ // If PLCP[i] is minimal, we add the left match to the list.
+ // We add pairs of type (j, inverseSA(j) + n_of_s) as inverseSA is < 0 for end markers.
+ usint next_y = this->psiUnsafe(x - 1, c, *(iters[c]));
+ next_x = this->psiUnsafeNext(c, *(iters[c])) - this->number_of_sequences;
+ if(next_y != next_x + this->number_of_sequences - 1)
+ {
+ matches.push_back(pair_type(maximal, next_y));
+ maximal = i + 1;
+ }
+ }
+
+ if(!matches.empty())
+ {
+ maximal = (*(matches.begin())).first;
+ matches.clear();
+ }
+ if(maximal <= seq_range.second)
+ {
+ this->encodePLCPRun(plcp, maximal, seq_range.second + 1, seq_range.second + 1 - maximal);
+ }
}
- else
+
+ this->deleteIterators(iters);
+
+ return new RLEVector(plcp, 2 * (this->end_points->getSize() + 1));
+}
+
+usint
+RLCSA::sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report) const
+{
+ if(sample_rate == 0)
{
- if(this->sa_samples == 0) { return EMPTY_PAIR; }
- usint d = this->sa_samples->getSampleRate();
- result.first = d * ((this->end_points->select(number - 1) / d) + 1);
- result.second = this->end_points->selectNext();
+ sample_rate = this->data_size + 1;
}
- return result;
+ PsiVector::Iterator** iters = this->getIterators();
+
+ usint runs = this->countRuns();
+ usint samples = 0, minimal_sum = 0, nonstrict_minimal_sum = 0, minimal_samples = 0;
+ usint max_samples = runs + (this->data_size - runs) / sample_rate;
+ sampled_values = new pair_type[max_samples];
+
+ std::list<Triple> matches;
+ for(usint j = 0; j < this->number_of_sequences; j++)
+ {
+ pair_type seq_range = this->getSequenceRange(j);
+ usint first_sample = samples; // First minimal sample of the current sequence.
+ usint minimal = seq_range.second + 1; // Last minimal position.
+ usint i, x = this->sa_samples->inverseSA(seq_range.first), next_x, prev_x = 0;
+
+ // Invariant: x == inverseSA(i)
+ for(i = seq_range.first; i <= seq_range.second; i++, x = next_x)
+ {
+ usint c = this->getCharacter(x);
+
+ // T[i,n] is lexicographically the first suffix beginning with c.
+ if(x == this->index_ranges[c].first)
+ {
+ if(!matches.empty())
+ {
+ for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end(); ++iter)
+ {
+ nonstrict_minimal_sum += i - (*iter).first;
+ }
+ matches.clear();
+ }
+
+ // Minimal sample: SA[x] = i, LCP[x] = 0
+ sampled_values[samples] = pair_type(x, 0); minimal = i;
+ samples++; minimal_samples++;
+ next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences; prev_x = x;
+ continue;
+ }
+
+ // Process the previous left matches we are still following.
+ usint low = this->index_ranges[c].first + this->number_of_sequences;
+ pair_type potential_sample(this->data_size, this->data_size);
+ for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end();)
+ {
+ Triple match = *iter;
+ if(match.second < low) // These no longer match the current character.
+ {
+ if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; }
+
+ // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first
+ potential_sample = pair_type(match.third, i - match.first);
+ iter = matches.erase(iter);
+ }
+ else
+ {
+ (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c]));
+ ++iter;
+ }
+ }
+ if(potential_sample.first != this->data_size)
+ {
+ // Last potential sample is minimal.
+ sampled_values[samples] = potential_sample;
+ samples++; minimal_sum += potential_sample.second; minimal_samples++;
+ }
+
+ // If PLCP[i] is minimal, we add the left match to the list.
+ // We add triples of type (j, inverseSA(j) + n_of_s, x) as inverseSA is < 0 for end markers.
+ // x is the original minimal position.
+ usint next_y = this->psiUnsafe(x - 1, c, *(iters[c]));
+ next_x = this->psiUnsafeNext(c, *(iters[c]));
+ if(next_y != next_x - 1 || next_x < this->number_of_sequences)
+ {
+ matches.push_back(Triple(i, next_y, x)); minimal = i;
+ }
+ next_x -= this->number_of_sequences; prev_x = x;
+ }
+
+ // Are we still following something?
+ if(!matches.empty())
+ {
+ pair_type potential_sample(this->data_size, this->data_size);
+ for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end(); ++iter)
+ {
+ Triple match = *iter;
+ if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; }
+
+ // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first
+ potential_sample = pair_type(match.third, i - match.first);
+ }
+ matches.clear();
+ if(potential_sample.first != this->data_size)
+ {
+ // Last potential sample is minimal.
+ sampled_values[samples] = potential_sample;
+ samples++; minimal_sum += potential_sample.second; minimal_samples++;
+ }
+ }
+
+ // Add the non-minimal samples.
+ if(sample_rate <= this->data_size)
+ {
+ usint last_sample = samples - 1;
+ i = seq_range.first; x = this->sa_samples->inverseSA(seq_range.first);
+ for(usint current_sample = first_sample; current_sample <= last_sample; current_sample++)
+ {
+ // Find the next minimal sample and add nonminimal samples if needed.
+ pair_type first_nonminimal(i + sample_rate - 1, samples);
+ pair_type next_nonminimal = first_nonminimal;
+ while(x != sampled_values[current_sample].first)
+ {
+ if(i == next_nonminimal.first)
+ {
+ sampled_values[samples] = pair_type(x, 0); samples++;
+ next_nonminimal.first += sample_rate; next_nonminimal.second++;
+ }
+ i++; x = this->psi(x, iters) - this->number_of_sequences;
+ }
+
+ // Reduce the nonminimal samples to the current minimal sample.
+ for(next_nonminimal = first_nonminimal; next_nonminimal.second < samples; next_nonminimal.first += sample_rate, next_nonminimal.second++)
+ {
+ sampled_values[next_nonminimal.second].second = sampled_values[current_sample].second + i - next_nonminimal.first;
+ }
+ i++; x = this->psi(x, iters) - this->number_of_sequences;
+ }
+ }
+ }
+
+ std::sort(sampled_values, sampled_values + samples);
+
+ if(report)
+ {
+ std::cout << "Samples: " << samples << " (total) / " << minimal_samples << " (minimal)" << std::endl;
+ std::cout << "Upper bounds: " << max_samples << " (total) / " << runs << " (minimal)" << std::endl;
+ std::cout << "Sum of minimal samples: " << (minimal_sum + nonstrict_minimal_sum) << " (total) / " << minimal_sum << " (strict)" << std::endl;
+ std::cout << std::endl;
+ }
+
+ this->deleteIterators(iters);
+
+ return samples;
}
usint
-RLCSA::reportSize(bool print)
+RLCSA::lcp(usint index, const LCPSamples& lcp_samples) const
{
- usint bytes = 0, temp = 0;
+ if(index >= this->data_size) { return this->data_size; }
- for(usint c = 0; c < CHARS; c++)
+ usint offset = 0;
+ index += this->number_of_sequences;
+ while(true)
{
- if(this->array[c]) { temp += this->array[c]->reportSize(); }
+ pair_type next_sample = lcp_samples.getFirstSampleAfter(index - this->number_of_sequences);
+ next_sample.first += this->number_of_sequences;
+ if(next_sample.first == index)
+ {
+ return lcp_samples.getSample(next_sample.second) + offset;
+ }
+ index = this->psi(index - this->number_of_sequences);
+ offset++;
}
- if(print) { std::cout << "Psi array: " << temp << std::endl; }
- bytes += temp;
+}
- if(this->support_locate || this->support_display)
+usint
+RLCSA::lcp(usint index, const LCPSamples& lcp_samples, const RLEVector& plcp) const
+{
+ if(index >= this->data_size) { return this->data_size; }
+
+ usint offset = 0;
+ while(true)
{
- temp = this->sa_samples->reportSize();
- if(print) { std::cout << "SA samples: " << temp << std::endl; }
- bytes += temp;
+ usint c = this->getCharacter(index);
+ PsiVector::Iterator iter(*(this->array[c]));
+
+ if(index == this->index_ranges[c].first)
+ {
+ return lcp_samples.getSampleAtPosition(index) + offset;
+ }
+ usint next_y = this->psiUnsafe(index - 1, c, iter);
+ usint next_x = this->psiUnsafeNext(c, iter);
+ if(next_y != next_x - 1 || next_x < this->number_of_sequences)
+ {
+ pair_type sample = lcp_samples.getFirstSampleAfter(index);
+ if(sample.first == index)
+ {
+ return lcp_samples.getSample(sample.second) + offset;
+ }
+ }
+
+ pair_type next_sample = this->sa_samples->getFirstSampleAfter(index);
+ if(next_sample.first == index)
+ {
+ index = this->sa_samples->getSample(next_sample.second) - offset;
+ RLEVector::Iterator plcp_iter(plcp);
+ return plcp_iter.select(index) - 2 * index;
+ }
+ index = next_x - this->number_of_sequences;
+ offset++;
+ }
+}
+
+usint
+RLCSA::lcpDirect(usint index) const
+{
+ if(index >= this->data_size || index == 0)
+ {
+ return 0;
}
- temp = sizeof(*this) + this->end_points->reportSize();
- if(print) { std::cout << "RLCSA overhead: " << temp << std::endl; }
- bytes += temp;
+ usint match = index - 1;
+ usint value = 0;
- if(print)
+ usint c = this->getCharacter(index);
+ usint low = this->index_ranges[c].first;
+ while(match >= low)
{
- std::cout << "Total size: " << bytes << std::endl;
- std::cout << std::endl;
+ PsiVector::Iterator iter(*(this->array[c]));
+ match = this->psiUnsafe(match, c, iter);
+ index = this->psiUnsafe(index, c, iter);
+ value++;
+
+ if(match < this->number_of_sequences || index < this->number_of_sequences)
+ {
+ break;
+ }
+ match -= this->number_of_sequences;
+ index -= this->number_of_sequences;
+
+ c = this->getCharacter(index);
+ low = this->index_ranges[c].first;
}
- return bytes;
+ return value;
+}
+
+//--------------------------------------------------------------------------
+
+void
+RLCSA::mergeEndPoints(RLCSA& index, RLCSA& increment)
+{
+ DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
+
+ DeltaVector::Iterator index_iter(*(index.end_points));
+ DeltaVector::Iterator increment_iter(*(increment.end_points));
+
+ endings->setBit(index_iter.select(0));
+ for(usint i = 1; i < index.number_of_sequences; i++)
+ {
+ endings->setBit(index_iter.selectNext());
+ }
+ usint sum = index.end_points->getSize();
+ delete index.end_points; index.end_points = 0;
+
+ endings->setBit(sum + increment_iter.select(0));
+ for(usint i = 1; i < increment.number_of_sequences; i++)
+ {
+ endings->setBit(sum + increment_iter.selectNext());
+ }
+ sum += increment.end_points->getSize();
+ delete increment.end_points; increment.end_points = 0;
+
+ this->end_points = new DeltaVector(*endings, sum);
+ delete endings;
+}
+
+
+void
+RLCSA::mergeSamples(RLCSA& index, RLCSA& increment, usint* positions)
+{
+ if(this->support_locate || this->support_display)
+ {
+ positions += increment.number_of_sequences;
+ this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size, this->number_of_sequences);
+ }
}
//--------------------------------------------------------------------------