X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=incbwt%2Frlcsa.cpp;h=47d2661d6c9a07c4ac2b558867390dacb68c03ee;hb=13e254b7c0ee22dffbc7c3125cee0408f9b375da;hp=8e3d2d908c4e954907e406eb5d6484db8972cb1f;hpb=e4b6bdc7cc2a1372e4d4dae50acac55cebcc7e9b;p=SXSI%2FTextCollection.git diff --git a/incbwt/rlcsa.cpp b/incbwt/rlcsa.cpp index 8e3d2d9..47d2661 100644 --- a/incbwt/rlcsa.cpp +++ b/incbwt/rlcsa.cpp @@ -1,12 +1,20 @@ #include #include -#include +#include #include #include +#include #include "rlcsa.h" #include "misc/utils.h" #include "qsufsort/qsufsort.h" +#include "bits/vectors.h" + + +#ifdef MULTITHREAD_SUPPORT +#include +#endif + namespace CSA @@ -32,11 +40,11 @@ RLCSA::RLCSA(const std::string& base_name, bool print) : array_file.read((char*)distribution, CHARS * sizeof(usint)); this->buildCharIndexes(distribution); - Parameters parameters; - parameters.read(base_name + PARAMETERS_EXTENSION); +// Parameters parameters; +// parameters.read(base_name + PARAMETERS_EXTENSION); for(usint c = 0; c < CHARS; c++) { - if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); } + if(!isEmpty(this->index_ranges[c])) { this->array[c] = new PsiVector(array_file); } } this->end_points = new DeltaVector(array_file); @@ -45,8 +53,8 @@ RLCSA::RLCSA(const std::string& base_name, bool print) : array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate)); array_file.close(); - this->support_locate = parameters.get(SUPPORT_LOCATE); - this->support_display = parameters.get(SUPPORT_DISPLAY); + this->support_locate = true;// parameters.get(SUPPORT_LOCATE); + this->support_display = true; //parameters.get(SUPPORT_DISPLAY); if(this->support_locate || this->support_display) { @@ -61,7 +69,7 @@ RLCSA::RLCSA(const std::string& base_name, bool print) : sa_sample_file.close(); } - if(print) { parameters.print(); } +// if(print) { parameters.print(); } this->ok = true; } @@ -165,10 +173,10 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b // Build suffix array. - sint* inverse = new sint[bytes + 1]; + int* inverse = new int[bytes + 1]; if(multiple_sequences) { - sint zeros = 0; + int zeros = 0; for(usint i = 0; i < bytes; i++) { if(data[i] == 0) @@ -178,35 +186,29 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b } else { - inverse[i] = (sint)data[i] + this->number_of_sequences; + inverse[i] = (int)(data[i] + this->number_of_sequences); } } } else { - for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; } + for(usint i = 0; i < bytes; i++) { inverse[i] = (int)data[i]; } } if(delete_data) { delete[] data; } - sint* sa = new sint[bytes + 1]; + int* sa = new int[bytes + 1]; suffixsort(inverse, sa, bytes, high + 1, low); // Sample SA. - usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA. if(this->support_locate || this->support_display) { if(multiple_sequences) { - std::cout << "We shouldn't be here!" << std::endl; - // Real SA starts at sa + incr. - // for each sequence - // find starting position - // sample relative to starting position - // sort samples to SA order + this->sa_samples = new SASamples((uint*)inverse, this->end_points, this->data_size, this->sample_rate); } else { - this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate); + this->sa_samples = new SASamples((uint*)(sa + 1), this->data_size, this->sample_rate); } } @@ -220,14 +222,15 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b // Build RLCSA. + usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA. usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences. for(usint c = 0; c < CHARS; c++) { if(distribution[c] == 0) { this->array[c] = 0; continue; } - usint* curr = (usint*)(sa + index_ranges[c].first + incr); - usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1); - RLEEncoder encoder(block_size); + uint* curr = (uint*)(sa + index_ranges[c].first + incr); + uint* limit = (uint*)(sa + index_ranges[c].second + incr + 1); + PsiVector::Encoder encoder(block_size); pair_type run(*curr, 1); curr++; @@ -242,7 +245,7 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b } encoder.setRun(run.first - decr, run.second); - this->array[c] = new RLEVector(encoder, this->data_size + incr - decr); + this->array[c] = new PsiVector(encoder, this->data_size + incr - decr); } delete[] sa; @@ -250,7 +253,7 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b this->ok = true; } -RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) : +RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads) : ok(false), sa_samples(0), end_points(0) @@ -269,7 +272,6 @@ RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) if(index.sample_rate != increment.sample_rate) { std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl; - std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl; return; } @@ -291,61 +293,40 @@ RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) } this->buildCharIndexes(distribution); this->sample_rate = index.sample_rate; - - - // Combine end points of sequences. this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences; - DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE); - - endings->setBit(index.end_points->select(0)); - for(usint i = 1; i < index.number_of_sequences; i++) - { - endings->setBit(index.end_points->selectNext()); - } - usint sum = index.end_points->getSize(); - delete index.end_points; index.end_points = 0; - - endings->setBit(sum + increment.end_points->select(0)); - for(usint i = 1; i < increment.number_of_sequences; i++) - { - endings->setBit(sum + increment.end_points->selectNext()); - } - sum += increment.end_points->getSize(); - delete increment.end_points; increment.end_points = 0; - - this->end_points = new DeltaVector(*endings, sum); - delete endings; - - // Combine Psi. + // Merge end points, SA samples, and Psi. usint psi_size = this->data_size + this->number_of_sequences; - for(usint c = 0; c < CHARS; c++) - { - if(distribution[c] == 0) { this->array[c] = 0; continue; } - this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size); - index.array[c] = 0; - increment.array[c] = 0; - - if(this->array[c] == 0) - { - std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl; - return; - } - } - + bool should_be_ok = true; - // Combine suffix array samples. - if(this->support_locate || this->support_display) + #ifdef MULTITHREAD_SUPPORT + omp_set_num_threads(threads); + #pragma omp parallel { - positions += increment.number_of_sequences; - for(usint i = 0; i < increment.data_size; i++) + #pragma omp for schedule(dynamic, 1) + #endif + for(int c = -2; c < (int)CHARS; c++) { - positions[i] -= this->number_of_sequences; + if(c == -2) { this->mergeEndPoints(index, increment); } + else if(c == -1) { this->mergeSamples(index, increment, positions); } + else if(distribution[c] != 0) + { + this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size); + index.array[c] = 0; + increment.array[c] = 0; + + if(this->array[c] == 0) + { + std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl; + should_be_ok = false; + } + } } - this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size); + #ifdef MULTITHREAD_SUPPORT } + #endif - this->ok = true; + this->ok = should_be_ok; } RLCSA::~RLCSA() @@ -355,8 +336,10 @@ RLCSA::~RLCSA() delete this->end_points; } +//-------------------------------------------------------------------------- + void -RLCSA::writeTo(const std::string& base_name) +RLCSA::writeTo(const std::string& base_name) const { std::string array_name = base_name + ARRAY_EXTENSION; std::ofstream array_file(array_name.c_str(), std::ios_base::binary); @@ -400,7 +383,7 @@ RLCSA::writeTo(const std::string& base_name) } bool -RLCSA::isOk() +RLCSA::isOk() const { return this->ok; } @@ -408,24 +391,21 @@ RLCSA::isOk() //-------------------------------------------------------------------------- pair_type -RLCSA::count(const std::string& pattern) +RLCSA::count(const std::string& pattern) const { if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); } - pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())]; - if(isEmpty(index_range)) { return index_range; } + std::string::const_reverse_iterator iter = pattern.rbegin(); + + pair_type index_range = this->index_ranges[(uchar)*iter]; + if(isEmpty(index_range)) { return EMPTY_PAIR; } index_range.first += this->number_of_sequences; index_range.second += this->number_of_sequences; - for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++) + for(++iter; iter != pattern.rend(); ++iter) { - RLEVector* vector = this->array[(usint)*iter]; - usint start = this->index_ranges[(usint)*iter].first; - - index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences; - index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences; - - if(isEmpty(index_range)) { return index_range; } + index_range = this->backwardSearchStep(index_range, (uchar)*iter); + if(isEmpty(index_range)) { return EMPTY_PAIR; } } // Suffix array indexes are 0-based. @@ -435,20 +415,23 @@ RLCSA::count(const std::string& pattern) return index_range; } +//-------------------------------------------------------------------------- + void -RLCSA::reportPositions(uchar* data, usint length, usint* positions) +RLCSA::reportPositions(uchar* data, usint length, usint* positions) const { if(data == 0 || length == 0 || positions == 0) { return; } + PsiVector::Iterator** iters = this->getIterators(); + usint current = this->number_of_sequences - 1; positions[length] = current; // "immediately after current" for(sint i = (sint)(length - 1); i >= 0; i--) { -// positions[i] = current; // "immediately after current" usint c = (usint)data[i]; - if(array[c]) + if(this->array[c] != 0) { - current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences; + current = this->LF(current, c, *(iters[c])); } else { @@ -463,24 +446,25 @@ RLCSA::reportPositions(uchar* data, usint length, usint* positions) } positions[i] = current; // "immediately after current" } + + this->deleteIterators(iters); } //-------------------------------------------------------------------------- -LocateItem* -RLCSA::locate(pair_type range) +usint* +RLCSA::locate(pair_type range) const { if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; } - LocateItem* data = new LocateItem[range.second + 1 - range.first]; - if(!data) { return 0; } + usint* data = new usint[length(range)]; this->locateUnsafe(range, data); return data; } -LocateItem* -RLCSA::locate(pair_type range, LocateItem* data) +usint* +RLCSA::locate(pair_type range, usint* data) const { if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; } this->locateUnsafe(range, data); @@ -488,14 +472,19 @@ RLCSA::locate(pair_type range, LocateItem* data) } void -RLCSA::locateUnsafe(pair_type range, LocateItem* data) +RLCSA::locateUnsafe(pair_type range, usint* data) const { - usint items = range.second + 1 - range.first; + usint items = length(range); + usint* offsets = new usint[items]; + bool* finished = new bool[items]; // FIXME This could be more space efficient... + + PsiVector::Iterator** iters = this->getIterators(); + for(usint i = 0, j = range.first; i < items; i++, j++) { - data[i].value = j + this->number_of_sequences; - data[i].offset = 0; - data[i].found = false; + data[i] = j + this->number_of_sequences; + offsets[i] = 0; + finished[i] = false; } bool found = false; @@ -505,7 +494,7 @@ RLCSA::locateUnsafe(pair_type range, LocateItem* data) pair_type run = EMPTY_PAIR; for(usint i = 0; i < items; i++) { - if(data[i].found) + if(finished[i]) { continue; // The run might continue after this. } @@ -513,22 +502,26 @@ RLCSA::locateUnsafe(pair_type range, LocateItem* data) { run = pair_type(i, i); } - else if(data[i].value - data[run.first].value == i - run.first) + else if(data[i] - data[run.first] == i - run.first) { run.second = i; } else { - found &= this->processRun(run, data); + found &= this->processRun(run, data, offsets, finished, iters); run = pair_type(i, i); } } - if(!isEmpty(run)) { found &= this->processRun(run, data); } + if(!isEmpty(run)) { found &= this->processRun(run, data, offsets, finished, iters); } } + + this->deleteIterators(iters); + delete[] offsets; + delete[] finished; } bool -RLCSA::processRun(pair_type run, LocateItem* data) +RLCSA::processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters) const { bool found = true; usint run_start = 0, run_left = 0; @@ -536,44 +529,45 @@ RLCSA::processRun(pair_type run, LocateItem* data) for(usint i = run.first; i <= run.second; i++) { - if(data[i].found) + if(finished[i]) { if(run_left > 0) { run_left--; } continue; } - if(data[i].value < this->number_of_sequences) // Implicit sample here. + if(data[i] < this->number_of_sequences) // Implicit sample here. { - data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset; - data[i].found = true; + DeltaVector::Iterator iter(*(this->end_points)); + data[i] = iter.select(data[i]) + 1 - offsets[i]; + finished[i] = true; if(run_left > 0) { run_left--; } continue; } - if(next_sample.first < data[i].value) // Need another sample. + if(next_sample.first < data[i]) // Need another sample. { - next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences); + next_sample = this->sa_samples->getFirstSampleAfter(data[i] - this->number_of_sequences); next_sample.first += this->number_of_sequences; } - if(data[i].value < next_sample.first) // No sample found for current position. + if(data[i] < next_sample.first) // No sample found for current position. { if(run_left > 0) { - data[i].value = data[run_start].value + i - run_start; + data[i] = data[run_start] + i - run_start; run_left--; } else { - pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i); - data[i].value = value.first; + pair_type value = this->psi(data[i] - this->number_of_sequences, run.second - i, iters); + data[i] = value.first; run_left = value.second; run_start = i; } - data[i].offset++; + offsets[i]++; found = false; } else // Sampled position found. { - data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset; - data[i].found = true; + data[i] = this->sa_samples->getSample(next_sample.second) - offsets[i]; + finished[i] = true; if(run_left > 0) { run_left--; } } } @@ -581,7 +575,7 @@ RLCSA::processRun(pair_type run, LocateItem* data) } usint -RLCSA::locate(usint index) +RLCSA::locate(usint index) const { if(!(this->support_locate) || index >= this->data_size) { return this->data_size; } @@ -591,7 +585,8 @@ RLCSA::locate(usint index) { if(index < this->number_of_sequences) // Implicit sample here { - return this->end_points->select(index) + 1 - offset; + DeltaVector::Iterator iter(*(this->end_points)); + return iter.select(index) + 1 - offset; } pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences); next_sample.first += this->number_of_sequences; @@ -607,29 +602,43 @@ RLCSA::locate(usint index) //-------------------------------------------------------------------------- uchar* -RLCSA::display(usint sequence, pair_type range) +RLCSA::display(usint sequence) const +{ + if(!(this->support_display)) { return 0; } + + pair_type seq_range = this->getSequenceRange(sequence); + if(isEmpty(seq_range)) { return 0; } + + uchar* data = new uchar[length(seq_range)+1]; + this->displayUnsafe(seq_range, data); + data[length(seq_range)] = 0; + + return data; +} + +uchar* +RLCSA::display(usint sequence, pair_type range) const { if(!(this->support_display) || isEmpty(range)) { return 0; } - pair_type seq_range = this->getSequence(sequence); + pair_type seq_range = this->getSequenceRange(sequence); if(isEmpty(seq_range)) { return 0; } range.first += seq_range.first; range.second += seq_range.first; if(range.second > seq_range.second) { return 0; } - uchar* data = new uchar[range.second + 1 - range.first]; - if(!data) { return 0; } + uchar* data = new uchar[length(range)]; this->displayUnsafe(range, data); return data; } uchar* -RLCSA::display(usint sequence, pair_type range, uchar* data) +RLCSA::display(usint sequence, pair_type range, uchar* data) const { if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; } - pair_type seq_range = this->getSequence(sequence); + pair_type seq_range = this->getSequenceRange(sequence); if(isEmpty(seq_range)) { return 0; } range.first += seq_range.first; range.second += seq_range.first; @@ -639,143 +648,662 @@ RLCSA::display(usint sequence, pair_type range, uchar* data) return data; } +uchar* +RLCSA::display(usint position, usint len, usint context, usint& result_length) const +{ + if(!(this->support_display)) { return 0; } + + pair_type range = this->getSequenceRangeForPosition(position); + if(isEmpty(range)) { return 0; } + + range.first = position - std::min(context, position - range.first); + range.second = std::min(range.second, position + len + context - 1); + result_length = length(range); + if(isEmpty(range)) { return 0; } + + uchar* data = new uchar[length(range)]; + this->displayUnsafe(range, data); + + return data; +} + +usint +RLCSA::displayPrefix(usint sequence, usint len, uchar* data) const +{ + if(!(this->support_display) || len == 0 || data == 0) { return 0; } + + pair_type seq_range = this->getSequenceRange(sequence); + if(isEmpty(seq_range)) { return 0; } + + pair_type range(seq_range.first, std::min(seq_range.second, seq_range.first + len - 1)); + + this->displayUnsafe(range, data); + return length(range); +} + +usint +RLCSA::displayFromPosition(usint index, usint max_len, uchar* data) const +{ + if(max_len == 0 || data == 0 || index >= this->data_size) { return 0; } + + for(usint i = 0; i < max_len; i++) + { + data[i] = this->getCharacter(index); + index = this->psiUnsafe(index, data[i]); + if(index < this->number_of_sequences) { return i + 1; } + index -= this->number_of_sequences; + } + + return max_len; +} + void -RLCSA::displayUnsafe(pair_type range, uchar* data) +RLCSA::displayUnsafe(pair_type range, uchar* data) const { usint i = range.first - range.first % this->sa_samples->getSampleRate(); usint pos = this->sa_samples->inverseSA(i); - for(; i < range.first; i++) + if(length(range) >= 1024) { - pos = this->psi(pos) - this->number_of_sequences; + PsiVector::Iterator** iters = this->getIterators(); + for(; i < range.first; i++) + { + pos = this->psi(pos, iters) - this->number_of_sequences; + } + for(; i <= range.second; i++) + { + usint c = this->getCharacter(pos); + data[i - range.first] = c; + pos = this->psiUnsafe(pos, c, *(iters[c])) - this->number_of_sequences; + } + this->deleteIterators(iters); } - for(; i <= range.second; i++) + else { - data[i - range.first] = this->getCharacter(pos); - pos = this->psi(pos) - this->number_of_sequences; + for(; i < range.first; i++) + { + pos = this->psi(pos) - this->number_of_sequences; + } + for(; i <= range.second; i++) + { + usint c = this->getCharacter(pos); + data[i - range.first] = c; + pos = this->psiUnsafe(pos, c) - this->number_of_sequences; + } } } //-------------------------------------------------------------------------- -void -RLCSA::decompressInto(std::ofstream& psi_file) +pair_type +RLCSA::getSequenceRange(usint number) const { - for(usint c = 0; c < CHARS; c++) + if(number >= this->number_of_sequences) { return EMPTY_PAIR; } + + pair_type result; + DeltaVector::Iterator iter(*(this->end_points)); + if(number == 0) { - if(!(this->array[c])) { continue; } - usint value = this->array[c]->select(0); - psi_file.write((char*)&(value), sizeof(value)); - while(this->array[c]->hasNext()) - { - value = this->array[c]->selectNext(); - psi_file.write((char*)&value, sizeof(value)); - } + result.first = 0; + result.second = iter.select(number); + } + else + { + result.first = nextMultipleOf(this->sample_rate, iter.select(number - 1)); + result.second = iter.selectNext(); + } + + return result; +} + +usint +RLCSA::getSequenceForPosition(usint value) const +{ + if(value == 0) { return 0; } + DeltaVector::Iterator iter(*(this->end_points)); + return iter.rank(value - 1); +} + +pair_type +RLCSA::getSequenceRangeForPosition(usint value) const +{ + return this->getSequenceRange(this->getSequenceForPosition(value)); +} + +usint* +RLCSA::getSequenceForPosition(usint* values, usint len) const +{ + if(values == 0) { return 0; } + + DeltaVector::Iterator iter(*(this->end_points)); + for(usint i = 0; i < len; i++) + { + if(values[i] > 0) { values[i] = iter.rank(values[i] - 1); } + } + + return values; +} + +pair_type +RLCSA::getRelativePosition(usint value) const +{ + DeltaVector::Iterator iter(*(this->end_points)); + pair_type result(0, value); + + if(value > 0) { result.first = iter.rank(value - 1); } + if(result.first > 0) + { + result.second -= nextMultipleOf(this->sample_rate, iter.select(result.first - 1)); } + + return result; +} + +//-------------------------------------------------------------------------- + +uchar* +RLCSA::readBWT() const +{ + return this->readBWT(pair_type(0, this->data_size + this->number_of_sequences - 1)); } uchar* -RLCSA::readBWT() +RLCSA::readBWT(pair_type range) const { - usint n = this->data_size + this->number_of_sequences; + if(isEmpty(range) || range.second >= this->data_size + this->number_of_sequences) { return 0; } + + usint n = length(range); uchar* bwt = new uchar[n]; memset(bwt, 0, n); for(usint c = 0; c < CHARS; c++) { - RLEVector* vector = this->array[c]; - if(vector != 0) + if(this->array[c] != 0) { - bwt[vector->select(0)] = c; - while(vector->hasNext()) { bwt[vector->selectNext()] = c; } + PsiVector::Iterator iter(*(this->array[c])); + usint pos = iter.valueAfter(range.first).first; + while(pos <= range.second) + { + bwt[pos - range.first] = c; + pos = iter.selectNext(); + } } } return bwt; } +usint +RLCSA::countRuns() const +{ + usint runs = 0; + for(usint c = 0; c < CHARS; c++) + { + if(this->array[c] != 0) + { + PsiVector::Iterator iter(*(this->array[c])); + runs += iter.countRuns(); + } + } + + return runs; +} + //-------------------------------------------------------------------------- -usint -RLCSA::psi(usint index) +PsiVector::Iterator** +RLCSA::getIterators() const { - if(index >= this->data_size) + PsiVector::Iterator** iters = new PsiVector::Iterator*[CHARS]; + for(usint i = 0; i < CHARS; i++) { - return this->data_size + this->number_of_sequences; + if(this->array[i] == 0) { iters[i] = 0; } + else { iters[i] = new PsiVector::Iterator(*(this->array[i])); } } + return iters; +} - usint c = this->getCharacter(index); - return this->array[c]->select(index - this->index_ranges[c].first); +void +RLCSA::deleteIterators(PsiVector::Iterator** iters) const +{ + if(iters == 0) { return; } + for(usint i = 0; i < CHARS; i++) { delete iters[i]; } + delete[] iters; } -pair_type -RLCSA::psi(usint index, usint max_length) +//-------------------------------------------------------------------------- + +usint +RLCSA::reportSize(bool print) const { - if(index >= this->data_size) + usint bytes = 0, temp = 0, bwt = 0; + + for(usint c = 0; c < CHARS; c++) + { + if(this->array[c]) + { + bytes += this->array[c]->reportSize(); + bwt += this->array[c]->getCompressedSize(); + } + } + bytes += sizeof(*this) + this->end_points->reportSize(); + if(print) { - return pair_type(this->data_size + this->number_of_sequences, 0); + std::cout << "RLCSA: " << (bytes / (double)MEGABYTE) << " MB" << std::endl; + std::cout << " BWT only: " << (bwt / (double)MEGABYTE) << " MB" << std::endl; } - usint c = this->getCharacter(index); - return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length); + if(this->support_locate || this->support_display) + { + temp = this->sa_samples->reportSize(); + if(print) { std::cout << "SA samples: " << (temp / (double)MEGABYTE) << " MB" << std::endl; } + bytes += temp; + } + + if(print) + { + std::cout << "Total size: " << (bytes / (double)MEGABYTE) << " MB" << std::endl; + std::cout << std::endl; + } + + return bytes; +} + +void +RLCSA::printInfo() const +{ + double megabytes = this->data_size / (double)MEGABYTE; + + std::cout << "Sequences: " << this->number_of_sequences << std::endl; + std::cout << "Original size: " << megabytes << " MB" << std::endl; + std::cout << "Block size: " << (this->getBlockSize() * sizeof(usint)) << " bytes" << std::endl; + if(this->support_locate || this->support_display) + { + std::cout << "Sample rate: " << this->sample_rate << std::endl; + } + std::cout << std::endl; } //-------------------------------------------------------------------------- -pair_type -RLCSA::getSequence(usint number) +RLEVector* +RLCSA::buildPLCP(usint block_size) const { - if(number >= this->number_of_sequences) { return EMPTY_PAIR; } + if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0) + { + std::cerr << "PLCP: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl; + return 0; + } - pair_type result; - if(number == 0) + PsiVector::Iterator** iters = this->getIterators(); + + RLEEncoder plcp(block_size); + std::list matches; + pair_type prev_range = EMPTY_PAIR; + for(usint j = 0; j < this->number_of_sequences; j++) { - result.first = 0; - result.second = this->end_points->select(number); + // Encode the padding as a single run. + pair_type seq_range = this->getSequenceRange(j); + if(j > 0 && prev_range.second + 1 < seq_range.first) + { + this->encodePLCPRun(plcp, prev_range.second + 1, seq_range.first, seq_range.first - prev_range.second - 2); + } + prev_range = seq_range; + + usint maximal = seq_range.first; + usint x = this->sa_samples->inverseSA(seq_range.first), next_x; + + // Invariant: x == inverseSA(i) + for(usint i = seq_range.first; i <= seq_range.second; i++, x = next_x) + { + usint c = this->getCharacter(x); + + // T[i,n] is lexicographically the first suffix beginning with c. + if(x == this->index_ranges[c].first) + { + if(!matches.empty()) + { + maximal = (*(matches.begin())).first; + matches.clear(); + } + this->encodePLCPRun(plcp, maximal, i + 1, i - maximal); + maximal = i + 1; + next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences; + continue; + } + + // Process the previous left matches we are still following. + usint low = this->index_ranges[c].first + this->number_of_sequences; + usint start, stop; start = stop = maximal; + for(std::list::iterator iter = matches.begin(); iter != matches.end();) + { + pair_type match = *iter; + if(match.second < low) // These no longer match the current character. + { + if(match.first < start) { start = match.first; } // Form a single run from then. + iter = matches.erase(iter); + } + else + { + if(match.first < stop) { stop = match.first; } // End of the run to be encoded. + (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c])); + ++iter; + } + } + if(start < stop) { this->encodePLCPRun(plcp, start, stop, i - start); } + + // If PLCP[i] is minimal, we add the left match to the list. + // We add pairs of type (j, inverseSA(j) + n_of_s) as inverseSA is < 0 for end markers. + usint next_y = this->psiUnsafe(x - 1, c, *(iters[c])); + next_x = this->psiUnsafeNext(c, *(iters[c])) - this->number_of_sequences; + if(next_y != next_x + this->number_of_sequences - 1) + { + matches.push_back(pair_type(maximal, next_y)); + maximal = i + 1; + } + } + + if(!matches.empty()) + { + maximal = (*(matches.begin())).first; + matches.clear(); + } + if(maximal <= seq_range.second) + { + this->encodePLCPRun(plcp, maximal, seq_range.second + 1, seq_range.second + 1 - maximal); + } } - else + + this->deleteIterators(iters); + + return new RLEVector(plcp, 2 * (this->end_points->getSize() + 1)); +} + +usint +RLCSA::sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report) const +{ + if(sample_rate == 0) { - if(this->sa_samples == 0) { return EMPTY_PAIR; } - usint d = this->sa_samples->getSampleRate(); - result.first = d * ((this->end_points->select(number - 1) / d) + 1); - result.second = this->end_points->selectNext(); + sample_rate = this->data_size + 1; } - return result; + PsiVector::Iterator** iters = this->getIterators(); + + usint runs = this->countRuns(); + usint samples = 0, minimal_sum = 0, nonstrict_minimal_sum = 0, minimal_samples = 0; + usint max_samples = runs + (this->data_size - runs) / sample_rate; + sampled_values = new pair_type[max_samples]; + + std::list matches; + for(usint j = 0; j < this->number_of_sequences; j++) + { + pair_type seq_range = this->getSequenceRange(j); + usint first_sample = samples; // First minimal sample of the current sequence. + usint minimal = seq_range.second + 1; // Last minimal position. + usint i, x = this->sa_samples->inverseSA(seq_range.first), next_x, prev_x = 0; + + // Invariant: x == inverseSA(i) + for(i = seq_range.first; i <= seq_range.second; i++, x = next_x) + { + usint c = this->getCharacter(x); + + // T[i,n] is lexicographically the first suffix beginning with c. + if(x == this->index_ranges[c].first) + { + if(!matches.empty()) + { + for(std::list::iterator iter = matches.begin(); iter != matches.end(); ++iter) + { + nonstrict_minimal_sum += i - (*iter).first; + } + matches.clear(); + } + + // Minimal sample: SA[x] = i, LCP[x] = 0 + sampled_values[samples] = pair_type(x, 0); minimal = i; + samples++; minimal_samples++; + next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences; prev_x = x; + continue; + } + + // Process the previous left matches we are still following. + usint low = this->index_ranges[c].first + this->number_of_sequences; + pair_type potential_sample(this->data_size, this->data_size); + for(std::list::iterator iter = matches.begin(); iter != matches.end();) + { + Triple match = *iter; + if(match.second < low) // These no longer match the current character. + { + if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; } + + // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first + potential_sample = pair_type(match.third, i - match.first); + iter = matches.erase(iter); + } + else + { + (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c])); + ++iter; + } + } + if(potential_sample.first != this->data_size) + { + // Last potential sample is minimal. + sampled_values[samples] = potential_sample; + samples++; minimal_sum += potential_sample.second; minimal_samples++; + } + + // If PLCP[i] is minimal, we add the left match to the list. + // We add triples of type (j, inverseSA(j) + n_of_s, x) as inverseSA is < 0 for end markers. + // x is the original minimal position. + usint next_y = this->psiUnsafe(x - 1, c, *(iters[c])); + next_x = this->psiUnsafeNext(c, *(iters[c])); + if(next_y != next_x - 1 || next_x < this->number_of_sequences) + { + matches.push_back(Triple(i, next_y, x)); minimal = i; + } + next_x -= this->number_of_sequences; prev_x = x; + } + + // Are we still following something? + if(!matches.empty()) + { + pair_type potential_sample(this->data_size, this->data_size); + for(std::list::iterator iter = matches.begin(); iter != matches.end(); ++iter) + { + Triple match = *iter; + if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; } + + // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first + potential_sample = pair_type(match.third, i - match.first); + } + matches.clear(); + if(potential_sample.first != this->data_size) + { + // Last potential sample is minimal. + sampled_values[samples] = potential_sample; + samples++; minimal_sum += potential_sample.second; minimal_samples++; + } + } + + // Add the non-minimal samples. + if(sample_rate <= this->data_size) + { + usint last_sample = samples - 1; + i = seq_range.first; x = this->sa_samples->inverseSA(seq_range.first); + for(usint current_sample = first_sample; current_sample <= last_sample; current_sample++) + { + // Find the next minimal sample and add nonminimal samples if needed. + pair_type first_nonminimal(i + sample_rate - 1, samples); + pair_type next_nonminimal = first_nonminimal; + while(x != sampled_values[current_sample].first) + { + if(i == next_nonminimal.first) + { + sampled_values[samples] = pair_type(x, 0); samples++; + next_nonminimal.first += sample_rate; next_nonminimal.second++; + } + i++; x = this->psi(x, iters) - this->number_of_sequences; + } + + // Reduce the nonminimal samples to the current minimal sample. + for(next_nonminimal = first_nonminimal; next_nonminimal.second < samples; next_nonminimal.first += sample_rate, next_nonminimal.second++) + { + sampled_values[next_nonminimal.second].second = sampled_values[current_sample].second + i - next_nonminimal.first; + } + i++; x = this->psi(x, iters) - this->number_of_sequences; + } + } + } + + std::sort(sampled_values, sampled_values + samples); + + if(report) + { + std::cout << "Samples: " << samples << " (total) / " << minimal_samples << " (minimal)" << std::endl; + std::cout << "Upper bounds: " << max_samples << " (total) / " << runs << " (minimal)" << std::endl; + std::cout << "Sum of minimal samples: " << (minimal_sum + nonstrict_minimal_sum) << " (total) / " << minimal_sum << " (strict)" << std::endl; + std::cout << std::endl; + } + + this->deleteIterators(iters); + + return samples; } usint -RLCSA::reportSize(bool print) +RLCSA::lcp(usint index, const LCPSamples& lcp_samples) const { - usint bytes = 0, temp = 0; + if(index >= this->data_size) { return this->data_size; } - for(usint c = 0; c < CHARS; c++) + usint offset = 0; + index += this->number_of_sequences; + while(true) { - if(this->array[c]) { temp += this->array[c]->reportSize(); } + pair_type next_sample = lcp_samples.getFirstSampleAfter(index - this->number_of_sequences); + next_sample.first += this->number_of_sequences; + if(next_sample.first == index) + { + return lcp_samples.getSample(next_sample.second) + offset; + } + index = this->psi(index - this->number_of_sequences); + offset++; } - if(print) { std::cout << "Psi array: " << temp << std::endl; } - bytes += temp; +} - if(this->support_locate || this->support_display) +usint +RLCSA::lcp(usint index, const LCPSamples& lcp_samples, const RLEVector& plcp) const +{ + if(index >= this->data_size) { return this->data_size; } + + usint offset = 0; + while(true) { - temp = this->sa_samples->reportSize(); - if(print) { std::cout << "SA samples: " << temp << std::endl; } - bytes += temp; + usint c = this->getCharacter(index); + PsiVector::Iterator iter(*(this->array[c])); + + if(index == this->index_ranges[c].first) + { + return lcp_samples.getSampleAtPosition(index) + offset; + } + usint next_y = this->psiUnsafe(index - 1, c, iter); + usint next_x = this->psiUnsafeNext(c, iter); + if(next_y != next_x - 1 || next_x < this->number_of_sequences) + { + pair_type sample = lcp_samples.getFirstSampleAfter(index); + if(sample.first == index) + { + return lcp_samples.getSample(sample.second) + offset; + } + } + + pair_type next_sample = this->sa_samples->getFirstSampleAfter(index); + if(next_sample.first == index) + { + index = this->sa_samples->getSample(next_sample.second) - offset; + RLEVector::Iterator plcp_iter(plcp); + return plcp_iter.select(index) - 2 * index; + } + index = next_x - this->number_of_sequences; + offset++; + } +} + +usint +RLCSA::lcpDirect(usint index) const +{ + if(index >= this->data_size || index == 0) + { + return 0; } - temp = sizeof(*this) + this->end_points->reportSize(); - if(print) { std::cout << "RLCSA overhead: " << temp << std::endl; } - bytes += temp; + usint match = index - 1; + usint value = 0; - if(print) + usint c = this->getCharacter(index); + usint low = this->index_ranges[c].first; + while(match >= low) { - std::cout << "Total size: " << bytes << std::endl; - std::cout << std::endl; + PsiVector::Iterator iter(*(this->array[c])); + match = this->psiUnsafe(match, c, iter); + index = this->psiUnsafe(index, c, iter); + value++; + + if(match < this->number_of_sequences || index < this->number_of_sequences) + { + break; + } + match -= this->number_of_sequences; + index -= this->number_of_sequences; + + c = this->getCharacter(index); + low = this->index_ranges[c].first; } - return bytes; + return value; +} + +//-------------------------------------------------------------------------- + +void +RLCSA::mergeEndPoints(RLCSA& index, RLCSA& increment) +{ + DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE); + + DeltaVector::Iterator index_iter(*(index.end_points)); + DeltaVector::Iterator increment_iter(*(increment.end_points)); + + endings->setBit(index_iter.select(0)); + for(usint i = 1; i < index.number_of_sequences; i++) + { + endings->setBit(index_iter.selectNext()); + } + usint sum = index.end_points->getSize(); + delete index.end_points; index.end_points = 0; + + endings->setBit(sum + increment_iter.select(0)); + for(usint i = 1; i < increment.number_of_sequences; i++) + { + endings->setBit(sum + increment_iter.selectNext()); + } + sum += increment.end_points->getSize(); + delete increment.end_points; increment.end_points = 0; + + this->end_points = new DeltaVector(*endings, sum); + delete endings; +} + + +void +RLCSA::mergeSamples(RLCSA& index, RLCSA& increment, usint* positions) +{ + if(this->support_locate || this->support_display) + { + positions += increment.number_of_sequences; + this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size, this->number_of_sequences); + } } //--------------------------------------------------------------------------