X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=incbwt%2Frlcsa.cpp;fp=incbwt%2Frlcsa.cpp;h=8e3d2d908c4e954907e406eb5d6484db8972cb1f;hb=40ddf9aca842bdc081b6350a4ebfe36b066c94c9;hp=0000000000000000000000000000000000000000;hpb=af8938dbee21244687184dd0502a84ce1af70c50;p=SXSI%2FTextCollection.git diff --git a/incbwt/rlcsa.cpp b/incbwt/rlcsa.cpp new file mode 100644 index 0000000..8e3d2d9 --- /dev/null +++ b/incbwt/rlcsa.cpp @@ -0,0 +1,839 @@ +#include +#include +#include +#include +#include + +#include "rlcsa.h" +#include "misc/utils.h" +#include "qsufsort/qsufsort.h" + + +namespace CSA +{ + + +RLCSA::RLCSA(const std::string& base_name, bool print) : + ok(false), + sa_samples(0), + end_points(0) +{ + for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; } + + std::string array_name = base_name + ARRAY_EXTENSION; + std::ifstream array_file(array_name.c_str(), std::ios_base::binary); + if(!array_file) + { + std::cerr << "RLCSA: Error opening Psi array file!" << std::endl; + return; + } + + usint distribution[CHARS]; + array_file.read((char*)distribution, CHARS * sizeof(usint)); + this->buildCharIndexes(distribution); + + Parameters parameters; + parameters.read(base_name + PARAMETERS_EXTENSION); + for(usint c = 0; c < CHARS; c++) + { + if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); } + } + + this->end_points = new DeltaVector(array_file); + this->number_of_sequences = this->end_points->getNumberOfItems(); + + array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate)); + array_file.close(); + + this->support_locate = parameters.get(SUPPORT_LOCATE); + this->support_display = parameters.get(SUPPORT_DISPLAY); + + if(this->support_locate || this->support_display) + { + std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION; + std::ifstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary); + if(!sa_sample_file) + { + std::cerr << "RLCSA: Error opening suffix array sample file!" << std::endl; + return; + } + this->sa_samples = new SASamples(sa_sample_file, this->sample_rate); + sa_sample_file.close(); + } + + if(print) { parameters.print(); } + + this->ok = true; +} + +RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data) : + ok(false), + sa_samples(0), + sample_rate(sa_sample_rate), + end_points(0) +{ + for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; } + + if(!data || bytes == 0) + { + std::cerr << "RLCSA: No input data given!" << std::endl; + if(delete_data) { delete[] data; } + return; + } + if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0) + { + std::cerr << "RLCSA: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl; + if(delete_data) { delete[] data; } + return; + } + + + // Do we store SA samples? + if(this->sample_rate > 0) + { + this->support_locate = this->support_display = true; + } + else + { + this->support_locate = this->support_display = false; + this->sample_rate = 1; + } + + + // Determine the number of sequences and mark their end points. + if(multiple_sequences) + { + DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE); + this->number_of_sequences = 0; + usint marker = 0; + usint padding = 0, chars_encountered = 0; + + for(usint i = 0; i < bytes; i++) + { + if(data[i] == 0) + { + if(i == marker) { break; } // Empty sequence. + this->number_of_sequences++; + marker = i + 1; + + usint pos = chars_encountered + padding - 1; + endings.setBit(pos); + padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - chars_encountered; + } + else + { + chars_encountered++; + } + } + + if(this->number_of_sequences == 0 || marker != bytes) + { + std::cerr << "RLCSA: Collection must consist of 0-terminated nonempty sequences!" << std::endl; + if(delete_data) { delete[] data; } + return; + } + this->end_points = new DeltaVector(endings, chars_encountered + padding); + } + else + { + this->number_of_sequences = 1; + DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE, RLCSA::ENDPOINT_BLOCK_SIZE); + usint pos = bytes - 1; + endings.setBit(pos); + usint padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - bytes; + this->end_points = new DeltaVector(endings, bytes + padding); + } + + + // Build character tables etc. + usint distribution[CHARS]; + sint low = CHARS, high = 0; + for(usint c = 0; c < CHARS; c++) { distribution[c] = 0; } + for(usint i = 0; i < bytes; i++) + { + if(data[i] < low) { low = data[i]; } + if(data[i] > high) { high = data[i]; } + distribution[(usint)data[i]]++; + } + if(multiple_sequences) + { + distribution[0] = 0; + low = 0; + high = high + this->number_of_sequences; + } + this->buildCharIndexes(distribution); + + + // Build suffix array. + sint* inverse = new sint[bytes + 1]; + if(multiple_sequences) + { + sint zeros = 0; + for(usint i = 0; i < bytes; i++) + { + if(data[i] == 0) + { + inverse[i] = zeros; + zeros++; + } + else + { + inverse[i] = (sint)data[i] + this->number_of_sequences; + } + } + } + else + { + for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; } + } + if(delete_data) { delete[] data; } + sint* sa = new sint[bytes + 1]; + suffixsort(inverse, sa, bytes, high + 1, low); + + + // Sample SA. + usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA. + if(this->support_locate || this->support_display) + { + if(multiple_sequences) + { + std::cout << "We shouldn't be here!" << std::endl; + // Real SA starts at sa + incr. + // for each sequence + // find starting position + // sample relative to starting position + // sort samples to SA order + } + else + { + this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate); + } + } + + + // Build Psi. + for(usint i = 0; i <= bytes; i++) + { + sa[i] = inverse[(sa[i] + 1) % (bytes + 1)]; + } + delete[] inverse; + + + // Build RLCSA. + usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences. + for(usint c = 0; c < CHARS; c++) + { + if(distribution[c] == 0) { this->array[c] = 0; continue; } + + usint* curr = (usint*)(sa + index_ranges[c].first + incr); + usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1); + RLEEncoder encoder(block_size); + pair_type run(*curr, 1); + curr++; + + for(; curr < limit; curr++) + { + if(*curr == run.first + run.second) { run.second++; } + else + { + encoder.setRun(run.first - decr, run.second); + run = pair_type(*curr, 1); + } + } + encoder.setRun(run.first - decr, run.second); + + this->array[c] = new RLEVector(encoder, this->data_size + incr - decr); + } + delete[] sa; + + + this->ok = true; +} + +RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) : + ok(false), + sa_samples(0), + end_points(0) +{ + for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; } + + if(!index.isOk() || !increment.isOk()) + { + return; // Fail silently. Actual error has already been reported. + } + if(positions == 0) + { + std::cerr << "RLCSA: Positions for insertions not available!" << std::endl; + return; + } + if(index.sample_rate != increment.sample_rate) + { + std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl; + std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl; + return; + } + + if(index.sa_samples == 0 || increment.sa_samples == 0) + { + this->support_locate = this->support_display = false; + } + else + { + this->support_locate = this->support_display = true; + } + + + // Build character tables etc. + usint distribution[CHARS]; + for(usint c = 0; c < CHARS; c++) + { + distribution[c] = length(index.index_ranges[c]) + length(increment.index_ranges[c]); + } + this->buildCharIndexes(distribution); + this->sample_rate = index.sample_rate; + + + // Combine end points of sequences. + this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences; + DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE); + + endings->setBit(index.end_points->select(0)); + for(usint i = 1; i < index.number_of_sequences; i++) + { + endings->setBit(index.end_points->selectNext()); + } + usint sum = index.end_points->getSize(); + delete index.end_points; index.end_points = 0; + + endings->setBit(sum + increment.end_points->select(0)); + for(usint i = 1; i < increment.number_of_sequences; i++) + { + endings->setBit(sum + increment.end_points->selectNext()); + } + sum += increment.end_points->getSize(); + delete increment.end_points; increment.end_points = 0; + + this->end_points = new DeltaVector(*endings, sum); + delete endings; + + + // Combine Psi. + usint psi_size = this->data_size + this->number_of_sequences; + for(usint c = 0; c < CHARS; c++) + { + if(distribution[c] == 0) { this->array[c] = 0; continue; } + this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size); + index.array[c] = 0; + increment.array[c] = 0; + + if(this->array[c] == 0) + { + std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl; + return; + } + } + + + // Combine suffix array samples. + if(this->support_locate || this->support_display) + { + positions += increment.number_of_sequences; + for(usint i = 0; i < increment.data_size; i++) + { + positions[i] -= this->number_of_sequences; + } + this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size); + } + + this->ok = true; +} + +RLCSA::~RLCSA() +{ + for(usint c = 0; c < CHARS; c++) { delete this->array[c]; } + delete this->sa_samples; + delete this->end_points; +} + +void +RLCSA::writeTo(const std::string& base_name) +{ + std::string array_name = base_name + ARRAY_EXTENSION; + std::ofstream array_file(array_name.c_str(), std::ios_base::binary); + if(!array_file) + { + std::cerr << "RLCSA: Error creating Psi array file!" << std::endl; + return; + } + + usint distribution[CHARS]; + for(usint c = 0; c < CHARS; c++) + { + distribution[c] = length(this->index_ranges[c]); + } + array_file.write((char*)distribution, CHARS * sizeof(usint)); + for(usint c = 0; c < CHARS; c++) + { + if(this->array[c] != 0) + { + this->array[c]->writeTo(array_file); + } + } + + this->end_points->writeTo(array_file); + array_file.write((char*)&(this->sample_rate), sizeof(this->sample_rate)); + array_file.close(); + + if(this->support_locate || this->support_display) + { + std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION; + std::ofstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary); + if(!sa_sample_file) + { + std::cerr << "RLCSA: Error creating suffix array sample file!" << std::endl; + return; + } + + this->sa_samples->writeTo(sa_sample_file); + sa_sample_file.close(); + } +} + +bool +RLCSA::isOk() +{ + return this->ok; +} + +//-------------------------------------------------------------------------- + +pair_type +RLCSA::count(const std::string& pattern) +{ + if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); } + + pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())]; + if(isEmpty(index_range)) { return index_range; } + index_range.first += this->number_of_sequences; + index_range.second += this->number_of_sequences; + + for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++) + { + RLEVector* vector = this->array[(usint)*iter]; + usint start = this->index_ranges[(usint)*iter].first; + + index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences; + index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences; + + if(isEmpty(index_range)) { return index_range; } + } + + // Suffix array indexes are 0-based. + index_range.first -= this->number_of_sequences; + index_range.second -= this->number_of_sequences; + + return index_range; +} + +void +RLCSA::reportPositions(uchar* data, usint length, usint* positions) +{ + if(data == 0 || length == 0 || positions == 0) { return; } + + usint current = this->number_of_sequences - 1; + positions[length] = current; // "immediately after current" + for(sint i = (sint)(length - 1); i >= 0; i--) + { +// positions[i] = current; // "immediately after current" + usint c = (usint)data[i]; + if(array[c]) + { + current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences; + } + else + { + if(c < this->text_chars[0]) // No previous characters either. + { + current = this->number_of_sequences - 1; + } + else + { + current = this->index_ranges[c].first - 1 + this->number_of_sequences; + } + } + positions[i] = current; // "immediately after current" + } +} + +//-------------------------------------------------------------------------- + +LocateItem* +RLCSA::locate(pair_type range) +{ + if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; } + + LocateItem* data = new LocateItem[range.second + 1 - range.first]; + if(!data) { return 0; } + this->locateUnsafe(range, data); + + return data; +} + +LocateItem* +RLCSA::locate(pair_type range, LocateItem* data) +{ + if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; } + this->locateUnsafe(range, data); + return data; +} + +void +RLCSA::locateUnsafe(pair_type range, LocateItem* data) +{ + usint items = range.second + 1 - range.first; + for(usint i = 0, j = range.first; i < items; i++, j++) + { + data[i].value = j + this->number_of_sequences; + data[i].offset = 0; + data[i].found = false; + } + + bool found = false; + while(!found) + { + found = true; + pair_type run = EMPTY_PAIR; + for(usint i = 0; i < items; i++) + { + if(data[i].found) + { + continue; // The run might continue after this. + } + else if(isEmpty(run)) + { + run = pair_type(i, i); + } + else if(data[i].value - data[run.first].value == i - run.first) + { + run.second = i; + } + else + { + found &= this->processRun(run, data); + run = pair_type(i, i); + } + } + if(!isEmpty(run)) { found &= this->processRun(run, data); } + } +} + +bool +RLCSA::processRun(pair_type run, LocateItem* data) +{ + bool found = true; + usint run_start = 0, run_left = 0; + pair_type next_sample = pair_type(0, 0); + + for(usint i = run.first; i <= run.second; i++) + { + if(data[i].found) + { + if(run_left > 0) { run_left--; } + continue; + } + if(data[i].value < this->number_of_sequences) // Implicit sample here. + { + data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset; + data[i].found = true; + if(run_left > 0) { run_left--; } + continue; + } + if(next_sample.first < data[i].value) // Need another sample. + { + next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences); + next_sample.first += this->number_of_sequences; + } + if(data[i].value < next_sample.first) // No sample found for current position. + { + if(run_left > 0) + { + data[i].value = data[run_start].value + i - run_start; + run_left--; + } + else + { + pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i); + data[i].value = value.first; + run_left = value.second; + run_start = i; + } + data[i].offset++; + found = false; + } + else // Sampled position found. + { + data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset; + data[i].found = true; + if(run_left > 0) { run_left--; } + } + } + return found; +} + +usint +RLCSA::locate(usint index) +{ + if(!(this->support_locate) || index >= this->data_size) { return this->data_size; } + + usint offset = 0; + index += this->number_of_sequences; + while(true) + { + if(index < this->number_of_sequences) // Implicit sample here + { + return this->end_points->select(index) + 1 - offset; + } + pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences); + next_sample.first += this->number_of_sequences; + if(next_sample.first == index) + { + return this->sa_samples->getSample(next_sample.second) - offset; + } + index = this->psi(index - this->number_of_sequences); + offset++; + } +} + +//-------------------------------------------------------------------------- + +uchar* +RLCSA::display(usint sequence, pair_type range) +{ + if(!(this->support_display) || isEmpty(range)) { return 0; } + + pair_type seq_range = this->getSequence(sequence); + if(isEmpty(seq_range)) { return 0; } + + range.first += seq_range.first; range.second += seq_range.first; + if(range.second > seq_range.second) { return 0; } + + uchar* data = new uchar[range.second + 1 - range.first]; + if(!data) { return 0; } + this->displayUnsafe(range, data); + + return data; +} + +uchar* +RLCSA::display(usint sequence, pair_type range, uchar* data) +{ + if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; } + + pair_type seq_range = this->getSequence(sequence); + if(isEmpty(seq_range)) { return 0; } + + range.first += seq_range.first; range.second += seq_range.first; + if(range.second > seq_range.second) { return 0; } + + this->displayUnsafe(range, data); + return data; +} + +void +RLCSA::displayUnsafe(pair_type range, uchar* data) +{ + usint i = range.first - range.first % this->sa_samples->getSampleRate(); + + usint pos = this->sa_samples->inverseSA(i); + + for(; i < range.first; i++) + { + pos = this->psi(pos) - this->number_of_sequences; + } + for(; i <= range.second; i++) + { + data[i - range.first] = this->getCharacter(pos); + pos = this->psi(pos) - this->number_of_sequences; + } +} + +//-------------------------------------------------------------------------- + +void +RLCSA::decompressInto(std::ofstream& psi_file) +{ + for(usint c = 0; c < CHARS; c++) + { + if(!(this->array[c])) { continue; } + usint value = this->array[c]->select(0); + psi_file.write((char*)&(value), sizeof(value)); + while(this->array[c]->hasNext()) + { + value = this->array[c]->selectNext(); + psi_file.write((char*)&value, sizeof(value)); + } + } +} + +uchar* +RLCSA::readBWT() +{ + usint n = this->data_size + this->number_of_sequences; + + uchar* bwt = new uchar[n]; + memset(bwt, 0, n); + + for(usint c = 0; c < CHARS; c++) + { + RLEVector* vector = this->array[c]; + if(vector != 0) + { + bwt[vector->select(0)] = c; + while(vector->hasNext()) { bwt[vector->selectNext()] = c; } + } + } + + return bwt; +} + +//-------------------------------------------------------------------------- + +usint +RLCSA::psi(usint index) +{ + if(index >= this->data_size) + { + return this->data_size + this->number_of_sequences; + } + + usint c = this->getCharacter(index); + return this->array[c]->select(index - this->index_ranges[c].first); +} + +pair_type +RLCSA::psi(usint index, usint max_length) +{ + if(index >= this->data_size) + { + return pair_type(this->data_size + this->number_of_sequences, 0); + } + + usint c = this->getCharacter(index); + return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length); +} + +//-------------------------------------------------------------------------- + +pair_type +RLCSA::getSequence(usint number) +{ + if(number >= this->number_of_sequences) { return EMPTY_PAIR; } + + pair_type result; + if(number == 0) + { + result.first = 0; + result.second = this->end_points->select(number); + } + else + { + if(this->sa_samples == 0) { return EMPTY_PAIR; } + usint d = this->sa_samples->getSampleRate(); + result.first = d * ((this->end_points->select(number - 1) / d) + 1); + result.second = this->end_points->selectNext(); + } + + return result; +} + +usint +RLCSA::reportSize(bool print) +{ + usint bytes = 0, temp = 0; + + for(usint c = 0; c < CHARS; c++) + { + if(this->array[c]) { temp += this->array[c]->reportSize(); } + } + if(print) { std::cout << "Psi array: " << temp << std::endl; } + bytes += temp; + + if(this->support_locate || this->support_display) + { + temp = this->sa_samples->reportSize(); + if(print) { std::cout << "SA samples: " << temp << std::endl; } + bytes += temp; + } + + temp = sizeof(*this) + this->end_points->reportSize(); + if(print) { std::cout << "RLCSA overhead: " << temp << std::endl; } + bytes += temp; + + if(print) + { + std::cout << "Total size: " << bytes << std::endl; + std::cout << std::endl; + } + + return bytes; +} + +//-------------------------------------------------------------------------- + +void +RLCSA::buildCharIndexes(usint* distribution) +{ + this->data_size = buildRanges(distribution, this->index_ranges); + + usint i = 0, c = 0; + for(; c < CHARS; c++) + { + if(!isEmpty(this->index_ranges[c])) + { + this->text_chars[i] = c; + i++; + } + } + this->chars = i; + + this->index_rate = std::max((this->data_size + CHARS - 1) / CHARS, (usint)1); + usint current = 0; + + for(c = 0, i = 0; c < this->chars; c++) + { + pair_type range = this->index_ranges[this->text_chars[c]]; + while(current <= range.second) + { + this->index_pointers[i] = c; + current += this->index_rate; + i++; + } + } +} + +usint +buildRanges(usint* distribution, pair_type* index_ranges) +{ + if(distribution == 0 || index_ranges == 0) { return 0; } + + usint sum = 0; + for(usint c = 0; c < CHARS; c++) + { + if(distribution[c] == 0) + { + if(sum == 0) { index_ranges[c] = EMPTY_PAIR; } + else { index_ranges[c] = pair_type(sum, sum - 1); } + } + else + { + index_ranges[c].first = sum; + sum += distribution[c]; + index_ranges[c].second = sum - 1; + } + } + + return sum; +} + + +} // namespace CSA