X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=incbwt%2Frlcsa.h;fp=incbwt%2Frlcsa.h;h=79ea062ccf1d3e979638946a2421403fe4b1d28b;hb=13e254b7c0ee22dffbc7c3125cee0408f9b375da;hp=ff2505a7925aa6fee7d4b1bbd694c0d88a75b79c;hpb=e4b6bdc7cc2a1372e4d4dae50acac55cebcc7e9b;p=SXSI%2FTextCollection.git diff --git a/incbwt/rlcsa.h b/incbwt/rlcsa.h index ff2505a..79ea062 100644 --- a/incbwt/rlcsa.h +++ b/incbwt/rlcsa.h @@ -2,10 +2,14 @@ #define RLCSA_H #include -#include // defines std::memset, added by Kim +#include + +#include "bits/deltavector.h" +#include "bits/rlevector.h" +#include "bits/nibblevector.h" -#include "bits/vectors.h" #include "sasamples.h" +#include "lcpsamples.h" #include "misc/parameters.h" @@ -18,26 +22,31 @@ const std::string PSI_EXTENSION = ".psi"; const std::string ARRAY_EXTENSION = ".rlcsa.array"; const std::string SA_SAMPLES_EXTENSION = ".rlcsa.sa_samples"; const std::string PARAMETERS_EXTENSION = ".rlcsa.parameters"; +const std::string LCP_SAMPLES_EXTENSION = ".lcp_samples"; +const std::string PLCP_EXTENSION = ".plcp"; const parameter_type RLCSA_BLOCK_SIZE = parameter_type("RLCSA_BLOCK_SIZE", 32); const parameter_type SAMPLE_RATE = parameter_type("SAMPLE_RATE", 512); -const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 0); -const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 0); +const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 1); +const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 1); -struct LocateItem -{ - usint value; - usint offset; - bool found; -}; - +#ifdef USE_NIBBLE_VECTORS +typedef NibbleVector PsiVector; +#else +typedef RLEVector PsiVector; +#endif class RLCSA { public: + +//-------------------------------------------------------------------------- +// CONSTRUCTION +//-------------------------------------------------------------------------- + const static usint ENDPOINT_BLOCK_SIZE = 16; RLCSA(const std::string& base_name, bool print = false); @@ -45,58 +54,179 @@ class RLCSA /* If multiple_sequences is true, each 0 is treated as a end of sequence marker. There must be nonzero characters between the 0s. data[bytes - 1] must also be 0. + FIXME Crashes if bytes >= 2 GB. */ RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data); // Destroys contents of index and increment. - RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size); + // threads has no effect unless MULTITHREAD_SUPPORT is defined + RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads = 1); ~RLCSA(); - void writeTo(const std::string& base_name); + void writeTo(const std::string& base_name) const; + + bool isOk() const; - bool isOk(); +//-------------------------------------------------------------------------- +// QUERIES +//-------------------------------------------------------------------------- // Returns the closed interval of suffix array containing the matches. - pair_type count(const std::string& pattern); + pair_type count(const std::string& pattern) const; - void reportPositions(uchar* data, usint length, usint* positions); + // Used when merging CSAs. + void reportPositions(uchar* data, usint length, usint* positions) const; // Returns SA[range]. User must free the buffer. Latter version uses buffer provided by the user. - LocateItem* locate(pair_type range); - LocateItem* locate(pair_type range, LocateItem* data); + usint* locate(pair_type range) const; + usint* locate(pair_type range, usint* data) const; // Returns SA[index]. - usint locate(usint index); + usint locate(usint index) const; + + // Returns T^{sequence}[range]. User must free the buffer. + // Third version uses buffer provided by the user. + uchar* display(usint sequence) const; + uchar* display(usint sequence, pair_type range) const; + uchar* display(usint sequence, pair_type range, uchar* data) const; + + // Displays the intersection of T[position - context, position + len + context - 1] + // and T^{getSequenceForPosition(position)}. + // This is intended for displaying an occurrence of a pattern of length 'len' + // with 'context' extra characters on both sides. + // The actual length of the returned string is written into result_length. + uchar* display(usint position, usint len, usint context, usint& result_length) const; - // Returns Ti[range]. User must free the buffer. Latter version uses buffer provided by the user. - uchar* display(usint sequence, pair_type range); - uchar* display(usint sequence, pair_type range, uchar* data); + // Returns the actual length of the prefix. User must provide the buffer. + usint displayPrefix(usint sequence, usint len, uchar* data) const; - // Writes the Psi array into given file. End of sequence markers are not written. - void decompressInto(std::ofstream& psi_file); + // Returns at most max_len characters starting from T[SA[index]]. + // User must provide the buffer. Returns the number of characters in buffer. + usint displayFromPosition(usint index, usint max_len, uchar* data) const; + + // Get the range of SA values for the sequence identified by + // a sequence number or a SA value. + pair_type getSequenceRange(usint number) const; + pair_type getSequenceRangeForPosition(usint value) const; + + // Get the sequence number for given SA value(s). + // The returned array is the same as the parameter. + usint getSequenceForPosition(usint value) const; + usint* getSequenceForPosition(usint* value, usint length) const; + + // Changes SA value to (sequence, offset). + pair_type getRelativePosition(usint value) const; // Returns the BWT of the collection including end of sequence markers. - uchar* readBWT(); + uchar* readBWT() const; + uchar* readBWT(pair_type range) const; + + // Returns the number of equal letter runs in the BWT. Runs consisting of end markers are ignored. + usint countRuns() const; - // Return value includes the implicit end of sequence markers. To get suffix array indexes, - // subtract getNumberOfSequences() from the value. - usint psi(usint index); - pair_type psi(usint index, usint max_length); // This version returns a run. +//-------------------------------------------------------------------------- +// BASIC OPERATIONS +//-------------------------------------------------------------------------- - inline bool supportsLocate() { return this->support_locate; } - inline bool supportsDisplay() { return this->support_display; } - inline usint getSize() { return this->data_size; } - inline usint getNumberOfSequences() { return this->number_of_sequences; } + // The return values of these functions include the implicit end markers. + // To get SA indexes, subtract getNumberOfSequences() from the value. - pair_type getSequence(usint number); + inline usint psi(usint index) const + { + if(index >= this->data_size) + { + return this->data_size + this->number_of_sequences; + } + + usint c = this->getCharacter(index); + return this->psiUnsafe(index, c); + } + + // This version returns a run. + inline pair_type psi(usint index, usint max_length) const + { + if(index >= this->data_size) + { + return pair_type(this->data_size + this->number_of_sequences, 0); + } + + usint c = this->getCharacter(index); + PsiVector::Iterator iter(*(this->array[c])); + return iter.selectRun(index - this->index_ranges[c].first, max_length); + } + + inline usint C(usint c) const + { + if(c >= CHARS) + return this->data_size + this->number_of_sequences; + return this->index_ranges[c].first + this->number_of_sequences - 1; + } + + inline usint LF(usint index, usint c) const + { + if(c >= CHARS) + { + return this->data_size + this->number_of_sequences; + } + if(this->array[c] == 0) + { + if(c < this->text_chars[0]) { return this->number_of_sequences - 1; } + return this->index_ranges[c].first + this->number_of_sequences - 1; + } + index += this->number_of_sequences; + + PsiVector::Iterator iter(*(this->array[c])); + return this->LF(index, c, iter); + } + +//-------------------------------------------------------------------------- +// REPORTING +//-------------------------------------------------------------------------- + + inline bool supportsLocate() const { return this->support_locate; } + inline bool supportsDisplay() const { return this->support_display; } + inline usint getSize() const { return this->data_size; } + inline usint getNumberOfSequences() const { return this->number_of_sequences; } + inline usint getBlockSize() const { return this->array[this->text_chars[0]]->getBlockSize(); } // Returns the size of the data structure. - usint reportSize(bool print = false); + usint reportSize(bool print = false) const; + + void printInfo() const; + +//-------------------------------------------------------------------------- +// LCP EXPERIMENTS +//-------------------------------------------------------------------------- + + // Optimized version: + // - Interleaves main loop with computing irreducible values. + // - Encodes maximal runs from a true local maximum to a true local minimum. + RLEVector* buildPLCP(usint block_size) const; + + // Returns the number of samples. sampled_values will be a pointer to the samples. + usint sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report = false) const; + + usint lcp(usint index, const LCPSamples& lcp_samples) const; + usint lcp(usint index, const LCPSamples& lcp_samples, const RLEVector& plcp) const; + + // Calculate LCP[index] directly. + usint lcpDirect(usint index) const; + + // Writes PLCP[start] to PLCP[stop - 1]. + inline void encodePLCPRun(RLEEncoder& plcp, usint start, usint stop, usint first_val) const + { + plcp.setRun(2 * start + first_val, stop - start); +// std::cerr << "(" << start << ", " << stop << ", " << first_val << ")" << std::endl; + } + +//-------------------------------------------------------------------------- +// INTERNAL VARIABLES +//-------------------------------------------------------------------------- private: bool ok; - RLEVector* array[CHARS]; + PsiVector* array[CHARS]; SASamples* sa_samples; pair_type index_ranges[CHARS]; @@ -114,18 +244,90 @@ class RLCSA usint number_of_sequences; DeltaVector* end_points; - void locateUnsafe(pair_type range, LocateItem* data); - bool processRun(pair_type run, LocateItem* data); - void displayUnsafe(pair_type range, uchar* data); +//-------------------------------------------------------------------------- +// INTERNAL VERSIONS OF QUERIES +//-------------------------------------------------------------------------- + + void locateUnsafe(pair_type range, usint* data) const; + bool processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters) const; + void displayUnsafe(pair_type range, uchar* data) const; + +//-------------------------------------------------------------------------- +// INTERNAL VERSIONS OF BASIC OPERATIONS +//-------------------------------------------------------------------------- + + inline usint psi(usint index, PsiVector::Iterator** iters) const + { + usint c = this->getCharacter(index); + return iters[c]->select(index - this->index_ranges[c].first); + } - inline usint getCharacter(usint pos) + inline pair_type psi(usint index, usint max_length, PsiVector::Iterator** iters) const { - usint* curr = &(this->text_chars[this->index_pointers[pos / this->index_rate]]); + usint c = this->getCharacter(index); + return iters[c]->selectRun(index - this->index_ranges[c].first, max_length); + } + + // Returns psi(index), assuming the suffix of rank index begins with c. + inline usint psiUnsafe(usint index, usint c) const + { + PsiVector::Iterator iter(*(this->array[c])); + return this->psiUnsafe(index, c, iter); + } + + // As above, but with a given iterator. + inline usint psiUnsafe(usint index, usint c, PsiVector::Iterator& iter) const + { + return iter.select(index - this->index_ranges[c].first); + } + + // As above, but returns the next value of psi. + inline usint psiUnsafeNext(usint c, PsiVector::Iterator& iter) const + { + return iter.selectNext(); + } + + inline pair_type backwardSearchStep(pair_type range, usint c) const + { + if(this->array[c] == 0) { return EMPTY_PAIR; } + PsiVector::Iterator iter(*(this->array[c])); + + usint start = this->index_ranges[c].first + this->number_of_sequences - 1; + range.first = start + iter.rank(range.first, true); + range.second = start + iter.rank(range.second); + + return range; + } + + inline usint LF(usint index, usint c, PsiVector::Iterator& iter) const + { + return this->index_ranges[c].first + this->number_of_sequences + iter.rank(index) - 1; + } + +//-------------------------------------------------------------------------- +// INTERNAL STUFF +//-------------------------------------------------------------------------- + + // Creates an array of iterators for every vector in this->array. + PsiVector::Iterator** getIterators() const; + void deleteIterators(PsiVector::Iterator** iters) const; + + inline usint getCharacter(usint pos) const + { + const usint* curr = &(this->text_chars[this->index_pointers[pos / this->index_rate]]); while(pos > this->index_ranges[*curr].second) { curr++; } return *curr; } + void mergeEndPoints(RLCSA& index, RLCSA& increment); + void mergeSamples(RLCSA& index, RLCSA& increment, usint* positions); + void buildCharIndexes(usint* distribution); + + // These are not allowed. + RLCSA(); + RLCSA(const RLCSA&); + RLCSA& operator = (const RLCSA&); };