7 #include "bits/deltavector.h"
8 #include "bits/rlevector.h"
9 #include "bits/nibblevector.h"
11 #include "sasamples.h"
12 #include "lcpsamples.h"
13 #include "misc/parameters.h"
20 const std::string SA_EXTENSION = ".sa";
21 const std::string PSI_EXTENSION = ".psi";
22 const std::string ARRAY_EXTENSION = ".rlcsa.array";
23 const std::string SA_SAMPLES_EXTENSION = ".rlcsa.sa_samples";
24 const std::string PARAMETERS_EXTENSION = ".rlcsa.parameters";
25 const std::string LCP_SAMPLES_EXTENSION = ".lcp_samples";
26 const std::string PLCP_EXTENSION = ".plcp";
29 const parameter_type RLCSA_BLOCK_SIZE = parameter_type("RLCSA_BLOCK_SIZE", 32);
30 const parameter_type SAMPLE_RATE = parameter_type("SAMPLE_RATE", 512);
31 const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 1);
32 const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 1);
35 #ifdef USE_NIBBLE_VECTORS
36 typedef NibbleVector PsiVector;
38 typedef RLEVector PsiVector;
46 //--------------------------------------------------------------------------
48 //--------------------------------------------------------------------------
50 const static usint ENDPOINT_BLOCK_SIZE = 16;
52 RLCSA(const std::string& base_name, bool print = false);
55 If multiple_sequences is true, each 0 is treated as a end of sequence marker.
56 There must be nonzero characters between the 0s. data[bytes - 1] must also be 0.
57 FIXME Crashes if bytes >= 2 GB.
59 RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data);
61 // Destroys contents of index and increment.
62 // threads has no effect unless MULTITHREAD_SUPPORT is defined
63 RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads = 1);
66 void writeTo(const std::string& base_name) const;
70 //--------------------------------------------------------------------------
72 //--------------------------------------------------------------------------
74 // Returns the closed interval of suffix array containing the matches.
75 pair_type count(const std::string& pattern) const;
77 // Used when merging CSAs.
78 void reportPositions(uchar* data, usint length, usint* positions) const;
80 // Returns SA[range]. User must free the buffer. Latter version uses buffer provided by the user.
81 usint* locate(pair_type range) const;
82 usint* locate(pair_type range, usint* data) const;
85 usint locate(usint index) const;
87 // Returns T^{sequence}[range]. User must free the buffer.
88 // Third version uses buffer provided by the user.
89 uchar* display(usint sequence) const;
90 uchar* display(usint sequence, pair_type range) const;
91 uchar* display(usint sequence, pair_type range, uchar* data) const;
93 // Displays the intersection of T[position - context, position + len + context - 1]
94 // and T^{getSequenceForPosition(position)}.
95 // This is intended for displaying an occurrence of a pattern of length 'len'
96 // with 'context' extra characters on both sides.
97 // The actual length of the returned string is written into result_length.
98 uchar* display(usint position, usint len, usint context, usint& result_length) const;
100 // Returns the actual length of the prefix. User must provide the buffer.
101 usint displayPrefix(usint sequence, usint len, uchar* data) const;
103 // Returns at most max_len characters starting from T[SA[index]].
104 // User must provide the buffer. Returns the number of characters in buffer.
105 usint displayFromPosition(usint index, usint max_len, uchar* data) const;
107 // Get the range of SA values for the sequence identified by
108 // a sequence number or a SA value.
109 pair_type getSequenceRange(usint number) const;
110 pair_type getSequenceRangeForPosition(usint value) const;
112 // Get the sequence number for given SA value(s).
113 // The returned array is the same as the parameter.
114 usint getSequenceForPosition(usint value) const;
115 usint* getSequenceForPosition(usint* value, usint length) const;
117 // Changes SA value to (sequence, offset).
118 pair_type getRelativePosition(usint value) const;
120 // Returns the BWT of the collection including end of sequence markers.
121 uchar* readBWT() const;
122 uchar* readBWT(pair_type range) const;
124 // Returns the number of equal letter runs in the BWT. Runs consisting of end markers are ignored.
125 usint countRuns() const;
127 //--------------------------------------------------------------------------
129 //--------------------------------------------------------------------------
131 // The return values of these functions include the implicit end markers.
132 // To get SA indexes, subtract getNumberOfSequences() from the value.
134 inline usint psi(usint index) const
136 if(index >= this->data_size)
138 return this->data_size + this->number_of_sequences;
141 usint c = this->getCharacter(index);
142 return this->psiUnsafe(index, c);
145 // This version returns a run.
146 inline pair_type psi(usint index, usint max_length) const
148 if(index >= this->data_size)
150 return pair_type(this->data_size + this->number_of_sequences, 0);
153 usint c = this->getCharacter(index);
154 PsiVector::Iterator iter(*(this->array[c]));
155 return iter.selectRun(index - this->index_ranges[c].first, max_length);
158 inline usint C(usint c) const
161 return this->data_size + this->number_of_sequences;
162 return this->index_ranges[c].first + this->number_of_sequences - 1;
165 inline usint LF(usint index, usint c) const
169 return this->data_size + this->number_of_sequences;
171 if(this->array[c] == 0)
173 if(c < this->text_chars[0]) { return this->number_of_sequences - 1; }
174 return this->index_ranges[c].first + this->number_of_sequences - 1;
176 index += this->number_of_sequences;
178 PsiVector::Iterator iter(*(this->array[c]));
179 return this->LF(index, c, iter);
182 //--------------------------------------------------------------------------
184 //--------------------------------------------------------------------------
186 inline bool supportsLocate() const { return this->support_locate; }
187 inline bool supportsDisplay() const { return this->support_display; }
188 inline usint getSize() const { return this->data_size; }
189 inline usint getNumberOfSequences() const { return this->number_of_sequences; }
190 inline usint getBlockSize() const { return this->array[this->text_chars[0]]->getBlockSize(); }
192 // Returns the size of the data structure.
193 usint reportSize(bool print = false) const;
195 void printInfo() const;
197 //--------------------------------------------------------------------------
199 //--------------------------------------------------------------------------
201 // Optimized version:
202 // - Interleaves main loop with computing irreducible values.
203 // - Encodes maximal runs from a true local maximum to a true local minimum.
204 RLEVector* buildPLCP(usint block_size) const;
206 // Returns the number of samples. sampled_values will be a pointer to the samples.
207 usint sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report = false) const;
209 usint lcp(usint index, const LCPSamples& lcp_samples) const;
210 usint lcp(usint index, const LCPSamples& lcp_samples, const RLEVector& plcp) const;
212 // Calculate LCP[index] directly.
213 usint lcpDirect(usint index) const;
215 // Writes PLCP[start] to PLCP[stop - 1].
216 inline void encodePLCPRun(RLEEncoder& plcp, usint start, usint stop, usint first_val) const
218 plcp.setRun(2 * start + first_val, stop - start);
219 // std::cerr << "(" << start << ", " << stop << ", " << first_val << ")" << std::endl;
222 //--------------------------------------------------------------------------
223 // INTERNAL VARIABLES
224 //--------------------------------------------------------------------------
229 PsiVector* array[CHARS];
230 SASamples* sa_samples;
232 pair_type index_ranges[CHARS];
235 usint text_chars[CHARS]; // which characters are present in the text
238 usint index_pointers[CHARS]; // which of the above is at i * index_rate
241 bool support_locate, support_display;
244 usint number_of_sequences;
245 DeltaVector* end_points;
247 //--------------------------------------------------------------------------
248 // INTERNAL VERSIONS OF QUERIES
249 //--------------------------------------------------------------------------
251 void locateUnsafe(pair_type range, usint* data) const;
252 bool processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters) const;
253 void displayUnsafe(pair_type range, uchar* data) const;
255 //--------------------------------------------------------------------------
256 // INTERNAL VERSIONS OF BASIC OPERATIONS
257 //--------------------------------------------------------------------------
259 inline usint psi(usint index, PsiVector::Iterator** iters) const
261 usint c = this->getCharacter(index);
262 return iters[c]->select(index - this->index_ranges[c].first);
265 inline pair_type psi(usint index, usint max_length, PsiVector::Iterator** iters) const
267 usint c = this->getCharacter(index);
268 return iters[c]->selectRun(index - this->index_ranges[c].first, max_length);
271 // Returns psi(index), assuming the suffix of rank index begins with c.
272 inline usint psiUnsafe(usint index, usint c) const
274 PsiVector::Iterator iter(*(this->array[c]));
275 return this->psiUnsafe(index, c, iter);
278 // As above, but with a given iterator.
279 inline usint psiUnsafe(usint index, usint c, PsiVector::Iterator& iter) const
281 return iter.select(index - this->index_ranges[c].first);
284 // As above, but returns the next value of psi.
285 inline usint psiUnsafeNext(usint c, PsiVector::Iterator& iter) const
287 return iter.selectNext();
290 inline pair_type backwardSearchStep(pair_type range, usint c) const
292 if(this->array[c] == 0) { return EMPTY_PAIR; }
293 PsiVector::Iterator iter(*(this->array[c]));
295 usint start = this->index_ranges[c].first + this->number_of_sequences - 1;
296 range.first = start + iter.rank(range.first, true);
297 range.second = start + iter.rank(range.second);
302 inline usint LF(usint index, usint c, PsiVector::Iterator& iter) const
304 return this->index_ranges[c].first + this->number_of_sequences + iter.rank(index) - 1;
307 //--------------------------------------------------------------------------
309 //--------------------------------------------------------------------------
311 // Creates an array of iterators for every vector in this->array.
312 PsiVector::Iterator** getIterators() const;
313 void deleteIterators(PsiVector::Iterator** iters) const;
315 inline usint getCharacter(usint pos) const
317 const usint* curr = &(this->text_chars[this->index_pointers[pos / this->index_rate]]);
318 while(pos > this->index_ranges[*curr].second) { curr++; }
322 void mergeEndPoints(RLCSA& index, RLCSA& increment);
323 void mergeSamples(RLCSA& index, RLCSA& increment, usint* positions);
325 void buildCharIndexes(usint* distribution);
327 // These are not allowed.
330 RLCSA& operator = (const RLCSA&);
334 // Returns the total number of characters.
335 usint buildRanges(usint* distribution, pair_type* index_ranges);