5 #include <cstring> // defines std::memset, added by Kim
7 #include "bits/vectors.h"
9 #include "misc/parameters.h"
16 const std::string SA_EXTENSION = ".sa";
17 const std::string PSI_EXTENSION = ".psi";
18 const std::string ARRAY_EXTENSION = ".rlcsa.array";
19 const std::string SA_SAMPLES_EXTENSION = ".rlcsa.sa_samples";
20 const std::string PARAMETERS_EXTENSION = ".rlcsa.parameters";
23 const parameter_type RLCSA_BLOCK_SIZE = parameter_type("RLCSA_BLOCK_SIZE", 32);
24 const parameter_type SAMPLE_RATE = parameter_type("SAMPLE_RATE", 512);
25 const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 0);
26 const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 0);
41 const static usint ENDPOINT_BLOCK_SIZE = 16;
43 RLCSA(const std::string& base_name, bool print = false);
46 If multiple_sequences is true, each 0 is treated as a end of sequence marker.
47 There must be nonzero characters between the 0s. data[bytes - 1] must also be 0.
49 RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data);
51 // Destroys contents of index and increment.
52 RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size);
55 void writeTo(const std::string& base_name);
59 // Returns the closed interval of suffix array containing the matches.
60 pair_type count(const std::string& pattern);
62 void reportPositions(uchar* data, usint length, usint* positions);
64 // Returns SA[range]. User must free the buffer. Latter version uses buffer provided by the user.
65 LocateItem* locate(pair_type range);
66 LocateItem* locate(pair_type range, LocateItem* data);
69 usint locate(usint index);
71 // Returns Ti[range]. User must free the buffer. Latter version uses buffer provided by the user.
72 uchar* display(usint sequence, pair_type range);
73 uchar* display(usint sequence, pair_type range, uchar* data);
75 // Writes the Psi array into given file. End of sequence markers are not written.
76 void decompressInto(std::ofstream& psi_file);
78 // Returns the BWT of the collection including end of sequence markers.
81 // Return value includes the implicit end of sequence markers. To get suffix array indexes,
82 // subtract getNumberOfSequences() from the value.
83 usint psi(usint index);
84 pair_type psi(usint index, usint max_length); // This version returns a run.
86 inline bool supportsLocate() { return this->support_locate; }
87 inline bool supportsDisplay() { return this->support_display; }
88 inline usint getSize() { return this->data_size; }
89 inline usint getNumberOfSequences() { return this->number_of_sequences; }
91 pair_type getSequence(usint number);
93 // Returns the size of the data structure.
94 usint reportSize(bool print = false);
99 RLEVector* array[CHARS];
100 SASamples* sa_samples;
102 pair_type index_ranges[CHARS];
105 usint text_chars[CHARS]; // which characters are present in the text
108 usint index_pointers[CHARS]; // which of the above is at i * index_rate
111 bool support_locate, support_display;
114 usint number_of_sequences;
115 DeltaVector* end_points;
117 void locateUnsafe(pair_type range, LocateItem* data);
118 bool processRun(pair_type run, LocateItem* data);
119 void displayUnsafe(pair_type range, uchar* data);
121 inline usint getCharacter(usint pos)
123 usint* curr = &(this->text_chars[this->index_pointers[pos / this->index_rate]]);
124 while(pos > this->index_ranges[*curr].second) { curr++; }
128 void buildCharIndexes(usint* distribution);
132 // Returns the total number of characters.
133 usint buildRanges(usint* distribution, pair_type* index_ranges);