1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
4 * FMIndex implementation for the TextCollection interface *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU Lesser General Public License as published *
8 * by the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU Lesser General Public License for more details. *
16 * You should have received a copy of the GNU Lesser General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
20 *****************************************************************************/
25 #include "incbwt/bits/deltavector.h"
28 #include "TextCollection.h"
29 #include "BlockArray.h"
31 // Include from XMLTree/libcds
32 #include <basics.h> // Defines W == 32
33 #include <static_bitsequence.h>
34 #include <alphabet_mapper.h>
35 #include <static_sequence.h>
36 #include <static_sequence_wvtree_noptrs.h>
38 // Re-define word size to ulong:
49 #include "TextStorage.h"
59 * Implementation of the TextCollection interface
62 class FMIndex : public SXSI::TextCollection {
64 FMIndex(uchar *, ulong, unsigned, unsigned, ulong, ulong,
65 CSA::DeltaVector &, const std::string &, char);
68 bool EmptyText(DocId) const;
71 * Extracting one text.
73 * Call DeleteText() for each pointer returned by GetText()
74 * to avoid possible memory leaks.
76 uchar * GetText(DocId) const;
77 void DeleteText(uchar *text) const
78 { textStorage->DeleteText(text); }
81 * Returns a pointer to the beginning of texts i, i+1, ..., j.
82 * Texts are separated by a '\0' byte.
84 * Call DeleteText() for each pointer returned by GetText()
85 * to avoid possible memory leaks.
87 uchar * GetText(DocId i, DocId j) const
88 { return textStorage->GetText(i, j); }
91 * Returns a substring of given text ID.
93 * FIXME This may be reimplemented via TextStorage.
95 // uchar* GetText(DocId, TextPosition, TextPosition) const;
97 bool IsPrefix(uchar const *) const;
98 bool IsSuffix(uchar const *) const;
99 bool IsEqual(uchar const *) const;
100 bool IsContains(uchar const *) const;
101 bool IsLessThan(uchar const *) const;
103 bool IsPrefix(uchar const *, DocId, DocId) const;
104 bool IsSuffix(uchar const *, DocId, DocId) const;
105 bool IsEqual(uchar const *, DocId, DocId) const;
106 bool IsContains(uchar const *, DocId, DocId) const;
107 bool IsLessThan(uchar const *, DocId, DocId) const;
109 ulong Count(uchar const *) const;
110 unsigned CountPrefix(uchar const *) const;
111 unsigned CountSuffix(uchar const *) const;
112 unsigned CountEqual(uchar const *) const;
113 unsigned CountContains(uchar const *) const;
114 unsigned CountLessThan(const unsigned char*) const;
116 unsigned CountPrefix(uchar const *, DocId, DocId) const;
117 unsigned CountSuffix(uchar const *, DocId, DocId) const;
118 unsigned CountEqual(uchar const *, DocId, DocId) const;
119 unsigned CountContains(uchar const *, DocId, DocId) const;
120 unsigned CountLessThan(uchar const *, DocId, DocId) const;
122 // Definition of document_result is inherited from SXSI::TextCollection.
123 document_result Prefix(uchar const *) const;
124 document_result Suffix(uchar const *) const;
125 document_result Equal(uchar const *) const;
126 document_result Contains(uchar const *) const;
127 document_result LessThan(uchar const *) const;
128 document_result KMismaches(uchar const *, unsigned) const;
129 document_result KErrors(uchar const *, unsigned) const;
131 document_result Prefix(uchar const *, DocId, DocId) const;
132 document_result Suffix(uchar const *, DocId, DocId) const;
133 document_result Equal(uchar const *, DocId, DocId) const;
134 document_result Contains(uchar const *, DocId, DocId) const;
135 document_result LessThan(uchar const *, DocId, DocId) const;
137 // Definition of full_result is inherited from SXSI::TextCollection.
138 full_result FullContains(uchar const *) const;
139 full_result FullContains(uchar const *, DocId, DocId) const;
140 full_result FullKMismatches(uchar const *, unsigned) const;
141 full_result FullKErrors(uchar const *, unsigned) const;
143 // Index from/to disk
144 FMIndex(FILE *, index_mode_t, unsigned);
145 void Save(FILE *, char const *) const;
148 typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
150 static const uchar versionFlag;
154 TextPosition bwtEndPos;
155 static_sequence * alphabetrank;
157 // Sample structures for texts longer than samplerate
158 static_bitsequence * sampled;
159 BlockArray *suffixes;
160 BlockArray *suffixDocId;
162 // Total number of texts in the collection
163 unsigned numberOfTexts;
164 // Length of the longest text
167 // Array of document id's in the order of end-markers in BWT
168 // static_sequence *Doc;
171 // Text storage for fast extraction
172 TextStorage * textStorage;
174 // Following methods are not part of the public API
175 uchar * BWT(uchar *);
176 void makewavelet(uchar *);
177 void maketables(ulong, char, CSA::DeltaVector::Iterator &, const std::string &);
178 DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
179 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
180 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
181 ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
182 ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
183 ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
184 ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
186 * Count end-markers in given interval
188 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
195 ranksp = alphabetrank->rank(0, sp - 1);
197 return alphabetrank->rank(0, ep) - ranksp;
201 * Count end-markers in given interval and
202 * within docIds [min,max]
204 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
207 sp = alphabetrank->rank(0, sp - 1);
208 ep = alphabetrank->rank(0, ep);
212 return Doc->count(sp, ep-1, min, max);
216 * Enumerate all end-markers in given interval
218 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
221 sp = alphabetrank->rank(0, sp - 1);
222 ep = alphabetrank->rank(0, ep);
224 return document_result();
226 return Doc->accessAll(sp, ep-1);
230 * Enumerate end-markers in given interval and
231 * within docIds [min,max]
233 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
236 sp = alphabetrank->rank(0, sp - 1);
237 ep = alphabetrank->rank(0, ep);
239 return document_result();
241 return Doc->access(sp, ep-1, min, max);
245 * Enumerate documents in given interval [sp, ep]
247 inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
249 // We want unique document indentifiers, using std::set to collect them
250 // FIXME use unordered_set?
251 uint tmp_rank_c = 0; // Cache rank value of c.
252 for (; sp <= ep; ++sp)
255 uchar c = alphabetrank->access(i, tmp_rank_c);
256 while (c != '\0' && !sampled->access(i))
258 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
259 c = alphabetrank->access(i, tmp_rank_c);
263 // Rank among the end-markers in BWT
264 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
265 resultSet.insert(Doc->access(endmarkerRank));
269 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
270 assert((unsigned)di < numberOfTexts);
271 resultSet.insert(di);
277 * Enumerate documents in given interval [sp, ep]
278 * and within [begin, end]
280 inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
282 // We want unique document indentifiers, using std::set to collect them
283 uint tmp_rank_c = 0; // Cache rank value of c.
284 for (; sp <= ep; ++sp)
287 uchar c = alphabetrank->access(i, tmp_rank_c);
288 while (c != '\0' && !sampled->access(i))
290 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
291 c = alphabetrank->access(i, tmp_rank_c);
295 // Rank among the end-markers in BWT
296 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
297 DocId docId = Doc->access(endmarkerRank);
298 if (docId >= begin && docId <= end)
299 resultSet.insert(docId);
303 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
304 assert((unsigned)docId < numberOfTexts);
305 if (docId >= begin && docId <= end)
306 resultSet.insert(docId);
312 * Enumerate document+position pairs (full_result) of
313 * each suffix in given interval.
315 inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
317 uint tmp_rank_c = 0; // Cache rank value of c.
318 for (; sp <= ep; ++sp)
321 TextPosition dist = 0;
322 uchar c = alphabetrank->access(i, tmp_rank_c);
323 while (c != '\0' && !sampled->access(i))
325 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
326 c = alphabetrank->access(i, tmp_rank_c);
331 // Rank among the end-markers in BWT
332 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
333 DocId docId = Doc->access(endmarkerRank);
334 result.push_back(std::make_pair(docId, dist));
338 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
339 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
341 result.push_back(std::make_pair(docId, textPos));
347 * Enumerate document+position pairs (full_result) of
348 * each suffix in given interval and within [begin, end].
350 inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
352 uint tmp_rank_c = 0; // Cache rank value of c.
353 for (; sp <= ep; ++sp)
356 TextPosition dist = 0;
357 uchar c = alphabetrank->access(i, tmp_rank_c);
358 while (c != '\0' && !sampled->access(i))
360 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
361 c = alphabetrank->access(i, tmp_rank_c);
366 // Rank among the end-markers in BWT
367 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
368 DocId docId = Doc->access(endmarkerRank);
369 if (docId >= begin && docId <= end)
370 result.push_back(std::make_pair(docId, dist));
374 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
375 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
377 if (docId >= begin && docId <= end)
378 result.push_back(std::make_pair(docId, textPos));