1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #ifndef _TCImplementation_H_
22 #define _TCImplementation_H_
24 #include "TextCollection.h"
25 #include "BlockArray.h"
27 // Include from XMLTree/libcds
28 #include <basics.h> // Defines W == 32
29 #include <static_bitsequence.h>
30 #include <alphabet_mapper.h>
31 #include <static_sequence.h>
32 #include <static_sequence_wvtree_noptrs.h>
34 // Re-define word size to ulong:
43 #include "TextStorage.h"
51 * Implementation of the TextCollection interface
54 class TCImplementation : public SXSI::TextCollection {
56 TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong);
59 bool EmptyText(DocId) const;
62 * Returns a pointer to the original text.
64 * Do *not* try to free the array.
65 * (However, this implementation is suspect to change.)
67 * See TextStorage.h for details.
69 uchar * GetText(DocId) const;
72 * Returns a pointer to the beginning of texts i, i+1, ..., j.
73 * Texts are separated by a '\0' byte.
75 * Do *not* try to free the array.
76 * (However, this implementation is suspect to change.)
78 * See TextStorage.h for details.
80 uchar * GetText(DocId i, DocId j) const;
83 * Returns a substring of given text ID.
85 * FIXME This may be reimplemented via TextStorage.
87 // uchar* GetText(DocId, TextPosition, TextPosition) const;
89 bool IsPrefix(uchar const *) const;
90 bool IsSuffix(uchar const *) const;
91 bool IsEqual(uchar const *) const;
92 bool IsContains(uchar const *) const;
93 bool IsLessThan(uchar const *) const;
95 bool IsPrefix(uchar const *, DocId, DocId) const;
96 bool IsSuffix(uchar const *, DocId, DocId) const;
97 bool IsEqual(uchar const *, DocId, DocId) const;
98 bool IsContains(uchar const *, DocId, DocId) const;
99 bool IsLessThan(uchar const *, DocId, DocId) const;
101 ulong Count(uchar const *) const;
102 unsigned CountPrefix(uchar const *) const;
103 unsigned CountSuffix(uchar const *) const;
104 unsigned CountEqual(uchar const *) const;
105 unsigned CountContains(uchar const *) const;
106 unsigned CountLessThan(const unsigned char*) const;
108 unsigned CountPrefix(uchar const *, DocId, DocId) const;
109 unsigned CountSuffix(uchar const *, DocId, DocId) const;
110 unsigned CountEqual(uchar const *, DocId, DocId) const;
111 unsigned CountContains(uchar const *, DocId, DocId) const;
112 unsigned CountLessThan(uchar const *, DocId, DocId) const;
114 // Definition of document_result is inherited from SXSI::TextCollection.
115 document_result Prefix(uchar const *) const;
116 document_result Suffix(uchar const *) const;
117 document_result Equal(uchar const *) const;
118 document_result Contains(uchar const *) const;
119 document_result LessThan(uchar const *) const;
120 document_result Kmismaches(uchar const *, unsigned) const;
121 document_result Kerrors(uchar const *, unsigned) const;
123 document_result Prefix(uchar const *, DocId, DocId) const;
124 document_result Suffix(uchar const *, DocId, DocId) const;
125 document_result Equal(uchar const *, DocId, DocId) const;
126 document_result Contains(uchar const *, DocId, DocId) const;
127 document_result LessThan(uchar const *, DocId, DocId) const;
129 // Definition of full_result is inherited from SXSI::TextCollection.
130 full_result FullContains(uchar const *) const;
131 full_result FullContains(uchar const *, DocId, DocId) const;
132 full_result FullKmismatches(uchar const *, unsigned) const;
133 full_result FullKerrors(uchar const *, unsigned) const;
135 // Index from/to disk
136 TCImplementation(FILE *, unsigned);
137 void Save(FILE *) const;
140 typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
142 static const uchar versionFlag;
146 TextPosition bwtEndPos;
147 static_sequence * alphabetrank;
149 // Sample structures for texts longer than samplerate
150 static_bitsequence * sampled;
151 BlockArray *suffixes;
152 BlockArray *suffixDocId;
154 // Total number of texts in the collection
155 unsigned numberOfTexts;
156 // Length of the longest text
159 // Array of document id's in the order of end-markers in BWT
160 static_sequence *Doc;
162 // Text storage for fast extraction
163 TextStorage * textStorage;
165 // Following methods are not part of the public API
166 uchar * BWT(uchar *);
167 void makewavelet(uchar *);
168 void maketables(ulong);
169 DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
170 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
171 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
172 ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
173 ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
174 ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
175 ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
177 * Count end-markers in given interval
179 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
186 ranksp = alphabetrank->rank(0, sp - 1);
188 return alphabetrank->rank(0, ep) - ranksp;
192 * Count end-markers in given interval and
193 * within docIds [min,max]
195 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
198 sp = alphabetrank->rank(0, sp - 1);
199 ep = alphabetrank->rank(0, ep);
203 return Doc->count(sp, ep-1, min, max);
207 * Enumerate all end-markers in given interval
209 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
212 sp = alphabetrank->rank(0, sp - 1);
213 ep = alphabetrank->rank(0, ep);
215 return document_result();
217 return Doc->accessAll(sp, ep-1);
221 * Enumerate end-markers in given interval and
222 * within docIds [min,max]
224 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
227 sp = alphabetrank->rank(0, sp - 1);
228 ep = alphabetrank->rank(0, ep);
230 return document_result();
232 return Doc->access(sp, ep-1, min, max);
236 * Enumerate documents in given interval [sp, ep]
238 inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
240 // We want unique document indentifiers, using std::set to collect them
241 // FIXME use unordered_set?
242 uint tmp_rank_c = 0; // Cache rank value of c.
243 for (; sp <= ep; ++sp)
246 uchar c = alphabetrank->access(i, tmp_rank_c);
247 while (c != '\0' && !sampled->access(i))
249 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
250 c = alphabetrank->access(i, tmp_rank_c);
254 // Rank among the end-markers in BWT
255 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
256 resultSet.insert(Doc->access(endmarkerRank));
260 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
261 assert((unsigned)di < numberOfTexts);
262 resultSet.insert(di);
268 * Enumerate documents in given interval [sp, ep]
269 * and within [begin, end]
271 inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
273 // We want unique document indentifiers, using std::set to collect them
274 uint tmp_rank_c = 0; // Cache rank value of c.
275 for (; sp <= ep; ++sp)
278 uchar c = alphabetrank->access(i, tmp_rank_c);
279 while (c != '\0' && !sampled->access(i))
281 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
282 c = alphabetrank->access(i, tmp_rank_c);
286 // Rank among the end-markers in BWT
287 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
288 DocId docId = Doc->access(endmarkerRank);
289 if (docId >= begin && docId <= end)
290 resultSet.insert(docId);
294 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
295 assert((unsigned)docId < numberOfTexts);
296 if (docId >= begin && docId <= end)
297 resultSet.insert(docId);
303 * Enumerate document+position pairs (full_result) of
304 * each suffix in given interval.
306 inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
308 uint tmp_rank_c = 0; // Cache rank value of c.
309 for (; sp <= ep; ++sp)
312 TextPosition dist = 0;
313 uchar c = alphabetrank->access(i, tmp_rank_c);
314 while (c != '\0' && !sampled->access(i))
316 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
317 c = alphabetrank->access(i, tmp_rank_c);
322 // Rank among the end-markers in BWT
323 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
324 DocId docId = Doc->access(endmarkerRank);
325 result.push_back(make_pair(docId, dist));
329 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
330 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
332 result.push_back(make_pair(docId, textPos));
338 * Enumerate document+position pairs (full_result) of
339 * each suffix in given interval and within [begin, end].
341 inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
343 uint tmp_rank_c = 0; // Cache rank value of c.
344 for (; sp <= ep; ++sp)
347 TextPosition dist = 0;
348 uchar c = alphabetrank->access(i, tmp_rank_c);
349 while (c != '\0' && !sampled->access(i))
351 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
352 c = alphabetrank->access(i, tmp_rank_c);
357 // Rank among the end-markers in BWT
358 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
359 DocId docId = Doc->access(endmarkerRank);
360 if (docId >= begin && docId <= end)
361 result.push_back(make_pair(docId, dist));
365 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
366 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
368 if (docId >= begin && docId <= end)
369 result.push_back(make_pair(docId, textPos));
374 }; // class TCImplementation