1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #ifndef _TCImplementation_H_
22 #define _TCImplementation_H_
24 #include "incbwt/bits/deltavector.h"
27 #include "TextCollection.h"
28 #include "BlockArray.h"
30 // Include from XMLTree/libcds
31 #include <basics.h> // Defines W == 32
32 #include <static_bitsequence.h>
33 #include <alphabet_mapper.h>
34 #include <static_sequence.h>
35 #include <static_sequence_wvtree_noptrs.h>
37 // Re-define word size to ulong:
48 #include "TextStorage.h"
58 * Implementation of the TextCollection interface
61 class TCImplementation : public SXSI::TextCollection {
63 TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong,
64 CSA::DeltaVector &, const std::string &, char);
67 bool EmptyText(DocId) const;
70 * Extracting one text.
72 * Call DeleteText() for each pointer returned by GetText()
73 * to avoid possible memory leaks.
75 uchar * GetText(DocId) const;
76 void DeleteText(uchar *text) const
77 { textStorage->DeleteText(text); }
80 * Returns a pointer to the beginning of texts i, i+1, ..., j.
81 * Texts are separated by a '\0' byte.
83 * Call DeleteText() for each pointer returned by GetText()
84 * to avoid possible memory leaks.
86 uchar * GetText(DocId i, DocId j) const
87 { return textStorage->GetText(i, j); }
90 * Returns a substring of given text ID.
92 * FIXME This may be reimplemented via TextStorage.
94 // uchar* GetText(DocId, TextPosition, TextPosition) const;
96 bool IsPrefix(uchar const *) const;
97 bool IsSuffix(uchar const *) const;
98 bool IsEqual(uchar const *) const;
99 bool IsContains(uchar const *) const;
100 bool IsLessThan(uchar const *) const;
102 bool IsPrefix(uchar const *, DocId, DocId) const;
103 bool IsSuffix(uchar const *, DocId, DocId) const;
104 bool IsEqual(uchar const *, DocId, DocId) const;
105 bool IsContains(uchar const *, DocId, DocId) const;
106 bool IsLessThan(uchar const *, DocId, DocId) const;
108 ulong Count(uchar const *) const;
109 unsigned CountPrefix(uchar const *) const;
110 unsigned CountSuffix(uchar const *) const;
111 unsigned CountEqual(uchar const *) const;
112 unsigned CountContains(uchar const *) const;
113 unsigned CountLessThan(const unsigned char*) const;
115 unsigned CountPrefix(uchar const *, DocId, DocId) const;
116 unsigned CountSuffix(uchar const *, DocId, DocId) const;
117 unsigned CountEqual(uchar const *, DocId, DocId) const;
118 unsigned CountContains(uchar const *, DocId, DocId) const;
119 unsigned CountLessThan(uchar const *, DocId, DocId) const;
121 // Definition of document_result is inherited from SXSI::TextCollection.
122 document_result Prefix(uchar const *) const;
123 document_result Suffix(uchar const *) const;
124 document_result Equal(uchar const *) const;
125 document_result Contains(uchar const *) const;
126 document_result LessThan(uchar const *) const;
127 document_result KMismaches(uchar const *, unsigned) const;
128 document_result KErrors(uchar const *, unsigned) const;
130 document_result Prefix(uchar const *, DocId, DocId) const;
131 document_result Suffix(uchar const *, DocId, DocId) const;
132 document_result Equal(uchar const *, DocId, DocId) const;
133 document_result Contains(uchar const *, DocId, DocId) const;
134 document_result LessThan(uchar const *, DocId, DocId) const;
136 // Definition of full_result is inherited from SXSI::TextCollection.
137 full_result FullContains(uchar const *) const;
138 full_result FullContains(uchar const *, DocId, DocId) const;
139 full_result FullKMismatches(uchar const *, unsigned) const;
140 full_result FullKErrors(uchar const *, unsigned) const;
142 // Index from/to disk
143 TCImplementation(FILE *, unsigned);
144 void Save(FILE *) const;
147 typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
149 static const uchar versionFlag;
153 TextPosition bwtEndPos;
154 static_sequence * alphabetrank;
156 // Sample structures for texts longer than samplerate
157 static_bitsequence * sampled;
158 BlockArray *suffixes;
159 BlockArray *suffixDocId;
161 // Total number of texts in the collection
162 unsigned numberOfTexts;
163 // Length of the longest text
166 // Array of document id's in the order of end-markers in BWT
167 // static_sequence *Doc;
170 // Text storage for fast extraction
171 TextStorage * textStorage;
173 // Following methods are not part of the public API
174 uchar * BWT(uchar *);
175 void makewavelet(uchar *);
176 void maketables(ulong, char, CSA::DeltaVector &, const std::string &);
177 DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
178 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
179 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
180 ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
181 ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
182 ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
183 ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
185 * Count end-markers in given interval
187 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
194 ranksp = alphabetrank->rank(0, sp - 1);
196 return alphabetrank->rank(0, ep) - ranksp;
200 * Count end-markers in given interval and
201 * within docIds [min,max]
203 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
206 sp = alphabetrank->rank(0, sp - 1);
207 ep = alphabetrank->rank(0, ep);
211 return Doc->count(sp, ep-1, min, max);
215 * Enumerate all end-markers in given interval
217 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
220 sp = alphabetrank->rank(0, sp - 1);
221 ep = alphabetrank->rank(0, ep);
223 return document_result();
225 return Doc->accessAll(sp, ep-1);
229 * Enumerate end-markers in given interval and
230 * within docIds [min,max]
232 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
235 sp = alphabetrank->rank(0, sp - 1);
236 ep = alphabetrank->rank(0, ep);
238 return document_result();
240 return Doc->access(sp, ep-1, min, max);
244 * Enumerate documents in given interval [sp, ep]
246 inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
248 // We want unique document indentifiers, using std::set to collect them
249 // FIXME use unordered_set?
250 uint tmp_rank_c = 0; // Cache rank value of c.
251 for (; sp <= ep; ++sp)
254 uchar c = alphabetrank->access(i, tmp_rank_c);
255 while (c != '\0' && !sampled->access(i))
257 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
258 c = alphabetrank->access(i, tmp_rank_c);
262 // Rank among the end-markers in BWT
263 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
264 resultSet.insert(Doc->access(endmarkerRank));
268 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
269 assert((unsigned)di < numberOfTexts);
270 resultSet.insert(di);
276 * Enumerate documents in given interval [sp, ep]
277 * and within [begin, end]
279 inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
281 // We want unique document indentifiers, using std::set to collect them
282 uint tmp_rank_c = 0; // Cache rank value of c.
283 for (; sp <= ep; ++sp)
286 uchar c = alphabetrank->access(i, tmp_rank_c);
287 while (c != '\0' && !sampled->access(i))
289 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
290 c = alphabetrank->access(i, tmp_rank_c);
294 // Rank among the end-markers in BWT
295 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
296 DocId docId = Doc->access(endmarkerRank);
297 if (docId >= begin && docId <= end)
298 resultSet.insert(docId);
302 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
303 assert((unsigned)docId < numberOfTexts);
304 if (docId >= begin && docId <= end)
305 resultSet.insert(docId);
311 * Enumerate document+position pairs (full_result) of
312 * each suffix in given interval.
314 inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
316 uint tmp_rank_c = 0; // Cache rank value of c.
317 for (; sp <= ep; ++sp)
320 TextPosition dist = 0;
321 uchar c = alphabetrank->access(i, tmp_rank_c);
322 while (c != '\0' && !sampled->access(i))
324 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
325 c = alphabetrank->access(i, tmp_rank_c);
330 // Rank among the end-markers in BWT
331 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
332 DocId docId = Doc->access(endmarkerRank);
333 result.push_back(std::make_pair(docId, dist));
337 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
338 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
340 result.push_back(std::make_pair(docId, textPos));
346 * Enumerate document+position pairs (full_result) of
347 * each suffix in given interval and within [begin, end].
349 inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
351 uint tmp_rank_c = 0; // Cache rank value of c.
352 for (; sp <= ep; ++sp)
355 TextPosition dist = 0;
356 uchar c = alphabetrank->access(i, tmp_rank_c);
357 while (c != '\0' && !sampled->access(i))
359 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
360 c = alphabetrank->access(i, tmp_rank_c);
365 // Rank among the end-markers in BWT
366 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
367 DocId docId = Doc->access(endmarkerRank);
368 if (docId >= begin && docId <= end)
369 result.push_back(std::make_pair(docId, dist));
373 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
374 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
376 if (docId >= begin && docId <= end)
377 result.push_back(std::make_pair(docId, textPos));
382 }; // class TCImplementation