1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
25 #include "TextCollection.h"
26 #include "BlockArray.h"
27 #include "RLWaveletTree.h"
28 #include "StringIterator.h"
32 // Include from XMLTree/libcds
33 #include <basics.h> // Defines W == 32
34 #include <static_bitsequence.h>
35 #include <alphabet_mapper.h>
36 #include <static_sequence.h>
38 // Re-define word size to ulong:
50 // Un-comment to compare BWT against a BWT generated from class dynFMI:
51 //#define CSA_TEST_BWT
54 * Implementation of the TextCollection interface
57 class CSA : public SXSI::TextCollection {
60 * Constructor with (default) samplerate
62 explicit CSA(unsigned);
65 * Following functions implement the interface described in TextCollection.h.
66 * For details/documentation, see TextCollection.h.
68 void InsertText(uchar const *);
70 bool EmptyText(DocId) const;
71 uchar* GetText(DocId) const;
73 * Next method is not supported:
74 * Supporting GetText for some substring [i,j]
75 * would require more space.
77 // uchar* GetText(DocId, TextPosition, TextPosition) const;
79 bool IsPrefix(uchar const *) const;
80 bool IsSuffix(uchar const *) const;
81 bool IsEqual(uchar const *) const;
82 bool IsContains(uchar const *) const;
83 bool IsLessThan(uchar const *) const;
85 bool IsPrefix(uchar const *, DocId, DocId) const;
86 bool IsSuffix(uchar const *, DocId, DocId) const;
87 bool IsEqual(uchar const *, DocId, DocId) const;
88 bool IsContains(uchar const *, DocId, DocId) const;
89 bool IsLessThan(uchar const *, DocId, DocId) const;
91 ulong Count(uchar const *) const;
92 unsigned CountPrefix(uchar const *) const;
93 unsigned CountSuffix(uchar const *) const;
94 unsigned CountEqual(uchar const *) const;
95 unsigned CountContains(uchar const *) const;
96 unsigned CountLessThan(const unsigned char*) const;
98 unsigned CountPrefix(uchar const *, DocId, DocId) const;
99 unsigned CountSuffix(uchar const *, DocId, DocId) const;
100 unsigned CountEqual(uchar const *, DocId, DocId) const;
101 unsigned CountContains(uchar const *, DocId, DocId) const;
102 unsigned CountLessThan(uchar const *, DocId, DocId) const;
104 // Definition of document_result is inherited from SXSI::TextCollection.
105 document_result Prefix(uchar const *) const;
106 document_result Suffix(uchar const *) const;
107 document_result Equal(uchar const *) const;
108 document_result Contains(uchar const *) const;
109 document_result LessThan(uchar const *) const;
111 document_result Prefix(uchar const *, DocId, DocId) const;
112 document_result Suffix(uchar const *, DocId, DocId) const;
113 document_result Equal(uchar const *, DocId, DocId) const;
114 document_result Contains(uchar const *, DocId, DocId) const;
115 document_result LessThan(uchar const *, DocId, DocId) const;
117 // Definition of full_result is inherited from SXSI::TextCollection.
118 full_result FullContains(uchar const *) const;
119 full_result FullContains(uchar const *, DocId, DocId) const;
121 // Index from/to disk
122 void Load(FILE *, unsigned);
123 void Save(FILE *) const;
143 void deleteEndmarker()
145 delete endmarkerDocId;
156 TCodeEntry() {count=0;bits=0;code=0u;};
161 class THuffAlphabetRank {
162 // using fixed 0...255 alphabet
165 THuffAlphabetRank *left;
166 THuffAlphabetRank *right;
167 TCodeEntry *codetable;
171 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
172 THuffAlphabetRank(FILE *);
173 ~THuffAlphabetRank();
176 bool Test(uchar *, TextPosition);
178 inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
179 THuffAlphabetRank const * temp=this;
180 if (codetable[c].count == 0) return 0;
182 unsigned code = codetable[c].code;
183 while (!temp->leaf) {
184 if ((code & (1u<<level)) == 0) {
185 i = i-temp->bitrank->rank(i);
189 i = temp->bitrank->rank(i)-1;
196 inline bool IsCharAtPos(int c, TextPosition i) const {
197 THuffAlphabetRank const * temp=this;
198 if (codetable[c].count == 0) return false;
200 unsigned code = codetable[c].code;
201 while (!temp->leaf) {
202 if ((code & (1u<<level))==0) {
203 if (temp->bitrank->IsBitSet(i)) return false;
204 i = i-temp->bitrank->rank(i);
208 if (!temp->bitrank->IsBitSet(i)) return false;
209 i = temp->bitrank->rank(i)-1;
216 inline uchar access(TextPosition i) const {
217 THuffAlphabetRank const * temp=this;
218 while (!temp->leaf) {
219 if (temp->bitrank->IsBitSet(i)) {
220 i = temp->bitrank->rank(i)-1;
224 i = i-temp->bitrank->rank(i);
231 inline uchar charAtPos(ulong i, TextPosition *rank) const {
232 THuffAlphabetRank const * temp=this;
233 while (!temp->leaf) {
234 if (temp->bitrank->IsBitSet(i)) {
235 i = temp->bitrank->rank(i)-1;
238 i = i-temp->bitrank->rank(i);
255 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
256 static void count_chars(uchar *, TextPosition , TCodeEntry *);
257 static unsigned SetBit(unsigned , unsigned , unsigned );
259 node( unsigned char c = 0, unsigned i = 0 ) {
266 node( node* c0, node *c1 ) {
268 weight = c0->weight + c1->weight;
274 bool operator>( const node &a ) const {
275 return weight > a.weight;
278 static TCodeEntry *makecodetable(uchar *, TextPosition);
281 static const uchar versionFlag;
285 TextPosition bwtEndPos;
286 // THuffAlphabetRank *alphabetrank;
287 // RLWaveletTree *alphabetrank;
288 static_sequence * alphabetrank;
293 TCodeEntry *codetable;
295 // Sample structures for texts longer than samplerate
297 BlockArray *suffixes;
298 BlockArray *suffixDocId;
300 // Total number of texts in the collection
301 unsigned numberOfTexts;
302 // Total number of texts including empty texts
303 unsigned numberOfAllTexts;
304 // Length of the longest text
307 // Array of document id's in the order of end-markers in BWT
308 // Access by endmarkerDocId[rank_$(L, p) - 1].
309 BlockArray *endmarkerDocId;
311 // FIXME Replace with a more succinct data structure
312 // Note: Empty texts are already handled inside XMLTree class.
313 std::set<unsigned> emptyTextId;
314 BSGAP *emptyTextRank;
316 // FIXME A better solution?
319 // Following are not part of the public API
320 uchar * BWT(uchar *);
321 void makewavelet(uchar *);
323 DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
324 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
325 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
326 ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
327 // TextPosition Inverse(TextPosition) const;
328 // TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
329 // TextPosition Psi(TextPosition) const;
330 // uchar * Substring(TextPosition, TextPosition) const;
331 // TextPosition Lookup(TextPosition) const;
334 * Count end-markers in given interval
336 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
343 ranksp = alphabetrank->rank(0, sp - 1);
345 return alphabetrank->rank(0, ep) - ranksp;
347 unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const;