1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
25 #include "TextCollection.h"
26 #include "BlockArray.h"
27 #include "RLWaveletTree.h"
31 // Include from XMLTree/libcds
33 #include <static_bitsequence.h>
34 #include <alphabet_mapper.h>
35 #include <static_sequence.h>
37 // Re-define word size to ulong:
48 * Implementation of the TextCollection interface
50 * Implementation notes:
52 * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
53 * which requires O(n log \sigma) bits of memory. If we know the distribution
54 * of characters beforehand, space can easily be made O(nH_0).
56 * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
57 * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
59 * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
60 * with any FM-index variant supporting LF-mapping.
62 class CSA : public SXSI::TextCollection {
65 * Constructor with (default) samplerate
67 explicit CSA(unsigned);
70 * Following functions implement the interface described in TextCollection.h.
71 * For details/documentation, see TextCollection.h.
73 void InsertText(uchar const *);
75 bool EmptyText(DocId) const;
76 uchar* GetText(DocId) const;
78 * Next method is not supported:
79 * Supporting GetText for some substring [i,j]
80 * would require more space.
82 // uchar* GetText(DocId, TextPosition, TextPosition) const;
84 bool IsPrefix(uchar const *) const;
85 bool IsSuffix(uchar const *) const;
86 bool IsEqual(uchar const *) const;
87 bool IsContains(uchar const *) const;
88 bool IsLessThan(uchar const *) const;
90 ulong Count(uchar const *) const;
91 unsigned CountPrefix(uchar const *) const;
92 unsigned CountSuffix(uchar const *) const;
93 unsigned CountEqual(uchar const *) const;
94 unsigned CountContains(uchar const *) const;
95 unsigned CountLessThan(const unsigned char*) const;
97 // document_result is inherited from SXSI::TextCollection.
98 document_result Prefix(uchar const *) const;
99 document_result Suffix(uchar const *) const;
100 document_result Equal(uchar const *) const;
101 document_result Contains(uchar const *) const;
102 document_result LessThan(uchar const *) const;
104 // full_result is inherited from SXSI::TextCollection.
105 full_result FullContains(uchar const *) const;
107 // Index from/to disk
108 void Load(FILE *, unsigned);
109 void Save(FILE *) const;
112 // Debug FIXME Remove
131 void deleteEndmarker()
133 delete endmarkerDocId;
144 TCodeEntry() {count=0;bits=0;code=0u;};
149 class THuffAlphabetRank {
150 // using fixed 0...255 alphabet
153 THuffAlphabetRank *left;
154 THuffAlphabetRank *right;
155 TCodeEntry *codetable;
159 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
160 THuffAlphabetRank(FILE *);
161 ~THuffAlphabetRank();
164 bool Test(uchar *, TextPosition);
166 inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
167 THuffAlphabetRank const * temp=this;
168 if (codetable[c].count == 0) return 0;
170 unsigned code = codetable[c].code;
171 while (!temp->leaf) {
172 if ((code & (1u<<level)) == 0) {
173 i = i-temp->bitrank->rank(i);
177 i = temp->bitrank->rank(i)-1;
184 inline bool IsCharAtPos(int c, TextPosition i) const {
185 THuffAlphabetRank const * temp=this;
186 if (codetable[c].count == 0) return false;
188 unsigned code = codetable[c].code;
189 while (!temp->leaf) {
190 if ((code & (1u<<level))==0) {
191 if (temp->bitrank->IsBitSet(i)) return false;
192 i = i-temp->bitrank->rank(i);
196 if (!temp->bitrank->IsBitSet(i)) return false;
197 i = temp->bitrank->rank(i)-1;
204 inline uchar access(TextPosition i) const {
205 THuffAlphabetRank const * temp=this;
206 while (!temp->leaf) {
207 if (temp->bitrank->IsBitSet(i)) {
208 i = temp->bitrank->rank(i)-1;
212 i = i-temp->bitrank->rank(i);
219 inline uchar charAtPos(ulong i, TextPosition *rank) const {
220 THuffAlphabetRank const * temp=this;
221 while (!temp->leaf) {
222 if (temp->bitrank->IsBitSet(i)) {
223 i = temp->bitrank->rank(i)-1;
226 i = i-temp->bitrank->rank(i);
243 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
244 static void count_chars(uchar *, TextPosition , TCodeEntry *);
245 static unsigned SetBit(unsigned , unsigned , unsigned );
247 node( unsigned char c = 0, unsigned i = 0 ) {
254 node( node* c0, node *c1 ) {
256 weight = c0->weight + c1->weight;
262 bool operator>( const node &a ) const {
263 return weight > a.weight;
266 static TCodeEntry *makecodetable(uchar *, TextPosition);
270 static const unsigned char print = 1;
271 static const unsigned char report = 1;
272 static const uchar versionFlag;
276 TextPosition bwtEndPos;
277 // THuffAlphabetRank *alphabetrank;
278 // RLWaveletTree *alphabetrank;
279 static_sequence * alphabetrank;
281 BlockArray *suffixes;
282 BlockArray *suffixDocId;
283 BlockArray *positions;
284 TCodeEntry *codetable;
287 // Total number of texts in the collection
288 unsigned numberOfTexts;
289 // Total number of texts including empty texts
290 unsigned numberOfAllTexts;
291 // Length of the longest text
294 // Array of document id's in the order of end-markers in BWT
295 // Access by endmarkerDocId[rank_$(L, p) - 1].
296 BlockArray *endmarkerDocId;
297 // Array of text lengths (in the inserted order)
298 BlockArray *textLength;
299 // Array of text starting positions (in the inserted order)
300 BlockArray *textStartPos;
302 // FIXME Replace with a more succinct data structure
303 std::set<unsigned> emptyTextId;
304 BSGAP *emptyTextRank;
307 uchar * BWT(uchar *);
308 uchar * LoadFromFile(const char *);
309 void SaveToFile(const char *, uchar *);
310 void makewavelet(uchar *);
313 // Following are not part of the public API
314 DocId DocIdAtTextPos(TextPosition) const;
315 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
316 TextPosition Inverse(TextPosition) const;
317 TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
318 TextPosition Psi(TextPosition) const;
319 uchar * Substring(TextPosition, TextPosition) const;
320 TextPosition Lookup(TextPosition) const;
323 * Count end-markers in given interval
325 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
332 ranksp = alphabetrank->rank(0, sp - 1);
334 return alphabetrank->rank(0, ep) - ranksp;