1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
28 #include "TextCollection.h"
29 #include "BlockArray.h"
32 * Implementation of the TextCollection interface
34 * Implementation notes:
36 * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
37 * which requires O(n log \sigma) bits of memory. If we know the distribution
38 * of characters beforehand, space can easily be made O(nH_0).
40 * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
41 * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
43 * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
44 * with any FM-index variant supporting LF-mapping.
46 class CSA : public SXSI::TextCollection {
49 * Constructor with (default) samplerate
51 explicit CSA(unsigned);
54 * Following functions implement the interface described in TextCollection.h.
55 * For details/documentation, see TextCollection.h.
57 void InsertText(uchar const *);
59 bool EmptyText(DocId) const;
60 uchar* GetText(DocId) const;
61 uchar* GetText(DocId, TextPosition, TextPosition) const;
63 bool IsPrefix(uchar const *) const;
64 bool IsSuffix(uchar const *) const;
65 bool IsEqual(uchar const *) const;
66 bool IsContains(uchar const *) const;
67 bool IsLessThan(uchar const *) const;
69 unsigned CountPrefix(uchar const *) const;
70 unsigned CountSuffix(uchar const *) const;
71 unsigned CountEqual(uchar const *) const;
72 unsigned CountContains(uchar const *) const;
73 unsigned CountLessThan(const unsigned char*) const;
75 // document_result is inherited from SXSI::TextCollection.
76 document_result Prefix(uchar const *) const;
77 document_result Suffix(uchar const *) const;
78 document_result Equal(uchar const *) const;
79 document_result Contains(uchar const *) const;
80 document_result LessThan(uchar const *) const;
82 // full_result is inherited from SXSI::TextCollection.
83 full_result FullContains(uchar const *) const;
86 void Load(FILE *, unsigned);
87 void Save(FILE *) const;
90 TextPosition Lookup(TextPosition) const;
98 TCodeEntry() {count=0;bits=0;code=0u;};
102 class THuffAlphabetRank {
103 // using fixed 0...255 alphabet
106 THuffAlphabetRank *left;
107 THuffAlphabetRank *right;
108 TCodeEntry *codetable;
112 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
113 ~THuffAlphabetRank();
114 bool Test(uchar *, TextPosition);
116 inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
117 THuffAlphabetRank const * temp=this;
118 if (codetable[c].count == 0) return 0;
120 unsigned code = codetable[c].code;
121 while (!temp->leaf) {
122 if ((code & (1u<<level)) == 0) {
123 i = i-temp->bitrank->rank(i);
127 i = temp->bitrank->rank(i)-1;
134 inline bool IsCharAtPos(int c, TextPosition i) const {
135 THuffAlphabetRank const * temp=this;
136 if (codetable[c].count == 0) return false;
138 unsigned code = codetable[c].code;
139 while (!temp->leaf) {
140 if ((code & (1u<<level))==0) {
141 if (temp->bitrank->IsBitSet(i)) return false;
142 i = i-temp->bitrank->rank(i);
146 if (!temp->bitrank->IsBitSet(i)) return false;
147 i = temp->bitrank->rank(i)-1;
154 inline int charAtPos(TextPosition i) const {
155 THuffAlphabetRank const * temp=this;
156 while (!temp->leaf) {
157 if (temp->bitrank->IsBitSet(i)) {
158 i = temp->bitrank->rank(i)-1;
162 i = i-temp->bitrank->rank(i);
166 return (int)temp->ch;
177 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
178 static void count_chars(uchar *, TextPosition , TCodeEntry *);
179 static unsigned SetBit(unsigned , unsigned , unsigned );
181 node( unsigned char c = 0, unsigned i = 0 ) {
188 node( node* c0, node *c1 ) {
190 weight = c0->weight + c1->weight;
196 bool operator>( const node &a ) const {
197 return weight > a.weight;
200 static TCodeEntry *makecodetable(uchar *, TextPosition);
203 static const unsigned char print = 1;
204 static const unsigned char report = 1;
208 TextPosition bwtEndPos;
209 THuffAlphabetRank *alphabetrank;
211 BlockArray *suffixes;
212 BlockArray *positions;
213 TCodeEntry *codetable;
216 // Total number of texts in the collection
217 unsigned numberOfTexts;
218 // Length of the longest text
222 // Array of document id's in the order of end-markers in BWT
223 // Access by endmarkerDocId[rank_$(L, p) - 1].
224 BlockArray *endmarkerDocId;
225 // Array of text lengths (in the inserted order)
226 BlockArray *textLength;
227 // Array of text starting positions (in the inserted order)
228 BlockArray *textStartPos;
231 uchar * BWT(uchar *);
232 uchar * LoadFromFile(const char *);
233 void SaveToFile(const char *, uchar *);
234 void makewavelet(uchar *);
237 // Following are not part of the public API
238 DocId DocIdAtTextPos(TextPosition) const;
239 unsigned CountEndmarkers(TextPosition, TextPosition) const;
240 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
241 TextPosition Inverse(TextPosition) const;
242 TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
243 TextPosition Psi(TextPosition) const;
244 uchar * Substring(TextPosition, TextPosition) const;