1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
25 #include "TextCollection.h"
26 #include "BlockArray.h"
27 #include "RLWaveletTree.h"
31 // Include from XMLTree/libcds
33 #include <static_bitsequence.h>
34 #include <alphabet_mapper.h>
35 #include <static_sequence.h>
37 // Re-define word size to ulong:
47 * Implementation of the TextCollection interface
49 * Implementation notes:
51 * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
52 * which requires O(n log \sigma) bits of memory. If we know the distribution
53 * of characters beforehand, space can easily be made O(nH_0).
55 * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
56 * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
58 * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
59 * with any FM-index variant supporting LF-mapping.
61 class CSA : public SXSI::TextCollection {
64 * Constructor with (default) samplerate
66 explicit CSA(unsigned);
69 * Following functions implement the interface described in TextCollection.h.
70 * For details/documentation, see TextCollection.h.
72 void InsertText(uchar const *);
74 bool EmptyText(DocId) const;
75 uchar* GetText(DocId) const;
76 // uchar* GetText(DocId, TextPosition, TextPosition) const;
78 bool IsPrefix(uchar const *) const;
79 bool IsSuffix(uchar const *) const;
80 bool IsEqual(uchar const *) const;
81 bool IsContains(uchar const *) const;
82 bool IsLessThan(uchar const *) const;
84 unsigned CountPrefix(uchar const *) const;
85 unsigned CountSuffix(uchar const *) const;
86 unsigned CountEqual(uchar const *) const;
87 unsigned CountContains(uchar const *) const;
88 unsigned CountLessThan(const unsigned char*) const;
90 // document_result is inherited from SXSI::TextCollection.
91 document_result Prefix(uchar const *) const;
92 document_result Suffix(uchar const *) const;
93 document_result Equal(uchar const *) const;
94 document_result Contains(uchar const *) const;
95 document_result LessThan(uchar const *) const;
97 // full_result is inherited from SXSI::TextCollection.
98 full_result FullContains(uchar const *) const;
100 // Index from/to disk
101 void Load(FILE *, unsigned);
102 void Save(FILE *) const;
105 // Debug FIXME Remove
124 void deleteEndmarker()
126 delete endmarkerDocId;
136 TCodeEntry() {count=0;bits=0;code=0u;};
140 class THuffAlphabetRank {
141 // using fixed 0...255 alphabet
144 THuffAlphabetRank *left;
145 THuffAlphabetRank *right;
146 TCodeEntry *codetable;
150 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
151 THuffAlphabetRank(FILE *);
152 ~THuffAlphabetRank();
155 bool Test(uchar *, TextPosition);
157 inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
158 THuffAlphabetRank const * temp=this;
159 if (codetable[c].count == 0) return 0;
161 unsigned code = codetable[c].code;
162 while (!temp->leaf) {
163 if ((code & (1u<<level)) == 0) {
164 i = i-temp->bitrank->rank(i);
168 i = temp->bitrank->rank(i)-1;
175 inline bool IsCharAtPos(int c, TextPosition i) const {
176 THuffAlphabetRank const * temp=this;
177 if (codetable[c].count == 0) return false;
179 unsigned code = codetable[c].code;
180 while (!temp->leaf) {
181 if ((code & (1u<<level))==0) {
182 if (temp->bitrank->IsBitSet(i)) return false;
183 i = i-temp->bitrank->rank(i);
187 if (!temp->bitrank->IsBitSet(i)) return false;
188 i = temp->bitrank->rank(i)-1;
195 inline uchar access(TextPosition i) const {
196 THuffAlphabetRank const * temp=this;
197 while (!temp->leaf) {
198 if (temp->bitrank->IsBitSet(i)) {
199 i = temp->bitrank->rank(i)-1;
203 i = i-temp->bitrank->rank(i);
210 inline uchar charAtPos(ulong i, TextPosition *rank) const {
211 THuffAlphabetRank const * temp=this;
212 while (!temp->leaf) {
213 if (temp->bitrank->IsBitSet(i)) {
214 i = temp->bitrank->rank(i)-1;
217 i = i-temp->bitrank->rank(i);
233 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
234 static void count_chars(uchar *, TextPosition , TCodeEntry *);
235 static unsigned SetBit(unsigned , unsigned , unsigned );
237 node( unsigned char c = 0, unsigned i = 0 ) {
244 node( node* c0, node *c1 ) {
246 weight = c0->weight + c1->weight;
252 bool operator>( const node &a ) const {
253 return weight > a.weight;
256 static TCodeEntry *makecodetable(uchar *, TextPosition);
259 static const unsigned char print = 1;
260 static const unsigned char report = 1;
261 static const uchar versionFlag;
265 TextPosition bwtEndPos;
266 // THuffAlphabetRank *alphabetrank;
267 // RLWaveletTree *alphabetrank;
268 static_sequence * alphabetrank;
270 BlockArray *suffixes;
271 BlockArray *suffixDocId;
272 BlockArray *positions;
273 TCodeEntry *codetable;
276 // Total number of texts in the collection
277 unsigned numberOfTexts;
278 // Total number of texts including empty texts
279 unsigned numberOfAllTexts;
280 // Length of the longest text
283 // Array of document id's in the order of end-markers in BWT
284 // Access by endmarkerDocId[rank_$(L, p) - 1].
285 BlockArray *endmarkerDocId;
286 // Array of text lengths (in the inserted order)
287 BlockArray *textLength;
288 // Array of text starting positions (in the inserted order)
289 BlockArray *textStartPos;
291 // FIXME Replace with a more succinct data structure
292 std::set<unsigned> emptyTextId;
293 BSGAP *emptyTextRank;
296 uchar * BWT(uchar *);
297 uchar * LoadFromFile(const char *);
298 void SaveToFile(const char *, uchar *);
299 void makewavelet(uchar *);
302 // Following are not part of the public API
303 DocId DocIdAtTextPos(TextPosition) const;
304 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
305 TextPosition Inverse(TextPosition) const;
306 TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
307 TextPosition Psi(TextPosition) const;
308 uchar * Substring(TextPosition, TextPosition) const;
309 TextPosition Lookup(TextPosition) const;
312 * Count end-markers in given interval
314 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
321 ranksp = alphabetrank->rank(0, sp - 1);
323 return alphabetrank->rank(0, ep) - ranksp;