1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
23 // Include from XMLTree/libcds
25 #include <static_bitsequence.h>
26 #include <alphabet_mapper.h>
27 #include <static_sequence.h>
29 //clash between TextCollection/Tools.h and libcds/includes/basics.h
30 // Number of bits in ulong vs uint.
37 #include "TextCollection.h"
38 #include "BlockArray.h"
39 #include "RLWaveletTree.h"
44 * Implementation of the TextCollection interface
46 * Implementation notes:
48 * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
49 * which requires O(n log \sigma) bits of memory. If we know the distribution
50 * of characters beforehand, space can easily be made O(nH_0).
52 * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
53 * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
55 * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
56 * with any FM-index variant supporting LF-mapping.
58 class CSA : public SXSI::TextCollection {
61 * Constructor with (default) samplerate
63 explicit CSA(unsigned);
66 * Following functions implement the interface described in TextCollection.h.
67 * For details/documentation, see TextCollection.h.
69 void InsertText(uchar const *);
71 bool EmptyText(DocId) const;
72 uchar* GetText(DocId) const;
73 // uchar* GetText(DocId, TextPosition, TextPosition) const;
75 bool IsPrefix(uchar const *) const;
76 bool IsSuffix(uchar const *) const;
77 bool IsEqual(uchar const *) const;
78 bool IsContains(uchar const *) const;
79 bool IsLessThan(uchar const *) const;
81 unsigned CountPrefix(uchar const *) const;
82 unsigned CountSuffix(uchar const *) const;
83 unsigned CountEqual(uchar const *) const;
84 unsigned CountContains(uchar const *) const;
85 unsigned CountLessThan(const unsigned char*) const;
87 // document_result is inherited from SXSI::TextCollection.
88 document_result Prefix(uchar const *) const;
89 document_result Suffix(uchar const *) const;
90 document_result Equal(uchar const *) const;
91 document_result Contains(uchar const *) const;
92 document_result LessThan(uchar const *) const;
94 // full_result is inherited from SXSI::TextCollection.
95 full_result FullContains(uchar const *) const;
98 void Load(FILE *, unsigned);
99 void Save(FILE *) const;
102 // Debug FIXME Remove
121 void deleteEndmarker()
123 delete endmarkerDocId;
133 TCodeEntry() {count=0;bits=0;code=0u;};
137 class THuffAlphabetRank {
138 // using fixed 0...255 alphabet
141 THuffAlphabetRank *left;
142 THuffAlphabetRank *right;
143 TCodeEntry *codetable;
147 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
148 THuffAlphabetRank(FILE *);
149 ~THuffAlphabetRank();
152 bool Test(uchar *, TextPosition);
154 inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
155 THuffAlphabetRank const * temp=this;
156 if (codetable[c].count == 0) return 0;
158 unsigned code = codetable[c].code;
159 while (!temp->leaf) {
160 if ((code & (1u<<level)) == 0) {
161 i = i-temp->bitrank->rank(i);
165 i = temp->bitrank->rank(i)-1;
172 inline bool IsCharAtPos(int c, TextPosition i) const {
173 THuffAlphabetRank const * temp=this;
174 if (codetable[c].count == 0) return false;
176 unsigned code = codetable[c].code;
177 while (!temp->leaf) {
178 if ((code & (1u<<level))==0) {
179 if (temp->bitrank->IsBitSet(i)) return false;
180 i = i-temp->bitrank->rank(i);
184 if (!temp->bitrank->IsBitSet(i)) return false;
185 i = temp->bitrank->rank(i)-1;
192 inline uchar access(TextPosition i) const {
193 THuffAlphabetRank const * temp=this;
194 while (!temp->leaf) {
195 if (temp->bitrank->IsBitSet(i)) {
196 i = temp->bitrank->rank(i)-1;
200 i = i-temp->bitrank->rank(i);
207 inline uchar charAtPos(ulong i, TextPosition *rank) const {
208 THuffAlphabetRank const * temp=this;
209 while (!temp->leaf) {
210 if (temp->bitrank->IsBitSet(i)) {
211 i = temp->bitrank->rank(i)-1;
214 i = i-temp->bitrank->rank(i);
230 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
231 static void count_chars(uchar *, TextPosition , TCodeEntry *);
232 static unsigned SetBit(unsigned , unsigned , unsigned );
234 node( unsigned char c = 0, unsigned i = 0 ) {
241 node( node* c0, node *c1 ) {
243 weight = c0->weight + c1->weight;
249 bool operator>( const node &a ) const {
250 return weight > a.weight;
253 static TCodeEntry *makecodetable(uchar *, TextPosition);
256 static const unsigned char print = 1;
257 static const unsigned char report = 1;
258 static const uchar versionFlag;
262 TextPosition bwtEndPos;
263 // THuffAlphabetRank *alphabetrank;
264 // RLWaveletTree *alphabetrank;
265 static_sequence * alphabetrank;
267 BlockArray *suffixes;
268 BlockArray *suffixDocId;
269 BlockArray *positions;
270 TCodeEntry *codetable;
273 // Total number of texts in the collection
274 unsigned numberOfTexts;
275 // Total number of texts including empty texts
276 unsigned numberOfAllTexts;
277 // Length of the longest text
280 // Array of document id's in the order of end-markers in BWT
281 // Access by endmarkerDocId[rank_$(L, p) - 1].
282 BlockArray *endmarkerDocId;
283 // Array of text lengths (in the inserted order)
284 BlockArray *textLength;
285 // Array of text starting positions (in the inserted order)
286 BlockArray *textStartPos;
288 // FIXME Replace with a more succinct data structure
289 std::set<unsigned> emptyTextId;
290 BSGAP *emptyTextRank;
293 uchar * BWT(uchar *);
294 uchar * LoadFromFile(const char *);
295 void SaveToFile(const char *, uchar *);
296 void makewavelet(uchar *);
299 // Following are not part of the public API
300 DocId DocIdAtTextPos(TextPosition) const;
301 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
302 TextPosition Inverse(TextPosition) const;
303 TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
304 TextPosition Psi(TextPosition) const;
305 uchar * Substring(TextPosition, TextPosition) const;
306 TextPosition Lookup(TextPosition) const;
309 * Count end-markers in given interval
311 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
318 ranksp = alphabetrank->rank(0, sp - 1);
320 return alphabetrank->rank(0, ep) - ranksp;