1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
28 #include "TextCollection.h"
31 * Implementation of the TextCollection interface
33 * Implementation notes:
35 * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
36 * which requires O(n log \sigma) bits of memory. If we know the distribution
37 * of characters beforehand, space can easily be made O(nH_0).
39 * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
40 * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
42 * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
43 * with any FM-index variant supporting LF-mapping.
45 class CSA : public SXSI::TextCollection {
48 * Constructor with (default) samplerate
50 explicit CSA(unsigned);
53 * Following functions implement the interface described in TextCollection.h.
54 * For details/documentation, see TextCollection.h.
56 void InsertText(uchar const *);
58 bool EmptyText(DocId) const;
59 uchar* GetText(DocId) const;
60 uchar* GetText(DocId, TextPosition, TextPosition) const;
62 bool IsPrefix(uchar const *) const;
63 bool IsSuffix(uchar const *) const;
64 bool IsEqual(uchar const *) const;
65 bool IsContains(uchar const *) const;
66 bool IsLessThan(uchar const *) const;
68 unsigned CountPrefix(uchar const *) const;
69 unsigned CountSuffix(uchar const *) const;
70 unsigned CountEqual(uchar const *) const;
71 unsigned CountContains(uchar const *) const;
72 unsigned CountLessThan(const unsigned char*) const;
74 // document_result is inherited from SXSI::TextCollection.
75 document_result Prefix(uchar const *) const;
76 document_result Suffix(uchar const *) const;
77 document_result Equal(uchar const *) const;
78 document_result Contains(uchar const *) const;
79 document_result LessThan(uchar const *) const;
81 // full_result is inherited from SXSI::TextCollection.
82 full_result FullContains(uchar const *) const;
85 void Load(FILE *, unsigned);
86 void Save(FILE *) const;
94 TCodeEntry() {count=0;bits=0;code=0u;};
98 class THuffAlphabetRank {
99 // using fixed 0...255 alphabet
102 THuffAlphabetRank *left;
103 THuffAlphabetRank *right;
104 TCodeEntry *codetable;
108 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
109 ~THuffAlphabetRank();
110 bool Test(uchar *, TextPosition);
112 inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
113 THuffAlphabetRank const * temp=this;
114 if (codetable[c].count == 0) return 0;
116 unsigned code = codetable[c].code;
117 while (!temp->leaf) {
118 if ((code & (1u<<level)) == 0) {
119 i = i-temp->bitrank->rank(i);
123 i = temp->bitrank->rank(i)-1;
130 inline bool IsCharAtPos(int c, TextPosition i) const {
131 THuffAlphabetRank const * temp=this;
132 if (codetable[c].count == 0) return false;
134 unsigned code = codetable[c].code;
135 while (!temp->leaf) {
136 if ((code & (1u<<level))==0) {
137 if (temp->bitrank->IsBitSet(i)) return false;
138 i = i-temp->bitrank->rank(i);
142 if (!temp->bitrank->IsBitSet(i)) return false;
143 i = temp->bitrank->rank(i)-1;
150 inline int charAtPos(TextPosition i) const {
151 THuffAlphabetRank const * temp=this;
152 while (!temp->leaf) {
153 if (temp->bitrank->IsBitSet(i)) {
154 i = temp->bitrank->rank(i)-1;
158 i = i-temp->bitrank->rank(i);
162 return (int)temp->ch;
173 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
174 static void count_chars(uchar *, TextPosition , TCodeEntry *);
175 static unsigned SetBit(unsigned , unsigned , unsigned );
177 node( unsigned char c = 0, unsigned i = 0 ) {
184 node( node* c0, node *c1 ) {
186 weight = c0->weight + c1->weight;
192 bool operator>( const node &a ) const {
193 return weight > a.weight;
196 static TCodeEntry *makecodetable(uchar *, TextPosition);
199 static const unsigned char print = 1;
200 static const unsigned char report = 1;
204 TextPosition bwtEndPos;
205 THuffAlphabetRank *alphabetrank;
209 TCodeEntry *codetable;
211 // Map from end-marker in BWT to pair (textId, sampled text position)
212 std::map<TextPosition, std::pair<DocId, TextPosition> > endmarkers;
213 // Vector of pairs of <text length, text start position>
214 std::vector<std::pair<TextPosition, TextPosition> > textLength;
217 uchar * BWT(uchar *);
218 uchar * LoadFromFile(const char *);
219 void SaveToFile(const char *, uchar *);
220 void makewavelet(uchar *);
223 // Following are not part of the public API
224 DocId DocIdAtTextPos(TextPosition) const;
225 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
226 TextPosition Lookup(TextPosition) const;
227 TextPosition Inverse(TextPosition) const;
228 TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
229 TextPosition Psi(TextPosition) const;
230 uchar * Substring(TextPosition, TextPosition) const;