1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
28 #include "TextCollection.h"
31 * Implementation of the TextCollection interface
33 * Implementation notes:
35 * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
36 * which requires O(n log \sigma) bits of memory. If we know the distribution
37 * of characters beforehand, space can easily be made O(nH_0).
39 * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
40 * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
42 * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
43 * with any FM-index variant supporting LF-mapping.
45 class CSA : public SXSI::TextCollection {
48 * Constructor with (default) samplerate
50 explicit CSA(unsigned);
53 * Following functions implement the interface described in TextCollection.h.
54 * For details/documentation, see TextCollection.h.
56 void InsertText(uchar const *);
58 uchar* GetText(DocId) const;
59 uchar* GetText(DocId, TextPosition, TextPosition) const;
61 bool IsPrefix(uchar const *) const;
62 bool IsSuffix(uchar const *) const;
63 bool IsEqual(uchar const *) const;
64 bool IsContains(uchar const *) const;
65 bool IsLessThan(uchar const *) const;
67 unsigned CountPrefix(uchar const *) const;
68 unsigned CountSuffix(uchar const *) const;
69 unsigned CountEqual(uchar const *) const;
70 unsigned CountContains(uchar const *) const;
71 unsigned CountLessThan(const unsigned char*) const;
73 // document_result is inherited from SXSI::TextCollection.
74 document_result Prefix(uchar const *) const;
75 document_result Suffix(uchar const *) const;
76 document_result Equal(uchar const *) const;
77 document_result Contains(uchar const *) const;
78 document_result LessThan(uchar const *) const;
80 // full_result is inherited from SXSI::TextCollection.
81 full_result FullContains(uchar const *) const;
84 void Load(FILE *, unsigned);
85 void Save(FILE *) const;
93 TCodeEntry() {count=0;bits=0;code=0u;};
97 class THuffAlphabetRank {
98 // using fixed 0...255 alphabet
101 THuffAlphabetRank *left;
102 THuffAlphabetRank *right;
103 TCodeEntry *codetable;
107 THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
108 ~THuffAlphabetRank();
109 bool Test(uchar *, TextPosition);
111 inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
112 THuffAlphabetRank const * temp=this;
113 if (codetable[c].count == 0) return 0;
115 unsigned code = codetable[c].code;
116 while (!temp->leaf) {
117 if ((code & (1u<<level)) == 0) {
118 i = i-temp->bitrank->rank(i);
122 i = temp->bitrank->rank(i)-1;
129 inline bool IsCharAtPos(int c, TextPosition i) const {
130 THuffAlphabetRank const * temp=this;
131 if (codetable[c].count == 0) return false;
133 unsigned code = codetable[c].code;
134 while (!temp->leaf) {
135 if ((code & (1u<<level))==0) {
136 if (temp->bitrank->IsBitSet(i)) return false;
137 i = i-temp->bitrank->rank(i);
141 if (!temp->bitrank->IsBitSet(i)) return false;
142 i = temp->bitrank->rank(i)-1;
149 inline int charAtPos(TextPosition i) const {
150 THuffAlphabetRank const * temp=this;
151 while (!temp->leaf) {
152 if (temp->bitrank->IsBitSet(i)) {
153 i = temp->bitrank->rank(i)-1;
157 i = i-temp->bitrank->rank(i);
161 return (int)temp->ch;
172 void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
173 static void count_chars(uchar *, TextPosition , TCodeEntry *);
174 static unsigned SetBit(unsigned , unsigned , unsigned );
176 node( unsigned char c = 0, unsigned i = 0 ) {
183 node( node* c0, node *c1 ) {
185 weight = c0->weight + c1->weight;
191 bool operator>( const node &a ) const {
192 return weight > a.weight;
195 static TCodeEntry *makecodetable(uchar *, TextPosition);
198 static const unsigned char print = 1;
199 static const unsigned char report = 1;
203 TextPosition bwtEndPos;
204 THuffAlphabetRank *alphabetrank;
208 TCodeEntry *codetable;
210 // Map from end-marker in BWT to pair (textId, sampled text position)
211 std::map<TextPosition, std::pair<DocId, TextPosition> > endmarkers;
212 // Vector of pairs of <text length, text start position>
213 std::vector<std::pair<TextPosition, TextPosition> > textLength;
216 uchar * BWT(uchar *);
217 uchar * LoadFromFile(const char *);
218 void SaveToFile(const char *, uchar *);
219 void makewavelet(uchar *);
222 // Following are not part of the public API
223 DocId DocIdAtTextPos(TextPosition) const;
224 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
225 TextPosition Lookup(TextPosition) const;
226 TextPosition Inverse(TextPosition) const;
227 TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
228 TextPosition Psi(TextPosition) const;
229 uchar * Substring(TextPosition, TextPosition) const;