1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #ifndef _TCImplementation_H_
22 #define _TCImplementation_H_
24 #include "TextCollection.h"
25 #include "BlockArray.h"
28 // Include from XMLTree/libcds
29 #include <basics.h> // Defines W == 32
30 #include <static_bitsequence.h>
31 #include <alphabet_mapper.h>
32 #include <static_sequence.h>
34 // Re-define word size to ulong:
48 * Implementation of the TextCollection interface
51 class TCImplementation : public SXSI::TextCollection {
53 TCImplementation(uchar *, ulong, unsigned, unsigned, ulong);
56 bool EmptyText(DocId) const;
57 uchar* GetText(DocId) const;
59 * Next method is not supported:
60 * Supporting GetText for some substring [i,j]
61 * would require more space.
63 // uchar* GetText(DocId, TextPosition, TextPosition) const;
65 bool IsPrefix(uchar const *) const;
66 bool IsSuffix(uchar const *) const;
67 bool IsEqual(uchar const *) const;
68 bool IsContains(uchar const *) const;
69 bool IsLessThan(uchar const *) const;
71 bool IsPrefix(uchar const *, DocId, DocId) const;
72 bool IsSuffix(uchar const *, DocId, DocId) const;
73 bool IsEqual(uchar const *, DocId, DocId) const;
74 bool IsContains(uchar const *, DocId, DocId) const;
75 bool IsLessThan(uchar const *, DocId, DocId) const;
77 ulong Count(uchar const *) const;
78 unsigned CountPrefix(uchar const *) const;
79 unsigned CountSuffix(uchar const *) const;
80 unsigned CountEqual(uchar const *) const;
81 unsigned CountContains(uchar const *) const;
82 unsigned CountLessThan(const unsigned char*) const;
84 unsigned CountPrefix(uchar const *, DocId, DocId) const;
85 unsigned CountSuffix(uchar const *, DocId, DocId) const;
86 unsigned CountEqual(uchar const *, DocId, DocId) const;
87 unsigned CountContains(uchar const *, DocId, DocId) const;
88 unsigned CountLessThan(uchar const *, DocId, DocId) const;
90 // Definition of document_result is inherited from SXSI::TextCollection.
91 document_result Prefix(uchar const *) const;
92 document_result Suffix(uchar const *) const;
93 document_result Equal(uchar const *) const;
94 document_result Contains(uchar const *) const;
95 document_result LessThan(uchar const *) const;
97 document_result Prefix(uchar const *, DocId, DocId) const;
98 document_result Suffix(uchar const *, DocId, DocId) const;
99 document_result Equal(uchar const *, DocId, DocId) const;
100 document_result Contains(uchar const *, DocId, DocId) const;
101 document_result LessThan(uchar const *, DocId, DocId) const;
103 // Definition of full_result is inherited from SXSI::TextCollection.
104 full_result FullContains(uchar const *) const;
105 full_result FullContains(uchar const *, DocId, DocId) const;
107 // Index from/to disk
108 TCImplementation(FILE *, unsigned);
109 void Save(FILE *) const;
112 static const uchar versionFlag;
116 TextPosition bwtEndPos;
117 static_sequence * alphabetrank;
119 // Sample structures for texts longer than samplerate
120 BSGAP *sampled; // FIXME Replace with RRR02
121 BlockArray *suffixes;
122 BlockArray *suffixDocId;
124 // Total number of texts in the collection
125 unsigned numberOfTexts;
126 // Length of the longest text
129 // Array of document id's in the order of end-markers in BWT
130 // Access by endmarkerDocId[rank_$(L, p) - 1].
131 static_sequence *Doc;
133 // Following are not part of the public API
134 uchar * BWT(uchar *);
135 void makewavelet(uchar *);
137 DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
138 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
139 ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
140 ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
143 * Count end-markers in given interval
145 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
152 ranksp = alphabetrank->rank(0, sp - 1);
154 return alphabetrank->rank(0, ep) - ranksp;
158 * Count end-markers in given interval and
159 * within docIds [min,max]
161 inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
164 sp = alphabetrank->rank(0, sp - 1);
165 ep = alphabetrank->rank(0, ep);
169 return Doc->count(sp, ep-1, min, max);
173 * Enumerate all end-markers in given interval
175 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
178 sp = alphabetrank->rank(0, sp - 1);
179 ep = alphabetrank->rank(0, ep);
181 return document_result();
183 return Doc->accessAll(sp, ep-1);
187 * Enumerate end-markers in given interval and
188 * within docIds [min,max]
190 inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
193 sp = alphabetrank->rank(0, sp - 1);
194 ep = alphabetrank->rank(0, ep);
196 return document_result();
198 return Doc->access(sp, ep-1, min, max);
200 }; // class TCImplementation