1 /******************************************************************************
2 * Copyright (C) 2008 by Niko Valimaki <nvalimak@cs.helsinki.fi> *
3 * Text collection interface for an in-memory XQuery/XPath engine *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ******************************************************************************/
21 #ifndef _SXSI_TextCollection_h_
22 #define _SXSI_TextCollection_h_
24 #include "Tools.h" // Defines ulong and uchar.
26 #include <utility> // Defines std::pair.
32 * General interface for a text collection
34 * Class is virtual, make objects by calling
35 * the static method InitTextCollection().
40 // Type of document identifier
42 // Type for text position (FIXME ulong or long?)
43 typedef ulong TextPosition;
45 // mode flag: Default includes both the index and "naive" text
46 enum index_mode_t { index_mode_default, index_mode_text_only }; // index_mode_index_only
51 * The second parameter is a prefix to be used for multiple
52 * files. (SWCSAWrapper uses multiple save files!)
54 * New samplerate can be given, otherwise will use the one specified in the save file!
56 * Throws an exception if std::fread() fails.
59 static TextCollection* Load(FILE *, char const *, index_mode_t = index_mode_default, unsigned samplerate = 0);
62 * Save data structure into a file
64 * The second parameter is a prefix to be used for multiple
65 * files. (SWCSAWrapper uses multiple save files!)
67 * Throws an exception if std::fwrite() fails.
69 virtual void Save(FILE *, char const *) const = 0;
74 virtual ~TextCollection() { };
77 * Tests if the string pointed to by DocId is empty
79 virtual bool EmptyText(DocId) const = 0;
84 * Returns the i'th text in the collection.
85 * The numbering starts from 0.
87 * Call DeleteText() for each pointer returned by GetText()
88 * to avoid possible memory leaks.
90 virtual uchar* GetText(DocId) const = 0;
91 virtual void DeleteText(uchar *text) const = 0;
94 * Returns a pointer to the beginning of texts i, i+1, ..., j.
95 * Texts are separated by a '\0' byte.
97 * Call DeleteText() for each pointer returned by GetText()
98 * to avoid possible memory leaks.
100 virtual uchar * GetText(DocId i, DocId j) const = 0;
103 * Returns substring [i, j] of k'th text
105 * Note: Parameters i and j are text positions inside the k'th text.
107 // virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
109 * Returns backwards (reverse) iterator to the end of i'th text
111 * Note: Do we need this?
112 * Forward iterator would be really in-efficient compared to
115 * TODO Define and implement const_reverse_iterator.
117 //const_reverse_iterator rend(DocId) const;
120 * Existential queries
122 // Is there a text prefixed by given string?
123 virtual bool IsPrefix(uchar const *) const = 0;
124 // Is there a text having given string as a suffix?
125 virtual bool IsSuffix(uchar const *) const = 0;
126 // Is there a text that equals given string?
127 virtual bool IsEqual(uchar const *) const = 0;
128 // Does a text contain given string?
129 virtual bool IsContains(uchar const *) const = 0;
130 // Is there a text that is lexicographically less than given string?
131 virtual bool IsLessThan(uchar const *) const = 0;
134 * Existential queries for given DocId interval.
136 virtual bool IsPrefix(uchar const *, DocId, DocId) const = 0;
137 virtual bool IsSuffix(uchar const *, DocId, DocId) const = 0;
138 virtual bool IsEqual(uchar const *, DocId, DocId) const = 0;
139 virtual bool IsContains(uchar const *, DocId, DocId) const = 0;
140 virtual bool IsLessThan(uchar const *, DocId, DocId) const = 0;
144 * Result is the number of occurrences.
146 virtual ulong Count(uchar const *) const = 0;
148 * More counting queries
149 * Result is the number of documents.
151 virtual unsigned CountPrefix(uchar const *) const = 0;
152 virtual unsigned CountSuffix(uchar const *) const = 0;
153 virtual unsigned CountEqual(uchar const *) const = 0;
154 virtual unsigned CountContains(uchar const *) const = 0;
155 virtual unsigned CountLessThan(uchar const *) const = 0;
158 * Counting queries for given DocId interval
160 virtual unsigned CountPrefix(uchar const *, DocId, DocId) const = 0;
161 virtual unsigned CountSuffix(uchar const *, DocId, DocId) const = 0;
162 virtual unsigned CountEqual(uchar const *, DocId, DocId) const = 0;
163 virtual unsigned CountContains(uchar const *, DocId, DocId) const = 0;
164 virtual unsigned CountLessThan(uchar const *, DocId, DocId) const = 0;
167 * Document reporting queries
169 * Result is a vector of document id's in some undefined order.
171 // Data type for results
172 typedef std::vector<DocId> document_result;
173 virtual document_result Prefix(uchar const *) const = 0;
174 virtual document_result Suffix(uchar const *) const = 0;
175 virtual document_result Equal(uchar const *) const = 0;
176 virtual document_result Contains(uchar const *) const = 0;
177 virtual document_result LessThan(uchar const *) const = 0;
178 virtual document_result KMismaches(uchar const *, unsigned) const = 0;
179 virtual document_result KErrors(uchar const *, unsigned) const = 0;
182 * Document reporting queries for given DocId interval.
184 virtual document_result Prefix(uchar const *, DocId, DocId) const = 0;
185 virtual document_result Suffix(uchar const *, DocId, DocId) const = 0;
186 virtual document_result Equal(uchar const *, DocId, DocId) const = 0;
187 virtual document_result Contains(uchar const *, DocId, DocId) const = 0;
188 virtual document_result LessThan(uchar const *, DocId, DocId) const = 0;
191 * Full reporting queries
193 * Result is a vector of pairs <doc id, offset> in some undefined order.
195 // Data type for results
196 typedef std::vector<std::pair<DocId, TextPosition> > full_result;
197 virtual full_result FullContains(uchar const *) const = 0;
198 // Full reporting query for given DocId interval
199 virtual full_result FullContains(uchar const *, DocId, DocId) const = 0;
201 virtual full_result FullKMismatches(uchar const *, unsigned) const = 0;
202 virtual full_result FullKErrors(uchar const *, unsigned) const = 0;
205 virtual TextPosition getLength() const
207 std::cerr << "TextCollection::getLength() is unsupported! Use RLCSA instead." << std::endl;
212 virtual TextPosition LF(uchar c, TextPosition i) const
214 std::cerr << "TextCollection::LF() is unsupported! Use RLCSA instead." << std::endl;
219 virtual uchar* getSuffix(TextPosition pos, unsigned l) const
221 std::cerr << "TextCollection::getSuffix() is unsupported! Use RLCSA instead." << std::endl;
226 virtual DocId getDoc(TextPosition i) const
228 std::cerr << "TextCollection::getDoc() is unsupported! Use RLCSA instead." << std::endl;
235 // Protected constructor; use TextCollectionBuilder
236 TextCollection() { };
238 // index_mode_t indexMode;
240 // No copy constructor or assignment
241 TextCollection(TextCollection const&);
242 TextCollection& operator = (TextCollection const&);