1 /******************************************************************************
2 * Copyright (C) 2008 by Niko Valimaki <nvalimak@cs.helsinki.fi> *
3 * Text collection interface for an in-memory XQuery/XPath engine *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ******************************************************************************/
21 #ifndef _SXSI_TextCollection_h_
22 #define _SXSI_TextCollection_h_
24 #include "Tools.h" // Defines ulong and uchar.
26 #include <utility> // Defines std::pair.
28 // Default samplerate for suffix array samples
29 #define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64
34 * General interface for a text collection
36 * Class is virtual, make objects by calling
37 * the static method InitTextCollection().
42 // Type of document identifier
44 // Type for text position (FIXME ulong or long?)
45 typedef ulong TextPosition;
48 * Init an instance of a text collection object
50 * Returns a pointer to an object implementing this interface.
52 static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
56 * New samplerate can be given, otherwise will use the one specified in the save file!
57 * Note: This is not a static method; call InitTextCollection() first to get the object handle.
59 * Throws an exception if something goes wrong (unlikely since we are passing a file descriptor).
62 virtual void Load(FILE *, unsigned samplerate = 0) = 0;
64 * Save data structure into a file
66 virtual void Save(FILE *) = 0;
70 virtual ~TextCollection() { };
74 * Must be a zero-terminated string from alphabet [1,255].
75 * Can not be called after makeStatic().
76 * The i'th text insertion gets an identifier value i-1.
77 * In other words, document identifiers start from 0.
79 virtual void InsertText(uchar const *) = 0;
83 * Convert to a static collection; reduces space and time complexities.
84 * New texts can not be inserted after this operation.
86 virtual void MakeStatic() = 0;
91 * Returns the i'th text in the collection.
92 * The numbering starts from 0.
94 virtual uchar* GetText(DocId) const = 0;
96 * Returns substring [i, j] of k'th text
98 * Note: Parameters i and j are text positions inside the k'th text.
100 virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
102 * Returns backwards (reverse) iterator to the end of i'th text
104 * Note: Do we need this?
105 * Forward iterator would be really in-efficient compared to
106 * getText(k) and getText(k, i, j).
108 * TODO Define and implement const_reverse_iterator.
110 //const_reverse_iterator rend(DocId) const;
113 * Existential queries
115 // Is there a text prefixed by given string?
116 virtual bool IsPrefix(uchar const *) const = 0;
117 // Is there a text having given string as a suffix?
118 virtual bool IsSuffix(uchar const *) const = 0;
119 // Is there a text that equals given string?
120 virtual bool IsEqual(uchar const *) const = 0;
121 // Does a text contain given string?
122 virtual bool IsContains(uchar const *) const = 0;
123 // Is there a text that is lexicographically less than given string?
124 virtual bool IsLessThan(uchar const *) const = 0;
129 * Result is the number of documents.
131 virtual unsigned CountPrefix(uchar const *) const = 0;
132 virtual unsigned CountSuffix(uchar const *) const = 0;
133 virtual unsigned CountEqual(uchar const *) const = 0;
134 virtual unsigned CountContains(uchar const *) const = 0;
135 virtual unsigned CountLessThan(uchar const *) const = 0;
138 * Document reporting queries
140 * Result is a vector of document id's in some undefined order.
142 // Data type for results
143 typedef std::vector<DocId> document_result;
144 virtual document_result Prefix(uchar const *) const = 0;
145 virtual document_result Suffix(uchar const *) const = 0;
146 virtual document_result Equal(uchar const *) const = 0;
147 virtual document_result Contains(uchar const *) const = 0;
148 virtual document_result LessThan(uchar const *) const = 0;
151 * Full reporting queries
153 * Result is a vector of pairs <doc id, offset> in some undefined order.
155 // Data type for results
156 typedef std::vector<std::pair<DocId, TextPosition> > full_result;
157 virtual full_result FullContains(uchar const *) const = 0;
160 // Protected constructor; call the static function InitTextCollection().
161 TextCollection() { };
163 // No copy constructor or assignment
164 TextCollection(TextCollection const&);
165 TextCollection& operator = (TextCollection const&);