2 /******************************************************************************
\r
3 * Copyright (C) 2008 by Diego Arroyuelo *
\r
4 * Interface for the in-memory XQuery/XPath engine *
\r
6 * This program is free software; you can redistribute it and/or modify *
\r
7 * it under the terms of the GNU Lesser General Public License as published *
\r
8 * by the Free Software Foundation; either version 2 of the License, or *
\r
9 * (at your option) any later version. *
\r
11 * This program is distributed in the hope that it will be useful, *
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
\r
14 * GNU Lesser General Public License for more details. *
\r
16 * You should have received a copy of the GNU Lesser General Public License *
\r
17 * along with this program; if not, write to the *
\r
18 * Free Software Foundation, Inc., *
\r
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
\r
20 ******************************************************************************/
\r
24 #include "TextCollection/TextCollectionBuilder.h"
\r
30 //clash between TextCollection/Tools.h and libcds/includes/basics.h
\r
36 #include <static_bitsequence.h>
\r
37 #include <alphabet_mapper.h>
\r
38 #include <static_sequence.h>
\r
39 using SXSI::TextCollection;
\r
40 using SXSI::TextCollectionBuilder;
\r
43 // this constant is used to efficiently compute the child operation in the tree
\r
48 #define PERM_SAMPLE 10
\r
51 #define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))
\r
52 // cleans bit p in e
\r
53 #define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))
\r
56 typedef int treeNode;
\r
57 typedef int TagType;
\r
67 // I know this class implements the working draft that we have but the logics seem flawed here...
\r
68 // We should have two classes. One XMLTreeBuilder and one XMLTree.
\r
69 // XMLTreeBuilder would have OpenDocument, NewOpenTag,... and CloseDocument would return an XMLTree
\r
70 // XMLTree would have only an initialized structure. If find it really ugly to check (!finished) or (!initialized)
\r
71 // in every function (FirstChild....).
\r
74 /** Balanced parentheses representation of the tree */
\r
77 /** Mapping from tag identifer to tag name */
\r
78 unsigned char **TagName;
\r
80 /** boolean flag indicating whether we are indexing empty texts or not */
\r
81 bool indexing_empty_texts;
\r
83 /** Bit vector indicating with a 1 the positions of the non-empty texts. */
\r
84 static_bitsequence_rrr02 *EBVector;
\r
86 /** Tag sequence represented with a data structure for rank and select */
\r
87 static_sequence *Tags;
\r
89 uint tags_blen, tags_len;
\r
91 /** The texts in the XML document */
\r
92 TextCollectionBuilder *TextBuilder;
\r
93 TextCollection *Text;
\r
95 /** The texts in the XML document (cached for faster display) */
\r
96 vector<string> CachedText;
\r
98 /** Flag indicating whether the whole data structure has been constructed or
\r
99 * not. If the value is true, you cannot add more texts, nodes, etc. */
\r
102 /** Flag indicating whether the construction of the data structure has been
\r
103 * initialized or not (by calling method OpenDocument()). If this is true,
\r
104 * you cannot insert new tags or texts. */
\r
107 /* the following components are used for construction purposes */
\r
113 unsigned int *empty_texts_aux;
\r
116 // I added those two. The TagName array should always contains two special tags
\r
117 // <@> for attributes and <$> for PCDATA.
\r
118 // <$> can never be in a document (since we handle the text differently)
\r
119 // but <@> can be returned by the parser. This boolean is needed for the construction
\r
120 // of the Tag bitmap to know if <@> must be taken into account or not
\r
121 bool found_attributes;
\r
124 // Allows to disable the TextCollection for benchmarkin purposes
\r
128 void print_stats();
\r
130 /** Data structure constructor */
\r
131 XMLTree() {finished = false; initialized = false; Text = 0; TextBuilder = 0; };
\r
133 /** Data structure destructor */
\r
136 /** root(): returns the tree root. */
\r
139 /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
\r
141 int SubtreeSize(treeNode x);
\r
143 /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
\r
145 int SubtreeTags(treeNode x, TagType tag);
\r
147 /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
\r
148 * representation this is just a bit inspection. */
\r
149 bool IsLeaf(treeNode x);
\r
151 /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
\r
152 bool IsAncestor(treeNode x, treeNode y);
\r
154 /** IsChild(x,y): returns whether node x is parent of node y. */
\r
155 bool IsChild(treeNode x, treeNode y);
\r
157 /** NumChildren(x): number of children of node x. Constant time with the
\r
158 * data structure of Sadakane. */
\r
159 int NumChildren(treeNode x);
\r
161 /** ChildNumber(x): returns i if node x is the i-th children of its
\r
163 inline int ChildNumber(treeNode x);
\r
165 /** Depth(x): depth of node x, a simple binary rank on the parentheses
\r
167 int Depth(treeNode x);
\r
169 /** Preorder(x): returns the preorder number of node x, just regarding tree
\r
170 * nodes (and not texts). */
\r
171 int Preorder(treeNode x);
\r
173 /** Postorder(x): returns the postorder number of node x, just regarding
\r
174 * tree nodes (and not texts). */
\r
175 int Postorder(treeNode x);
\r
177 /** Tag(x): returns the tag identifier of node x. */
\r
178 TagType Tag(treeNode x);
\r
180 /** DocIds(x): returns the range (i.e., a pair of integers) of document
\r
181 * identifiers that descend from node x. */
\r
182 range DocIds(treeNode x);
\r
184 /** Parent(x): returns the parent node of node x. */
\r
185 treeNode Parent(treeNode x);
\r
187 /** Child(x,i): returns the i-th child of node x, assuming it exists. */
\r
188 treeNode Child(treeNode x, int i);
\r
190 /** FirstChild(x): returns the first child of node x, assuming it exists.
\r
191 * Very fast in BP. */
\r
192 treeNode FirstChild(treeNode x);
\r
194 /** NextSibling(x): returns the next sibling of node x, assuming it
\r
196 treeNode NextSibling(treeNode x);
\r
198 /** PrevSibling(x): returns the previous sibling of node x, assuming it
\r
200 treeNode PrevSibling(treeNode x);
\r
202 /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or
\r
203 * NULLT if there is none. Because of the balanced-parentheses representation
\r
204 * of the tree, this operation is not supported efficiently, just iterating
\r
205 * among the children of node x until finding the desired child. */
\r
206 treeNode TaggedChild(treeNode x, int i, TagType tag);
\r
208 /** TaggedDesc(x,tag): returns the first node tagged tag with larger
\r
209 * preorder than x and within the subtree of x. Returns NULT if there
\r
211 treeNode TaggedDesc(treeNode x, TagType tag);
\r
214 treeNode TaggedBelow(treeNode x, TagType *childtags, unsigned int ctlen,
\r
215 TagType *desctags, unsigned int dtlen);
\r
217 treeNode TaggedNext(treeNode x, TagType *childtags, unsigned int ctlen,
\r
218 TagType *folltags, unsigned int flen,treeNode root);
\r
220 treeNode TaggedDescOnly(treeNode x, TagType *desctags, unsigned int dtlen);
\r
222 treeNode TaggedDescOrFollOnly(treeNode x, TagType *folltags, unsigned int flen,
\r
225 treeNode TaggedFollOnly(treeNode x, TagType *folltags, unsigned int flen,
\r
229 /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
\r
230 * preorder than x and not an ancestor of x. Returns NULLT if there
\r
232 treeNode TaggedPrec(treeNode x, TagType tag);
\r
234 /** TaggedFoll(x,tag): returns the first node tagged tag with larger
\r
235 * preorder than x and not in the subtree of x. Returns NULLT if there
\r
237 treeNode TaggedFoll(treeNode x, TagType tag);
\r
239 treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root);
\r
241 /** TaggedFollowingSibling(x,tag) */
\r
242 treeNode TaggedFollowingSibling(treeNode x, TagType tag);
\r
244 /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
\r
245 * tag. Return NULLT is there is none. */
\r
246 treeNode TaggedAncestor(treeNode x, TagType tag);
\r
248 /** PrevText(x): returns the document identifier of the text to the left of
\r
249 * node x, or NULLT if x is the root node. */
\r
250 DocID PrevText(treeNode x);
\r
252 /** NextText(x): returns the document identifier of the text to the right of
\r
253 * node x, or NULLT if x is the root node. */
\r
254 DocID NextText(treeNode x);
\r
256 /** MyText(x): returns the document identifier of the text below node x, or
\r
257 * NULLT if x is not a leaf node. */
\r
258 DocID MyText(treeNode x);
\r
260 /** TextXMLId(d): returns the preorder of document with identifier d in the
\r
261 * tree consisting of all tree nodes and all text nodes. */
\r
262 int TextXMLId(DocID d);
\r
264 /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
\r
265 * all tree nodes and all text nodes. */
\r
266 int NodeXMLId(treeNode x);
\r
268 /** ParentNode(d): returns the parent node of document identifier d. */
\r
269 treeNode ParentNode(DocID d);
\r
270 treeNode PrevNode(DocID d);
\r
272 /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction
\r
273 * of the data structure for the XML document. Parameter empty_texts
\r
274 * indicates whether we index empty texts in document or not. Parameter
\r
275 * sample_rate_text indicates the sampling rate for the text searching data
\r
276 * structures (small values get faster searching but a bigger space
\r
277 * requirement). dtc disable the use of the TextCollection
\r
278 * (i.e. everything is considered an empty text *)
\r
279 * Returns a non-zero value upon success, NULLT in case of
\r
282 int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc);
\r
284 /** CloseDocument(): finishes the construction of the data structure for
\r
285 * the XML document. Tree and tags are represented in the final form,
\r
286 * dynamic data structures are made static, and the flag "finished" is set
\r
287 * to true. After that, the data structure can be queried. */
\r
288 int CloseDocument();
\r
290 /** NewOpenTag(tagname): indicates the event of finding a new opening tag
\r
291 * in the document. Tag name is given. Returns a non-zero value upon
\r
292 * success, and returns NULLT in case of error. */
\r
293 int NewOpenTag(unsigned char *tagname);
\r
295 /** NewClosingTag(tagname): indicates the event of finding a new closing tag
\r
296 * in the document. Tag name is given. Returns a non-zero value upon
\r
297 * success, and returns NULLT in case of error. */
\r
298 int NewClosingTag(unsigned char *tagname);
\r
300 /** NewText(s): indicates the event of finding a new (non-empty) text s in
\r
301 * the document. The new text is inserted within the text collection.
\r
302 * Returns a non-zero value upon success, NULLT in case of error. */
\r
303 int NewText(unsigned char *s);
\r
305 /** NewEmptyText(): indicates the event of finding a new empty text in the
\r
306 * document. In case of indexing empty and non-empty texts, we insert the
\r
307 * empty texts into the text collection. In case of indexing only non-empty
\r
308 * texts, it just indicates an empty text in the bit vector of empty texts.
\r
309 * Returns a non-zero value upon success, NULLT in case of error. */
\r
310 int NewEmptyText();
\r
312 /** GetTagId(tagname): returns the tag identifier corresponding to a given
\r
313 * tag name. Returns NULLT in case that the tag name does not exists. */
\r
314 TagType GetTagId(unsigned char *tagname);
\r
316 /** GetTagName(tagid): returns the tag name of a given tag identifier.
\r
317 * Returns NULL in case that the tag identifier is not valid.*/
\r
318 unsigned char *GetTagName(TagType tagid);
\r
322 /** GetTagName(tagid): returns the tag name of a given tag identifier.
\r
323 * The result is just a reference and should not be freed by the caller.
\r
325 const unsigned char *GetTagNameByRef(TagType tagid);
\r
328 /** RegisterTag adds a new tag to the tag collection this is needed
\r
329 * if the query contains a tag which is not in the document, we need
\r
330 * to give this new tag a fresh id and store it somewhere. A logical
\r
332 * We might want to use a hashtable instead of an array though.
\r
334 TagType RegisterTag(unsigned char *tagname);
\r
336 bool EmptyText(DocID i) {
\r
337 return Text->EmptyText(i);
\r
339 /** Prefix(s): search for texts prefixed by string s. */
\r
340 TextCollection::document_result Prefix(uchar const *s) {
\r
341 return Text->Prefix(s);
\r
344 /** Suffix(s): search for texts having string s as a suffix. */
\r
345 TextCollection::document_result Suffix(uchar const *s) {
\r
346 return Text->Suffix(s);
\r
349 /** Equal(s): search for texts equal to string s. */
\r
350 TextCollection::document_result Equal(uchar const *s) {
\r
351 return Text->Equal(s);
\r
354 /** Contains(s): search for texts containing string s. */
\r
355 TextCollection::document_result Contains(uchar const *s) {
\r
356 return Text->Contains(s);
\r
359 /** LessThan(s): returns document identifiers for the texts that
\r
360 * are lexicographically smaller than string s. */
\r
361 TextCollection::document_result LessThan(uchar const *s) {
\r
362 return Text->LessThan(s);
\r
365 /** IsPrefix(x): returns true if there is a text prefixed by string s. */
\r
366 bool IsPrefix(uchar const *s) {
\r
367 return Text->IsPrefix(s);
\r
370 /** IsSuffix(s): returns true if there is a text having string s as a
\r
372 bool IsSuffix(uchar const *s) {
\r
373 return Text->IsSuffix(s);
\r
376 /** IsEqual(s): returns true if there is a text that equals given
\r
378 bool IsEqual(uchar const *s) {
\r
379 return Text->IsEqual(s);
\r
382 /** IsContains(s): returns true if there is a text containing string s. */
\r
383 bool IsContains(uchar const *s) {
\r
384 return Text->IsContains(s);
\r
387 /** IsLessThan(s): returns true if there is at least a text that is
\r
388 * lexicographically smaller than string s. */
\r
389 bool IsLessThan(uchar const *s) {
\r
390 return Text->IsLessThan(s);
\r
393 /** Count(s): Global counting */
\r
394 unsigned Count(uchar const *s) {
\r
395 return Text->Count(s);
\r
398 /** CountPrefix(s): counting version of Prefix(s). */
\r
399 unsigned CountPrefix(uchar const *s) {
\r
400 return Text->CountPrefix(s);
\r
403 /** CountSuffix(s): counting version of Suffix(s). */
\r
404 unsigned CountSuffix(uchar const *s) {
\r
405 return Text->CountSuffix(s);
\r
408 /** CountEqual(s): counting version of Equal(s). */
\r
409 unsigned CountEqual(uchar const *s) {
\r
410 return Text->CountEqual(s);
\r
413 /** CountContains(s): counting version of Contains(s). */
\r
414 unsigned CountContains(uchar const *s) {
\r
415 return Text->CountContains(s);
\r
418 /** CountLessThan(s): counting version of LessThan(s). */
\r
419 unsigned CountLessThan(uchar const *s) {
\r
420 return CountLessThan(s);
\r
423 /** GetText(d): returns the text corresponding to document with
\r
425 uchar* GetText(DocID d) {
\r
426 return Text->GetText(d);
\r
429 uchar* GetCachedText(DocID d) {
\r
430 uchar * str = (uchar*) calloc(sizeof(char),(CachedText.at(d).size() + 1));
\r
431 strcpy((char*) str,(const char*) CachedText.at(d).c_str());
\r
432 return (uchar*) (str);
\r
435 TextCollection *getTextCollection() {
\r
438 /** Save: saves XML tree data structure to file. */
\r
439 void Save(unsigned char *filename);
\r
441 /** Load: loads XML tree data structure from file. sample_rate_text
\r
442 * indicates the sample rate for the text search data structure. */
\r
443 static XMLTree *Load(unsigned char *filename, int sample_rate_text);
\r