2 /******************************************************************************
\r
3 * Copyright (C) 2008 by Diego Arroyuelo *
\r
4 * Interface for the in-memory XQuery/XPath engine *
\r
6 * This program is free software; you can redistribute it and/or modify *
\r
7 * it under the terms of the GNU Lesser General Public License as published *
\r
8 * by the Free Software Foundation; either version 2 of the License, or *
\r
9 * (at your option) any later version. *
\r
11 * This program is distributed in the hope that it will be useful, *
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
\r
14 * GNU Lesser General Public License for more details. *
\r
16 * You should have received a copy of the GNU Lesser General Public License *
\r
17 * along with this program; if not, write to the *
\r
18 * Free Software Foundation, Inc., *
\r
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
\r
20 ******************************************************************************/
\r
25 #include "libcds/includes/static_bitsequence.h"
\r
26 #include "libcds/includes/alphabet_mapper.h"
\r
27 #include "libcds/includes/static_sequence.h"
\r
29 //#include "TextCollection/TextCollection.h"
\r
30 //using SXSI::TextCollection;
\r
32 // this constant is used to efficiently compute the child operation in the tree
\r
38 #define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))
\r
39 // cleans bit p in e
\r
40 #define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))
\r
43 typedef int treeNode;
\r
44 typedef int TagType;
\r
54 /** Balanced parentheses representation of the tree */
\r
57 /** Mapping from tag identifer to tag name */
\r
58 unsigned char **TagName;
\r
60 /** boolean flag indicating whether we are indexing empty texts or not */
\r
61 bool indexing_empty_texts;
\r
63 /** Bit vector indicating with a 1 the positions of the non-empty texts. */
\r
64 static_bitsequence_rrr02 *EBVector;
\r
66 /** Tag sequence represented with a data structure for rank and select */
\r
67 static_sequence_wvtree *Tags;
\r
69 /** The texts in the XML document */
\r
70 //TextCollection *Text;
\r
72 /** Flag indicating whether the whole data structure has been constructed or
\r
73 * not. If the value is true, you cannot add more texts, nodes, etc. */
\r
76 /** Flag indicating whether the construction of the data structure has been
\r
77 * initialized or not (by calling method OpenDocument()). If this is true,
\r
78 * you cannot insert new tags or texts. */
\r
81 /* the following components are used for construction purposes */
\r
86 unsigned int *empty_texts_aux;
\r
90 /** Data structure constructor */
\r
91 XMLTree() {finished = false; initialized = false;};
\r
93 /** Data structure destructor */
\r
96 /** root(): returns the tree root. */
\r
99 /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
\r
101 int SubtreeSize(treeNode x);
\r
103 /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
\r
105 int SubtreeTags(treeNode x, TagType tag);
\r
107 /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
\r
108 * representation this is just a bit inspection. */
\r
109 bool IsLeaf(treeNode x);
\r
111 /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
\r
112 bool IsAncestor(treeNode x, treeNode y);
\r
114 /** IsChild(x,y): returns whether node x is parent of node y. */
\r
115 bool IsChild(treeNode x, treeNode y);
\r
117 /** NumChildren(x): number of children of node x. Constant time with the
\r
118 * data structure of Sadakane. */
\r
119 int NumChildren(treeNode x);
\r
121 /** ChildNumber(x): returns i if node x is the i-th children of its
\r
123 inline int ChildNumber(treeNode x);
\r
125 /** Depth(x): depth of node x, a simple binary rank on the parentheses
\r
127 int Depth(treeNode x);
\r
129 /** Preorder(x): returns the preorder number of node x, just regarding tree
\r
130 * nodes (and not texts). */
\r
131 int Preorder(treeNode x);
\r
133 /** Postorder(x): returns the postorder number of node x, just regarding
\r
134 * tree nodes (and not texts). */
\r
135 int Postorder(treeNode x);
\r
137 /** Tag(x): returns the tag identifier of node x. */
\r
138 TagType Tag(treeNode x);
\r
140 /** DocIds(x): returns the range (i.e., a pair of integers) of document
\r
141 * identifiers that descend from node x. */
\r
142 range DocIds(treeNode x);
\r
144 /** Parent(x): returns the parent node of node x. */
\r
145 treeNode Parent(treeNode x);
\r
147 /** Child(x,i): returns the i-th child of node x, assuming it exists. */
\r
148 treeNode Child(treeNode x, int i);
\r
150 /** FirstChild(x): returns the first child of node x, assuming it exists.
\r
151 * Very fast in BP. */
\r
152 treeNode FirstChild(treeNode x);
\r
154 /** NextSibling(x): returns the next sibling of node x, assuming it
\r
156 treeNode NextSibling(treeNode x);
\r
158 /** PrevSibling(x): returns the previous sibling of node x, assuming it
\r
160 treeNode PrevSibling(treeNode x);
\r
162 /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or
\r
163 * NULLT if there is none. Because of the balanced-parentheses representation
\r
164 * of the tree, this operation is not supported efficiently, just iterating
\r
165 * among the children of node x until finding the desired child. */
\r
166 treeNode TaggedChild(treeNode x, int i, TagType tag);
\r
168 /** TaggedDesc(x,tag): returns the first node tagged tag with larger
\r
169 * preorder than x and within the subtree of x. Returns NULT if there
\r
171 treeNode TaggedDesc(treeNode x, TagType tag);
\r
173 /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
\r
174 * preorder than x and not an ancestor of x. Returns NULLT if there
\r
176 treeNode TaggedPrec(treeNode x, TagType tag);
\r
178 /** TaggedFoll(x,tag): returns the first node tagged tag with larger
\r
179 * preorder than x and not in the subtree of x. Returns NULLT if there
\r
181 treeNode TaggedFoll(treeNode x, TagType tag);
\r
183 /** PrevText(x): returns the document identifier of the text to the left of
\r
184 * node x, or NULLT if x is the root node. */
\r
185 DocID PrevText(treeNode x);
\r
187 /** NextText(x): returns the document identifier of the text to the right of
\r
188 * node x, or NULLT if x is the root node. */
\r
189 DocID NextText(treeNode x);
\r
191 /** MyText(x): returns the document identifier of the text below node x, or
\r
192 * NULLT if x is not a leaf node. */
\r
193 DocID MyText(treeNode x);
\r
195 /** TextXMLId(d): returns the preorder of document with identifier d in the
\r
196 * tree consisting of all tree nodes and all text nodes. */
\r
197 int TextXMLId(DocID d);
\r
199 /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
\r
200 * all tree nodes and all text nodes. */
\r
201 int NodeXMLId(treeNode x);
\r
203 /** ParentNode(d): returns the parent node of document identifier d. */
\r
204 treeNode ParentNode(DocID d);
\r
206 /** OpenDocument(empty_texts,sample_rate_text): initilizes the construction
\r
207 * of the data structure for the XML document. Parameter empty_texts
\r
208 * indicates whether we index empty texts in document or not. Parameter
\r
209 * sample_rate_text indicates the sampling rate for the text searching data
\r
210 * structures (small values get faster searching but a bigger space
\r
211 * requirement). Returns a non-zero value upon success, NULLT in case of
\r
213 int OpenDocument(bool empty_texts, int sample_rate_text);
\r
215 /** CloseDocument(): finishes the construction of the data structure for
\r
216 * the XML document. Tree and tags are represented in the final form,
\r
217 * dynamic data structures are made static, and the flag "finished" is set
\r
218 * to true. After that, the data structure can be queried. */
\r
219 int CloseDocument();
\r
221 /** NewOpenTag(tagname): indicates the event of finding a new opening tag
\r
222 * in the document. Tag name is given. Returns a non-zero value upon
\r
223 * success, and returns NULLT in case of error. */
\r
224 int NewOpenTag(unsigned char *tagname);
\r
226 /** NewClosingTag(tagname): indicates the event of finding a new closing tag
\r
227 * in the document. Tag name is given. Returns a non-zero value upon
\r
228 * success, and returns NULLT in case of error. */
\r
229 int NewClosingTag(unsigned char *tagname);
\r
231 /** NewText(s): indicates the event of finding a new (non-empty) text s in
\r
232 * the document. The new text is inserted within the text collection.
\r
233 * Returns a non-zero value upon success, NULLT in case of error. */
\r
234 int NewText(unsigned char *s);
\r
236 /** NewEmptyText(): indicates the event of finding a new empty text in the
\r
237 * document. In case of indexing empty and non-empty texts, we insert the
\r
238 * empty texts into the text collection. In case of indexing only non-empty
\r
239 * texts, it just indicates an empty text in the bit vector of empty texts.
\r
240 * Returns a non-zero value upon success, NULLT in case of error. */
\r
241 int NewEmptyText();
\r
243 /** GetTagId(tagname): returns the tag identifier corresponding to a given
\r
244 * tag name. Returns NULLT in case that the tag name does not exists. */
\r
245 TagType GetTagId(unsigned char *tagname);
\r
247 /** GetTagName(tagid): returns the tag name of a given tag identifier.
\r
248 * Returns NULL in case that the tag identifier is not valid.*/
\r
249 unsigned char *GetTagName(TagType tagid);
\r
251 /** saveXMLTree: saves XML tree data structure to file. Every component of
\r
252 * the collection is stored in different files (same name, different file
\r
254 void Save(unsigned char *filename);
\r
256 /** load: loads XML tree data structure from file. */
\r
257 static XMLTree *Load(unsigned char *filename, int sample_rate_text);
\r