6 #include <unordered_set>
7 #include <unordered_map>
9 #include <libbp/bp-darray.h>
10 #include <libcds/includes/basics.h>
11 #include <libcds/includes/static_bitsequence.h>
12 #include <libcds/includes/alphabet_mapper.h>
13 #include <libcds/includes/static_sequence.h>
14 #include "bit-vector.hpp"
20 #include <TextCollection/TextCollection.h>
21 #include <TextCollection/TextCollectionBuilder.h>
23 class xml_tree_builder;
27 typedef int32_t node_t;
28 typedef int32_t tag_t;
30 static const node_t NIL = -1;
31 static const node_t ROOT = 0;
33 static const char* NIL_TAG;
34 static const tag_t NIL_TAG_ID = -1;
35 static const char* DOCUMENT_OPEN_TAG;
36 static const tag_t DOCUMENT_OPEN_TAG_ID = 0;
37 static const char* ATTRIBUTE_OPEN_TAG;
38 static const tag_t ATTRIBUTE_OPEN_TAG_ID = 1;
39 static const char* PCDATA_OPEN_TAG;
40 static const tag_t PCDATA_OPEN_TAG_ID = 2;
41 static const char* ATTRIBUTE_DATA_OPEN_TAG;
42 static const tag_t ATTRIBUTE_DATA_OPEN_TAG_ID = 3;
43 static const char* CLOSE_TAG;
44 static const tag_t CLOSE_TAG_ID = 4;
51 * [size()] returns the size of the tree (number of nodes)
54 inline uint32_t size() const;
57 * [num_tags()] returns the number of distinct tags.
60 inline uint32_t num_tags() const;
63 * [subtree_size(n)] returns the size of the subtree (number of nodes)
67 inline uint32_t subtree_size(node_t) const;
70 * [subtree_tags(n, t)] returns the number of occurences of tag [t] in the
71 * subtree rooted at [n]
74 inline uint32_t subtree_tags(node_t, tag_t) const;
77 * [subtree_elements(n)] returns the number of element nodes below [n]
78 * Runs in O(attribute_ids->size()+3)
80 inline uint32_t subtree_elements(node_t) const;
83 * [num_children(n)] returns the number of child nodes of [n]
84 * (both text and elements, and including a fake <@> node if
88 uint32_t num_children(node_t) const;
91 * [child_pos(n)] returns the position of [n] amongst its siblings
94 uint32_t child_pos(node_t) const;
99 * [is_leaf(n)] returns true if [n] is a leaf (i.e. if [num_children(n)]
103 inline bool is_leaf(node_t) const;
106 * [is_ancestor(n, m)] returns true if [n] is an ancestor of [m], false
110 inline bool is_ancestor(node_t, node_t) const;
113 * [is_right_descendant(n, m)] returns true if [m] is a descendant-or-self of
114 * a following-sibling of [n], false otherwise
117 inline bool is_right_descendant(node_t, node_t) const;
120 * [is_child(n, m)] returns true if [m] is a child of [n]
123 bool is_child(node_t, node_t) const;
126 * [is_first_child(n, m)] returns true if [m] is the first child of [n]
129 inline bool is_first_child(node_t) const;
132 * [is_nil(n)] returns true if [n] is equal to xml_tree::NIL
135 inline bool is_nil(node_t) const;
138 * [is_open(n)] returns true if [n], seen as an index in the
139 * underlying BP representation corresponds to an opening parenthesis.
142 inline bool is_open(node_t) const;
144 //Numbering functions
146 * [depth(n)] returns the depths of node [n]. The root has depth 1.
149 uint32_t depth(node_t) const;
152 * [preorder(n)] returns the preorder of node [n]. Equivalent to calling
153 * rank on the underlying BP representation.
156 uint32_t preorder(node_t) const;
159 * [preorder(n)] returns the postorder of node [n].
162 uint32_t postorder(node_t) const;
166 * [tag(n)] returns the tag of node [n] which must be a valid node identifier
167 * (in particular not NIL)
170 inline tag_t tag(node_t) const;
173 * [get_tag_name_by_ref(t)] returns the string representation of tag [t]
174 * For elements, the string representation is the tag name itself
175 * Returns <!INVALID!> if [t] is not a proper tag identifier.
178 const char* get_tag_name_by_ref(tag_t) const;
181 * [register_tag(s)] returns the tag identifier for the tag represented
182 * by the string [s]. If no such tag exists in the document, return a
183 * fresh tag identifier [i]. Subsequent calls with the same [s] will return
184 * the same identifier.
187 tag_t register_tag(char *s);
189 //Navigation functions
192 * [root()] returns the root node of the tree.
195 inline node_t root () const;
198 * [child(n, i)] returns the [i]th child of [n]
201 node_t child(node_t, uint32_t) const;
204 * [parent(n)] returns the parent node of [n]. Returns
205 * xml_tree::NIL if [n] is the root of the tree.
208 inline node_t parent(node_t) const;
211 * [first_child(n)] returns the first child of [n]. Returns
212 * xml_tree::NIL if [n] is a leaf.
215 inline node_t first_child(node_t) const;
218 * [last_child(n)] returns the last child of [n]. Returns
219 * xml_tree::NIL if [n] is a leaf.
222 inline node_t last_child(node_t) const;
225 * [next_sibling(n)] returns the following sibling of [n].
226 * Returns xml_tree::NIL if [n] is the last of its siblings.
229 inline node_t next_sibling(node_t) const;
232 * [prev_sibling(n)] returns the preceding sibling of [n].
233 * Returns xml_tree::NIL if [n] is the first of its siblings.
236 inline node_t prev_sibling(node_t) const;
239 * [first_element(n)] returns the first child of [n] which
240 * corresponds to an element node. Returns xml_tree::NIL if
241 * no such node exists.
244 inline node_t first_element(node_t) const;
247 * [next_element(n)] returns the first following sibling of [n] which
248 * corresponds to an element node. Returns xml_tree::NIL if
249 * no such node exists.
252 inline node_t next_element(node_t) const;
255 * [tagged_next(n, t)] returns the first node following [n] in pre-order
256 * which has tag [t]. Returns xml_tree::NIL if no such node exists.
259 inline node_t tagged_next(node_t, tag_t) const;
262 * [tagged_next_close(n, t)], like [tagged_next(n, t)] but uses a euristic to
263 * return the result faster if the next tag is close to [n].
266 inline node_t tagged_next_close(node_t, tag_t) const;
269 * [tagged_descendant(n, t)] returns the first descendant of [n] in
270 * pre-order which has tag [t]. Returns xml_tree::NIL if no such node
274 inline node_t tagged_descendant(node_t, tag_t) const;
277 * [tagged_following_before(n, t, m)] returns the first following node
278 * of [n] in pre-order which has tag [t] and which occurs before node
279 * [m]. Returns xml_tree::NIL if no such node exists.
282 inline node_t tagged_following_before(node_t, tag_t, node_t) const;
285 * [tagged_child(n, t)] returns the first child of [n] in
286 * pre-order which has tag [t]. Returns xml_tree::NIL if no such node
290 inline node_t tagged_child(node_t, tag_t) const;
293 * [tagged_sibling(n, t)] returns the first following sibling of [n] in
294 * pre-order which has tag [t]. Returns xml_tree::NIL if no such node
298 inline node_t tagged_sibling(node_t, tag_t) const;
301 * [select_child(n, tags)] returns the first child of [n] in
302 * pre-order which has a tag in [tags]. [tags] is a xml_tree::NIL_TAG_ID
303 * terminated array of xml_tree::tag_t. Returns xml_tree::NIL if no such
305 * Runs in O(sizeof(tags))
307 node_t select_child(node_t, tag_t*) const;
310 * [select_descendant(n, tags)] returns the first descendant of [n] in
311 * pre-order which has a tag in [tags]. [tags] is a xml_tree::NIL_TAG_ID
312 * terminated array of xml_tree::tag_t. Returns xml_tree::NIL if no such
314 * Runs in O(sizeof(tags))
316 inline node_t select_descendant(node_t, tag_t*) const;
319 * [select_sibling(n, tags)] returns the first following sibling of
320 * of [n] in pre-order which has a tag in [tags].
321 * [tags] is a xml_tree::NIL_TAG_ID terminated array of xml_tree::tag_t.
322 * Returns xml_tree::NIL if no such node exists.
323 * Runs in O(sizeof(tags))
325 node_t select_sibling(node_t, tag_t*) const;
328 * [select_sibling(n, tags, m)] returns the first following node
329 * of [n] in pre-order which has a tag in [tags] and which occurs before
330 * node [m]. [tags] is a xml_tree::NIL_TAG_ID terminated array of
331 * xml_tree::tag_t. Returns xml_tree::NIL if no such node exists.
332 * Runs in O(sizeof(tags))
334 inline node_t select_following_before (node_t, tag_t*, node_t) const;
337 * [closing(n)] returns the node identifier corresponding to the closing
338 * parenthesis associated with [n] in the underlying BP representation.
339 * The result is undefined if [n] does not point to an opening parenthesis.
342 inline node_t closing(node_t) const;
346 * [parent_node(i)] returns the node identifier corresponding to the [i]th
347 * text of the text collection. The result is undefined if [i] does not
348 * denote a valid text identifier.
351 inline node_t parent_node(int32_t) const;
354 * [get_text_collection()] returns a pointer to the underlying text collection.
355 * The pointer may be 0 if text indexing was disabled during index creation.
358 inline SXSI::TextCollection *get_text_collection() const;
361 * [text_id_range(n)] returns the identifier of the first and last text fragment
362 * that occur below node [n]. Returns (-1, -1) if there are no text fragment
363 * below [n] or if [n] does not denote a valid node.
366 std::pair<int32_t, int32_t> text_id_range(node_t) const;
369 * [text_id(n)] returns the identifier of the text fragment associated with
370 * node [n]. The result is unspecified if [n] is not an attribute data node
374 int32_t text_id(node_t) const;
377 * [get_text(i)] returns the content of the [i]th text stored in the text
378 * collection, has a 0 terminated string.
381 const char* get_text(int32_t) const;
383 SXSI::TextCollection::document_result prefix(uchar const *s) const;
384 SXSI::TextCollection::document_result suffix(uchar const *s) const;
385 SXSI::TextCollection::document_result equals(uchar const *s) const;
386 SXSI::TextCollection::document_result contains(uchar const *s) const;
387 SXSI::TextCollection::document_result less_than(uchar const *s) const;
390 bool naive_contains(node_t, uchar const *s) const;
393 void save(int, char*);
394 static xml_tree* load(int, char*, bool, int);
395 void print(node_t, int, bool no_text=false);
399 friend class xml_tree_builder;
401 xml_tree(std::vector<int32_t>*,
402 std::unordered_map<std::string, int32_t>*,
405 SXSI::TextCollectionBuilder *,
406 SXSI::TextCollectionBuilder::index_type_t,
409 //Parenthesis sequence
412 std::vector<static_bitsequence_sdarray*> tags;
414 uint32_t tag_seq_len;
415 uint32_t bits_per_tag;
416 //Mapping from tag_t identifiers to/from tagnames
417 std::vector<std::string> *tag_names;
418 std::unordered_map<std::string, tag_t> *tag_ids;
419 //Set of tag ids that map to attribute nodes
420 std::unordered_set<tag_t> *attribute_ids;
422 SXSI::TextCollection *text_collection;
423 static_bitsequence *text_positions;
424 SXSI::TextCollectionBuilder::index_type_t text_index_type;
425 //auxiliary data structures for efficient printing.
426 std::vector<std::string> *print_stack;
427 std::string *print_buffer;
429 #define BUFFER_SIZE (8192*2)
432 void uflush_r(int, size_t);
433 void uput_str(std::string s, int fd);
434 void uputs(const char* s, int fd);
435 void uputc(const char c, int fd);
436 size_t uprintf(const char*s, int fd);
439 #define XML_TREE_INTERNAL__ 1
440 #include "xml-tree-inc.hpp"
442 #endif //XML_TREE_HPP_