X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=xml-tree.hpp;h=0795ca064c84f91d6dc6cf3c8f47cad103e81a33;hb=HEAD;hp=ff98f216a524ad4ee49bad521b0598edf625c313;hpb=6eb6b7bc5f5284f9ee647046b822e47cae62bccc;p=SXSI%2FXMLTree.git diff --git a/xml-tree.hpp b/xml-tree.hpp index ff98f21..0795ca0 100644 --- a/xml-tree.hpp +++ b/xml-tree.hpp @@ -3,9 +3,10 @@ #include +#include #include -#include -#include +#include +#include #include #include #include @@ -46,60 +47,338 @@ public: ~xml_tree(); //Counting functions + /** + * [size()] returns the size of the tree (number of nodes) + * Runs in O(1) + */ inline uint32_t size() const; + + /** + * [num_tags()] returns the number of distinct tags. + * Runs in O(1) + */ inline uint32_t num_tags() const; + + /** + * [subtree_size(n)] returns the size of the subtree (number of nodes) + * rooted at n. + * Runs in O(1) + */ inline uint32_t subtree_size(node_t) const; + + /** + * [subtree_tags(n, t)] returns the number of occurences of tag [t] in the + * subtree rooted at [n] + * Runs in O(1) + */ inline uint32_t subtree_tags(node_t, tag_t) const; - uint32_t subtree_elements(node_t) const; + + /** + * [subtree_elements(n)] returns the number of element nodes below [n] + * Runs in O(1) + */ + inline uint32_t subtree_elements(node_t) const; + + /** + * [num_children(n)] returns the number of child nodes of [n] + * (both text and elements, and including a fake <@> node if + * present). + * Runs in O(1) (?) + */ uint32_t num_children(node_t) const; + + /** + * [child_pos(n)] returns the position of [n] amongst its siblings + * Runs in O(1) (?) + */ uint32_t child_pos(node_t) const; //Node_T tests + /** + * [is_leaf(n)] returns true if [n] is a leaf (i.e. if [num_children(n)] + * returns 0 + * Runs in O(1) + */ inline bool is_leaf(node_t) const; + + /** + * [is_ancestor(n, m)] returns true if [n] is an ancestor of [m], false + * otherwise + * Runs in O(1) + */ inline bool is_ancestor(node_t, node_t) const; + + /** + * [is_right_descendant(n, m)] returns true if [m] is a descendant-or-self of + * a following-sibling of [n], false otherwise + * Runs in O(1) + */ inline bool is_right_descendant(node_t, node_t) const; + + /** + * [is_child(n, m)] returns true if [m] is a child of [n] + * Runs in O(1) + */ bool is_child(node_t, node_t) const; + + /** + * [is_first_child(n, m)] returns true if [m] is the first child of [n] + * Runs in O(1) + */ inline bool is_first_child(node_t) const; + + /** + * [is_nil(n)] returns true if [n] is equal to xml_tree::NIL + * Runs in O(1) + */ inline bool is_nil(node_t) const; + + /** + * [is_open(n)] returns true if [n], seen as an index in the + * underlying BP representation corresponds to an opening parenthesis. + * Runs in O(1) + */ inline bool is_open(node_t) const; + //Numbering functions + /** + * [depth(n)] returns the depths of node [n]. The root has depth 1. + * Runs in O(1) + */ uint32_t depth(node_t) const; + + /** + * [preorder(n)] returns the preorder of node [n]. Equivalent to calling + * rank on the underlying BP representation. + * Runs in O(1) + */ uint32_t preorder(node_t) const; + + /** + * [preorder(n)] returns the postorder of node [n]. + * Runs in O(1) + */ uint32_t postorder(node_t) const; //Tag functions + /** + * [tag(n)] returns the tag of node [n] which must be a valid node identifier + * (in particular not NIL) + * Runs in O(1) + */ inline tag_t tag(node_t) const; + + /** + * [get_tag_name_by_ref(t)] returns the string representation of tag [t] + * For elements, the string representation is the tag name itself + * Returns if [t] is not a proper tag identifier. + * Runs in O(1) + */ const char* get_tag_name_by_ref(tag_t) const; + + /** + * [register_tag(s)] returns the tag identifier for the tag represented + * by the string [s]. If no such tag exists in the document, return a + * fresh tag identifier [i]. Subsequent calls with the same [s] will return + * the same identifier. + * Runs in O(1) + */ tag_t register_tag(char *s); //Navigation functions + + /** + * [root()] returns the root node of the tree. + * Runs in O(1) + */ inline node_t root () const; + + /** + * [child(n, i)] returns the [i]th child of [n] + * Runs in O(i) (?) + */ node_t child(node_t, uint32_t) const; + + /** + * [parent(n)] returns the parent node of [n]. Returns + * xml_tree::NIL if [n] is the root of the tree. + * Runs in O(1) + */ inline node_t parent(node_t) const; + + /** + * [first_child(n)] returns the first child of [n]. Returns + * xml_tree::NIL if [n] is a leaf. + * Runs in O(1) + */ inline node_t first_child(node_t) const; + + /** + * [last_child(n)] returns the last child of [n]. Returns + * xml_tree::NIL if [n] is a leaf. + * Runs in O(1) + */ inline node_t last_child(node_t) const; + + /** + * [next_sibling(n)] returns the following sibling of [n]. + * Returns xml_tree::NIL if [n] is the last of its siblings. + * Runs in O(1) + */ inline node_t next_sibling(node_t) const; + + /** + * [prev_sibling(n)] returns the preceding sibling of [n]. + * Returns xml_tree::NIL if [n] is the first of its siblings. + * Runs in O(1) + */ inline node_t prev_sibling(node_t) const; + + /** + * [first_element(n)] returns the first child of [n] which + * corresponds to an element node. Returns xml_tree::NIL if + * no such node exists. + * Runs in O(1) + */ inline node_t first_element(node_t) const; + + /** + * [next_element(n)] returns the first following sibling of [n] which + * corresponds to an element node. Returns xml_tree::NIL if + * no such node exists. + * Runs in O(1) + */ inline node_t next_element(node_t) const; + + /** + * [tagged_next(n, t)] returns the first node following [n] in pre-order + * which has tag [t]. Returns xml_tree::NIL if no such node exists. + * Runs in O(1) + */ inline node_t tagged_next(node_t, tag_t) const; + + /** + * [tagged_next_close(n, t)], like [tagged_next(n, t)] but uses a euristic to + * return the result faster if the next tag is close to [n]. + * Runs in O(1) + */ + inline node_t tagged_next_close(node_t, tag_t) const; + + /** + * [tagged_descendant(n, t)] returns the first descendant of [n] in + * pre-order which has tag [t]. Returns xml_tree::NIL if no such node + * exists. + * Runs in O(1) + */ inline node_t tagged_descendant(node_t, tag_t) const; + + /** + * [tagged_following_before(n, t, m)] returns the first following node + * of [n] in pre-order which has tag [t] and which occurs before node + * [m]. Returns xml_tree::NIL if no such node exists. + * Runs in O(1) + */ inline node_t tagged_following_before(node_t, tag_t, node_t) const; + + /** + * [tagged_child(n, t)] returns the first child of [n] in + * pre-order which has tag [t]. Returns xml_tree::NIL if no such node + * exists. + * Runs in O(1) + */ inline node_t tagged_child(node_t, tag_t) const; + + /** + * [tagged_sibling(n, t)] returns the first following sibling of [n] in + * pre-order which has tag [t]. Returns xml_tree::NIL if no such node + * exists. + * Runs in O(1) + */ inline node_t tagged_sibling(node_t, tag_t) const; + + /** + * [select_child(n, tags)] returns the first child of [n] in + * pre-order which has a tag in [tags]. [tags] is a xml_tree::NIL_TAG_ID + * terminated array of xml_tree::tag_t. Returns xml_tree::NIL if no such + * node exists. + * Runs in O(sizeof(tags)) + */ node_t select_child(node_t, tag_t*) const; - node_t select_descendant(node_t, tag_t*) const; + + /** + * [select_descendant(n, tags)] returns the first descendant of [n] in + * pre-order which has a tag in [tags]. [tags] is a xml_tree::NIL_TAG_ID + * terminated array of xml_tree::tag_t. Returns xml_tree::NIL if no such + * node exists. + * Runs in O(sizeof(tags)) + */ + inline node_t select_descendant(node_t, tag_t*) const; + + /** + * [select_sibling(n, tags)] returns the first following sibling of + * of [n] in pre-order which has a tag in [tags]. + * [tags] is a xml_tree::NIL_TAG_ID terminated array of xml_tree::tag_t. + * Returns xml_tree::NIL if no such node exists. + * Runs in O(sizeof(tags)) + */ node_t select_sibling(node_t, tag_t*) const; - node_t select_following_before (node_t, tag_t*, node_t) const; + + /** + * [select_sibling(n, tags, m)] returns the first following node + * of [n] in pre-order which has a tag in [tags] and which occurs before + * node [m]. [tags] is a xml_tree::NIL_TAG_ID terminated array of + * xml_tree::tag_t. Returns xml_tree::NIL if no such node exists. + * Runs in O(sizeof(tags)) + */ + inline node_t select_following_before (node_t, tag_t*, node_t) const; + + /** + * [closing(n)] returns the node identifier corresponding to the closing + * parenthesis associated with [n] in the underlying BP representation. + * The result is undefined if [n] does not point to an opening parenthesis. + * Runs in O(1). + */ inline node_t closing(node_t) const; //Text functions + /** + * [parent_node(i)] returns the node identifier corresponding to the [i]th + * text of the text collection. The result is undefined if [i] does not + * denote a valid text identifier. + * Runs in O(1) ? + */ inline node_t parent_node(int32_t) const; + + /** + * [get_text_collection()] returns a pointer to the underlying text collection. + * The pointer may be 0 if text indexing was disabled during index creation. + * Runs in O(1) + */ inline SXSI::TextCollection *get_text_collection() const; + + /** + * [text_id_range(n)] returns the identifier of the first and last text fragment + * that occur below node [n]. Returns (-1, -1) if there are no text fragment + * below [n] or if [n] does not denote a valid node. + * Runs in O(1) ? + */ std::pair text_id_range(node_t) const; + + /** + * [text_id(n)] returns the identifier of the text fragment associated with + * node [n]. The result is unspecified if [n] is not an attribute data node + * or a text node. + * Runs in O(1) ? + */ int32_t text_id(node_t) const; - unsigned char* get_text(int32_t) const; + + /** + * [get_text(i)] returns the content of the [i]th text stored in the text + * collection, has a 0 terminated string. + * Runs in O(1) ? + */ + const char* get_text(int32_t) const; SXSI::TextCollection::document_result prefix(uchar const *s) const; SXSI::TextCollection::document_result suffix(uchar const *s) const; @@ -107,6 +386,9 @@ public: SXSI::TextCollection::document_result contains(uchar const *s) const; SXSI::TextCollection::document_result less_than(uchar const *s) const; + + bool naive_contains(node_t, uchar const *s) const; + //I/O functions void save(int, char*); static xml_tree* load(int, char*, bool, int); @@ -127,13 +409,15 @@ private: //Parenthesis sequence bp *par; //tag sequence - static_sequence *tags; + std::vector tags; uint32_t *tag_seq; uint32_t tag_seq_len; uint32_t bits_per_tag; //Mapping from tag_t identifiers to/from tagnames std::vector *tag_names; std::unordered_map *tag_ids; + //Set of tag ids that map to attribute nodes + std::unordered_set *attribute_ids; //Text index SXSI::TextCollection *text_collection; static_bitsequence *text_positions;