X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=1cab2aacf36da23962fd39ecc22591ce868d2266;hb=00d137d79d67d8a79c8ff7ab6623a00a3783df5f;hp=4673ccc989142c89cf7b45ef14822a4d6f6ba644;hpb=8b92ac7e539c796ee3160078b5ca30537f26ea51;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index 4673ccc..1cab2aa 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -25,17 +25,18 @@ #include #include #include -#include "TextCollection/TextCollectionBuilder.h" +#include #undef W #undef WW #undef Wminusone -#include "bp.h" +#include #include -#include -#include -#include +#include +#include +#include + using SXSI::TextCollection; using SXSI::TextCollectionBuilder; @@ -49,8 +50,8 @@ using SXSI::TextCollectionBuilder; typedef int treeNode; -typedef int TagType; -typedef int DocID; +typedef int TagType; +typedef int DocID; typedef struct { int min; @@ -98,46 +99,14 @@ typedef TagIdMap::const_iterator TagIdMapIT; #define BUFFER_ALLOC (8192 * 2) #define BUFFER_SIZE (BUFFER_ALLOC / 2) -static inline int fast_find_close(bp *b,int s) -{ - return fwd_excess(b,s,-1); -} - -static inline int fast_inspect(bp* Par,treeNode i) -{ - int j,l; - j = i >> logD; - l = i & (D-1); - return (Par->B[j] >> (D-1-l)) & 1; -} - -static bool fast_isleaf(bp* Par,treeNode x){ - return (fast_inspect(Par, x+1) == CP); -} - -static treeNode fast_first_child(bp *Par, treeNode x) -{ - x = x+1; - return (fast_inspect(Par,x) == OP) ? x : NULLT; -} - -inline static treeNode fast_next_sibling(bp* Par,treeNode x) -{ - treeNode y = fast_find_close(Par,x)+1; - return (fast_inspect(Par, y) == OP) ? y : NULLT; -} - -inline static bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){ - return (x <= y) && ((x==0) || (y <= fast_find_close(Par,x))); -} // tag position -> tree node -static treeNode tagpos2node(int t) +static treeNode tagpos2node(int t) { return (treeNode) t; } // tree node -> tag position -static int node2tagpos(treeNode x) +static int node2tagpos(treeNode x) { return (int)x; } @@ -153,13 +122,13 @@ class XMLTree { private: /** Balanced parentheses representation of the tree */ bp *Par; - - /** Mapping from tag identifer to tag name */ + + /** Mapping from tag identifer to tag name */ std::vector *TagName; TagIdMap * tIdMap; - + /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence *EBVector; + static_bitsequence *EBVector; /** Tag sequence represented with a data structure for rank and select */ static_sequence *Tags; @@ -172,21 +141,28 @@ class XMLTree { // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; SXSI::TextCollectionBuilder::index_type_t text_index_type; - + std::string *buffer; std::vector *print_stack; + + void _real_flush(int fd, size_t size) { + if (size == 0) return; + size_t written; + while (1) { + written = write(fd, buffer->data(), size); + if ((written < 0) && (errno == EAGAIN || errno == EINTR)) + continue; + break; + }; + buffer->clear(); + + } + void _flush(int fd){ size_t size = buffer->size(); if (size < BUFFER_SIZE) return; - size_t written; - while (1) { - written = write(fd, buffer->data(), size); - if ((written < 0) && (errno == EAGAIN || errno == EINTR)) - continue; - break; - }; - buffer->clear(); + _real_flush(fd, size); } void _dput_str(std::string s, int fd){ @@ -196,7 +172,7 @@ class XMLTree { void _dputs(const char* s, int fd){ buffer->append(s); - _flush(fd); + _flush(fd); } void _dputc(const char c, int fd){ @@ -245,10 +221,10 @@ class XMLTree { TextCollection * const TC, bool dis_tc, TextCollectionBuilder::index_type_t _index_type ); -public: +public: /** Data structure destructor */ ~XMLTree(); - + /** root(): returns the tree root. */ treeNode Root() { return 0; } @@ -268,16 +244,16 @@ public: unsigned int * TagStruct() { return tags_fix; }; - /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of * node x. */ - int SubtreeSize(treeNode x) { return subtree_size(Par, x); } - - /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree + int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); } + + /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree * of node x. */ int SubtreeTags(treeNode x, TagType tag){ //int s = x + 2*subtree_size(Par, x) - 1; - treeNode y = fast_find_close(Par, x); - + treeNode y = bp_find_close(Par, x); + if (y - x < 10) { int count = 0; for(int i = x; i < y; i++) @@ -287,12 +263,12 @@ public: else return (Tags->rank(tag, y) - Tags->rank(tag, x)); }; - + /** SubtreeElements(x) of element nodes in the subtree of x */ int SubtreeElements(treeNode x); - /** IsLeaf(x): returns whether node x is leaf or not. In the succinct + /** IsLeaf(x): returns whether node x is leaf or not. In the succinct * representation this is just a bit inspection. */ bool IsLeaf(treeNode x); @@ -300,92 +276,100 @@ public: /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ bool IsAncestor(treeNode x, treeNode y); - + + + /** IsRigthDescendant returns true if y is a descendant of x and y is + not a descendant of the first child of x */ + bool IsRightDescendant(treeNode x, treeNode y) { + if (x <= Root()) return false; + treeNode z = bp_parent_close(Par, x); + treeNode c = bp_find_close(Par, x); + return (y > c && y < z ); + } + + /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); /** IsFirstChild(x): returns whether node x is the first child of its parent. */ /* OCAML */ - bool IsFirstChild(treeNode x) { - return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1)); + bool IsFirstChild(treeNode x) { + return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1)); }; - - /** NumChildren(x): number of children of node x. Constant time with the + + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); - /** ChildNumber(x): returns i if node x is the i-th children of its + /** ChildNumber(x): returns i if node x is the i-th children of its * parent. */ int ChildNumber(treeNode x); - /** Depth(x): depth of node x, a simple binary rank on the parentheses + /** Depth(x): depth of node x, a simple binary rank on the parentheses * sequence. */ int Depth(treeNode x); - - /** Preorder(x): returns the preorder number of node x, just regarding tree - * nodes (and not texts). */ + + /** Preorder(x): returns the preorder number of node x, just regarding tree + * nodes (and not texts). */ int Preorder(treeNode x); - - /** Postorder(x): returns the postorder number of node x, just regarding + + /** Postorder(x): returns the postorder number of node x, just regarding * tree nodes (and not texts). */ int Postorder(treeNode x); - - /** DocIds(x): returns the range (i.e., a pair of integers) of document + + /** DocIds(x): returns the range (i.e., a pair of integers) of document * identifiers that descend from node x. */ range DocIds(treeNode x); /** Parent(x): returns the parent node of node x. */ - treeNode Parent(treeNode x) { - if (x == Root()) - return NULLT; - else - return parent(Par, x); + treeNode Parent(treeNode x) { + return (x == Root()) ? NULLT : bp_parent(Par, x); }; treeNode BinaryParent(treeNode x){ - if (x <= Root()) - return NULLT; - else { - treeNode prev = x - 1; - return (fast_inspect(Par, prev) == OP) ? prev : find_open(Par, prev); - }; + if (x <= Root()) + return NULLT; + else { + treeNode prev = x - 1; + return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev); + }; }; /* Assumes x is neither 0 nor -1 */ - - /** Child(x,i): returns the i-th child of node x, assuming it exists. */ + + /** Child(x,i): returns the i-th child of node x, assuming it exists. */ treeNode Child(treeNode x, int i); /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x) { - NULLT_IF(x == NULLT || fast_isleaf(Par,x)); - return find_open(Par, fast_find_close(Par, x)-1); + NULLT_IF(x == NULLT || bp_isleaf(Par,x)); + return bp_find_open(Par, bp_find_close(Par, x)-1); } - - /** PrevSibling(x): returns the previous sibling of node x, assuming it + + /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ treeNode PrevSibling(treeNode x) { NULLT_IF(x==NULLT); - return prev_sibling(Par, x); + return bp_prev_sibling(Par, x); } - - /** TaggedChild(x,tag): returns the first child of node x tagged tag, or - * NULLT if there is none. Because of the balanced-parentheses representation - * of the tree, this operation is not supported efficiently, just iterating + + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or + * NULLT if there is none. Because of the balanced-parentheses representation + * of the tree, this operation is not supported efficiently, just iterating * among the children of node x until finding the desired child. */ - + treeNode SelectChild(treeNode x, TagIdSet * tags); - /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or + /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or * NULLT if there is none. */ - + treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); @@ -394,25 +378,25 @@ public: treeNode SelectDescendant(treeNode x, TagIdSet * tags) { NULLT_IF (x == NULLT); treeNode fc = x+1; - if (fast_inspect(Par, fc) == CP) return NULLT; + if (bp_inspect(Par, fc) == CP) return NULLT; treeNode min = NULLT; treeNode aux; TagIdSet::const_iterator tagit; for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - aux = TaggedDescendant(x, (TagType) *tagit); + aux = TaggedDescendant(x, (TagType) *tagit); if (((unsigned int) aux) < ((unsigned int) min)) min = aux; }; return min; } - /** TaggedPrec(x,tag): returns the first node tagged tag with smaller - * preorder than x and not an ancestor of x. Returns NULLT if there + /** TaggedPrec(x,tag): returns the first node tagged tag with smaller + * preorder than x and not an ancestor of x. Returns NULLT if there * is none. */ treeNode TaggedPreceding(treeNode x, TagType tag); - - /** TaggedFoll(x,tag): returns the first node tagged tag with larger - * preorder than x and not in the subtree of x. Returns NULLT if there + + /** TaggedFoll(x,tag): returns the first node tagged tag with larger + * preorder than x and not in the subtree of x. Returns NULLT if there * is none. */ treeNode TaggedFollowing(treeNode x, TagType tag); @@ -427,65 +411,65 @@ public: NULLT_IF(x <= 0); - treeNode close = fast_find_close(Par,x); + treeNode close = bp_find_close(Par,x); + - treeNode min = NULLT; treeNode aux; - + TagIdSet::const_iterator tagit; for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - aux = tagpos2node(Tags->select_next(*tagit, close)); + aux = tagpos2node(Tags->select_next(*tagit, close)); if (((unsigned int) aux) < ((unsigned int) min)) min = aux; }; - + return (min < ancestor_closing) ? min : NULLT; - + } - /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged + /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged * tag. Return NULLT is there is none. */ treeNode TaggedAncestor(treeNode x, TagType tag); - - /** PrevText(x): returns the document identifier of the text to the left of + + /** PrevText(x): returns the document identifier of the text to the left of * node x, or NULLT if x is the root node. */ DocID PrevText(treeNode x); - - /** NextText(x): returns the document identifier of the text to the right of + + /** NextText(x): returns the document identifier of the text to the right of * node x, or NULLT if x is the root node. */ DocID NextText(treeNode x); - - /** MyText(x): returns the document identifier of the text below node x, or + + /** MyText(x): returns the document identifier of the text below node x, or * NULLT if x is not a leaf node. */ DocID MyText(treeNode x); DocID MyTextUnsafe(treeNode x); - /** TextXMLId(d): returns the preorder of document with identifier d in the + /** TextXMLId(d): returns the preorder of document with identifier d in the * tree consisting of all tree nodes and all text nodes. */ int TextXMLId(DocID d); - - /** NodeXMLId(x): returns the preorder of node x in the tree consisting of + + /** NodeXMLId(x): returns the preorder of node x in the tree consisting of * all tree nodes and all text nodes. */ int NodeXMLId(treeNode x); - + /** ParentNode(d): returns the parent node of document identifier d. */ treeNode ParentNode(DocID d); - + treeNode PrevNode(DocID d); - /** GetTagId(tagname): returns the tag identifier corresponding to a given + /** GetTagId(tagname): returns the tag identifier corresponding to a given * tag name. Returns NULLT in case that the tag name does not exists. */ TagType GetTagId(unsigned char *tagname); - /** GetTagName(tagid): returns the tag name of a given tag identifier. + /** GetTagName(tagid): returns the tag name of a given tag identifier. * Returns NULL in case that the tag identifier is not valid.*/ unsigned char *GetTagName(TagType tagid); - /** GetTagName(tagid): returns the tag name of a given tag identifier. + /** GetTagName(tagid): returns the tag name of a given tag identifier. * The result is just a reference and should not be freed by the caller. */ const unsigned char *GetTagNameByRef(TagType tagid); @@ -527,35 +511,35 @@ public: TextCollection::document_result LessThan(uchar const *s) { return Text->LessThan(s); } - + /** IsPrefix(x): returns true if there is a text prefixed by string s. */ bool IsPrefix(uchar const *s) { return Text->IsPrefix(s); - } - - /** IsSuffix(s): returns true if there is a text having string s as a + } + + /** IsSuffix(s): returns true if there is a text having string s as a * suffix.*/ bool IsSuffix(uchar const *s) { return Text->IsSuffix(s); } - - /** IsEqual(s): returns true if there is a text that equals given + + /** IsEqual(s): returns true if there is a text that equals given * string s. */ bool IsEqual(uchar const *s) { return Text->IsEqual(s); } - + /** IsContains(s): returns true if there is a text containing string s. */ bool IsContains(uchar const *s) { return Text->IsContains(s); } - - /** IsLessThan(s): returns true if there is at least a text that is + + /** IsLessThan(s): returns true if there is at least a text that is * lexicographically smaller than string s. */ bool IsLessThan(uchar const *s) { return Text->IsLessThan(s); } - + /** Count(s): Global counting */ unsigned Count(uchar const *s) { return Text->Count(s); @@ -565,31 +549,31 @@ public: unsigned CountPrefix(uchar const *s) { return Text->CountPrefix(s); } - + /** CountSuffix(s): counting version of Suffix(s). */ unsigned CountSuffix(uchar const *s) { return Text->CountSuffix(s); } - + /** CountEqual(s): counting version of Equal(s). */ unsigned CountEqual(uchar const *s) { return Text->CountEqual(s); } - + /** CountContains(s): counting version of Contains(s). */ unsigned CountContains(uchar const *s) { return Text->CountContains(s); } - + /** CountLessThan(s): counting version of LessThan(s). */ unsigned CountLessThan(uchar const *s) { return Text->CountLessThan(s); } - + /** GetText(d): returns the text corresponding to document with * id d. */ uchar* GetText(DocID d) { - + uchar * s = Text->GetText(d); return (s[0] == 1 ? (s+1) : s); } @@ -604,19 +588,19 @@ public: TextCollection *getTextCollection() { return Text; } - + /** Save: saves XML tree data structure to file. */ - void Save(int fd ); - - /** Load: loads XML tree data structure from file. sample_rate_text + void Save(int fd, char* name ); + + /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd, bool load_tc, int sample_factor); + static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name); void insertTag(TagType tag, uint position); - + void print_stats(); - + /** Parenthesis functions */ treeNode Closing(treeNode x); @@ -626,7 +610,7 @@ public: /** Print procedure */ void Print(int fd,treeNode x, bool no_text); void Print(int fd,treeNode x) { Print(fd,x,false); } - void Flush(int fd){ _flush(fd); } + void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); } // The following are inlined here for speed /** Tag(x): returns the tag identifier of node x. */ @@ -634,13 +618,13 @@ public: inline TagType Tag(treeNode x) const throw () { if (tags_blen == 8) return (TagType) (((uchar*)tags_fix)[(int) x]); - else + else return get_field(tags_fix, tags_blen, x); /* - { + { size_t idxlen = x * tags_blen; size_t j = idxlen % W; - size_t i = idxlen / W; + size_t i = idxlen / W; size_t offset = W - tags_blen; size_t offset2 = offset - j; size_t w = tags_fix[i]; @@ -655,7 +639,7 @@ public: */ treeNode FirstChild(treeNode x) { NULLT_IF(x==NULLT); - return fast_first_child(Par, x); + return bp_first_child(Par, x); }; @@ -665,98 +649,110 @@ public: treeNode FirstElement(treeNode node){ { NULLT_IF(node == NULLT); - treeNode x = fast_first_child(Par, node); + treeNode x = bp_first_child(Par, node); NULLT_IF(x == NULLT); switch (Tag(x)){ - case ATTRIBUTE_TAG_ID: - x = fast_next_sibling(Par,x); + case ATTRIBUTE_TAG_ID: + x = bp_next_sibling(Par,x); if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x; - + case PCDATA_TAG_ID: x = x+2; - return (fast_inspect(Par,x)==OP)? x : NULLT; - + return (bp_inspect(Par,x)==OP)? x : NULLT; + default: return x; } } }; - /** NextSibling(x): returns the next sibling of node x, or NULLT if none + /** NextSibling(x): returns the next sibling of node x, or NULLT if none * exists. */ - + treeNode NextSibling(treeNode x) { NULLT_IF (x <= 0); - return fast_next_sibling(Par, x); + return bp_next_sibling(Par, x); }; - + /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT * if none. */ treeNode NextElement(treeNode x) { NULLT_IF(x <= 0); - x = fast_next_sibling(Par, x); - NULLT_IF(x == NULLT); + x = bp_next_sibling(Par, x); + NULLT_IF(x == NULLT); if (Tag(x) == PCDATA_TAG_ID){ int y = x+2; - return (fast_inspect(Par, y) == OP) ? y : NULLT; + return (bp_inspect(Par, y) == OP) ? y : NULLT; } - else return x; + else return x; }; - /** TaggedDesc(x,tag): returns the first node tagged tag with larger - * preorder than x and within the subtree of x. Returns NULT if there + /** TaggedDesc(x,tag): returns the first node tagged tag with larger + * preorder than x and within the subtree of x. Returns NULT if there * is none. */ + inline treeNode TaggedNext(treeNode x, TagType tag) + { + return tagpos2node(Tags->select_next(tag,node2tagpos(x))); + } inline treeNode TaggedDescendant(treeNode x, TagType tag) { - + int s = (int) Tags->select_next(tag,node2tagpos(x)); NULLT_IF (s == -1); - + treeNode y = tagpos2node(s); // transforms the tag position into a node position - - return (fast_is_ancestor(Par,x,y) ? y : NULLT); + + return (bp_is_ancestor(Par,x,y) ? y : NULLT); }; - + inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) { - treeNode close = fast_find_close(Par, x); + treeNode close = bp_find_close(Par, x); treeNode s = tagpos2node(Tags->select_next(tag, close)); - - if (ancestor == Root() || s == NULLT || s < fast_find_close(Par,ancestor)) return s; + + if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s; else return NULLT; }; inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing) { - treeNode close = fast_find_close(Par, x); + treeNode close = bp_find_close(Par, x); treeNode s = tagpos2node(Tags->select_next(tag, close)); - + if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s; else return NULLT; }; - + + inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing) + { + treeNode close = bp_find_close(Par, x); + int rank = bp_rank_open(Par, close); + treeNode y = bp_select_open(Par, rank+1); + return (y < ancestor_closing) ? y : NULLT; + }; + // TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none. treeNode TaggedFollowingSibling(treeNode x, TagType tag) { NULLT_IF(x==NULLT); treeNode sibling = x; TagType ctag; - while ((sibling = fast_next_sibling(Par, sibling)) != NULLT) { + while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) { ctag = Tag(sibling); - if (ctag == tag) return sibling; + if (ctag == tag) return sibling; } - return NULLT; + return NULLT; }; -treeNode TaggedChild(treeNode x, TagType tag) +treeNode TaggedChild(treeNode x, TagType tag) { - - NULLT_IF(x==NULLT || fast_isleaf(Par,x)); - treeNode child; - child = fast_first_child(Par, x); - + + NULLT_IF(x==NULLT || bp_isleaf(Par,x)); + treeNode child; + child = bp_first_child(Par, x); + if (Tag(child) == tag) return child; else