From: kim Date: Thu, 20 Aug 2009 01:28:00 +0000 (+0000) Subject: Better naming, some inlining. X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FXMLTree.git;a=commitdiff_plain;h=a4311418afa11f8e4dbcb3d0a5cc8354ff530efc Better naming, some inlining. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/XMLTree@556 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/XMLTree.cpp b/XMLTree.cpp index 51c495d..8124f9b 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -1,9 +1,11 @@ #include "basics.h" #include #include +#include #include "XMLTree.h" #include #include +#include #include #include @@ -78,13 +80,69 @@ inline int node2tagpos(treeNode x) return (int)x; } -// returns NULLT if the test is true -#define NULLT_IF(x) do { if (x) return NULLT; } while (0) +int fast_find_close(bp *b,int s) +{ + return fwd_excess(b,s,-1); +} + +inline int fast_inspect(bp* Par,treeNode i) +{ + int j,l; + j = i >> logD; + l = i & (D-1); + return (Par->B[j] >> (D-1-l)) & 1; +} + +inline treeNode fast_first_child(bp *Par, treeNode x) +{ + x = x+1; + return (fast_inspect(Par,x) == CP) ? NULLT : x; +} + +inline treeNode fast_next_sibling(bp* Par,treeNode x) +{ + x = fast_find_close(Par,x)+1; + return (fast_inspect(Par,x) == CP) ? NULLT : x; +} + + +inline treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ + + if (tag == PCDATA_TAG_ID){ + x = x+2; + return fast_inspect(Par,x)==OP ? x : NULLT; + } else return fast_next_sibling(Par,x); +} + +inline bool fast_isleaf(bp* Par,treeNode x){ + return (fast_inspect(Par,x+1) == CP ? true : false); +} + +inline uint fast_get_field(uint* A,int len, int idx) +{ + switch(len){ + case 8: + return (uint) (((uchar*)A)[idx]); + // Other 8-alligned values need to take care of the endianess of the arch. + default: + return get_field (A,len,idx); + }; + +} + +inline bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){ + + if (x > y) + return false; + else + return (y <= fast_find_close(Par,x)); +} -XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, - TextCollection * const TC, bool dis_tc) +XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, + uint *empty_texts_bmp, TagType *tags, + TextCollection * const TC, bool dis_tc) { // creates the data structure for the tree topology Par = (bp *)umalloc(sizeof(bp)); @@ -96,6 +154,7 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM tIdMap = (TagIdMap *) tim; uint max_tag = TN->size() - 1; + static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); alphabet_mapper *am = new alphabet_mapper_none(); @@ -103,7 +162,10 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; - tags_blen = bits(max_tag); + //Ensures that for small tag numbers, we are on an 8bit boundary. + //Makes tag access way faster with negligeable waste of space. + tags_blen = max(bits(max_tag),8); + std::cerr << "Tags blen is " << tags_blen << "\n"; tags_len = (uint)npar; tags_fix = new uint[uint_len(tags_blen,tags_len)]; for(uint i=0;i<(uint)npar;i++) @@ -117,11 +179,14 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM EBVector = new static_bitsequence_rrr02(empty_texts_bmp,npar,32); + //EBVector = new static_bitsequence_sdarray(empty_texts_bmp,npar); free(empty_texts_bmp); empty_texts_bmp = NULL; disable_tc = dis_tc; + stream = NULL; + stream_fd = 0; } @@ -147,7 +212,11 @@ XMLTree::~XMLTree() delete EBVector; EBVector = NULL; - + if (stream != NULL){ + fclose(stream); + stream = NULL; + stream_fd = 0; + }; } @@ -205,7 +274,7 @@ void XMLTree::Save(int fd) // Load: loads XML tree data structure from file. Returns // a pointer to the loaded data structure -XMLTree *XMLTree::Load(int fd) +XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) { FILE *fp; char buffer[1024]; @@ -253,6 +322,7 @@ XMLTree *XMLTree::Load(int fd) // loads the tag structure XML_Tree->Tags = static_sequence::load(fp); ufread(&XML_Tree->tags_blen,sizeof(uint),1,fp); + std::cerr << "tags_blen is "<< XML_Tree->tags_blen <<"\n"; ufread(&XML_Tree->tags_len,sizeof(uint),1,fp); XML_Tree->tags_fix = new uint[uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)]; ufread(XML_Tree->tags_fix,sizeof(uint),uint_len(XML_Tree->tags_blen,XML_Tree->tags_len),fp); @@ -274,25 +344,33 @@ XMLTree *XMLTree::Load(int fd) // loads the flags ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp); - + if (load_tc) { XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); - + //XML_Tree->EBVector = static_bitsequence_sdarray::load(fp); STOPTIMER(Loading); PRINTTIME("Loading text bitvector struct", Loading); STARTTIMER(); // Not used - int sample_rate_text = 64; // loads the texts if (!XML_Tree->disable_tc){ - XML_Tree->Text = TextCollection::Load(fp,sample_rate_text); + XML_Tree->Text = TextCollection::Load(fp,sample_factor); } else XML_Tree->Text = NULL; STOPTIMER(Loading); PRINTTIME("Loading TextCollection", Loading); - STARTTIMER(); + STARTTIMER(); + } + else { + XML_Tree->EBVector = NULL; + XML_Tree->Text = NULL; + XML_Tree->disable_tc = true; + }; + XML_Tree->stream = NULL; + XML_Tree->stream_fd = 0; + return XML_Tree; } @@ -313,31 +391,52 @@ int XMLTree::SubtreeSize(treeNode x) int XMLTree::SubtreeTags(treeNode x, TagType tag) { if (x == Root()) - x = first_child(Par,x); + x = fast_first_child(Par,x); int s = x + 2*subtree_size(Par, x) - 1; return Tags->rank(tag, s) - Tags->rank(tag, node2tagpos(x)-1); } +int XMLTree::SubtreeElements(treeNode x) + { + + int size = subtree_size(Par,x); + if (x == Root()){ + x = fast_first_child(Par,x); + size = size - 1; + }; + + int s = x + 2*size - 1; + int ntext = Tags->rank(PCDATA_TAG_ID, s) - Tags->rank(PCDATA_TAG_ID, node2tagpos(x)-1); + size = size - ntext; + treeNode fin = fast_find_close(Par,x); + treeNode y = Tags->select_next(ATTRIBUTE_TAG_ID,node2tagpos(x)); + while (y != NULLT && y < fin){ + size -= SubtreeSize(y); + y = Tags->select_next(ATTRIBUTE_TAG_ID,node2tagpos(y)); + }; + return size; + } // IsLeaf(x): returns whether node x is leaf or not. In the succinct representation // this is just a bit inspection. bool XMLTree::IsLeaf(treeNode x) { - return isleaf(Par, x); + NULLT_IF(x==NULLT); + return fast_isleaf(Par, x); } // IsAncestor(x,y): returns whether node x is ancestor of node y. bool XMLTree::IsAncestor(treeNode x, treeNode y) { - return is_ancestor(Par, x, y); + return fast_is_ancestor(Par, x, y); } // IsChild(x,y): returns whether node x is parent of node y. bool XMLTree::IsChild(treeNode x, treeNode y) { - if (!is_ancestor(Par, x, y)) return false; + if (!fast_is_ancestor(Par, x, y)) return false; return depth(Par, x) == (depth(Par, y) + 1); } @@ -380,46 +479,44 @@ int XMLTree::Postorder(treeNode x) { return postorder_rank(Par, x); } - +/* // Tag(x): returns the tag identifier of node x. TagType XMLTree::Tag(treeNode x) { - return get_field(tags_fix,tags_blen,node2tagpos(x)); + return fast_get_field(tags_fix,tags_blen,node2tagpos(x)); } - +*/ // DocIds(x): returns the range of text identifiers that descend from node x. // returns {NULLT, NULLT} when there are no texts descending from x. range XMLTree::DocIds(treeNode x) { - range r; - if (x == NULLT) { - r.min = NULLT; - r.max = NULLT; - return r; - }; - - - int min = EBVector->rank1(x-1); - int max = EBVector->rank1(x+2*subtree_size(Par, x)-2); - if (min==max) { // range is empty, no texts within the subtree of x - r.min = NULLT; - r.max = NULLT; - } - else { // the range is non-empty, there are texts within the subtree of x - r.min = min+1; - r.max = max; - } - return r; - + range r; + if (x == NULLT) { + r.min = NULLT; + r.max = NULLT; + return r; + }; + int min = EBVector->rank1(x-1); + int max = EBVector->rank1(x+2*subtree_size(Par, x)-2); + if (min==max) { // range is empty, no texts within the subtree of x + r.min = NULLT; + r.max = NULLT; + } + else { // the range is non-empty, there are texts within the subtree of x + r.min = min+1; + r.max = max; + } + return r; } // Parent(x): returns the parent node of node x. + treeNode XMLTree::Parent(treeNode x) { if (x == Root()) return NULLT; else - return parent(Par, x); + return parent(Par, x);; } // Child(x,i): returns the i-th child of node x, assuming it exists. @@ -430,50 +527,61 @@ treeNode XMLTree::Child(treeNode x, int i) } // FirstChild(x): returns the first child of node x, assuming it exists. Very fast in BP. + treeNode XMLTree::FirstChild(treeNode x) { NULLT_IF(x==NULLT); - return first_child(Par, x); + return fast_first_child(Par, x); } treeNode XMLTree::FirstElement(treeNode x) { NULLT_IF(x==NULLT); - treeNode fc = first_child(Par, x); - NULLT_IF(fc == NULLT); - TagType tag = Tag(fc); - if (tag != PCDATA_TAG_ID && tag != ATTRIBUTE_TAG_ID) - return fc; - else return next_sibling(Par,fc); - + x = fast_first_child(Par, x); + NULLT_IF(x == NULLT); + TagType tag = Tag(x); + if (tag == PCDATA_TAG_ID){ + x = x+2; + return (fast_inspect(Par,x)==OP)? x : NULLT; + } + else if (tag == ATTRIBUTE_TAG_ID){ + x = fast_next_sibling(Par,x); + if (x != NULLT && Tag(x) == PCDATA_TAG_ID){ + x = x+2; + return (fast_inspect(Par,x)==OP)? x : NULLT; + } + else return x; + }else return x; } treeNode XMLTree::NextElement(treeNode x) { - NULLT_IF(x==NULLT); - treeNode ns = next_sibling(Par, x); - NULLT_IF(ns == NULLT); - TagType tag = Tag(ns); - if (tag != PCDATA_TAG_ID && tag != ATTRIBUTE_TAG_ID) - return ns; - else return next_sibling(Par,ns); + NULLT_IF(x==NULLT); + x = fast_next_sibling(Par, x); + NULLT_IF(x == NULLT); + if (Tag(x) == PCDATA_TAG_ID){ + x = x+2; + return (fast_inspect(Par,x)==OP)? x : NULLT; + } + else return x; } // LastChild(x): returns the last child of node x. treeNode XMLTree::LastChild(treeNode x) { - NULLT_IF(x==NULLT || x == Root() || isleaf(Par,x)); - return find_open(Par, find_close(Par, x)-1); + NULLT_IF(NULLT || x == Root() || fast_isleaf(Par,x)); + return find_open(Par, fast_find_close(Par, x)-1); } - // NextSibling(x): returns the next sibling of node x, assuming it exists. treeNode XMLTree::NextSibling(treeNode x) { NULLT_IF(x==NULLT || x == Root() ); - return next_sibling(Par, x); + x = fast_find_close(Par,x)+1; + return (fast_inspect(Par,x) == CP ? NULLT : x); } + // PrevSibling(x): returns the previous sibling of node x, assuming it exists. treeNode XMLTree::PrevSibling(treeNode x) { @@ -487,85 +595,87 @@ treeNode XMLTree::PrevSibling(treeNode x) treeNode XMLTree::TaggedChild(treeNode x, TagType tag) { - NULLT_IF(x==NULLT || isleaf(Par,x)); - + NULLT_IF(x==NULLT || fast_isleaf(Par,x)); treeNode child; - child = first_child(Par, x); // starts at first child of node x - if (get_field(tags_fix,tags_blen,node2tagpos(child)) == tag) + child = fast_first_child(Par, x); // starts at first child of node x + if (Tag(child) == tag) return child; else - return TaggedFollSibling(child,tag); + return TaggedFollowingSibling(child,tag); } // TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none. -treeNode XMLTree::TaggedFollSibling(treeNode x, TagType tag) +treeNode XMLTree::TaggedFollowingSibling(treeNode x, TagType tag) { NULLT_IF(x==NULLT); - treeNode sibling = next_sibling(Par, x); + treeNode sibling = fast_next_sibling(Par, x); + TagType ctag; while (sibling != NULLT) { - if (get_field(tags_fix,tags_blen,node2tagpos(sibling)) == tag) // current sibling is labeled with tag of interest + ctag = Tag(sibling); + if (ctag == tag) // current sibling is labeled with tag of interest return sibling; - sibling = next_sibling(Par, sibling); // OK, let's try with the next sibling + sibling = fast_sibling(Par, sibling, ctag); // OK, let's try with the next sibling } return NULLT; // no such sibling was found } -treeNode XMLTree::SelectChild(treeNode x, std::unordered_set *tags) +treeNode XMLTree::SelectChild(treeNode x, TagIdSet *tags) { - NULLT_IF(x==NULLT || isleaf(Par,x)); + NULLT_IF(x==NULLT || fast_isleaf(Par,x)); int i; - treeNode child = first_child(Par, x); - TagType t = get_field(tags_fix, tags_blen, node2tagpos(child)); - std::unordered_set::const_iterator tagit = tags->find(t); - if (tagit != tags->end()) return child; - return SelectFollSibling(child,tags); + treeNode child = fast_first_child(Par, x); + TagType t; + while (child != NULLT) { + t = Tag(child); + if (tags->find(t) != tags->end()) return child; + child = fast_sibling(Par, child,t); + } + return NULLT; } -treeNode XMLTree::SelectFollSibling(treeNode x, std::unordered_set *tags) +treeNode XMLTree::SelectFollowingSibling(treeNode x, TagIdSet *tags) { NULLT_IF(x==NULLT); int i; TagType t; - treeNode sibling = next_sibling(Par, x); - std::unordered_set::const_iterator tagit; + treeNode sibling = fast_next_sibling(Par, x); while (sibling != NULLT) { - t = get_field(tags_fix, tags_blen, node2tagpos(sibling)); - tagit = tags->find(t); - if (tagit != tags->end()) return sibling; - sibling = next_sibling(Par, sibling); + t = Tag(sibling); + if (tags->find(t) != tags->end()) return sibling; + sibling = fast_sibling(Par, sibling,t); } return NULLT; } -// TaggedDesc(x,tag): returns the first node tagged tag with larger preorder than x and within +// TaggedDescendant(x,tag): returns the first node tagged tag with larger preorder than x and within // the subtree of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedDesc(treeNode x, TagType tag) +treeNode XMLTree::TaggedDescendant(treeNode x, TagType tag) { - NULLT_IF(x==NULLT || isleaf(Par,x)); + NULLT_IF(x==NULLT || fast_isleaf(Par,x)); int s = (int) Tags->select_next(tag,node2tagpos(x)); NULLT_IF (s == -1); treeNode y = tagpos2node(s); // transforms the tag position into a node position - return (is_ancestor(Par,x,y) ? y : NULLT); + return (fast_is_ancestor(Par,x,y) ? y : NULLT); } -treeNode XMLTree::SelectDesc(treeNode x, std::unordered_set *tags) +treeNode XMLTree::SelectDescendant(treeNode x, TagIdSet *tags) { - NULLT_IF (x ==NULLT || isleaf(Par,x)); + NULLT_IF (x ==NULLT || fast_isleaf(Par,x)); int i; treeNode min = NULLT; - treeNode fc = first_child(Par,x); + treeNode fc = fast_first_child(Par,x); treeNode aux; - std::unordered_set::const_iterator tagit; + TagIdSet::const_iterator tagit; for (tagit = tags->begin(); tagit != tags->end(); tagit++) { - aux = TaggedDesc(x, (TagType) *tagit); + aux = TaggedDescendant(x, (TagType) *tagit); if (aux == fc) return fc; if (aux == NULLT) continue; if ((min == NULLT) || (aux < min)) min = aux; @@ -577,7 +687,7 @@ treeNode XMLTree::SelectDesc(treeNode x, std::unordered_set *tags) // TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an // ancestor of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) +treeNode XMLTree::TaggedPreceding(treeNode x, TagType tag) { int r, s; treeNode node_s, root; @@ -586,7 +696,7 @@ treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) s = (int)Tags->select(tag, r); root = root_node(Par); node_s = tagpos2node(s); - while (is_ancestor(Par, node_s, x) && (node_s!=root)) { // the one that we found is an ancestor of x + while (fast_is_ancestor(Par, node_s, x) && (node_s!=root)) { // the one that we found is an ancestor of x r--; if (r==0) return NULLT; // there is no such node s = (int)Tags->select(tag, r); // we should use select_prev instead when provided @@ -598,43 +708,52 @@ treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) // TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in // the subtree of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) +treeNode XMLTree::TaggedFollowing(treeNode x, TagType tag) { - NULLT_IF (x ==NULLT || x == Root()); - - return tagpos2node(Tags->select_next(tag,find_close(Par, x))); + NULLT_IF (x ==NULLT || x == Root()); + return tagpos2node(Tags->select_next(tag,fast_find_close(Par, x))); } // TaggedFollBelow(x,tag,root): returns the first node tagged tag with larger preorder than x // and not in the subtree of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedFollBelow(treeNode x, TagType tag, treeNode root) +treeNode XMLTree::TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) +{ + NULLT_IF (x == NULLT || x == Root() || x == ancestor); + treeNode s = tagpos2node(Tags->select_next(tag, fast_find_close(Par, x))); + + if (ancestor == Root()) return s; + NULLT_IF (s == NULLT || s >= fast_find_close(Par, ancestor)); + + return s; +} + +treeNode XMLTree::TaggedFollowingBefore(treeNode x, TagType tag, treeNode closing) { NULLT_IF (x == NULLT || x == Root()); - treeNode s = tagpos2node(Tags->select_next(tag, find_close(Par, x))); - - if (root == Root()) return s; - NULLT_IF (s == NULLT || s >= find_close(Par, root)); + treeNode s = tagpos2node(Tags->select_next(tag, fast_find_close(Par, x))); + NULLT_IF (s == NULLT || s >= closing); return s; } /* Here we inline TaggedFoll to find the min globally, and only at the end we check if the min is below the context node */ -treeNode XMLTree::SelectFollBelow(treeNode x, std::unordered_set *tags, treeNode root) +treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ancestor) { NULLT_IF(x==NULLT || x==Root()); int i; treeNode min = NULLT; - treeNode ns = next_sibling(Par, x); + treeNode ns = fast_next_sibling(Par, x); + treeNode close = ns - 1; treeNode aux; - std::unordered_set::const_iterator tagit; + TagIdSet::const_iterator tagit; for (tagit = tags->begin(); tagit != tags->end(); tagit++) { - aux = tagpos2node(Tags->select_next(*tagit, find_close(Par, x))); + aux = tagpos2node(Tags->select_next(*tagit, close)); // The next sibling of x is guaranteed to be below ctx // and is the node with lowest preorder which is after ctx. @@ -648,10 +767,41 @@ treeNode XMLTree::SelectFollBelow(treeNode x, std::unordered_set *tags, tre // found the smallest node in preorder which is after x. // if ctx is the root node, just return what we found. - if (root == Root()) return min; + if (ancestor == Root()) return min; // else check whether if is in below the ctx node - NULLT_IF (min == NULLT || min >= find_close(Par, root)); + NULLT_IF (min == NULLT || min >= fast_find_close(Par, ancestor)); + + return min; + + } +treeNode XMLTree::SelectFollowingBefore(treeNode x, TagIdSet *tags, treeNode closing) + { + + NULLT_IF(x==NULLT || x==Root()); + int i; + treeNode min = NULLT; + treeNode ns = fast_next_sibling(Par, x); + treeNode close = ns - 1; + treeNode aux; + TagIdSet::const_iterator tagit; + for (tagit = tags->begin(); tagit != tags->end(); tagit++) { + + aux = tagpos2node(Tags->select_next(*tagit, close)); + + // The next sibling of x is guaranteed to be below ctx + // and is the node with lowest preorder which is after ctx. + // if we find it, we return early; + + if (aux == ns ) return ns; + if (aux == NULLT) continue; + if ((min == NULLT) || (aux < min)) min = aux; + }; + + // found the smallest node in preorder which is after x. + // if ctx is the root node, just return what we found. + + NULLT_IF (min == NULLT || min >= closing); return min; @@ -667,7 +817,7 @@ treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag) treeNode s = parent(Par, x), r = Root(); while (s != r) { - if (get_field(tags_fix,tags_blen,node2tagpos(s)) /*Tags->access(node2tagpos(s))*/ == tag) return s; + if (Tag(s) == tag) return s; s = parent(Par, s); } return NULLT; @@ -684,12 +834,19 @@ DocID XMLTree::MyText(treeNode x) // seems faster than testing EBVector->access(x); if (tag == PCDATA_TAG_ID || tag == ATTRIBUTE_DATA_TAG_ID) + //if (EBVector->access(x)) return (DocID) (EBVector->rank1(x)-1); //-1 because document ids start from 0 else return (DocID) NULLT; } - +// MyText(x): returns the document identifier of the text below node x, +// or NULLT if x is not a leaf node or the text is empty. Assumes Doc +// ids start from 0. +DocID XMLTree::MyTextUnsafe(treeNode x) +{ + return (DocID) (EBVector->rank1(x)-1); //-1 because document ids start from 0 +} // TextXMLId(d): returns the preorder of document with identifier d in the tree consisting of // all tree nodes and all text nodes. Assumes that the tree root has preorder 1. int XMLTree::TextXMLId(DocID d) @@ -736,7 +893,7 @@ unsigned char *XMLTree::GetTagName(TagType tagid) unsigned char *s; if ( tagid < 0 || tagid >= TagName->size()) return (unsigned char *) ""; - strcpy((char *)s, TagName->at(tagid).c_str()); + strcpy((char *)s, (*TagName)[tagid].c_str()); return (s == NULL ? (unsigned char*) "" : s); } @@ -749,7 +906,7 @@ const unsigned char *XMLTree::GetTagNameByRef(TagType tagid) if ( tagid < 0 || tagid >= TagName->size()) return (unsigned char *) ""; - return (const unsigned char *) TagName->at(tagid).c_str(); + return (const unsigned char *) (*TagName)[tagid].c_str(); } @@ -760,8 +917,7 @@ TagType XMLTree::RegisterTag(unsigned char *tagname) TagType id = XMLTree::GetTagId(tagname); if (id == NULLT) { string s = (char *) tagname; - REGISTER_TAG(TagName,tIdMap,s); - + REGISTER_TAG(TagName,tIdMap,s); }; return id; @@ -769,6 +925,107 @@ TagType XMLTree::RegisterTag(unsigned char *tagname) treeNode XMLTree::Closing(treeNode x) { - return find_close(Par,x); + return fast_find_close(Par,x); +} +bool XMLTree::IsOpen(treeNode x) { return fast_inspect(Par,x); } + +//WARNING this uses directly the underlying implementation for plain text + +void XMLTree::Print(int fd,treeNode x){ + + int newfd = dup(fd); + stream = fdopen(newfd,"wa"); + /* if (stream_fd != fd){ + if (stream != NULL) + fclose(stream); + int newfd = dup(fd); + stream = fdopen(newfd,"wa"); + stream_fd = fd; + }; + */ + + FILE* fp = stream; + treeNode fin = fast_find_close(Par,x); + treeNode n = x; + TagType tag = Tag(n); + uchar * tagstr; + range r = DocIds(x); + treeNode first_idx; + treeNode first_text = (tag == PCDATA_TAG_ID ? x : TaggedDescendant(x,PCDATA_TAG_ID)); + treeNode first_att = NULLT;//TaggedDesc(x,ATTRIBUTE_DATA_TAG_ID); + + if (first_att == NULLT) + first_idx = first_text; + else if (first_text == NULLT) + first_idx = first_att; + else + first_idx = min(first_att,first_text); + + uchar * current_text=NULL; + if (first_idx != NULLT) + current_text = GetText(MyText(first_idx)); + int read = 0; + + std::stack st; + while (n <= fin){ + if (fast_inspect(Par,n)){ + if (tag == PCDATA_TAG_ID) { + // fputs((const char*) (GetText(MyTextUnsafe(n))),fp); + + read = fprintf(fp,"%s",(const char*) current_text); + current_text += (read + 1); + + n+=2; // skip closing $ + tag = Tag(n); + } + else { + + fputc('<',fp); + tagstr = (uchar*) GetTagNameByRef(tag); + fputs((const char*) tagstr ,fp); + n++; + if (fast_inspect(Par,n)) { + st.push(tagstr); + tag = Tag(n); + if (tag == ATTRIBUTE_TAG_ID){ + n++; + while (fast_inspect(Par,n)){ + fputc(' ',fp); + fputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp); + fputs("=\"",fp); + n++; + read = fprintf(fp,"%s",(const char*) current_text); + current_text += (read + 1); + //fputs((const char*) GetText(MyTextUnsafe(n)),fp); + fputc('"',fp); + n+=3; //close @$ @@ + }; + fputc('>',fp); + n++; + tag=Tag(n); + } + else { + fputc('>',fp); + }; + } + else {// tag + fputs("/>",fp); + n++; + tag=Tag(n); + }; + }; + } + else + do { + fputs("', fp); + st.pop(); + n++; + }while (!fast_inspect(Par,n) && !st.empty()); + tag=Tag(n); + }; + fputc('\n',fp); + fflush(fp); + fclose(fp); } -bool XMLTree::IsOpen(treeNode x) { return inspect(Par,x); } diff --git a/XMLTree.h b/XMLTree.h index a62abfe..a3ac2ed 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (C) 2008 by Diego Arroyuelo * * Interface for the in-memory XQuery/XPath engine * @@ -34,7 +33,7 @@ #undef Wminusone #include "bp.h" - +#include #include #include #include @@ -84,7 +83,7 @@ typedef struct { #define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" - +typedef std::unordered_set TagIdSet; typedef std::unordered_map TagIdMap; typedef TagIdMap::const_iterator TagIdMapIT; @@ -92,6 +91,10 @@ typedef TagIdMap::const_iterator TagIdMapIT; (v)->push_back(t); } while (false) +// returns NULLT if the test is true +#define NULLT_IF(x) do { if (x) return NULLT; } while (0) + + class XMLTreeBuilder; class XMLTree { @@ -121,6 +124,8 @@ class XMLTree { // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; + FILE* stream; + int stream_fd; /** Data structure constructors */ XMLTree(){;}; @@ -135,35 +140,47 @@ public: /** root(): returns the tree root. */ treeNode Root(); - + + /** Size() : Number of parenthesis */ + unsigned int Size(){ + return tags_len/2; + } + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of * node x. */ int SubtreeSize(treeNode x); - + /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree * of node x. */ int SubtreeTags(treeNode x, TagType tag); + /** SubtreeElements(x) of element nodes in the subtree of x + */ + int SubtreeElements(treeNode x); + /** IsLeaf(x): returns whether node x is leaf or not. In the succinct * representation this is just a bit inspection. */ + bool IsLeaf(treeNode x); - + /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ + bool IsAncestor(treeNode x, treeNode y); /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); /** IsFirstChild(x): returns whether node x is the first child of its parent. */ + /* OCAML */ bool IsFirstChild(treeNode x); - + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); - + /** ChildNumber(x): returns i if node x is the i-th children of its * parent. */ - inline int ChildNumber(treeNode x); + int ChildNumber(treeNode x); /** Depth(x): depth of node x, a simple binary rank on the parentheses * sequence. */ @@ -176,35 +193,51 @@ public: /** Postorder(x): returns the postorder number of node x, just regarding * tree nodes (and not texts). */ int Postorder(treeNode x); - + /** Tag(x): returns the tag identifier of node x. */ - TagType Tag(treeNode x); - + TagType Tag(treeNode x) { + if (tags_blen == 8) + return (TagType) (((uchar*)tags_fix)[(int) x]); + else + return (TagType) get_field(tags_fix,tags_blen, (int) x); + } + /** DocIds(x): returns the range (i.e., a pair of integers) of document * identifiers that descend from node x. */ range DocIds(treeNode x); - + /** Parent(x): returns the parent node of node x. */ treeNode Parent(treeNode x); + /* Assumes x is neither 0 nor -1 */ /** Child(x,i): returns the i-th child of node x, assuming it exists. */ treeNode Child(treeNode x, int i); - - /** FirstChild(x): returns the first child of node x, assuming it exists. - * Very fast in BP. */ + + /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf + */ treeNode FirstChild(treeNode x); + + /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT + * if none. + */ treeNode FirstElement(treeNode x); /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x); - /** NextSibling(x): returns the next sibling of node x, assuming it + /** NextSibling(x): returns the next sibling of node x, or NULLT if none * exists. */ + treeNode NextSibling(treeNode x); + + /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT + * if none. + */ treeNode NextElement(treeNode x); /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ + treeNode PrevSibling(treeNode x); /** TaggedChild(x,tag): returns the first child of node x tagged tag, or @@ -213,38 +246,38 @@ public: * among the children of node x until finding the desired child. */ treeNode TaggedChild(treeNode x, TagType tag); - treeNode SelectChild(treeNode x, std::unordered_set * tags); + treeNode SelectChild(treeNode x, TagIdSet * tags); - /** TaggedFollSibling(x,tag): returns the first sibling of node x tagged tag, or + /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or * NULLT if there is none. */ - treeNode TaggedFollSibling(treeNode x, TagType tag); + treeNode TaggedFollowingSibling(treeNode x, TagType tag); - treeNode SelectFollSibling(treeNode x, std::unordered_set * tags); + treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ - treeNode TaggedDesc(treeNode x, TagType tag); - - treeNode SelectDesc(treeNode x, std::unordered_set * tags); + treeNode TaggedDescendant(treeNode x, TagType tag); + treeNode SelectDescendant(treeNode x, TagIdSet * tags); /** TaggedPrec(x,tag): returns the first node tagged tag with smaller * preorder than x and not an ancestor of x. Returns NULLT if there * is none. */ - treeNode TaggedPrec(treeNode x, TagType tag); + treeNode TaggedPreceding(treeNode x, TagType tag); /** TaggedFoll(x,tag): returns the first node tagged tag with larger * preorder than x and not in the subtree of x. Returns NULLT if there * is none. */ - treeNode TaggedFoll(treeNode x, TagType tag); + treeNode TaggedFollowing(treeNode x, TagType tag); - treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); - - treeNode SelectFollBelow(treeNode x, std::unordered_set * tags, treeNode root); + treeNode TaggedFollowingBelow(treeNode x, TagType tag,treeNode ancestor); - /** TaggedFollowingSibling(x,tag) */ - treeNode TaggedFollowingSibling(treeNode x, TagType tag); + treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); + + treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); + + treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode closing); /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged * tag. Return NULLT is there is none. */ @@ -261,7 +294,8 @@ public: /** MyText(x): returns the document identifier of the text below node x, or * NULLT if x is not a leaf node. */ DocID MyText(treeNode x); - + DocID MyTextUnsafe(treeNode x); + /** TextXMLId(d): returns the preorder of document with identifier d in the * tree consisting of all tree nodes and all text nodes. */ int TextXMLId(DocID d); @@ -311,7 +345,7 @@ public: } /** Equal(s): search for texts equal to string s. */ - TextCollection::document_result Equal(uchar const *s) { + TextCollection::document_result Equals(uchar const *s) { return Text->Equal(s); } @@ -387,8 +421,9 @@ public: /** GetText(d): returns the text corresponding to document with * id d. */ uchar* GetText(DocID d) { - uchar * s = Text->GetText(d); - return (s[0] == 1 ? (uchar*)"" : s); + + uchar * s = Text->GetText(d); + return (s[0] == 1 ? (uchar*)"" : s); } /** GetText(i, j): returns the texts corresponding to documents with @@ -407,15 +442,22 @@ public: /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd); + static XMLTree *Load(int fd,bool load_tc, int sample_factor); void insertTag(TagType tag, uint position); void print_stats(); + + /** Parenthesis functions */ treeNode Closing(treeNode x); + bool IsOpen(treeNode x); + + /** Print procedure */ + void Print(int fd,treeNode x); + }; #endif diff --git a/bp.h b/bp.h index 41ed602..9bfc291 100644 --- a/bp.h +++ b/bp.h @@ -132,7 +132,18 @@ void saveTree(bp *b, FILE *fp); void loadTree(bp *b, FILE *fp); void destroyTree(bp *b); -int blog(int x); + +inline int blog(int x) +{ + int l; + l = 0; + while (x>0) { + x>>=1; + l++; + } + return l; +} + pb getpat_preorder(pb *b); pb getpat_leaf(pb *b); pb getpat_inorder(pb *b);