From 912ff50e1d38de484b503d8ef877a49a65765ab9 Mon Sep 17 00:00:00 2001 From: kim Date: Mon, 12 Apr 2010 01:12:00 +0000 Subject: [PATCH] . git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/XMLTree@778 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- XMLTree.cpp | 226 +++++++++++++++++++++++++++------------------------- 1 file changed, 118 insertions(+), 108 deletions(-) diff --git a/XMLTree.cpp b/XMLTree.cpp index 8124f9b..174c94c 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -1,66 +1,8 @@ #include "basics.h" -#include -#include +//#include #include #include "XMLTree.h" -#include -#include -#include -#include -#include - -static double tLoading = 0; - -static unsigned int cLoading = 0; -static struct timeval tmpv1; -static struct timeval tmpv2; -static string mem1; -static string mem2; - -void read_procmem(string& memstr) { - std::string buf; - pid_t pid = getpid(); - std::stringstream path; - path << "/proc/" << pid << "/status"; - std::ifstream infile; - infile.open (path.str().c_str(), std::ifstream::in); - while (infile.good()){ - getline(infile,buf); - if (infile.eof()) { - memstr = "Could not read memory"; - return; - }; - int idx = buf.find("VmRSS"); - if (idx == 0){ - memstr = buf; - return; - }; - }; - memstr = "Could not read memory"; - return; - -} - -#define STARTTIMER() do { \ - gettimeofday(&tmpv1,NULL); \ - read_procmem(mem1); \ - } while (0) \ - -#define STOPTIMER(x) do { \ - read_procmem(mem2); \ - gettimeofday(&tmpv2,NULL); \ - (t##x) = ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \ - (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \ - (c##x)= (c##x)+1; \ - } while (0) - -#define PRINTTIME(s,x) do { \ - std::cerr << (s) << " : " << (t##x) << "ms" << std::endl; \ - std::cerr << "Mem use before: " << mem1 << std::endl; \ - std::cerr << "Mem use after: " << mem2 << std::endl; \ - std::cerr.flush(); \ - } while (0) \ - +#include "timings.h" // functions to convert tag positions to the corresponding tree node and viceversa. // These are implemented in order to be able to change the tree and Tags representations, @@ -69,23 +11,33 @@ void read_procmem(string& memstr) { // the tree, and storing 2 tags per tree node (opening and closing tags). // tag position -> tree node -inline treeNode tagpos2node(int t) +static treeNode tagpos2node(int t) { return (treeNode) t; } +static int bits8 (int t ) { + int r = bits(t); + if (r <= 8) + return 8; + else if (r <= 16) + return 16; + else + return r; +} + // tree node -> tag position -inline int node2tagpos(treeNode x) +static int node2tagpos(treeNode x) { return (int)x; } -int fast_find_close(bp *b,int s) +static int fast_find_close(bp *b,int s) { return fwd_excess(b,s,-1); } -inline int fast_inspect(bp* Par,treeNode i) +static int fast_inspect(bp* Par,treeNode i) { int j,l; j = i >> logD; @@ -93,20 +45,20 @@ inline int fast_inspect(bp* Par,treeNode i) return (Par->B[j] >> (D-1-l)) & 1; } -inline treeNode fast_first_child(bp *Par, treeNode x) +static treeNode fast_first_child(bp *Par, treeNode x) { x = x+1; - return (fast_inspect(Par,x) == CP) ? NULLT : x; + return (fast_inspect(Par,x) == OP) ? x : NULLT; } -inline treeNode fast_next_sibling(bp* Par,treeNode x) +static treeNode fast_next_sibling(bp* Par,treeNode x) { - x = fast_find_close(Par,x)+1; - return (fast_inspect(Par,x) == CP) ? NULLT : x; + x = fwd_excess(Par,x,0); + return (fast_inspect(Par,x) == OP) ? x : NULLT; } -inline treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ +static treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ if (tag == PCDATA_TAG_ID){ x = x+2; @@ -115,28 +67,39 @@ inline treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ } -inline bool fast_isleaf(bp* Par,treeNode x){ +static bool fast_isleaf(bp* Par,treeNode x){ return (fast_inspect(Par,x+1) == CP ? true : false); } -inline uint fast_get_field(uint* A,int len, int idx) + +inline uint get_field_no_power(uint *A, uint len, uint index) { + + register uint i=index*len/W, j=index*len-W*i; + return (j+len <= W) ? (A[i] << (W-j-len)) >> (W-len) : (A[i] >> j) | (A[i+1] << (WW-j-len)) >> (W-len); + +} + +static uint fast_get_field(uint* A,int len, int idx) { - switch(len){ + uint f1, f2; + switch (len) { case 8: - return (uint) (((uchar*)A)[idx]); - // Other 8-alligned values need to take care of the endianess of the arch. - default: - return get_field (A,len,idx); + return (uint) (((uchar*)A)[idx]); + case 16: + f2 = ((unsigned short*)A)[idx]; + f1 = ((unsigned short*)A)[idx+1]; + return (f1 << 16) + f2; + default: + return get_field_no_power (A,len,idx); }; } inline bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){ - if (x > y) return false; else - return (y <= fast_find_close(Par,x)); + return (x==0) || (y <= fast_find_close(Par,x)); } @@ -146,7 +109,12 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM { // creates the data structure for the tree topology Par = (bp *)umalloc(sizeof(bp)); - bp_construct(Par, npar, (pb*) par, OPT_DEGREE|0); + STARTTIMER(); + bp_construct(Par, npar, (pb*) par, OPT_DEGREE|0); + STOPTIMER(Building); + PRINTTIME("Building parenthesis struct", Building); + STARTTIMER(); + // creates structure for tags @@ -160,20 +128,22 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM alphabet_mapper *am = new alphabet_mapper_none(); Tags = new static_sequence_bs((uint*)tags,npar,am,bmb); - cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; + //cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; //Ensures that for small tag numbers, we are on an 8bit boundary. //Makes tag access way faster with negligeable waste of space. - tags_blen = max(bits(max_tag),8); + tags_blen = bits8(max_tag); std::cerr << "Tags blen is " << tags_blen << "\n"; tags_len = (uint)npar; tags_fix = new uint[uint_len(tags_blen,tags_len)]; for(uint i=0;i<(uint)npar;i++) set_field(tags_fix,tags_blen,i,tags[i]); - delete bmb; free(tags); tags = NULL; + + STOPTIMER(Building); + PRINTTIME("Building Tag Structure", Building); Text = (TextCollection*) TC; @@ -187,6 +157,8 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM disable_tc = dis_tc; stream = NULL; stream_fd = 0; + std::cerr << "Number of distinct tags " << TagName->size() << "\n"; + //std::cerr.flush(); } @@ -338,6 +310,7 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) /// End ugly tests STOPTIMER(Loading); + std::cerr << (uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/(1024*1024) << " MB for tag sequence" << std::endl; PRINTTIME("Loading tag struct", Loading); STARTTIMER(); @@ -375,11 +348,6 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) } -// root(): returns the tree root. -inline treeNode XMLTree::Root() - { - return 0; //root_node(Par); - } // SubtreeSize(x): the number of nodes (and attributes) in the subtree of node x. int XMLTree::SubtreeSize(treeNode x) @@ -539,19 +507,22 @@ treeNode XMLTree::FirstElement(treeNode x) NULLT_IF(x==NULLT); x = fast_first_child(Par, x); NULLT_IF(x == NULLT); - TagType tag = Tag(x); - if (tag == PCDATA_TAG_ID){ + switch (Tag(x)){ + + case PCDATA_TAG_ID: x = x+2; return (fast_inspect(Par,x)==OP)? x : NULLT; - } - else if (tag == ATTRIBUTE_TAG_ID){ + + case ATTRIBUTE_TAG_ID: x = fast_next_sibling(Par,x); if (x != NULLT && Tag(x) == PCDATA_TAG_ID){ x = x+2; return (fast_inspect(Par,x)==OP)? x : NULLT; } - else return x; - }else return x; + else return x; + default: + return x; + } } treeNode XMLTree::NextElement(treeNode x) @@ -565,11 +536,27 @@ treeNode XMLTree::NextElement(treeNode x) } else return x; } +value XMLTree::CamlFirstElement(value x) +{ + return Val_int(FirstElement(Int_val(x))); +} +value XMLTree::CamlNextElement(value x) +{ + return Val_int(NextElement(Int_val(x))); +} + +extern "C" value caml_cpp_fast_first_element(value xmltree, value node){ + return XMLTREE(xmltree)->CamlFirstElement(node); +} + +extern "C" value caml_cpp_fast_next_element(value xmltree, value node){ + return XMLTREE(xmltree)->CamlNextElement(node); +} // LastChild(x): returns the last child of node x. treeNode XMLTree::LastChild(treeNode x) { - NULLT_IF(NULLT || x == Root() || fast_isleaf(Par,x)); + NULLT_IF(x == NULLT || fast_isleaf(Par,x)); return find_open(Par, fast_find_close(Par, x)-1); } @@ -585,7 +572,7 @@ treeNode XMLTree::NextSibling(treeNode x) // PrevSibling(x): returns the previous sibling of node x, assuming it exists. treeNode XMLTree::PrevSibling(treeNode x) { - NULLT_IF(x==NULLT || x == Root()); + NULLT_IF(x==NULLT); return prev_sibling(Par, x); } @@ -655,7 +642,7 @@ treeNode XMLTree::SelectFollowingSibling(treeNode x, TagIdSet *tags) // the subtree of x. Returns NULLT if there is none. treeNode XMLTree::TaggedDescendant(treeNode x, TagType tag) { - NULLT_IF(x==NULLT || fast_isleaf(Par,x)); + //NULLT_IF(x==NULLT || fast_isleaf(Par,x)); int s = (int) Tags->select_next(tag,node2tagpos(x)); NULLT_IF (s == -1); @@ -719,13 +706,32 @@ treeNode XMLTree::TaggedFollowing(treeNode x, TagType tag) // and not in the subtree of x. Returns NULLT if there is none. treeNode XMLTree::TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) { - NULLT_IF (x == NULLT || x == Root() || x == ancestor); - treeNode s = tagpos2node(Tags->select_next(tag, fast_find_close(Par, x))); - - if (ancestor == Root()) return s; - NULLT_IF (s == NULLT || s >= fast_find_close(Par, ancestor)); + // NULLT_IF (x == NULLT || x == Root() || x == ancestor); + + //Special optimisation, test for the following sibling first + treeNode close = fast_find_close(Par, x); + /* + treeNode ns = close+1; + if (fast_inspect(Par,ns) == OP) { + TagType tagns = Tag(ns); + // cout << GetTagNameByRef(tagns) << endl; + //cout.flush(); + if (tagns == PCDATA_TAG_ID){ + close = ns+1; + ns = ns+2; + if (fast_inspect(Par,ns) != OP) + goto after; + tagns = Tag(ns); + }; + if (tagns == tag) + return ns; + }; + after: + */ + treeNode s = tagpos2node(Tags->select_next(tag, close)); - return s; + if (ancestor == Root() || s==NULLT || s < fast_find_close(Par,ancestor)) return s; + else return NULLT; } treeNode XMLTree::TaggedFollowingBefore(treeNode x, TagType tag, treeNode closing) @@ -745,11 +751,17 @@ treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ance { NULLT_IF(x==NULLT || x==Root()); + + treeNode close = fast_find_close(Par,x); + treeNode ns = close+1; + if ( (fast_inspect(Par,ns) == OP) && (tags->find(Tag(ns)) != tags->end())) + return ns; + int i; treeNode min = NULLT; - treeNode ns = fast_next_sibling(Par, x); - treeNode close = ns - 1; treeNode aux; + + TagIdSet::const_iterator tagit; for (tagit = tags->begin(); tagit != tags->end(); tagit++) { @@ -757,9 +769,7 @@ treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ance // The next sibling of x is guaranteed to be below ctx // and is the node with lowest preorder which is after ctx. - // if we find it, we return early; - - if (aux == ns ) return ns; + // if we find it, we return early; if (aux == NULLT) continue; if ((min == NULLT) || (aux < min)) min = aux; }; -- 2.17.1