X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.cpp;h=c87c6dc008443bb723a450f7ecef9aff560598a3;hb=f97501e008660c0363f0fe643be09de66efd3533;hp=8124f9b0082f84d39645831b795ffbc1b7cfc572;hpb=a4311418afa11f8e4dbcb3d0a5cc8354ff530efc;p=SXSI%2FXMLTree.git diff --git a/XMLTree.cpp b/XMLTree.cpp index 8124f9b..c87c6dc 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -1,66 +1,11 @@ #include "basics.h" -#include -#include -#include #include "XMLTree.h" -#include -#include -#include -#include -#include - -static double tLoading = 0; - -static unsigned int cLoading = 0; -static struct timeval tmpv1; -static struct timeval tmpv2; -static string mem1; -static string mem2; - -void read_procmem(string& memstr) { - std::string buf; - pid_t pid = getpid(); - std::stringstream path; - path << "/proc/" << pid << "/status"; - std::ifstream infile; - infile.open (path.str().c_str(), std::ifstream::in); - while (infile.good()){ - getline(infile,buf); - if (infile.eof()) { - memstr = "Could not read memory"; - return; - }; - int idx = buf.find("VmRSS"); - if (idx == 0){ - memstr = buf; - return; - }; - }; - memstr = "Could not read memory"; - return; - -} - -#define STARTTIMER() do { \ - gettimeofday(&tmpv1,NULL); \ - read_procmem(mem1); \ - } while (0) \ - -#define STOPTIMER(x) do { \ - read_procmem(mem2); \ - gettimeofday(&tmpv2,NULL); \ - (t##x) = ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \ - (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \ - (c##x)= (c##x)+1; \ - } while (0) - -#define PRINTTIME(s,x) do { \ - std::cerr << (s) << " : " << (t##x) << "ms" << std::endl; \ - std::cerr << "Mem use before: " << mem1 << std::endl; \ - std::cerr << "Mem use after: " << mem2 << std::endl; \ - std::cerr.flush(); \ - } while (0) \ - +#include "timings.h" +#include +using std::cout; +using std::endl; +using std::min; +using std::string; // functions to convert tag positions to the corresponding tree node and viceversa. // These are implemented in order to be able to change the tree and Tags representations, @@ -69,23 +14,33 @@ void read_procmem(string& memstr) { // the tree, and storing 2 tags per tree node (opening and closing tags). // tag position -> tree node -inline treeNode tagpos2node(int t) +static treeNode tagpos2node(int t) { return (treeNode) t; } +static int bits8 (int t ) { + int r = bits(t); + if (r <= 8) + return 8; + else if (r <= 16) + return 16; + else + return r; +} + // tree node -> tag position -inline int node2tagpos(treeNode x) +static int node2tagpos(treeNode x) { return (int)x; } -int fast_find_close(bp *b,int s) +static int fast_find_close(bp *b,int s) { return fwd_excess(b,s,-1); } -inline int fast_inspect(bp* Par,treeNode i) +static int fast_inspect(bp* Par,treeNode i) { int j,l; j = i >> logD; @@ -93,20 +48,20 @@ inline int fast_inspect(bp* Par,treeNode i) return (Par->B[j] >> (D-1-l)) & 1; } -inline treeNode fast_first_child(bp *Par, treeNode x) +static treeNode fast_first_child(bp *Par, treeNode x) { x = x+1; - return (fast_inspect(Par,x) == CP) ? NULLT : x; + return (fast_inspect(Par,x) == OP) ? x : NULLT; } -inline treeNode fast_next_sibling(bp* Par,treeNode x) +static treeNode fast_next_sibling(bp* Par,treeNode x) { x = fast_find_close(Par,x)+1; - return (fast_inspect(Par,x) == CP) ? NULLT : x; + return (fast_inspect(Par,x) == OP) ? x : NULLT; } -inline treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ +static treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ if (tag == PCDATA_TAG_ID){ x = x+2; @@ -115,28 +70,39 @@ inline treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ } -inline bool fast_isleaf(bp* Par,treeNode x){ +static bool fast_isleaf(bp* Par,treeNode x){ return (fast_inspect(Par,x+1) == CP ? true : false); } -inline uint fast_get_field(uint* A,int len, int idx) + +inline uint get_field_no_power(uint *A, uint len, uint index) { + + register uint i=index*len/W, j=index*len-W*i; + return (j+len <= W) ? (A[i] << (W-j-len)) >> (W-len) : (A[i] >> j) | (A[i+1] << (WW-j-len)) >> (W-len); + +} + +static uint fast_get_field(uint* A,int len, int idx) { - switch(len){ + uint f1, f2; + switch (len) { case 8: - return (uint) (((uchar*)A)[idx]); - // Other 8-alligned values need to take care of the endianess of the arch. - default: - return get_field (A,len,idx); + return (uint) (((uchar*)A)[idx]); + case 16: + f2 = ((unsigned short*)A)[idx]; + f1 = ((unsigned short*)A)[idx+1]; + return (f1 << 16) + f2; + default: + return get_field_no_power (A,len,idx); }; } inline bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){ - if (x > y) return false; else - return (y <= fast_find_close(Par,x)); + return (x==0) || (y <= fast_find_close(Par,x)); } @@ -144,9 +110,15 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM uint *empty_texts_bmp, TagType *tags, TextCollection * const TC, bool dis_tc) { + buffer = 0; // creates the data structure for the tree topology Par = (bp *)umalloc(sizeof(bp)); - bp_construct(Par, npar, (pb*) par, OPT_DEGREE|0); + STARTTIMER(); + bp_construct(Par, npar, (pb*) par, OPT_DEGREE|0); + STOPTIMER(Building); + PRINTTIME("Building parenthesis struct", Building); + STARTTIMER(); + // creates structure for tags @@ -160,20 +132,22 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM alphabet_mapper *am = new alphabet_mapper_none(); Tags = new static_sequence_bs((uint*)tags,npar,am,bmb); - cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; + //cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; //Ensures that for small tag numbers, we are on an 8bit boundary. //Makes tag access way faster with negligeable waste of space. - tags_blen = max(bits(max_tag),8); + tags_blen = bits8(max_tag); std::cerr << "Tags blen is " << tags_blen << "\n"; tags_len = (uint)npar; tags_fix = new uint[uint_len(tags_blen,tags_len)]; for(uint i=0;i<(uint)npar;i++) set_field(tags_fix,tags_blen,i,tags[i]); - delete bmb; free(tags); tags = NULL; + + STOPTIMER(Building); + PRINTTIME("Building Tag Structure", Building); Text = (TextCollection*) TC; @@ -187,6 +161,8 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM disable_tc = dis_tc; stream = NULL; stream_fd = 0; + std::cerr << "Number of distinct tags " << TagName->size() << "\n"; + //std::cerr.flush(); } @@ -276,12 +252,13 @@ void XMLTree::Save(int fd) // a pointer to the loaded data structure XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) { + FILE *fp; char buffer[1024]; XMLTree *XML_Tree; int i; - + buffer[1023] = '\0'; fp = fdopen(fd, "r"); @@ -295,23 +272,21 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) PRINTTIME("Loading parenthesis struct", Loading); STARTTIMER(); - XML_Tree->TagName = new vector(); - XML_Tree->tIdMap = new std::unordered_map(); - - string s; + XML_Tree->TagName = new std::vector(); + XML_Tree->tIdMap = new std::unordered_map(); + std::string s; int ntags; // Load the tag names ufread(&ntags, sizeof(int), 1, fp); for (i=0; iTagName->push_back(s); + XML_Tree->TagName->push_back(s); XML_Tree->tIdMap->insert(std::make_pair(s,i)); }; @@ -338,6 +313,7 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) /// End ugly tests STOPTIMER(Loading); + std::cerr << (uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/(1024*1024) << " MB for tag sequence" << std::endl; PRINTTIME("Loading tag struct", Loading); STARTTIMER(); @@ -375,11 +351,6 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) } -// root(): returns the tree root. -inline treeNode XMLTree::Root() - { - return 0; //root_node(Par); - } // SubtreeSize(x): the number of nodes (and attributes) in the subtree of node x. int XMLTree::SubtreeSize(treeNode x) @@ -396,7 +367,7 @@ int XMLTree::SubtreeTags(treeNode x, TagType tag) int s = x + 2*subtree_size(Par, x) - 1; - return Tags->rank(tag, s) - Tags->rank(tag, node2tagpos(x)-1); + return (Tags->rank(tag, s) - Tags->rank(tag, node2tagpos(x)-1))+1; } int XMLTree::SubtreeElements(treeNode x) { @@ -539,19 +510,22 @@ treeNode XMLTree::FirstElement(treeNode x) NULLT_IF(x==NULLT); x = fast_first_child(Par, x); NULLT_IF(x == NULLT); - TagType tag = Tag(x); - if (tag == PCDATA_TAG_ID){ + switch (Tag(x)){ + + case PCDATA_TAG_ID: x = x+2; return (fast_inspect(Par,x)==OP)? x : NULLT; - } - else if (tag == ATTRIBUTE_TAG_ID){ + + case ATTRIBUTE_TAG_ID: x = fast_next_sibling(Par,x); if (x != NULLT && Tag(x) == PCDATA_TAG_ID){ x = x+2; return (fast_inspect(Par,x)==OP)? x : NULLT; } - else return x; - }else return x; + else return x; + default: + return x; + } } treeNode XMLTree::NextElement(treeNode x) @@ -569,7 +543,7 @@ treeNode XMLTree::NextElement(treeNode x) // LastChild(x): returns the last child of node x. treeNode XMLTree::LastChild(treeNode x) { - NULLT_IF(NULLT || x == Root() || fast_isleaf(Par,x)); + NULLT_IF(x == NULLT || fast_isleaf(Par,x)); return find_open(Par, fast_find_close(Par, x)-1); } @@ -585,7 +559,7 @@ treeNode XMLTree::NextSibling(treeNode x) // PrevSibling(x): returns the previous sibling of node x, assuming it exists. treeNode XMLTree::PrevSibling(treeNode x) { - NULLT_IF(x==NULLT || x == Root()); + NULLT_IF(x==NULLT); return prev_sibling(Par, x); } @@ -655,7 +629,7 @@ treeNode XMLTree::SelectFollowingSibling(treeNode x, TagIdSet *tags) // the subtree of x. Returns NULLT if there is none. treeNode XMLTree::TaggedDescendant(treeNode x, TagType tag) { - NULLT_IF(x==NULLT || fast_isleaf(Par,x)); + //NULLT_IF(x==NULLT || fast_isleaf(Par,x)); int s = (int) Tags->select_next(tag,node2tagpos(x)); NULLT_IF (s == -1); @@ -719,13 +693,32 @@ treeNode XMLTree::TaggedFollowing(treeNode x, TagType tag) // and not in the subtree of x. Returns NULLT if there is none. treeNode XMLTree::TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) { - NULLT_IF (x == NULLT || x == Root() || x == ancestor); - treeNode s = tagpos2node(Tags->select_next(tag, fast_find_close(Par, x))); - - if (ancestor == Root()) return s; - NULLT_IF (s == NULLT || s >= fast_find_close(Par, ancestor)); + // NULLT_IF (x == NULLT || x == Root() || x == ancestor); + + //Special optimisation, test for the following sibling first + treeNode close = fast_find_close(Par, x); + /* + treeNode ns = close+1; + if (fast_inspect(Par,ns) == OP) { + TagType tagns = Tag(ns); + // cout << GetTagNameByRef(tagns) << endl; + //cout.flush(); + if (tagns == PCDATA_TAG_ID){ + close = ns+1; + ns = ns+2; + if (fast_inspect(Par,ns) != OP) + goto after; + tagns = Tag(ns); + }; + if (tagns == tag) + return ns; + }; + after: + */ + treeNode s = tagpos2node(Tags->select_next(tag, close)); - return s; + if (ancestor == Root() || s==NULLT || s < fast_find_close(Par,ancestor)) return s; + else return NULLT; } treeNode XMLTree::TaggedFollowingBefore(treeNode x, TagType tag, treeNode closing) @@ -745,11 +738,17 @@ treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ance { NULLT_IF(x==NULLT || x==Root()); + + treeNode close = fast_find_close(Par,x); + treeNode ns = close+1; + if ( (fast_inspect(Par,ns) == OP) && (tags->find(Tag(ns)) != tags->end())) + return ns; + int i; treeNode min = NULLT; - treeNode ns = fast_next_sibling(Par, x); - treeNode close = ns - 1; treeNode aux; + + TagIdSet::const_iterator tagit; for (tagit = tags->begin(); tagit != tags->end(); tagit++) { @@ -757,9 +756,7 @@ treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ance // The next sibling of x is guaranteed to be below ctx // and is the node with lowest preorder which is after ctx. - // if we find it, we return early; - - if (aux == ns ) return ns; + // if we find it, we return early; if (aux == NULLT) continue; if ((min == NULLT) || (aux < min)) min = aux; }; @@ -931,18 +928,18 @@ bool XMLTree::IsOpen(treeNode x) { return fast_inspect(Par,x); } //WARNING this uses directly the underlying implementation for plain text -void XMLTree::Print(int fd,treeNode x){ + +void XMLTree::Print(int fd,treeNode x, bool no_text){ int newfd = dup(fd); stream = fdopen(newfd,"wa"); - /* if (stream_fd != fd){ - if (stream != NULL) - fclose(stream); - int newfd = dup(fd); - stream = fdopen(newfd,"wa"); - stream_fd = fd; - }; - */ + if (stream == 0){ + perror(NULL); + return; + }; + + if (buffer == 0) + buffer = new string(); FILE* fp = stream; treeNode fin = fast_find_close(Par,x); @@ -951,8 +948,8 @@ void XMLTree::Print(int fd,treeNode x){ uchar * tagstr; range r = DocIds(x); treeNode first_idx; - treeNode first_text = (tag == PCDATA_TAG_ID ? x : TaggedDescendant(x,PCDATA_TAG_ID)); - treeNode first_att = NULLT;//TaggedDesc(x,ATTRIBUTE_DATA_TAG_ID); + treeNode first_text = (tag == PCDATA_TAG_ID ? x : ParentNode(r.min-1)); + treeNode first_att = NULLT; if (first_att == NULLT) first_idx = first_text; @@ -964,52 +961,66 @@ void XMLTree::Print(int fd,treeNode x){ uchar * current_text=NULL; if (first_idx != NULLT) current_text = GetText(MyText(first_idx)); - int read = 0; - - std::stack st; + size_t read = 0; + std::vector st; while (n <= fin){ if (fast_inspect(Par,n)){ - if (tag == PCDATA_TAG_ID) { - // fputs((const char*) (GetText(MyTextUnsafe(n))),fp); - - read = fprintf(fp,"%s",(const char*) current_text); - current_text += (read + 1); - + if (tag == PCDATA_TAG_ID ) { + + if (no_text) + myfputs("<$/>",fp); + else{ + read = myfprintf((const char*) current_text, fp); + current_text += (read + 1); + }; n+=2; // skip closing $ tag = Tag(n); + } else { - - fputc('<',fp); + myfputc('<',fp); tagstr = (uchar*) GetTagNameByRef(tag); - fputs((const char*) tagstr ,fp); + myfputs((const char*) tagstr ,fp); n++; if (fast_inspect(Par,n)) { - st.push(tagstr); + st.push_back(tagstr); tag = Tag(n); if (tag == ATTRIBUTE_TAG_ID){ n++; + if (no_text) myfputs("><@@>",fp); while (fast_inspect(Par,n)){ - fputc(' ',fp); - fputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp); - fputs("=\"",fp); - n++; - read = fprintf(fp,"%s",(const char*) current_text); - current_text += (read + 1); - //fputs((const char*) GetText(MyTextUnsafe(n)),fp); - fputc('"',fp); - n+=3; //close @$ @@ + if (no_text) { + myfputc('<',fp); + myfputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp); + myfputc('>',fp); + myfputs("<$@/>',fp); + n+= 4; + } + else { + myfputc(' ',fp); + myfputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp); + n++; + myfputs("=\"",fp); + read = myfprintf((const char*) current_text,fp); + current_text += (read + 1); + myfputc('"',fp); + n+=3; + } }; - fputc('>',fp); + if (no_text) + myfputs("",fp); + else myfputc('>',fp); n++; tag=Tag(n); } else { - fputc('>',fp); + myfputc('>',fp); }; } else {// tag - fputs("/>",fp); + myfputs("/>",fp); n++; tag=Tag(n); }; @@ -1017,15 +1028,16 @@ void XMLTree::Print(int fd,treeNode x){ } else do { - fputs("', fp); - st.pop(); + myfputs("', fp); + st.pop_back(); n++; }while (!fast_inspect(Par,n) && !st.empty()); tag=Tag(n); }; - fputc('\n',fp); - fflush(fp); + myfputc('\n',fp); + mybufferflush(fp); + //fflush(fp); fclose(fp); }