From: kim Date: Tue, 28 Apr 2009 01:02:46 +0000 (+0000) Subject: Initial merge of Diego's cleaned up XMLTree class. X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FXMLTree.git;a=commitdiff_plain;h=aa6692a9fd2badf8e8e686b92075f041dc03bbef Initial merge of Diego's cleaned up XMLTree class. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/XMLTree@363 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/XMLTree.cpp b/XMLTree.cpp index bd728af..ffdf5d0 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -1,4 +1,5 @@ #include "XMLTree.h" +#include "basics.h" #include // functions to convert tag positions to the corresponding tree node and viceversa. @@ -8,79 +9,117 @@ // the tree, and storing 2 tags per tree node (opening and closing tags). // tag position -> tree node -inline treeNode tagpos2node(int t) { - return (treeNode)t; -} +inline treeNode tagpos2node(int t) + { + return (treeNode)t; + } // tree node -> tag position -inline int node2tagpos(treeNode x) { - return (int)x; -} +inline int node2tagpos(treeNode x) + { + return (int)x; + } -//KIM OJO to prevent suprious "unused result" warnings +XMLTree::XMLTree(pb *par, uint npar, unsigned char **TN, uint ntagnm, uint *empty_texts_bmp, TagType *tags, + TextCollection *TC, vector CT, bool indexing_empty_t, bool dis_tc) + { + // creates the data structure for the tree topology + Par = (bp *)umalloc(sizeof(bp)); + bp_construct(Par, npar, par, OPT_DEGREE|0); -inline void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream){ - size_t res; - res = fread(ptr,size,nmemb,stream); - if (res < nmemb) - throw "ufread I/O error"; - return; -} + // creates structure for tags -inline void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream){ - size_t res; - res = fwrite(ptr,size,nmemb,stream); - if (res < nmemb) - throw "ufwrite I/O error"; - return; -} + // If we found an attribute then "<@>" is present in the tree + // if we didn't then it is not. "<$>" is never present in the tree + TagName = TN; + ntagnames = ntagnm; + uint max_tag = 0; + for(uint i=0;i<(uint)npar-1;i++) + max_tag = max(max_tag,tags[i]); + int ntagsize = 2*ntagnames + 2; -// OJO to fail cleanly while doing a realloc -// if we can't realloc we are pretty much screwed anyway but -// it makes the code clearer to not have a bunch of if (!ptr) { printf("..."); exit(1); }; -inline void * urealloc(void *ptr, size_t size){ + static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); + alphabet_mapper *am = new alphabet_mapper_none(); + Tags = new static_sequence_bs((uint*)tags,npar,am,bmb); + + cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; - void * dest = realloc(ptr,size); - //don't fail if we requested size 0 - if (dest == NULL && size > 0 ) - throw std::bad_alloc(); - return dest; + tags_blen = bits(max_tag); + tags_len = (uint)npar; + tags_fix = new uint[uint_len(tags_blen,tags_len)]; + for(uint i=0;i<(uint)npar;i++) + set_field(tags_fix,tags_blen,i,tags[i]); -} + delete bmb; + free(tags); + tags = NULL; + + Text = TC; -inline void * ucalloc(size_t nmemb, size_t size){ + CachedText = CT; - void * dest = calloc(nmemb,size); - //don't fail if we requested size 0 - if (dest == NULL && nmemb > 0 && size > 0 ) - throw std::bad_alloc(); - return dest; + // creates the data structure marking the non-empty texts (just in the case it is necessary) + indexing_empty_texts = indexing_empty_t; + if (!indexing_empty_t) { + EBVector = new static_bitsequence_rrr02((uint *)empty_texts_bmp,(ulong)npar,(uint)32); + free(empty_texts_bmp); + empty_texts_bmp = NULL; + } -} + TagArray = new TagArrayEntry[ntagnames]; + for (uint i=0; i 0) - throw std::bad_alloc(); - return dest; -} -void XMLTree::print_stats() { - uint total_space = Tags->size()+sizeof(static_sequence*); - total_space += sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)); - cout << "Space usage for XMLTree:" << endl - << " - tags static_sequence: " << Tags->size()+sizeof(static_sequence*) << endl - << " - tags access array: " << sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)) << endl - << " ... add Diego structures ... " << endl - << " *total* " << total_space << endl; -} +// ~XMLTree: frees memory of XML tree. +XMLTree::~XMLTree() + { + int i; + + destroyTree(Par); + free(Par); // frees the memory of struct Par + + for (i=0; isize()+sizeof(static_sequence*); + total_space += sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)); + cout << "Space usage for XMLTree:" << endl + << " - tags static_sequence: " << Tags->size()+sizeof(static_sequence*) << endl + << " - tags access array: " << sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)) << endl + << " ... add Diego structures ... " << endl + << " *total* " << total_space << endl; + } // Save: saves XML tree data structure to file. void XMLTree::Save(unsigned char *filename) { - FILE *fp; char filenameaux[1024]; int i; @@ -103,17 +142,18 @@ void XMLTree::Save(unsigned char *filename) // stores the flags ufwrite(&indexing_empty_texts, sizeof(bool), 1, fp); - ufwrite(&initialized, sizeof(bool), 1, fp); - ufwrite(&finished, sizeof(bool), 1, fp); + bool ignore = true; + ufwrite(&ignore, sizeof(bool),1,fp); + ufwrite(&ignore, sizeof(bool),1,fp); ufwrite(&disable_tc, sizeof(bool),1,fp); if (!indexing_empty_texts) EBVector->save(fp); // stores the tags Tags->save(fp); - ufwrite(&tags_blen,sizeof(uint),1,fp); - ufwrite(&tags_len,sizeof(uint),1,fp); - ufwrite(tags_fix,sizeof(uint),uint_len(tags_blen,tags_len),fp); + ufwrite(&tags_blen,sizeof(uint),1,fp); + ufwrite(&tags_len,sizeof(uint),1,fp); + ufwrite(tags_fix,sizeof(uint),uint_len(tags_blen,tags_len),fp); // stores the texts if (!disable_tc) { @@ -135,7 +175,6 @@ void XMLTree::Save(unsigned char *filename) // a pointer to the loaded data structure XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) { - FILE *fp; char buffer[1024]; XMLTree *XML_Tree; @@ -154,31 +193,26 @@ XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) XML_Tree = new XMLTree(); + XML_Tree->Par = (bp *)umalloc(sizeof(bp)); loadTree(XML_Tree->Par, fp); s_tree += sizeof(bp); + // stores the table with tag names ufread(&XML_Tree->ntagnames, sizeof(int), 1, fp); - + s_tree += sizeof(int); XML_Tree->TagName = (unsigned char **)umalloc(XML_Tree->ntagnames*sizeof(unsigned char *)); - + s_tags += sizeof(unsigned char*)*XML_Tree->ntagnames; for (i=0; intagnames;i++) { - - // OJO Kim is it needed ? - int k = feof(fp); - - - // fscanf chokes on "\n" which is the case for the root element char * r = fgets(buffer,1023,fp); - // int r = fscanf(fp, "%s\n",buffer); if (r==NULL) throw "Cannot read tag list"; @@ -188,36 +222,39 @@ XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) strncpy((char *)XML_Tree->TagName[i], (const char *)buffer,len - 1); s_tags+= len*sizeof(char); } + // loads the flags ufread(&(XML_Tree->indexing_empty_texts), sizeof(bool), 1, fp); - ufread(&(XML_Tree->initialized), sizeof(bool), 1, fp); - ufread(&(XML_Tree->finished), sizeof(bool), 1, fp); + bool ignore; + ufread(&ignore, sizeof(bool), 1, fp); + ufread(&ignore, sizeof(bool), 1, fp); ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp); - + s_tree+=sizeof(bool)*4; if (!(XML_Tree->indexing_empty_texts)) XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); - + s_tree+= XML_Tree->EBVector->size(); - + // loads the tags XML_Tree->Tags = static_sequence::load(fp); - ufread(&XML_Tree->tags_blen,sizeof(uint),1,fp); - ufread(&XML_Tree->tags_len,sizeof(uint),1,fp); - XML_Tree->tags_fix = new uint[uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)]; - ufread(XML_Tree->tags_fix,sizeof(uint),uint_len(XML_Tree->tags_blen,XML_Tree->tags_len),fp); - s_tree+=2*sizeof(uint)+sizeof(uint)*uint_len(XML_Tree->tags_blen,XML_Tree->tags_len); + ufread(&XML_Tree->tags_blen,sizeof(uint),1,fp); + ufread(&XML_Tree->tags_len,sizeof(uint),1,fp); + XML_Tree->tags_fix = new uint[uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)]; + ufread(XML_Tree->tags_fix,sizeof(uint),uint_len(XML_Tree->tags_blen,XML_Tree->tags_len),fp); + s_tree+=2*sizeof(uint)+sizeof(uint)*uint_len(XML_Tree->tags_blen,XML_Tree->tags_len); s_tree+= XML_Tree->Tags->size(); - /// FIXME:UGLY tests! - uint * seq = new uint[XML_Tree->tags_len]; - for(uint i=0;itags_len;i++) - seq[i] = get_field(XML_Tree->tags_fix,XML_Tree->tags_blen,i); - cout << "Tags test: " << XML_Tree->Tags->test(seq,XML_Tree->tags_len) << endl; - delete [] seq; - /// End ugly tests + + /// FIXME:UGLY tests! + uint * seq = new uint[XML_Tree->tags_len]; + for(uint i=0;itags_len;i++) + seq[i] = get_field(XML_Tree->tags_fix,XML_Tree->tags_blen,i); + cout << "Tags test: " << XML_Tree->Tags->test(seq,XML_Tree->tags_len) << endl; + delete [] seq; + /// End ugly tests s_text = ftell(fp); @@ -237,88 +274,32 @@ XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) }; } - else { - XML_Tree->Text = NULL; - } - s_text = ftell(fp) - s_text; - - + else XML_Tree->Text = NULL; + s_text = ftell(fp) - s_text; fclose(fp); - /*std::cerr << "Tree part is " << s_tree/1024 << " Kbytes,\n" - << "with node->tagid part " << XML_Tree->Tags->size()/1024+(uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/1024 << "Kbytes \n" - << "size of Tag part : " << XML_Tree->Tags->length () << " elements\n" - << "sizof(unsigned int)* " << XML_Tree->Tags->length () << " = " << - sizeof(unsigned int) * XML_Tree->Tags->length () / 1024 << " Kbytes\n" - << "Tag part is " << s_tags/1024 << " Kbytes,\n" - << "Text collection is " << s_text/1024 << " Kbytes \n";*/ - XML_Tree->print_stats(); + XML_Tree->print_stats(); return XML_Tree; } -// ~XMLTree: frees memory of XML tree. -XMLTree::~XMLTree() - { - int i; - - destroyTree(Par); - free(Par); // frees the memory of struct Par - - for (i=0; i~static_bitsequence_rrr02(); - delete EBVector; - EBVector = NULL; - } - - //Tags->~static_sequence_wvtree(); - delete Tags; - Tags = NULL; - - //Text->~TextCollection(); - delete TextBuilder; - TextBuilder = NULL; - delete Text; - Text = NULL; - - initialized = false; - finished = false; - } - // root(): returns the tree root. treeNode XMLTree::Root() { - if (!finished) { - fprintf(stderr, "Root() : Error: data structure has not been constructed properly\n"); - exit(1); - } return root_node(Par); } // SubtreeSize(x): the number of nodes (and attributes) in the subtree of node x. int XMLTree::SubtreeSize(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } return subtree_size(Par, x); } // SubtreeTags(x,tag): the number of occurrences of tag within the subtree of node x. int XMLTree::SubtreeTags(treeNode x, TagType tag) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } if (x == Root()) x = first_child(Par,x); @@ -338,59 +319,39 @@ bool XMLTree::IsLeaf(treeNode x) // IsAncestor(x,y): returns whether node x is ancestor of node y. bool XMLTree::IsAncestor(treeNode x, treeNode y) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return is_ancestor(Par, x, y); } // IsChild(x,y): returns whether node x is parent of node y. bool XMLTree::IsChild(treeNode x, treeNode y) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (!is_ancestor(Par, x, y)) return false; return depth(Par, x) == (depth(Par, y) + 1); } -bool XMLTree::IsFirstChild(treeNode x){ - return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == NULLT)); -} + +// IsFirstChild(x): returns whether node x is the first child of its parent. +bool XMLTree::IsFirstChild(treeNode x) + { + return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1)); + } + + // NumChildren(x): number of children of node x. Constant time with the data structure // of Sadakane. int XMLTree::NumChildren(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return degree(Par, x); } // ChildNumber(x): returns i if node x is the i-th children of its parent. int XMLTree::ChildNumber(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return child_rank(Par, x); } // Depth(x): depth of node x, a simple binary rank on the parentheses sequence. int XMLTree::Depth(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return depth(Par, x); } @@ -398,11 +359,6 @@ int XMLTree::Depth(treeNode x) // nodes (i.e., tags, it disregards the texts in the tree). int XMLTree::Preorder(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return preorder_rank(Par, x); } @@ -410,21 +366,12 @@ int XMLTree::Preorder(treeNode x) // nodes (i.e., tags, it disregards the texts in the tree). int XMLTree::Postorder(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } return postorder_rank(Par, x); } // Tag(x): returns the tag identifier of node x. TagType XMLTree::Tag(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return get_field(tags_fix,tags_blen,node2tagpos(x)); //Tags->access(node2tagpos(x)); } @@ -432,20 +379,13 @@ TagType XMLTree::Tag(treeNode x) // returns {NULLT, NULLT} when there are no texts descending from x. range XMLTree::DocIds(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - range r; - if (x == NULLT) - { - r.min = NULLT; - r.max = NULLT; - return r; - }; + if (x == NULLT) { + r.min = NULLT; + r.max = NULLT; + return r; + }; - if (indexing_empty_texts) { // faster, no rank needed r.min = x; r.max = x+2*subtree_size(Par, x)-2; @@ -468,10 +408,6 @@ range XMLTree::DocIds(treeNode x) // Parent(x): returns the parent node of node x. treeNode XMLTree::Parent(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } if (x == Root()) return NULLT; else @@ -481,11 +417,6 @@ treeNode XMLTree::Parent(treeNode x) // Child(x,i): returns the i-th child of node x, assuming it exists. treeNode XMLTree::Child(treeNode x, int i) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (i <= OPTD) return naive_child(Par, x, i); else return child(Par, x, i); } @@ -493,29 +424,23 @@ treeNode XMLTree::Child(treeNode x, int i) // FirstChild(x): returns the first child of node x, assuming it exists. Very fast in BP. treeNode XMLTree::FirstChild(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return first_child(Par, x); } -treeNode XMLTree::LastChild(treeNode x) -{ - if (x == Root() || isleaf(Par,x) || x == NULLT) - return x; - else - return find_open(Par,find_close(Par,parent(Par,x))-1); -} +// LastChild(x): returns the last child of node x. +treeNode XMLTree::LastChild(treeNode x) + { + if (x == Root() || isleaf(Par,x) || x == NULLT) + return x; + else +// return find_open(Par,find_close(Par,parent(Par,x))-1); + return find_open(Par, find_close(Par, x)-1); + } + // NextSibling(x): returns the next sibling of node x, assuming it exists. treeNode XMLTree::NextSibling(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } if (x == Root() || x==NULLT) return NULLT; @@ -525,163 +450,195 @@ treeNode XMLTree::NextSibling(treeNode x) // PrevSibling(x): returns the previous sibling of node x, assuming it exists. treeNode XMLTree::PrevSibling(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - return prev_sibling(Par, x); } -// TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or NULLT if there is none. +// TaggedChild(x,tag): returns the first child of node x tagged tag, or NULLT if there is none. // Because of the balanced-parentheses representation of the tree, this operation is not supported // efficiently, just iterating among the children of node x until finding the desired child. -treeNode XMLTree::TaggedChild(treeNode x, int i, TagType tag) +treeNode XMLTree::TaggedChild(treeNode x, TagType tag) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - treeNode child; child = first_child(Par, x); // starts at first child of node x if (child==(treeNode)-1) return NULLT; // node x is a leaf, there is no such child while (child!=(treeNode)-1) { - if (get_field(tags_fix,tags_blen,node2tagpos(child)) /*Tags->access(node2tagpos(child))*/ == tag) { // current child is labeled with tag of interest - i--; - if (i==0) return child; // we have seen i children of x tagged tag, this is the one we are looking for - } - child = next_sibling(Par, x); // OK, let's try with the next child + if (get_field(tags_fix,tags_blen,node2tagpos(child)) == tag) // current child is labeled with tag of interest + return child; + child = next_sibling(Par, child); // OK, let's try with the next child } return NULLT; // no such child was found } + +treeNode XMLTree::SelectChild(treeNode x, TagType *tags, int ntags) + { + int i; + treeNode child = first_child(Par, x); + + while (child!=(treeNode)-1) { + TagType t = get_field(tags_fix, tags_blen, node2tagpos(child)); + for (i=0; iselect_next(tag,node2tagpos(x)); - /*r = (int) Tags->rank(tag, node2tagpos(x)); - s = (int) Tags->select(tag, r+1);*/ - if (s == -1) return NULLT; // there is no such node + int s = (int) Tags->select_next(tag,node2tagpos(x)); + if (s==-1) return NULLT; // there is no such node y = tagpos2node(s); // transforms the tag position into a node position if (!is_ancestor(Par, x, y)) return NULLT; // the next node tagged tag (in preorder) is not within the subtree of x. else return y; } -treeNode XMLTree::TaggedDescOnly(treeNode x,TagType *desctags, unsigned int dtlen) -{ - treeNode res,y; - if (isleaf(Par,x)) - return NULLT; - - res=NULLT; - for (unsigned int i = 0; i < dtlen; i ++ ) - { - y = TaggedDesc(x,desctags[i]); - res = (res == NULLT) || (( res != NULLT) && (y =! NULLT) && y < res) ? y : res; - - }; +treeNode XMLTree::SelectDesc(treeNode x, TagType *tags, int ntags) + { + int i; + treeNode min = NULLT; + treeNode fc = first_child(Par,x); + + for (i=0; iselect_next(folltags[i],node2tagpos(x)); - /*r = (int) Tags->rank(folltags[i], node2tagpos(x)); - s = (int) Tags->select(folltags[i], r+1);*/ - if (s == -1) - y = NULLT; // there is no such node - else { - y = tagpos2node(s); - if (y >= lim) - y = NULLT; - }; - res = (res == NULLT) || (( res != NULLT) && (y =! NULLT) && y < res) ? y : res; - - }; - - return res < lim ? res : NULLT; +treeNode XMLTree::TaggedDescOrFollOnly(treeNode x,TagType *folltags, unsigned int ftlen,treeNode root) + { + treeNode res,y,lim; + lim = find_close(Par,root); + res=NULLT; + for (unsigned int i = 0; i < ftlen; i++) { + int s = (int) Tags->select_next(folltags[i],node2tagpos(x)); + if (s == -1) + y = NULLT; // there is no such node + else { + y = tagpos2node(s); + if (y >= lim) + y = NULLT; + } + res = (res == NULLT) || (( res != NULLT) && (y =! NULLT) && y < res) ? y : res; + } + + return res < lim ? res : NULLT; } @@ -695,7 +652,6 @@ treeNode XMLTree::TaggedNext(treeNode x, TagType *childtags, unsigned int ctlen, if (x == NULLT || x == Root()) return NULLT; - lim = find_close(Par,root); res = NULLT; @@ -724,20 +680,14 @@ treeNode XMLTree::TaggedNext(treeNode x, TagType *childtags, unsigned int ctlen, res = (y!= x && (res == NULLT || (y != NULLT && y < res)))? y : res; }; - return res < lim ? res : NULLT; - + return res < lim ? res : NULLT; } // TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an // ancestor of x. Returns NULLT if there is none. treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) - { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - + { int r, s; treeNode node_s, root; r = (int)Tags->rank(tag, node2tagpos(x)-1); @@ -759,57 +709,58 @@ treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) // the subtree of x. Returns NULLT if there is none. treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - - //int r, s; if (x ==NULLT || x == Root()) return NULLT; - - int s = (int) Tags->select_next(tag,find_close(Par,x)); - /*r = (int) Tags->rank(tag, find_close(Par, x)); - s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag. */ + + int s = (int) Tags->select_next(tag,find_close(Par, x)); if (s==-1) return NULLT; else return tagpos2node(s); } -// TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in -// the subtree of x. Returns NULLT if there is none. +// TaggedFollBelow(x,tag,root): returns the first node tagged tag with larger preorder than x +// and not in the subtree of x. Returns NULLT if there is none. treeNode XMLTree::TaggedFollBelow(treeNode x, TagType tag, treeNode root) { - - if (x == NULLT || x == Root()) return NULLT; - treeNode s = (treeNode) Tags->select_next(tag,find_close(Par,x)); - /*int r = (int) Tags->rank(tag, find_close(Par, x)); - int s = (int) Tags->select(tag, r+1); */ - if (root == Root()) - return s; - - if (s == NULLT || s >= find_close(Par,root)) return NULLT; - return s; + if (x == NULLT || x == Root()) + return NULLT; + + treeNode s = (treeNode) Tags->select_next(tag, find_close(Par, x)); + + if (root == Root()) + return s; + if (s == NULLT || s >= find_close(Par, root)) return NULLT; + return s; } +treeNode XMLTree::SelectFollBelow(treeNode x, TagType *tags, int ntags, treeNode ctx) + { + int i; + treeNode min = NULLT; + treeNode fc = first_child(Par, x); + + for (i=0; iselect_next(tag,node2tagpos(ns)-1); - /*r = (int) Tags->rank(tag, node2tagpos(ns)-1); - s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag.*/ + int s = (int) Tags->select_next(tag, node2tagpos(ns)-1); if (s==-1) return NULLT; else return tagpos2node(s); } @@ -818,12 +769,7 @@ treeNode XMLTree::TaggedFollowingSibling(treeNode x, TagType tag) // TaggedAncestor(x, tag): returns the closest ancestor of x tagged tag. Return // NULLT is there is none. treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag) - { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - + { if (x == NULLT || x == Root()) return NULLT; @@ -841,11 +787,6 @@ treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag) // Assumes Doc ids start from 0. DocID XMLTree::PrevText(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (x == Root()) return NULLT; if (indexing_empty_texts) // faster, no rank needed return (DocID)x-1; @@ -861,11 +802,6 @@ DocID XMLTree::PrevText(treeNode x) // of node x, or NULLT if x is the root node. Assumes Doc ids start from 0. DocID XMLTree::NextText(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (x == Root()) return NULLT; if (indexing_empty_texts) // faster, no rank needed return (DocID)x+2*subtree_size(Par, x)-1; @@ -883,11 +819,6 @@ DocID XMLTree::NextText(treeNode x) // ids start from 0. DocID XMLTree::MyText(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (!IsLeaf(x)) return NULLT; if (indexing_empty_texts) // faster, no rank needed return (DocID)x; @@ -903,11 +834,6 @@ DocID XMLTree::MyText(treeNode x) // all tree nodes and all text nodes. Assumes that the tree root has preorder 1. int XMLTree::TextXMLId(DocID d) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (indexing_empty_texts) return d + rank_open(Par, d)+1; // +1 because root has preorder 1 else { // slower, needs rank and select @@ -921,11 +847,6 @@ int XMLTree::TextXMLId(DocID d) // preorder 0; int XMLTree::NodeXMLId(treeNode x) { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - if (indexing_empty_texts) return x - 1 + rank_open(Par, x); else { @@ -937,12 +858,7 @@ int XMLTree::NodeXMLId(treeNode x) // ParentNode(d): returns the parent node of document identifier d. treeNode XMLTree::ParentNode(DocID d) - { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - + { if (d == NULLT) return NULLT; @@ -960,12 +876,7 @@ treeNode XMLTree::ParentNode(DocID d) } treeNode XMLTree::PrevNode(DocID d) - { - if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); - exit(1); - } - + { if (d == NULLT) return NULLT; @@ -984,288 +895,6 @@ treeNode XMLTree::PrevNode(DocID d) } -// OpenDocument(empty_texts): it starts the construction of the data structure for -// the XML document. Parameter empty_texts indicates whether we index empty texts -// in document or not. Returns a non-zero value upon success, NULLT in case of error. -int XMLTree::OpenDocument(bool empty_texts, int sample_rate_text,bool dtc) - { - initialized = true; - finished = false; - found_attributes = false; - npar = 0; - parArraySize = 1; - ntagnames = 4; - disable_tc = dtc; - - indexing_empty_texts = empty_texts; - - par_aux = (pb *)umalloc(sizeof(pb)*parArraySize); - - tags_aux = (TagType *) umalloc(sizeof(TagType)); - - TagName = (unsigned char **) umalloc(4*sizeof(unsigned char*)); - - TagName[0] = (unsigned char *) umalloc(4*sizeof(unsigned char)); - - strcpy((char *) TagName[0], "<@>"); - - TagName[1] = (unsigned char *) umalloc(4*sizeof(unsigned char)); - - strcpy((char *) TagName[1], "<$>"); - - //OJO need to put these in the table too. - TagName[2] = (unsigned char *) umalloc(5*sizeof(unsigned char)); - - strcpy((char *) TagName[2], "/<@>"); - - TagName[3] = (unsigned char *) umalloc(5*sizeof(unsigned char)); - - strcpy((char *) TagName[3], "/<$>"); - - - if (!indexing_empty_texts) - empty_texts_aux = (unsigned int *)umalloc(sizeof(unsigned int)); - - if (disable_tc) - TextBuilder = 0; - else - TextBuilder = new TextCollectionBuilder((unsigned)sample_rate_text); - Text = 0; - - return 1; // indicates success in the initialization of the data structure - } - -// CloseDocument(): it finishes the construction of the data structure for the XML -// document. Tree and tags are represented in the final form, dynamic data -// structures are made static, and the flag "finished" is set to true. After that, -// the data structure can be queried. -int XMLTree::CloseDocument() - { - if (!initialized) { // data structure has not been initialized properly - fprintf(stderr, "Error: data structure has not been initialized properly (by calling method OpenDocument)\n"); - return NULLT; - } - - // closing parenthesis for the tree root - par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); - - // creates the data structure for the tree topology - Par = (bp *)umalloc(sizeof(bp)); - bp_construct(Par, npar, par_aux, OPT_DEGREE|0); - // creates structure for tags - - // If we found an attribute then "<@>" is present in the tree - // if we didn't then it is not. "<$>" is never present in the tree - uint max_tag = 0; - for(uint i=0;i<(uint)npar-1;i++) - max_tag = max(max_tag,tags_aux[i]); - //max_tag++; - //tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); - //tags_aux[npar++] = max_tag; - //int ntagsize = found_attributes ? 2*ntagnames-1 : 2*ntagnames - 2; - int ntagsize = 2*ntagnames + 2; - - //static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(20); - //static_permutation_builder * pmb = new static_permutation_builder_mrrr(PERM_SAMPLE, bmb); - //static_sequence_builder * ssb = new static_sequence_builder_gmr_chunk(bmb, pmb); - static_bitsequence_builder * bmb = new static_bitsequence_builder_sdarray(); - alphabet_mapper *am = new alphabet_mapper_none(); - //wt_coder * wc = new wt_coder_huff((uint*)tags_aux,npar,am); - //Tags = new static_sequence_wvtree((uint*)tags_aux,npar,wc ,bmb, am); - //Tags = new static_sequence_gmr((uint *) tags_aux, (uint) npar,ntagsize, bmb, ssb); - Tags = new static_sequence_bs((uint*)tags_aux,npar,am,bmb); - - cout << "Tags test: " << Tags->test((uint*)tags_aux,npar) << endl; - - tags_blen = bits(max_tag); - tags_len = (uint)npar; - tags_fix = new uint[uint_len(tags_blen,tags_len)]; - for(uint i=0;i<(uint)npar;i++) - set_field(tags_fix,tags_blen,i,tags_aux[i]); - - delete bmb; - //delete pmb; - //delete ssb; - - - // makes the text collection static - if (!disable_tc) - { - assert(Text == 0); - assert(TextBuilder != 0); - Text = TextBuilder->InitTextCollection(); - delete TextBuilder; - TextBuilder = 0; - } - - // creates the data structure marking the non-empty texts (just in the case it is necessary) - if (!indexing_empty_texts) { - EBVector = new static_bitsequence_rrr02((uint *)empty_texts_aux,(ulong)npar,(uint)32); - free (empty_texts_aux); - empty_texts_aux = NULL; - } - - // OJO was leaked before, found by valgrind - free(tags_aux); - - tags_aux = NULL; - - finished = true; - print_stats(); - - return 1; // indicates success in the inicialization - } - - -// NewOpenTag(tagname): indicates the event of finding a new opening tag in the document. -// Tag name is given. Returns a non-zero value upon success, and returns NULLT -// in case of failing when trying to insert the new tag. -int XMLTree::NewOpenTag(unsigned char *tagname) - { - int i; - - if (!initialized) { // data structure has not been initialized properly - fprintf(stderr, "Error: you cannot insert a new opening tag without first calling method OpenDocument first\n"); - return NULLT; - } - - // inserts a new opening parentheses in the bit sequence - if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); - parArraySize *= 2; - } - - setbit(par_aux,npar,OP); // marks a new opening parenthesis - - // transforms the tagname into a tag identifier. If the tag is new, we insert - // it in the table. - for (i=0; i") was called - if (i==0) - found_attributes=true; - - if (i==ntagnames) { // the tag is a new one, then we insert it - TagName = (unsigned char **)urealloc(TagName, sizeof(char *)*(ntagnames+1)); - - if (!TagName) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - - ntagnames++; - TagName[i] = (unsigned char *)umalloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1)); - strcpy((char *)TagName[i], (const char *)tagname); - } - tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); - - tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags - - npar++; - - return 1; - - } - - -// NewClosingTag(tagname): indicates the event of finding a new closing tag in the document. -// Tag name is given. Returns a non-zero value upon success, and returns NULLT -// in case of failing when trying to insert the new tag. -int XMLTree::NewClosingTag(unsigned char *tagname) - { - int i; - - if (!initialized) { // data structure has not been initialized properly - fprintf(stderr, "Error: you cannot insert a new closing tag without first calling method OpenDocument first\n"); - return NULLT; - } - - // inserts a new closing parentheses in the bit sequence - if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); - parArraySize *= 2; - } - - setbit(par_aux,npar,CP); // marks a new closing parenthesis - - // transforms the tagname into a tag identifier. If the tag is new, we insert - // it in the table. - for (i=0; iInsertText(s); - string cpps = (char*) s; - CachedText.push_back(cpps); - - return 1; // success - } - -// NewEmptyText(): indicates the event of finding a new empty text in the document. -// In case of indexing empty and non-empty texts, we insert the empty texts into the -// text collection. In case of indexing only non-empty texts, it just indicates an -// empty text in the bit vector of empty texts. Returns a non-zero value upon -// success, NULLT in case of error. -int XMLTree::NewEmptyText() - { - unsigned char c = 0; - if (!initialized) { // data structure has not been initialized properly - fprintf(stderr, "Error: you cannot insert a new empty text without first calling method OpenDocument first\n"); - return NULLT; - } - - if (!indexing_empty_texts) { - empty_texts_aux = (unsigned int *)urealloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); - - bitclean(empty_texts_aux, npar-1); // marks the empty text with a 0 in the bit vector - } - else TextBuilder->InsertText(&c); // we insert the empty text just in case we index all the texts - - return 1; // success - } - - // GetTagId: returns the tag identifier corresponding to a given tag name. // Returns NULLT in case that the tag name does not exists. TagType XMLTree::GetTagId(unsigned char *tagname) @@ -1292,11 +921,9 @@ unsigned char *XMLTree::GetTagName(TagType tagid) } -//KIM : OJO need the two following methods - const unsigned char *XMLTree::GetTagNameByRef(TagType tagid) { - if(tagid==(uint)-1) return NULL; + if(tagid==(uint)-1) return NULL; if (tagid >= ntagnames) return NULL; // invalid tag identifier return ((const unsigned char*) TagName[tagid]); } @@ -1304,18 +931,17 @@ const unsigned char *XMLTree::GetTagNameByRef(TagType tagid) TagType XMLTree::RegisterTag(unsigned char *tagname) -{ - if (!finished) - return NULLT; - - TagType id = XMLTree::GetTagId(tagname); - if (id == NULLT){ - id = ntagnames; - ntagnames = ntagnames + 1; - TagName = (unsigned char **) urealloc(TagName,ntagnames*(sizeof(unsigned char*))); - TagName[id] = (unsigned char *) umalloc(sizeof(unsigned char)*strlen( (const char*) tagname)+1); - strcpy((char*)TagName[id], (const char *)tagname); - }; - - return id; -} + { + TagType id = XMLTree::GetTagId(tagname); + if (id == NULLT) { + id = ntagnames; + ntagnames = ntagnames + 1; + TagName = (unsigned char **) urealloc(TagName,ntagnames*(sizeof(unsigned char*))); + TagName[id] = (unsigned char *) umalloc(sizeof(unsigned char)*strlen( (const char*) tagname)+1); + strcpy((char*)TagName[id], (const char *)tagname); + } + + return id; + } + + diff --git a/XMLTree.h b/XMLTree.h index 1268cb6..ea9b938 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -17,7 +17,7 @@ * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ + ******************************************************************************/ #ifndef XMLTREE_H_ #define XMLTREE_H_ @@ -26,13 +26,13 @@ #include #include -//KIM : OJO -//clash between TextCollection/Tools.h and libcds/includes/basics.h + #undef W #undef WW #undef Wminusone #include "bp.h" +//#include "basics.h" #include #include #include @@ -47,11 +47,6 @@ using SXSI::TextCollectionBuilder; #define PERM_SAMPLE 10 - // sets bit p in e -#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) - // cleans bit p in e -#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) - typedef int treeNode; typedef int TagType; @@ -62,13 +57,15 @@ typedef struct { int max; } range; +typedef struct nd { + uint position; + struct nd *next; +} ListNode; -//KIM : OJO -// I know this class implements the working draft that we have but the logics seem flawed here... -// We should have two classes. One XMLTreeBuilder and one XMLTree. -// XMLTreeBuilder would have OpenDocument, NewOpenTag,... and CloseDocument would return an XMLTree -// XMLTree would have only an initialized structure. If find it really ugly to check (!finished) or (!initialized) -// in every function (FirstChild....). +typedef struct { + ListNode *first; + ListNode *last; +} TagArrayEntry; class XMLTree { /** Balanced parentheses representation of the tree */ @@ -76,59 +73,36 @@ class XMLTree { /** Mapping from tag identifer to tag name */ unsigned char **TagName; + uint ntagnames; /** boolean flag indicating whether we are indexing empty texts or not */ bool indexing_empty_texts; /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence_rrr02 *EBVector; + static_bitsequence *EBVector; /** Tag sequence represented with a data structure for rank and select */ static_sequence *Tags; - uint * tags_fix; - uint tags_blen, tags_len; + uint * tags_fix; + uint tags_blen, tags_len; /** The texts in the XML document */ - TextCollectionBuilder *TextBuilder; TextCollection *Text; /** The texts in the XML document (cached for faster display) */ vector CachedText; - /** Flag indicating whether the whole data structure has been constructed or - * not. If the value is true, you cannot add more texts, nodes, etc. */ - bool finished; - - /** Flag indicating whether the construction of the data structure has been - * initialized or not (by calling method OpenDocument()). If this is true, - * you cannot insert new tags or texts. */ - bool initialized; - - /* the following components are used for construction purposes */ - pb *par_aux; - TagType *tags_aux; - int npar; - int parArraySize; - int ntagnames; - unsigned int *empty_texts_aux; - - // KIM : OJO - // I added those two. The TagName array should always contains two special tags - // <@> for attributes and <$> for PCDATA. - // <$> can never be in a document (since we handle the text differently) - // but <@> can be returned by the parser. This boolean is needed for the construction - // of the Tag bitmap to know if <@> must be taken into account or not - bool found_attributes; - - // KIM : OJO + TagArrayEntry *TagArray; + // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; public: - void print_stats(); + /** Data structure constructors */ + XMLTree() {;}; - /** Data structure constructor */ - XMLTree() {finished = false; initialized = false; Text = 0; TextBuilder = 0; }; + XMLTree(pb *par, uint npar, unsigned char **TN, uint ntagnames, uint *empty_texts_bmp, TagType *tags, + TextCollection *TC, vector CT, bool indexing_empty_t, bool dis_tc); /** Data structure destructor */ ~XMLTree(); @@ -154,9 +128,9 @@ public: /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); - /** IsChild(x,y): returns whether node x is the first child of its parent */ + /** IsFirstChild(x): returns whether node x is the first child of its parent. */ bool IsFirstChild(treeNode x); - + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); @@ -194,30 +168,37 @@ public: * Very fast in BP. */ treeNode FirstChild(treeNode x); - /** LastChild(x): returns the last child of node x. - * Implemented by Kim naively. */ + /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x); - + /** NextSibling(x): returns the next sibling of node x, assuming it * exists. */ - treeNode NextSibling(treeNode x); /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ treeNode PrevSibling(treeNode x); - /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or * NULLT if there is none. Because of the balanced-parentheses representation * of the tree, this operation is not supported efficiently, just iterating * among the children of node x until finding the desired child. */ - treeNode TaggedChild(treeNode x, int i, TagType tag); + treeNode TaggedChild(treeNode x, TagType tag); + treeNode SelectChild(treeNode x, TagType *tags, int ntags); + + /** TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or + * NULLT if there is none. */ + treeNode TaggedSibling(treeNode x, TagType tag); + + treeNode SelectSibling(treeNode x, TagType *tags, int ntags); + /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ treeNode TaggedDesc(treeNode x, TagType tag); + treeNode SelectDesc(treeNode x, TagType *tags, int ntags); treeNode TaggedBelow(treeNode x, TagType *childtags, unsigned int ctlen, TagType *desctags, unsigned int dtlen); @@ -245,7 +226,9 @@ public: treeNode TaggedFoll(treeNode x, TagType tag); treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); - + + treeNode SelectFollBelow(treeNode x, TagType *tags, int ntags, treeNode ctx); + /** TaggedFollowingSibling(x,tag) */ treeNode TaggedFollowingSibling(treeNode x, TagType tag); @@ -275,48 +258,9 @@ public: /** ParentNode(d): returns the parent node of document identifier d. */ treeNode ParentNode(DocID d); + treeNode PrevNode(DocID d); - /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter - * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space - * requirement). dtc disable the use of the TextCollection - * (i.e. everything is considered an empty text *) - * Returns a non-zero value upon success, NULLT in case of - * error. */ - - int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); - - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, - * dynamic data structures are made static, and the flag "finished" is set - * to true. After that, the data structure can be queried. */ - int CloseDocument(); - - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); - - /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); - - /** NewText(s): indicates the event of finding a new (non-empty) text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); - /** GetTagId(tagname): returns the tag identifier corresponding to a given * tag name. Returns NULLT in case that the tag name does not exists. */ TagType GetTagId(unsigned char *tagname); @@ -325,14 +269,11 @@ public: * Returns NULL in case that the tag identifier is not valid.*/ unsigned char *GetTagName(TagType tagid); - - // OJO /** GetTagName(tagid): returns the tag name of a given tag identifier. * The result is just a reference and should not be freed by the caller. */ const unsigned char *GetTagNameByRef(TagType tagid); - //OJO /** RegisterTag adds a new tag to the tag collection this is needed * if the query contains a tag which is not in the document, we need * to give this new tag a fresh id and store it somewhere. A logical @@ -344,6 +285,7 @@ public: bool EmptyText(DocID i) { return Text->EmptyText(i); } + /** Prefix(s): search for texts prefixed by string s. */ TextCollection::document_result Prefix(uchar const *s) { return Text->Prefix(s); @@ -425,7 +367,7 @@ public: /** CountLessThan(s): counting version of LessThan(s). */ unsigned CountLessThan(uchar const *s) { - return CountLessThan(s); + return Text->CountLessThan(s); } /** GetText(d): returns the text corresponding to document with @@ -443,11 +385,17 @@ public: TextCollection *getTextCollection() { return Text; } + /** Save: saves XML tree data structure to file. */ void Save(unsigned char *filename); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ static XMLTree *Load(unsigned char *filename, int sample_rate_text); + + void insertTag(TagType tag, uint position); + + void print_stats(); }; #endif + diff --git a/XMLTreeBuilder.cpp b/XMLTreeBuilder.cpp new file mode 100644 index 0000000..27bc862 --- /dev/null +++ b/XMLTreeBuilder.cpp @@ -0,0 +1,196 @@ + +#include "XMLTreeBuilder.h" +#include "basics.h" + +// OpenDocument(empty_texts): it starts the construction of the data structure for +// the XML document. Parameter empty_texts indicates whether we index empty texts +// in document or not. Returns a non-zero value upon success, NULLT in case of error. +int XMLTreeBuilder::OpenDocument(bool empty_texts, int sample_rate_text, bool dtc) + { + found_attributes = false; + npar = 0; + parArraySize = 1; + ntagnames = 4; + disable_tc = dtc; + + indexing_empty_texts = empty_texts; + + par_aux = (pb *)umalloc(sizeof(pb)*parArraySize); + + tags_aux = (TagType *) umalloc(sizeof(TagType)); + + TagName = (unsigned char **) umalloc(4*sizeof(unsigned char*)); + TagName[0] = (unsigned char *) umalloc(4*sizeof(unsigned char)); + strcpy((char *) TagName[0], "<@>"); + TagName[1] = (unsigned char *) umalloc(4*sizeof(unsigned char)); + strcpy((char *) TagName[1], "<$>"); + TagName[2] = (unsigned char *) umalloc(5*sizeof(unsigned char)); + strcpy((char *) TagName[2], "/<@>"); + TagName[3] = (unsigned char *) umalloc(5*sizeof(unsigned char)); + strcpy((char *) TagName[3], "/<$>"); + + if (!indexing_empty_texts) + empty_texts_aux = (unsigned int *)umalloc(sizeof(unsigned int)); + + if (disable_tc) + TextBuilder = 0; + else + TextBuilder = new TextCollectionBuilder((unsigned)sample_rate_text); + Text = 0; + + return 1; // indicates success in the initialization of the data structure + } + +// CloseDocument(): it finishes the construction of the data structure for the XML +// document. Tree and tags are represented in the final form, dynamic data +// structures are made static, and the flag "finished" is set to true. After that, +// the data structure can be queried. +XMLTree *XMLTreeBuilder::CloseDocument() + { + // closing parenthesis for the tree root + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); + setbit(par_aux, npar, CP); + npar++; + + // makes the text collection static + if (!disable_tc) { + assert(Text == 0); + assert(TextBuilder != 0); + Text = TextBuilder->InitTextCollection(); + delete TextBuilder; + TextBuilder = 0; + } + + XMLTree *T = new XMLTree(par_aux, npar, TagName, ntagnames, empty_texts_aux, tags_aux, + Text, CachedText, indexing_empty_texts, disable_tc); + return T; + } + + +// NewOpenTag(tagname): indicates the event of finding a new opening tag in the document. +// Tag name is given. Returns a non-zero value upon success, and returns NULLT +// in case of failing when trying to insert the new tag. +int XMLTreeBuilder::NewOpenTag(unsigned char *tagname) + { + int i; + + // inserts a new opening parentheses in the bit sequence + if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); + parArraySize *= 2; + } + + setbit(par_aux,npar,OP); // marks a new opening parenthesis + + // transforms the tagname into a tag identifier. If the tag is new, we insert + // it in the table. + for (i=0; i") was called + if (i==0) + found_attributes=true; + + if (i==ntagnames) { // the tag is a new one, then we insert it + TagName = (unsigned char **)urealloc(TagName, sizeof(char *)*(ntagnames+1)); + + if (!TagName) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + + ntagnames++; + TagName[i] = (unsigned char *)umalloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1)); + strcpy((char *)TagName[i], (const char *)tagname); + } + tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); + + tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags + + npar++; + + return 1; // success + } + + +// NewClosingTag(tagname): indicates the event of finding a new closing tag in the document. +// Tag name is given. Returns a non-zero value upon success, and returns NULLT +// in case of failing when trying to insert the new tag. +int XMLTreeBuilder::NewClosingTag(unsigned char *tagname) + { + int i; + + // inserts a new closing parentheses in the bit sequence + if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); + parArraySize *= 2; + } + + setbit(par_aux,npar,CP); // marks a new closing parenthesis + + // transforms the tagname into a tag identifier. If the tag is new, we insert + // it in the table. + for (i=0; iInsertText(s); + string cpps = (char*) s; + CachedText.push_back(cpps); + + return 1; // success + } + +// NewEmptyText(): indicates the event of finding a new empty text in the document. +// In case of indexing empty and non-empty texts, we insert the empty texts into the +// text collection. In case of indexing only non-empty texts, it just indicates an +// empty text in the bit vector of empty texts. Returns a non-zero value upon +// success, NULLT in case of error. +int XMLTreeBuilder::NewEmptyText() + { + unsigned char c = 0; + + if (!indexing_empty_texts) { + empty_texts_aux = (unsigned int *)urealloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); + + bitclean(empty_texts_aux, npar-1); // marks the empty text with a 0 in the bit vector + } + else TextBuilder->InsertText(&c); // we insert the empty text just in case we index all the texts + + return 1; // success + } + diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h new file mode 100644 index 0000000..38d81ba --- /dev/null +++ b/XMLTreeBuilder.h @@ -0,0 +1,131 @@ + +/****************************************************************************** + * Copyright (C) 2009 by Diego Arroyuelo * + * Builder class for the in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef XMLTREEBUILDER_H_ +#define XMLTREEBUILDER_H_ +#include "TextCollection/TextCollectionBuilder.h" +#include +#include +#include + + +#undef W +#undef WW +#undef Wminusone + +#include "bp.h" +#include "XMLTree.h" +#include +#include +#include +using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; + +#define NULLT -1 + + // sets bit p in e +#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) + // cleans bit p in e +#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + + +class XMLTreeBuilder { + + /** Array containing the balanced parentheses sequence */ + pb *par_aux; + int parArraySize; + int npar; + + /** Mapping from tag identifer to tag name */ + unsigned char **TagName; + int ntagnames; + + /** Array containing the sequence of tags */ + TagType *tags_aux; + + /** The texts in the XML document */ + TextCollectionBuilder *TextBuilder; + TextCollection *Text; + + /** The texts in the XML document (cached for faster display) */ + vector CachedText; + + /** boolean flag indicating whether we are indexing empty texts or not */ + bool indexing_empty_texts; + unsigned int *empty_texts_aux; + + // The TagName array should always contains two special tags + // <@> for attributes and <$> for PCDATA. + // <$> can never be in a document (since we handle the text differently) + // but <@> can be returned by the parser. This boolean is needed for the construction + // of the Tag bitmap to know if <@> must be taken into account or not + bool found_attributes; + + // Allows to disable the TextCollection for benchmarkin purposes + bool disable_tc; + +public: + + XMLTreeBuilder() {;}; + + ~XMLTreeBuilder(); + + /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction + * of the data structure for the XML document. Parameter empty_texts + * indicates whether we index empty texts in document or not. Parameter + * sample_rate_text indicates the sampling rate for the text searching data + * structures (small values get faster searching but a bigger space + * requirement). dtc disable the use of the TextCollection + * (i.e. everything is considered an empty text *) + * Returns a non-zero value upon success, NULLT in case of + * error. */ + int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); + + /** CloseDocument(): finishes the construction of the data structure for + * the XML document. Tree and tags are represented in the final form, + * dynamic data structures are made static, returning the resulting + * XMLTree. After that, the XMLTree data structure can be queried. */ + XMLTree *CloseDocument(); + + /** NewOpenTag(tagname): indicates the event of finding a new opening tag + * in the document. Tag name is given. Returns a non-zero value upon + * success, and returns NULLT in case of error. */ + int NewOpenTag(unsigned char *tagname); + + /** NewClosingTag(tagname): indicates the event of finding a new closing tag + * in the document. Tag name is given. Returns a non-zero value upon + * success, and returns NULLT in case of error. */ + int NewClosingTag(unsigned char *tagname); + + /** NewText(s): indicates the event of finding a new (non-empty) text s in + * the document. The new text is inserted within the text collection. + * Returns a non-zero value upon success, NULLT in case of error. */ + int NewText(unsigned char *s); + + /** NewEmptyText(): indicates the event of finding a new empty text in the + * document. In case of indexing empty and non-empty texts, we insert the + * empty texts into the text collection. In case of indexing only non-empty + * texts, it just indicates an empty text in the bit vector of empty texts. + * Returns a non-zero value upon success, NULLT in case of error. */ + int NewEmptyText(); +}; +#endif + diff --git a/basics.h b/basics.h new file mode 100644 index 0000000..273aa1d --- /dev/null +++ b/basics.h @@ -0,0 +1,51 @@ +#ifndef BASICS_H +#define BASICS_H + +#include +#include + +inline void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream) + { + size_t res; + res = fread(ptr,size,nmemb,stream); + if (res < nmemb) + throw "ufread I/O error"; + return; + } + +inline void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) + { + size_t res; + res = fwrite(ptr,size,nmemb,stream); + if (res < nmemb) + throw "ufwrite I/O error"; + return; + } + +inline void *urealloc(void *ptr, size_t size) + { + void *dest = realloc(ptr,size); + //don't fail if we requested size 0 + if (dest == NULL && size > 0 ) + throw std::bad_alloc(); + return dest; + } + +inline void *ucalloc(size_t nmemb, size_t size) + { + void * dest = calloc(nmemb,size); + //don't fail if we requested size 0 + if (dest == NULL && nmemb > 0 && size > 0 ) + throw std::bad_alloc(); + return dest; + } + +inline void *umalloc(size_t size) + { + void * dest = malloc(size); + if (dest == NULL && size > 0) + throw std::bad_alloc(); + return dest; + } + +#endif diff --git a/makefile b/makefile index bf21063..cd4b721 100644 --- a/makefile +++ b/makefile @@ -7,15 +7,16 @@ OBJECTS_TCA=TextCollection/incbwt/rlcsa.a all: libcds text_collection XMLTree -XMLTree: XMLTree.o bp.o darray.o bpcore.o $(LIBCDS_A) $(OBJECTS_TCO) $(OBJECTS_TCA) +XMLTree: XMLTree.o XMLTreeBuilder.o bp.o darray.o bpcore.o\ + $(LIBCDS_A) $(OBJECTS_TCO) $(OBJECTS_TCA) cp libcds/lib/libcds.a XMLTree.a rm -rf .objs mkdir .objs cd .objs; ar x ../$(OBJECTS_TCA) - ar rvcs XMLTree.a XMLTree.o bp.o darray.o bpcore.o $(OBJECTS_TCO) .objs/*.o + ar rvcs XMLTree.a XMLTree.o XMLTreeBuilder.o bp.o darray.o bpcore.o $(OBJECTS_TCO) .objs/*.o rm -rf .objs -XMLTree.o: bp.c bp.h bpcore.c XMLTree.cpp XMLTree.h +XMLTree.o: bp.c bp.h bpcore.c XMLTree.cpp XMLTree.h basics.h g++ $(FLAGS) -c XMLTree.cpp bp.o: bp.c bpcore.c darray.c bp.h darray.h @@ -27,6 +28,9 @@ darray.o: darray.c darray.h bpcore.c bpcore.o: bpcore.c g++ $(FLAGS) -c bpcore.c +XMLTreeBuilder.o: XMLTree.h XMLTreeBuilder.h XMLTreeBuilder.cpp basics.h + g++ $(FLAGS) -c XMLTreeBuilder.cpp + text_collection: make -C TextCollection/