X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.cpp;h=2a1da8446a816ab17117963aea34bc785b4945ca;hb=dc02a833a150dbef202bc14aca74c51360d4a631;hp=22ba4bb9a9e993dd7cdb4c48fbdb9987acd9f75a;hpb=184fd5131d257a334c29b0e55b1240fb29dc796b;p=SXSI%2FXMLTree.git diff --git a/XMLTree.cpp b/XMLTree.cpp index 22ba4bb..2a1da84 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -1,5 +1,6 @@ #include "XMLTree.h" #include + // functions to convert tag positions to the corresponding tree node and viceversa. // These are implemented in order to be able to change the tree and Tags representations, // without affecting the code so much. @@ -16,6 +17,66 @@ inline int node2tagpos(treeNode x) { return (int)x; } + +//KIM OJO to prevent suprious "unused result" warnings + +inline void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream){ + size_t res; + res = fread(ptr,size,nmemb,stream); + if (res < nmemb) + throw "ufread I/O error"; + + return; +} + +inline void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream){ + size_t res; + res = fwrite(ptr,size,nmemb,stream); + if (res < nmemb) + throw "ufwrite I/O error"; + return; +} + +// OJO to fail cleanly while doing a realloc +// if we can't realloc we are pretty much screwed anyway but +// it makes the code clearer to not have a bunch of if (!ptr) { printf("..."); exit(1); }; +inline void * urealloc(void *ptr, size_t size){ + + void * dest = realloc(ptr,size); + //don't fail if we requested size 0 + if (dest == NULL && size > 0 ) + throw std::bad_alloc(); + return dest; + +} + +inline void * ucalloc(size_t nmemb, size_t size){ + + void * dest = calloc(nmemb,size); + //don't fail if we requested size 0 + if (dest == NULL && nmemb > 0 && size > 0 ) + throw std::bad_alloc(); + return dest; + +} + +inline void * umalloc(size_t size){ + void * dest = malloc(size); + if (dest == NULL && size > 0) + throw std::bad_alloc(); + return dest; +} + +void XMLTree::print_stats() { + uint total_space = Tags->size()+sizeof(static_sequence*); + total_space += sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)); + cout << "Space usage for XMLTree:" << endl + << " - tags static_sequence: " << Tags->size()+sizeof(static_sequence*) << endl + << " - tags access array: " << sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)) << endl + << " ... add Diego structures ... " << endl + << " *total* " << total_space << endl; +} + // Save: saves XML tree data structure to file. void XMLTree::Save(unsigned char *filename) { @@ -35,23 +96,36 @@ void XMLTree::Save(unsigned char *filename) saveTree(Par, fp); // stores the table with tag names - fwrite(&ntagnames, sizeof(int), 1, fp); + ufwrite(&ntagnames, sizeof(int), 1, fp); for (i=0; isave(fp); // stores the tags Tags->save(fp); + ufwrite(&tags_blen,sizeof(uint),1,fp); + ufwrite(&tags_len,sizeof(uint),1,fp); + ufwrite(tags_fix,sizeof(uint),uint_len(tags_blen,tags_len),fp); // stores the texts - Text->Save(fp); - + if (!disable_tc) { + Text->Save(fp); + int st = CachedText.size(); + ufwrite(&st, sizeof(int),1,fp); + for (int i = 0; i< CachedText.size(); i++){ + st = CachedText.at(i).size(); + ufwrite(&st, sizeof(int),1,fp); + ufwrite(CachedText.at(i).c_str(),sizeof(char),1+CachedText.at(i).size(),fp); + }; + }; fclose(fp); } @@ -63,51 +137,124 @@ XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) { FILE *fp; - char filenameaux[1024]; + char buffer[1024]; XMLTree *XML_Tree; int i; - + size_t s_tree = 0; + long s_text = 0; + size_t s_tags = 0; + // first load the tree topology - sprintf(filenameaux, "%s.srx", filename); - fp = fopen(filenameaux, "r"); + sprintf(buffer, "%s.srx", filename); + fp = fopen(buffer, "r"); if (fp == NULL) { - printf("Error: cannot open file %s to load the tree structure of XML collection\n", filenameaux); + printf("Error: cannot open file %s to load the tree structure of XML collection\n", buffer); exit(1); } XML_Tree = new XMLTree(); - XML_Tree->Par = (bp *)malloc(sizeof(bp)); + XML_Tree->Par = (bp *)umalloc(sizeof(bp)); loadTree(XML_Tree->Par, fp); - + + s_tree += sizeof(bp); + // stores the table with tag names - fread(&XML_Tree->ntagnames, sizeof(int), 1, fp); + ufread(&XML_Tree->ntagnames, sizeof(int), 1, fp); + + s_tree += sizeof(int); + + XML_Tree->TagName = (unsigned char **)umalloc(XML_Tree->ntagnames*sizeof(unsigned char *)); + + s_tags += sizeof(unsigned char*)*XML_Tree->ntagnames; - XML_Tree->TagName = (unsigned char **)malloc(XML_Tree->ntagnames*sizeof(unsigned char *)); for (i=0; intagnames;i++) { - int k = feof(fp); - fscanf(fp, "%s\n",filenameaux); - XML_Tree->TagName[i] = (unsigned char *)malloc(sizeof(unsigned char)*(strlen((const char *)filenameaux)+1)); - strcpy((char *)XML_Tree->TagName[i], (const char *)filenameaux); + + // OJO Kim is it needed ? + int k = feof(fp); + + + // fscanf chokes on "\n" which is the case for the root element + char * r = fgets(buffer,1023,fp); + // int r = fscanf(fp, "%s\n",buffer); + if (r==NULL) + throw "Cannot read tag list"; + + // strlen is actually the right size, since there is a trailing '\n' + int len = strlen((const char*)buffer); + XML_Tree->TagName[i] = (unsigned char *)ucalloc(len,sizeof(char)); + strncpy((char *)XML_Tree->TagName[i], (const char *)buffer,len - 1); + s_tags+= len*sizeof(char); } // loads the flags - fread(&(XML_Tree->indexing_empty_texts), sizeof(bool), 1, fp); - fread(&(XML_Tree->initialized), sizeof(bool), 1, fp); - fread(&(XML_Tree->finished), sizeof(bool), 1, fp); + + ufread(&(XML_Tree->indexing_empty_texts), sizeof(bool), 1, fp); + ufread(&(XML_Tree->initialized), sizeof(bool), 1, fp); + ufread(&(XML_Tree->finished), sizeof(bool), 1, fp); + ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp); - if (!(XML_Tree->indexing_empty_texts)) XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); + s_tree+=sizeof(bool)*4; + if (!(XML_Tree->indexing_empty_texts)) XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); + + s_tree+= XML_Tree->EBVector->size(); + // loads the tags XML_Tree->Tags = static_sequence::load(fp); + ufread(&XML_Tree->tags_blen,sizeof(uint),1,fp); + ufread(&XML_Tree->tags_len,sizeof(uint),1,fp); + XML_Tree->tags_fix = new uint[uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)]; + ufread(XML_Tree->tags_fix,sizeof(uint),uint_len(XML_Tree->tags_blen,XML_Tree->tags_len),fp); + s_tree+=2*sizeof(uint)+sizeof(uint)*uint_len(XML_Tree->tags_blen,XML_Tree->tags_len); + s_tree+= XML_Tree->Tags->size(); + + /// FIXME:UGLY tests! + uint * seq = new uint[XML_Tree->tags_len]; + for(uint i=0;itags_len;i++) + seq[i] = get_field(XML_Tree->tags_fix,XML_Tree->tags_blen,i); + cout << "Tags test: " << XML_Tree->Tags->test(seq,XML_Tree->tags_len) << endl; + delete [] seq; + /// End ugly tests + + s_text = ftell(fp); + + // loads the texts + if (!XML_Tree->disable_tc){ + XML_Tree->Text = TextCollection::Load(fp,sample_rate_text); + int sst; + int st; + ufread(&sst, sizeof(int),1,fp); + for (int i=0;iCachedText.push_back(cppstr); + free(str); + }; - // loads the texts - XML_Tree->Text->Load(fp,sample_rate_text); + } + else { + XML_Tree->Text = NULL; + } + s_text = ftell(fp) - s_text; - fclose(fp); + + + fclose(fp); + + /*std::cerr << "Tree part is " << s_tree/1024 << " Kbytes,\n" + << "with node->tagid part " << XML_Tree->Tags->size()/1024+(uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/1024 << "Kbytes \n" + << "size of Tag part : " << XML_Tree->Tags->length () << " elements\n" + << "sizof(unsigned int)* " << XML_Tree->Tags->length () << " = " << + sizeof(unsigned int) * XML_Tree->Tags->length () / 1024 << " Kbytes\n" + << "Tag part is " << s_tags/1024 << " Kbytes,\n" + << "Text collection is " << s_text/1024 << " Kbytes \n";*/ + XML_Tree->print_stats(); return XML_Tree; } @@ -136,7 +283,9 @@ XMLTree::~XMLTree() Tags = NULL; //Text->~TextCollection(); - delete Text; + delete TextBuilder; + TextBuilder = NULL; + delete Text; Text = NULL; initialized = false; @@ -147,7 +296,7 @@ XMLTree::~XMLTree() treeNode XMLTree::Root() { if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); + fprintf(stderr, "Root() : Error: data structure has not been constructed properly\n"); exit(1); } return root_node(Par); @@ -170,6 +319,9 @@ int XMLTree::SubtreeTags(treeNode x, TagType tag) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } + if (x == Root()) + x = first_child(Par,x); + int s = x + 2*subtree_size(Par, x) - 1; @@ -205,7 +357,9 @@ bool XMLTree::IsChild(treeNode x, treeNode y) if (!is_ancestor(Par, x, y)) return false; return depth(Par, x) == (depth(Par, y) + 1); } - +bool XMLTree::IsFirstChild(treeNode x){ + return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == NULLT)); +} // NumChildren(x): number of children of node x. Constant time with the data structure // of Sadakane. int XMLTree::NumChildren(treeNode x) @@ -260,7 +414,6 @@ int XMLTree::Postorder(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - return postorder_rank(Par, x); } @@ -271,8 +424,8 @@ TagType XMLTree::Tag(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - - return Tags->access(node2tagpos(x)); + + return get_field(tags_fix,tags_blen,node2tagpos(x)); //Tags->access(node2tagpos(x)); } // DocIds(x): returns the range of text identifiers that descend from node x. @@ -285,6 +438,14 @@ range XMLTree::DocIds(treeNode x) } range r; + if (x == NULLT) + { + r.min = NULLT; + r.max = NULLT; + return r; + }; + + if (indexing_empty_texts) { // faster, no rank needed r.min = x; r.max = x+2*subtree_size(Par, x)-2; @@ -340,6 +501,14 @@ treeNode XMLTree::FirstChild(treeNode x) return first_child(Par, x); } +treeNode XMLTree::LastChild(treeNode x) +{ + if (x == Root() || isleaf(Par,x) || x == NULLT) + return x; + else + return find_open(Par,find_close(Par,parent(Par,x))-1); +} + // NextSibling(x): returns the next sibling of node x, assuming it exists. treeNode XMLTree::NextSibling(treeNode x) { @@ -347,9 +516,9 @@ treeNode XMLTree::NextSibling(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - if (x == Root()) + if (x == Root() || x==NULLT) return NULLT; - + return next_sibling(Par, x); } @@ -379,7 +548,7 @@ treeNode XMLTree::TaggedChild(treeNode x, int i, TagType tag) child = first_child(Par, x); // starts at first child of node x if (child==(treeNode)-1) return NULLT; // node x is a leaf, there is no such child while (child!=(treeNode)-1) { - if (Tags->access(node2tagpos(child)) == tag) { // current child is labeled with tag of interest + if (get_field(tags_fix,tags_blen,node2tagpos(child)) /*Tags->access(node2tagpos(child))*/ == tag) { // current child is labeled with tag of interest i--; if (i==0) return child; // we have seen i children of x tagged tag, this is the one we are looking for } @@ -397,16 +566,169 @@ treeNode XMLTree::TaggedDesc(treeNode x, TagType tag) exit(1); } - int r, s; + //int r, s; treeNode y; - r = (int) Tags->rank(tag, node2tagpos(x)); - s = (int) Tags->select(tag, r+1); + if (isleaf(Par,x)) + return NULLT; + + int s = (int) Tags->select_next(tag,node2tagpos(x)); + /*r = (int) Tags->rank(tag, node2tagpos(x)); + s = (int) Tags->select(tag, r+1);*/ if (s == -1) return NULLT; // there is no such node y = tagpos2node(s); // transforms the tag position into a node position if (!is_ancestor(Par, x, y)) return NULLT; // the next node tagged tag (in preorder) is not within the subtree of x. else return y; } +treeNode XMLTree::TaggedDescOnly(treeNode x,TagType *desctags, unsigned int dtlen) +{ + + treeNode res,y; + if (isleaf(Par,x)) + return NULLT; + + res=NULLT; + for (unsigned int i = 0; i < dtlen; i ++ ) + { + y = TaggedDesc(x,desctags[i]); + res = (res == NULLT) || (( res != NULLT) && (y =! NULLT) && y < res) ? y : res; + + }; + + return res; + +} + + +treeNode XMLTree::TaggedBelow(treeNode x, TagType *childtags, unsigned int ctlen, + TagType *desctags, unsigned int dtlen) +{ + treeNode fs,y,res; + TagType tag; + + if (isleaf(Par,x)) + return NULLT; + + res = NULLT; + fs = first_child(Par,x); + while (fs != NULLT) { + tag = get_field(tags_fix,tags_blen,node2tagpos(fs)); + + /* Check for first_child */ + for (unsigned int i = 0; i < ctlen; i++) { + if (childtags[i] == tag) + return fs; + }; + + for (unsigned int i = 0; i < dtlen; i++) + if (desctags[i] == tag) + return fs; + + /* check in the descendants */ + res = NULLT; + for (unsigned int i = 0; i < dtlen; i ++ ){ + /* maybe inline by hand */ + y = TaggedDesc(fs,desctags[i]); + res = (res==NULLT || (y != NULLT) &&(y < res)) ? y : res; + }; + if (res != NULLT) + return res; + + fs = next_sibling(Par,fs); + }; + return res; + +} +treeNode XMLTree::TaggedFollOnly(treeNode x,TagType *folltags, unsigned int ftlen,treeNode root) +{ + + treeNode res,y,lim; + lim = find_close(Par,root); + res=NULLT; + for (unsigned int i = 0; i < ftlen; i ++ ) + { + y = TaggedFoll(x,folltags[i]); + res = (res == NULLT) || (( res != NULLT) && (y =! NULLT) && y < res) ? y : res; + + }; + + return res < lim ? res : NULLT; + +} + +treeNode XMLTree::TaggedDescOrFollOnly(treeNode x,TagType *folltags, unsigned int ftlen,treeNode root) +{ + + treeNode res,y,lim; + //int r,s; + lim = find_close(Par,root); + res=NULLT; + for (unsigned int i = 0; i < ftlen; i ++ ) + { + + int s = (int) Tags->select_next(folltags[i],node2tagpos(x)); + /*r = (int) Tags->rank(folltags[i], node2tagpos(x)); + s = (int) Tags->select(folltags[i], r+1);*/ + if (s == -1) + y = NULLT; // there is no such node + else { + y = tagpos2node(s); + if (y >= lim) + y = NULLT; + }; + res = (res == NULLT) || (( res != NULLT) && (y =! NULLT) && y < res) ? y : res; + + }; + + return res < lim ? res : NULLT; + +} + + +// TaggedNext(x,tag): returns the first node tagged tag with larger preorder than x +// Returns NULLT if there is none. +treeNode XMLTree::TaggedNext(treeNode x, TagType *childtags, unsigned int ctlen, + TagType *folltags, unsigned int flen,treeNode root) + { + treeNode y,old_y,lim,res; + TagType tag; + if (x == NULLT || x == Root()) + return NULLT; + + + lim = find_close(Par,root); + + res = NULLT; + + y = next_sibling(Par,x); + while (y != NULLT) { + tag = get_field(tags_fix,tags_blen,node2tagpos(y)); + for(unsigned int i = 0; i < ctlen;i++) + if (childtags[i] == tag) + return y; + + for(unsigned int i = 0; i < flen;i++) + if (folltags[i] == tag) + return y; + + res = TaggedBelow(y,NULL,0,folltags,flen); + if (res != NULLT) + return res; + + y = next_sibling(Par,y); + }; + //Found nothing in the following sibling of x. + res = NULLT; + for(unsigned int i = 0; i < flen;i++){ + y = TaggedFoll(x,folltags[i]); + res = (y!= x && (res == NULLT || (y != NULLT && y < res)))? y : res; + }; + + return res < lim ? res : NULLT; + + } + + // TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an // ancestor of x. Returns NULLT if there is none. treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) @@ -432,22 +754,91 @@ treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) return NULLT; // there is no such node } + // TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in // the subtree of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) +treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) { if (!finished) { fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - int r, s; - r = (int) Tags->rank(tag, node2tagpos(next_sibling(Par, x))-1); - s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag. + //int r, s; + if (x ==NULLT || x == Root()) + return NULLT; + + int s = (int) Tags->select_next(tag,find_close(Par,x)); + /*r = (int) Tags->rank(tag, find_close(Par, x)); + s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag. */ + if (s==-1) return NULLT; + else return tagpos2node(s); + } + +// TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in +// the subtree of x. Returns NULLT if there is none. +treeNode XMLTree::TaggedFollBelow(treeNode x, TagType tag, treeNode root) + { + + //int r, s; + int lim = node2tagpos(find_close(Par,root)); + if (x ==NULLT || x == Root()) + return NULLT; + + int s = (int) Tags->select_next(tag,find_close(Par,x)); + /*r = (int) Tags->rank(tag,find_close(Par,x)); + s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag.*/ + if (s==-1 || s >= lim) + return NULLT; + else + return tagpos2node(s); + } + + +// TaggedFollowingSibling(x,tag): returns the first node tagged tag with larger preorder than x and not in +// the subtree of x. Returns NULLT if there is none. +treeNode XMLTree::TaggedFollowingSibling(treeNode x, TagType tag) + { + if (!finished) { + fprintf(stderr, "Error: data structure has not been constructed properly\n"); + exit(1); + } + + //int r, s; + treeNode ns = next_sibling(Par,x); + + if (x == NULLT || x == Root() || ns == -1) + return NULLT; + + int s = (int) Tags->select_next(tag,node2tagpos(ns)-1); + /*r = (int) Tags->rank(tag, node2tagpos(ns)-1); + s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag.*/ if (s==-1) return NULLT; else return tagpos2node(s); } + +// TaggedAncestor(x, tag): returns the closest ancestor of x tagged tag. Return +// NULLT is there is none. +treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag) + { + if (!finished) { + fprintf(stderr, "Error: data structure has not been constructed properly\n"); + exit(1); + } + + if (x == NULLT || x == Root()) + return NULLT; + + treeNode s = parent(Par, x), r = Root(); + while (s != r) { + if (get_field(tags_fix,tags_blen,node2tagpos(s)) /*Tags->access(node2tagpos(s))*/ == tag) return s; + s = parent(Par, s); + } + return NULLT; + } + + // PrevText(x): returns the document identifier of the text to the left // of node x, or NULLT if x is the root node or the text is empty. // Assumes Doc ids start from 0. @@ -554,10 +945,16 @@ treeNode XMLTree::ParentNode(DocID d) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - + + if (d == NULLT) + return NULLT; + int s; + // OJO : Kim : I added the d+1. before that, else branch was + // EBVector->select1(d) + // and gave wrong results (I'm really poking a bear with a stick here). if (indexing_empty_texts) s = d; - else s = EBVector->select1(d); + else s = EBVector->select1(d+1); if (inspect(Par,s) == CP) // is a closing parenthesis return parent(Par, find_open(Par, s)); @@ -565,44 +962,78 @@ treeNode XMLTree::ParentNode(DocID d) return (treeNode)s; } +treeNode XMLTree::PrevNode(DocID d) + { + if (!finished) { + fprintf(stderr, "Error: data structure has not been constructed properly\n"); + exit(1); + } + + if (d == NULLT) + return NULLT; + + int s; + + if (indexing_empty_texts) s = d; + else s = EBVector->select1(d+1); + if (s == -1) + return NULLT; + + if (inspect(Par,s) == CP) // is a closing parenthesis + return find_open(Par, s); + else // is an opening parenthesis + return NULLT; + + } // OpenDocument(empty_texts): it starts the construction of the data structure for // the XML document. Parameter empty_texts indicates whether we index empty texts // in document or not. Returns a non-zero value upon success, NULLT in case of error. -int XMLTree::OpenDocument(bool empty_texts, int sample_rate_text) +int XMLTree::OpenDocument(bool empty_texts, int sample_rate_text,bool dtc) { initialized = true; finished = false; + found_attributes = false; npar = 0; parArraySize = 1; - ntagnames = 0; - + ntagnames = 4; + disable_tc = dtc; + indexing_empty_texts = empty_texts; - par_aux = (pb *)malloc(sizeof(pb)*parArraySize); - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + par_aux = (pb *)umalloc(sizeof(pb)*parArraySize); - tags_aux = (TagType *) malloc(sizeof(TagType)); - if (!tags_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + tags_aux = (TagType *) umalloc(sizeof(TagType)); - TagName = NULL; + TagName = (unsigned char **) umalloc(4*sizeof(unsigned char*)); - if (!indexing_empty_texts) { - empty_texts_aux = (unsigned int *)malloc(sizeof(unsigned int)); - if (!empty_texts_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - } + TagName[0] = (unsigned char *) umalloc(4*sizeof(unsigned char)); + + strcpy((char *) TagName[0], "<@>"); + + TagName[1] = (unsigned char *) umalloc(4*sizeof(unsigned char)); + + strcpy((char *) TagName[1], "<$>"); - Text = TextCollection::InitTextCollection((unsigned)sample_rate_text); + //OJO need to put these in the table too. + TagName[2] = (unsigned char *) umalloc(5*sizeof(unsigned char)); + + strcpy((char *) TagName[2], "/<@>"); + + TagName[3] = (unsigned char *) umalloc(5*sizeof(unsigned char)); + + strcpy((char *) TagName[3], "/<$>"); + + + if (!indexing_empty_texts) + empty_texts_aux = (unsigned int *)umalloc(sizeof(unsigned int)); + + if (disable_tc) + TextBuilder = 0; + else + TextBuilder = new TextCollectionBuilder((unsigned)sample_rate_text); + Text = 0; return 1; // indicates success in the initialization of the data structure } @@ -619,33 +1050,71 @@ int XMLTree::CloseDocument() } // closing parenthesis for the tree root - par_aux = (pb *)realloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); // creates the data structure for the tree topology - Par = (bp *)malloc(sizeof(bp)); + Par = (bp *)umalloc(sizeof(bp)); bp_construct(Par, npar, par_aux, OPT_DEGREE|0); // creates structure for tags - static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(20); - static_permutation_builder * pmb = new static_permutation_builder_mrrr(PERM_SAMPLE, bmb); - static_sequence_builder * ssb = new static_sequence_builder_gmr_chunk(bmb, pmb); - Tags = new static_sequence_gmr((uint *) tags_aux, (uint) npar-1,2*ntagnames, bmb, ssb); + // If we found an attribute then "<@>" is present in the tree + // if we didn't then it is not. "<$>" is never present in the tree + uint max_tag = 0; + for(uint i=0;i<(uint)npar-1;i++) + max_tag = max(max_tag,tags_aux[i]); + //max_tag++; + //tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); + //tags_aux[npar++] = max_tag; + //int ntagsize = found_attributes ? 2*ntagnames-1 : 2*ntagnames - 2; + int ntagsize = 2*ntagnames + 2; + + //static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(20); + //static_permutation_builder * pmb = new static_permutation_builder_mrrr(PERM_SAMPLE, bmb); + //static_sequence_builder * ssb = new static_sequence_builder_gmr_chunk(bmb, pmb); + static_bitsequence_builder * bmb = new static_bitsequence_builder_sdarray(); + alphabet_mapper *am = new alphabet_mapper_none(); + //wt_coder * wc = new wt_coder_huff((uint*)tags_aux,npar,am); + //Tags = new static_sequence_wvtree((uint*)tags_aux,npar,wc ,bmb, am); + //Tags = new static_sequence_gmr((uint *) tags_aux, (uint) npar,ntagsize, bmb, ssb); + Tags = new static_sequence_bs((uint*)tags_aux,npar,am,bmb); + + cout << "Tags test: " << Tags->test((uint*)tags_aux,npar) << endl; + + tags_blen = bits(max_tag); + tags_len = (uint)npar; + tags_fix = new uint[uint_len(tags_blen,tags_len)]; + for(uint i=0;i<(uint)npar;i++) + set_field(tags_fix,tags_blen,i,tags_aux[i]); delete bmb; - delete pmb; - delete ssb; - // makes the text collection static - Text->MakeStatic(); + //delete pmb; + //delete ssb; + + // makes the text collection static + if (!disable_tc) + { + assert(Text == 0); + assert(TextBuilder != 0); + Text = TextBuilder->InitTextCollection(); + delete TextBuilder; + TextBuilder = 0; + } + // creates the data structure marking the non-empty texts (just in the case it is necessary) - if (!indexing_empty_texts) + if (!indexing_empty_texts) { EBVector = new static_bitsequence_rrr02((uint *)empty_texts_aux,(ulong)npar,(uint)32); + free (empty_texts_aux); + empty_texts_aux = NULL; + } + + // OJO was leaked before, found by valgrind + free(tags_aux); + + tags_aux = NULL; finished = true; + print_stats(); return 1; // indicates success in the inicialization } @@ -665,24 +1134,24 @@ int XMLTree::NewOpenTag(unsigned char *tagname) // inserts a new opening parentheses in the bit sequence if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - par_aux = (pb *)realloc(par_aux, sizeof(pb)*2*parArraySize); + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); parArraySize *= 2; } - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - setbit(par_aux,npar,OP); // marks a new opening parenthesis // transforms the tagname into a tag identifier. If the tag is new, we insert // it in the table. for (i=0; i") was called + if (i==0) + found_attributes=true; + if (i==ntagnames) { // the tag is a new one, then we insert it - TagName = (unsigned char **)realloc(TagName, sizeof(char *)*(ntagnames+1)); + TagName = (unsigned char **)urealloc(TagName, sizeof(char *)*(ntagnames+1)); if (!TagName) { fprintf(stderr, "Error: not enough memory\n"); @@ -690,19 +1159,15 @@ int XMLTree::NewOpenTag(unsigned char *tagname) } ntagnames++; - TagName[i] = (unsigned char *)malloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1)); + TagName[i] = (unsigned char *)umalloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1)); strcpy((char *)TagName[i], (const char *)tagname); } - tags_aux = (TagType *) realloc(tags_aux, sizeof(TagType)*(npar + 1)); - if (!tags_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags npar++; - + return 1; } @@ -722,41 +1187,27 @@ int XMLTree::NewClosingTag(unsigned char *tagname) // inserts a new closing parentheses in the bit sequence if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - par_aux = (pb *)realloc(par_aux, sizeof(pb)*2*parArraySize); + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); parArraySize *= 2; } - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - setbit(par_aux,npar,CP); // marks a new closing opening parenthesis + setbit(par_aux,npar,CP); // marks a new closing parenthesis // transforms the tagname into a tag identifier. If the tag is new, we insert // it in the table. for (i=0; iInsertText(s); + TextBuilder->InsertText(s); + string cpps = (char*) s; + CachedText.push_back(cpps); return 1; // success } @@ -806,15 +1259,11 @@ int XMLTree::NewEmptyText() } if (!indexing_empty_texts) { - empty_texts_aux = (unsigned int *)realloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); - if (!empty_texts_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + empty_texts_aux = (unsigned int *)urealloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); bitclean(empty_texts_aux, npar-1); // marks the empty text with a 0 in the bit vector } - else Text->InsertText(&c); // we insert the empty text just in case we index all the texts + else TextBuilder->InsertText(&c); // we insert the empty text just in case we index all the texts return 1; // success } @@ -828,7 +1277,7 @@ TagType XMLTree::GetTagId(unsigned char *tagname) // this should be changed for more efficient processing for (i=0; i= ntagnames) return NULL; // invalid tag identifier - s = (unsigned char *)malloc((strlen((const char *)TagName[tagid])+1)*sizeof(unsigned char)); + s = (unsigned char *)umalloc((strlen((const char *)TagName[tagid])+1)*sizeof(unsigned char)); strcpy((char *)s, (const char *)TagName[tagid]); return s; } +//KIM : OJO need the two following methods + +const unsigned char *XMLTree::GetTagNameByRef(TagType tagid) + { + if(tagid==(uint)-1) return NULL; + if (tagid >= ntagnames) return NULL; // invalid tag identifier + return ((const unsigned char*) TagName[tagid]); + } + + + TagType XMLTree::RegisterTag(unsigned char *tagname) { if (!finished) return NULLT; - TagType id = XMLTree::GetTagId(tagname); if (id == NULLT){ id = ntagnames; ntagnames = ntagnames + 1; - TagName = (unsigned char **) realloc(TagName,ntagnames*(sizeof(unsigned char*))); + TagName = (unsigned char **) urealloc(TagName,ntagnames*(sizeof(unsigned char*))); + TagName[id] = (unsigned char *) umalloc(sizeof(unsigned char)*strlen( (const char*) tagname)+1); strcpy((char*)TagName[id], (const char *)tagname); };