X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.cpp;h=a37263a16c30577984fcbf5537d1ca382bed6a9f;hb=3c8f8af6704ca98b36b503878058aa0619806dad;hp=b5e53a78f0b2e4e8c4ce546c6251cbbcd267d21c;hpb=1c3300373a43062be9bb8d23faf3c8446f9aa6c5;p=SXSI%2FXMLTree.git diff --git a/XMLTree.cpp b/XMLTree.cpp index b5e53a7..a37263a 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -1,5 +1,6 @@ #include "XMLTree.h" #include + // functions to convert tag positions to the corresponding tree node and viceversa. // These are implemented in order to be able to change the tree and Tags representations, // without affecting the code so much. @@ -16,6 +17,56 @@ inline int node2tagpos(treeNode x) { return (int)x; } + +//KIM OJO to prevent suprious "unused result" warnings + +inline void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream){ + size_t res; + res = fread(ptr,size,nmemb,stream); + if (res < nmemb) + throw "ufread I/O error"; + + return; +} + +inline void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream){ + size_t res; + res = fwrite(ptr,size,nmemb,stream); + if (res < nmemb) + throw "ufwrite I/O error"; + return; +} + +// OJO to fail cleanly while doing a realloc +// if we can't realloc we are pretty much screwed anyway but +// it makes the code clearer to not have a bunch of if (!ptr) { printf("..."); exit(1); }; +inline void * urealloc(void *ptr, size_t size){ + + void * dest = realloc(ptr,size); + //don't fail if we requested size 0 + if (dest == NULL && size > 0 ) + throw std::bad_alloc(); + return dest; + +} + +inline void * ucalloc(size_t nmemb, size_t size){ + + void * dest = calloc(nmemb,size); + //don't fail if we requested size 0 + if (dest == NULL && nmemb > 0 && size > 0 ) + throw std::bad_alloc(); + return dest; + +} + +inline void * umalloc(size_t size){ + void * dest = malloc(size); + if (dest == NULL && size > 0) + throw std::bad_alloc(); + return dest; +} + // Save: saves XML tree data structure to file. void XMLTree::Save(unsigned char *filename) { @@ -35,14 +86,16 @@ void XMLTree::Save(unsigned char *filename) saveTree(Par, fp); // stores the table with tag names - fwrite(&ntagnames, sizeof(int), 1, fp); + ufwrite(&ntagnames, sizeof(int), 1, fp); for (i=0; isave(fp); @@ -50,8 +103,17 @@ void XMLTree::Save(unsigned char *filename) Tags->save(fp); // stores the texts - Text->Save(fp); - + if (!disable_tc) + Text->Save(fp); + if (!disable_tc){ + int st = CachedText.size(); + ufwrite(&st, sizeof(int),1,fp); + for (int i = 0; i< CachedText.size(); ++i){ + st = CachedText.at(i).size(); + ufwrite(&st, sizeof(int),1,fp); + ufwrite(CachedText.at(i).c_str(),sizeof(char),(1+strlen(CachedText.at(i).c_str())),fp); + }; + }; fclose(fp); } @@ -63,51 +125,111 @@ XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) { FILE *fp; - char filenameaux[1024]; + char buffer[1024]; XMLTree *XML_Tree; int i; - + size_t s_tree = 0; + long s_text = 0; + size_t s_tags = 0; + // first load the tree topology - sprintf(filenameaux, "%s.srx", filename); - fp = fopen(filenameaux, "r"); + sprintf(buffer, "%s.srx", filename); + fp = fopen(buffer, "r"); if (fp == NULL) { - printf("Error: cannot open file %s to load the tree structure of XML collection\n", filenameaux); + printf("Error: cannot open file %s to load the tree structure of XML collection\n", buffer); exit(1); } XML_Tree = new XMLTree(); - XML_Tree->Par = (bp *)malloc(sizeof(bp)); + XML_Tree->Par = (bp *)umalloc(sizeof(bp)); loadTree(XML_Tree->Par, fp); - + + s_tree += sizeof(bp); + // stores the table with tag names - fread(&XML_Tree->ntagnames, sizeof(int), 1, fp); + ufread(&XML_Tree->ntagnames, sizeof(int), 1, fp); + + s_tree += sizeof(int); + + XML_Tree->TagName = (unsigned char **)umalloc(XML_Tree->ntagnames*sizeof(unsigned char *)); + + s_tags += sizeof(unsigned char*)*XML_Tree->ntagnames; - XML_Tree->TagName = (unsigned char **)malloc(XML_Tree->ntagnames*sizeof(unsigned char *)); for (i=0; intagnames;i++) { - int k = feof(fp); - fscanf(fp, "%s\n",filenameaux); - XML_Tree->TagName[i] = (unsigned char *)malloc(sizeof(unsigned char)*(strlen((const char *)filenameaux)+1)); - strcpy((char *)XML_Tree->TagName[i], (const char *)filenameaux); + + // OJO Kim is it needed ? + int k = feof(fp); + + + // fscanf chokes on "\n" which is the case for the root element + char * r = fgets(buffer,1023,fp); + // int r = fscanf(fp, "%s\n",buffer); + if (r==NULL) + throw "Cannot read tag list"; + + // strlen is actually the right size, since there is a trailing '\n' + int len = strlen((const char*)buffer); + XML_Tree->TagName[i] = (unsigned char *)ucalloc(len,sizeof(char)); + strncpy((char *)XML_Tree->TagName[i], (const char *)buffer,len - 1); + s_tags+= len*sizeof(char); } // loads the flags - fread(&(XML_Tree->indexing_empty_texts), sizeof(bool), 1, fp); - fread(&(XML_Tree->initialized), sizeof(bool), 1, fp); - fread(&(XML_Tree->finished), sizeof(bool), 1, fp); + + ufread(&(XML_Tree->indexing_empty_texts), sizeof(bool), 1, fp); + ufread(&(XML_Tree->initialized), sizeof(bool), 1, fp); + ufread(&(XML_Tree->finished), sizeof(bool), 1, fp); + ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp); - if (!(XML_Tree->indexing_empty_texts)) XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); + s_tree+=sizeof(bool)*4; + if (!(XML_Tree->indexing_empty_texts)) XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); + + s_tree+= XML_Tree->EBVector->size(); + // loads the tags - XML_Tree->Tags = static_sequence_wvtree::load(fp); + XML_Tree->Tags = static_sequence::load(fp); + s_tree+= XML_Tree->Tags->size(); + + s_text = ftell(fp); + + // loads the texts + if (!XML_Tree->disable_tc){ + XML_Tree->Text = TextCollection::InitTextCollection(sample_rate_text); + XML_Tree->Text->Load(fp,sample_rate_text); + int sst; + int st; + ufread(&sst, sizeof(int),1,fp); + for (int i=0;iCachedText.push_back(cppstr); + free(str); + }; - // loads the texts - XML_Tree->Text->Load(fp,sample_rate_text); + } + else + XML_Tree->Text = NULL; + + s_text = ftell(fp) - s_text; - fclose(fp); + + + fclose(fp); + + std::cerr << "Tree part is " << s_tree/1024 << " Kbytes,\n" + << "with node->tagid part " << XML_Tree->Tags->size()/1024 << "Kbytes \n" + << "size of Tag part : " << XML_Tree->Tags->length () << " elements\n" + << "sizof(unsigned int)* " << XML_Tree->Tags->length () << " = " << + sizeof(unsigned int) * XML_Tree->Tags->length () / 1024 << " Kbytes\n" + << "Tag part is " << s_tags/1024 << " Kbytes,\n" + << "Text collection is " << s_text/1024 << " Kbytes \n"; return XML_Tree; } @@ -126,16 +248,16 @@ XMLTree::~XMLTree() free(TagName); if (!indexing_empty_texts) { - EBVector->~static_bitsequence_rrr02(); + //EBVector->~static_bitsequence_rrr02(); delete EBVector; EBVector = NULL; } - Tags->~static_sequence_wvtree(); + //Tags->~static_sequence_wvtree(); delete Tags; Tags = NULL; - Text->~TextCollection(); + //Text->~TextCollection(); delete Text; Text = NULL; @@ -147,7 +269,7 @@ XMLTree::~XMLTree() treeNode XMLTree::Root() { if (!finished) { - fprintf(stderr, "Error: data structure has not been constructed properly\n"); + fprintf(stderr, "Root() : Error: data structure has not been constructed properly\n"); exit(1); } return root_node(Par); @@ -170,6 +292,9 @@ int XMLTree::SubtreeTags(treeNode x, TagType tag) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } + if (x == Root()) + x = first_child(Par,x); + int s = x + 2*subtree_size(Par, x) - 1; @@ -260,7 +385,6 @@ int XMLTree::Postorder(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - return postorder_rank(Par, x); } @@ -271,7 +395,7 @@ TagType XMLTree::Tag(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - + return Tags->access(node2tagpos(x)); } @@ -311,8 +435,10 @@ treeNode XMLTree::Parent(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - - return parent(Par, x); + if (x == Root()) + return NULLT; + else + return parent(Par, x); } // Child(x,i): returns the i-th child of node x, assuming it exists. @@ -345,7 +471,9 @@ treeNode XMLTree::NextSibling(treeNode x) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - + if (x == Root()) + return NULLT; + return next_sibling(Par, x); } @@ -395,6 +523,9 @@ treeNode XMLTree::TaggedDesc(treeNode x, TagType tag) int r, s; treeNode y; + if (isleaf(Par,x)) + return NULLT; + r = (int) Tags->rank(tag, node2tagpos(x)); s = (int) Tags->select(tag, r+1); if (s == -1) return NULLT; // there is no such node @@ -403,6 +534,28 @@ treeNode XMLTree::TaggedDesc(treeNode x, TagType tag) else return y; } +// TaggedNext(x,tag): returns the first node tagged tag with larger preorder than x +// Returns NULLT if there is none. +treeNode XMLTree::TaggedNext(treeNode x, TagType tag) + { + if (!finished) { + fprintf(stderr, "Error: data structure has not been constructed properly\n"); + exit(1); + } + + int r, s; + treeNode y; + if (x==NULLT) + return NULLT; + + r = (int) Tags->rank(tag, node2tagpos(x)); + s = (int) Tags->select(tag, r+1); + if (s == -1) return NULLT; // there is no such node + y = tagpos2node(s); // transforms the tag position into a node position + return (y<=x ? NULLT : y); + } + + // TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an // ancestor of x. Returns NULLT if there is none. treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) @@ -428,9 +581,30 @@ treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) return NULLT; // there is no such node } + // TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in // the subtree of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) +treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) + { + if (!finished) { + fprintf(stderr, "Error: data structure has not been constructed properly\n"); + exit(1); + } + + int r, s; + if (x ==NULLT || x == Root()) + return NULLT; + + r = (int) Tags->rank(tag, find_close(Par, x)); + s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag. + if (s==-1) return NULLT; + else return tagpos2node(s); + } + + +// TaggedFollowingSibling(x,tag): returns the first node tagged tag with larger preorder than x and not in +// the subtree of x. Returns NULLT if there is none. +treeNode XMLTree::TaggedFollowingSibling(treeNode x, TagType tag) { if (!finished) { fprintf(stderr, "Error: data structure has not been constructed properly\n"); @@ -438,12 +612,39 @@ treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) } int r, s; - r = (int) Tags->rank(tag, node2tagpos(next_sibling(Par, x))-1); + treeNode ns = next_sibling(Par,x); + + if (x == NULLT || x == Root() || ns == -1) + return NULLT; + + r = (int) Tags->rank(tag, node2tagpos(ns)-1); s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag. if (s==-1) return NULLT; else return tagpos2node(s); } + +// TaggedAncestor(x, tag): returns the closest ancestor of x tagged tag. Return +// NULLT is there is none. +treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag) + { + if (!finished) { + fprintf(stderr, "Error: data structure has not been constructed properly\n"); + exit(1); + } + + if (x == NULLT || x == Root()) + return NULLT; + + treeNode s = parent(Par, x), r = Root(); + while (s != r) { + if (Tags->access(node2tagpos(s)) == tag) return s; + s = parent(Par, s); + } + return NULLT; + } + + // PrevText(x): returns the document identifier of the text to the left // of node x, or NULLT if x is the root node or the text is empty. // Assumes Doc ids start from 0. @@ -550,10 +751,16 @@ treeNode XMLTree::ParentNode(DocID d) fprintf(stderr, "Error: data structure has not been constructed properly\n"); exit(1); } - + + if (d == NULLT) + return NULLT; + int s; + // OJO : Kim : I added the d+1. before that, else branch was + // EBVector->select1(d) + // and gave wrong results (I'm really poking a bear with a stick here). if (indexing_empty_texts) s = d; - else s = EBVector->select1(d); + else s = EBVector->select1(d+1); if (inspect(Par,s) == CP) // is a closing parenthesis return parent(Par, find_open(Par, s)); @@ -566,38 +773,37 @@ treeNode XMLTree::ParentNode(DocID d) // OpenDocument(empty_texts): it starts the construction of the data structure for // the XML document. Parameter empty_texts indicates whether we index empty texts // in document or not. Returns a non-zero value upon success, NULLT in case of error. -int XMLTree::OpenDocument(bool empty_texts, int sample_rate_text) +int XMLTree::OpenDocument(bool empty_texts, int sample_rate_text,bool dtc) { initialized = true; finished = false; + found_attributes = false; npar = 0; - ntagnames = 0; + parArraySize = 1; + ntagnames = 2; + disable_tc = dtc; indexing_empty_texts = empty_texts; - par_aux = (pb *)malloc(sizeof(pb)); - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - setbit(par_aux,npar,OP); // marks a new opening parenthesis for the tree root - npar++; + par_aux = (pb *)umalloc(sizeof(pb)*parArraySize); - tags_aux = (TagType *) malloc(sizeof(TagType)); - if (!tags_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + tags_aux = (TagType *) umalloc(sizeof(TagType)); - TagName = NULL; + TagName = (unsigned char **) umalloc(2*sizeof(unsigned char*)); - if (!indexing_empty_texts) { - empty_texts_aux = (unsigned int *)malloc(sizeof(unsigned int)); - if (!empty_texts_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - } + TagName[0] = (unsigned char *) umalloc(4*sizeof(unsigned char)); + + strcpy((char *) TagName[0], "<@>"); + + TagName[1] = (unsigned char *) umalloc(4*sizeof(unsigned char)); + + strcpy((char *) TagName[1], "<$>"); + + + if (!indexing_empty_texts) + empty_texts_aux = (unsigned int *)umalloc(sizeof(unsigned int)); + + Text = TextCollection::InitTextCollection((unsigned)sample_rate_text); @@ -616,29 +822,41 @@ int XMLTree::CloseDocument() } // closing parenthesis for the tree root - par_aux = (pb *)realloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } - setbit(par_aux,npar,CP); - npar++; + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); // creates the data structure for the tree topology - Par = (bp *)malloc(sizeof(bp)); + Par = (bp *)umalloc(sizeof(bp)); bp_construct(Par, npar, par_aux, OPT_DEGREE|0); // creates structure for tags - alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); - wt_coder * wtc = new wt_coder_huff((uint *)tags_aux,npar-1,am); - Tags = new static_sequence_wvtree((uint *) tags_aux, (uint) npar-1, wtc, bmb, am); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(20); + static_permutation_builder * pmb = new static_permutation_builder_mrrr(PERM_SAMPLE, bmb); + static_sequence_builder * ssb = new static_sequence_builder_gmr_chunk(bmb, pmb); + + // If we found an attribute then "<@>" is present in the tree + // if we didn't then it is not. "<$>" is never present in the tree + int ntagsize = found_attributes ? 2*ntagnames-1 : 2*ntagnames - 2; + + Tags = new static_sequence_gmr((uint *) tags_aux, (uint) npar-1,ntagsize, bmb, ssb); + + delete bmb; + delete pmb; + delete ssb; // makes the text collection static - Text->MakeStatic(); + if (!disable_tc) + Text->MakeStatic(); // creates the data structure marking the non-empty texts (just in the case it is necessary) - if (!indexing_empty_texts) + if (!indexing_empty_texts) { EBVector = new static_bitsequence_rrr02((uint *)empty_texts_aux,(ulong)npar,(uint)32); + free (empty_texts_aux); + empty_texts_aux = NULL; + } + + // OJO was leaked before, found by valgrind + free(tags_aux); + + tags_aux = NULL; finished = true; @@ -659,21 +877,25 @@ int XMLTree::NewOpenTag(unsigned char *tagname) } // inserts a new opening parentheses in the bit sequence - par_aux = (pb *)realloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; + if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); + parArraySize *= 2; } - + setbit(par_aux,npar,OP); // marks a new opening parenthesis // transforms the tagname into a tag identifier. If the tag is new, we insert // it in the table. for (i=0; i") was called + if (i==0) + found_attributes=true; + if (i==ntagnames) { // the tag is a new one, then we insert it - TagName = (unsigned char **)realloc(TagName, sizeof(char *)*(ntagnames+1)); + TagName = (unsigned char **)urealloc(TagName, sizeof(char *)*(ntagnames+1)); if (!TagName) { fprintf(stderr, "Error: not enough memory\n"); @@ -681,19 +903,15 @@ int XMLTree::NewOpenTag(unsigned char *tagname) } ntagnames++; - TagName[i] = (unsigned char *)malloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1)); + TagName[i] = (unsigned char *)umalloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1)); strcpy((char *)TagName[i], (const char *)tagname); } - tags_aux = (TagType *) realloc(tags_aux, sizeof(TagType)*(npar + 1)); - if (!tags_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags npar++; - + return 1; } @@ -712,38 +930,28 @@ int XMLTree::NewClosingTag(unsigned char *tagname) } // inserts a new closing parentheses in the bit sequence - par_aux = (pb *)realloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); - if (!par_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; + if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); + parArraySize *= 2; } - setbit(par_aux,npar,CP); // marks a new closing opening parenthesis + + setbit(par_aux,npar,CP); // marks a new closing parenthesis // transforms the tagname into a tag identifier. If the tag is new, we insert // it in the table. for (i=0; iInsertText(s); + string cpps = (char*) s; + CachedText.push_back(cpps); return 1; // success } @@ -793,11 +1003,7 @@ int XMLTree::NewEmptyText() } if (!indexing_empty_texts) { - empty_texts_aux = (unsigned int *)realloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); - if (!empty_texts_aux) { - fprintf(stderr, "Error: not enough memory\n"); - return NULLT; - } + empty_texts_aux = (unsigned int *)urealloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); bitclean(empty_texts_aux, npar-1); // marks the empty text with a 0 in the bit vector } @@ -827,10 +1033,35 @@ unsigned char *XMLTree::GetTagName(TagType tagid) unsigned char *s; if (tagid >= ntagnames) return NULL; // invalid tag identifier - s = (unsigned char *)malloc((strlen((const char *)TagName[tagid])+1)*sizeof(unsigned char)); + s = (unsigned char *)umalloc((strlen((const char *)TagName[tagid])+1)*sizeof(unsigned char)); strcpy((char *)s, (const char *)TagName[tagid]); return s; } +//KIM : OJO need the two following methods + +const unsigned char *XMLTree::GetTagNameByRef(TagType tagid) + { + if (tagid >= ntagnames) return NULL; // invalid tag identifier + return ((const unsigned char*) TagName[tagid]); + } + + +TagType XMLTree::RegisterTag(unsigned char *tagname) +{ + if (!finished) + return NULLT; + + TagType id = XMLTree::GetTagId(tagname); + if (id == NULLT){ + id = ntagnames; + ntagnames = ntagnames + 1; + TagName = (unsigned char **) urealloc(TagName,ntagnames*(sizeof(unsigned char*))); + TagName[id] = (unsigned char *) umalloc(sizeof(unsigned char)*strlen( (const char*) tagname)+1); + strcpy((char*)TagName[id], (const char *)tagname); + }; + + return id; +}