From c6266d8fd1872fad45b18d3d554410d080b65099 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Wed, 4 Apr 2012 18:59:58 +0200 Subject: [PATCH] Huge refactoring to remove diego' C/C++ chimera code. --- Makefile | 10 +- XMLTree.cpp | 778 ------------------------------------------- XMLTree.h | 774 ------------------------------------------ XMLTreeBuilder.cpp | 190 ----------- XMLTreeBuilder.h | 116 ------- bit-vector.cpp | 223 +++++++++++++ bit-vector.hpp | 148 ++++++++ common.h | 74 ---- timings.h | 73 ---- xml-tree-builder.cpp | 140 ++++++++ xml-tree-builder.hpp | 48 +++ xml-tree-inc.hpp | 225 +++++++++++++ xml-tree.cpp | 580 ++++++++++++++++++++++++++++++++ xml-tree.hpp | 159 +++++++++ 14 files changed, 1529 insertions(+), 2009 deletions(-) delete mode 100644 XMLTree.cpp delete mode 100644 XMLTree.h delete mode 100644 XMLTreeBuilder.cpp delete mode 100644 XMLTreeBuilder.h create mode 100644 bit-vector.cpp create mode 100644 bit-vector.hpp delete mode 100644 common.h delete mode 100644 timings.h create mode 100644 xml-tree-builder.cpp create mode 100644 xml-tree-builder.hpp create mode 100644 xml-tree-inc.hpp create mode 100644 xml-tree.cpp create mode 100644 xml-tree.hpp diff --git a/Makefile b/Makefile index 36637de..1e054aa 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -INC_FLAGS=-I.. -I../libcds/includes/ +INC_FLAGS=-I.. -I../libcds/includes/ -I. ifeq ($(VERBOSE), true) @@ -10,7 +10,7 @@ endif ifeq ($(DEBUG), true) OPT_FLAGS=-O0 -g $(POPCOUNT_FLAG) -static else - OPT_FLAGS=-O4 $(POPCOUNT_FLAG) -static -flto + OPT_FLAGS=-O3 $(POPCOUNT_FLAG) -static -flto endif ifeq ($(PROFILE), true) @@ -23,8 +23,10 @@ endif CXXFLAGS=-std=c++0x $(INC_FLAGS) $(OPT_FLAGS) $(PROF_FLAGS) CXX=g++ -OBJECTS_XMLTREE=XMLTree.o XMLTreeBuilder.o -XMLTREE_A=libXMLTree.a +#OBJECTS_XMLTREE=XMLTree.o XMLTreeBuilder.o +#XMLTREE_A=libXMLTree.a +OBJECTS_XMLTREE=bit-vector.o xml-tree.o xml-tree-builder.o +XMLTREE_A=libxml-tree.a diff --git a/XMLTree.cpp b/XMLTree.cpp deleted file mode 100644 index a7ee254..0000000 --- a/XMLTree.cpp +++ /dev/null @@ -1,778 +0,0 @@ -#include "common.h" -#include "XMLTree.h" -#include "timings.h" -#include -using std::cout; -using std::endl; -using std::min; -using std::string; - -// functions to convert tag positions to the corresponding tree node and viceversa. -// These are implemented in order to be able to change the tree and Tags representations, -// without affecting the code so much. -// Current implementation corresponds to balanced-parentheses representation for -// the tree, and storing 2 tags per tree node (opening and closing tags). - - -static int bits8 (int t ) { - int r = bits(t); - if (r <= 8) - return 8; - else if (r <= 16) - return 16; - else - return r; -} - - - -static treeNode fast_sibling(bp* Par,treeNode x,TagType tag){ - - if (tag == PCDATA_TAG_ID){ - x = x+2; - return bp_inspect(Par,x)==OP ? x : NULLT; - } else return bp_next_sibling(Par,x); - -} - - - - -inline uint get_field_no_power(uint *A, uint len, uint index) { - - register uint i=index*len/W, j=index*len-W*i; - return (j+len <= W) ? (A[i] << (W-j-len)) >> (W-len) : (A[i] >> j) | (A[i+1] << (WW-j-len)) >> (W-len); - -} - -static uint fast_get_field(uint* A,int len, int idx) -{ - uint f1, f2; - switch (len) { - case 8: - return (uint) (((uchar*)A)[idx]); - case 16: - f2 = ((unsigned short*)A)[idx]; - f1 = ((unsigned short*)A)[idx+1]; - return (f1 << 16) + f2; - default: - return get_field_no_power (A,len,idx); - }; - -} - - - - -XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, - uint *empty_texts_bmp, TagType *tags, - TextCollectionBuilder * const TCB, bool dis_tc, - TextCollectionBuilder::index_type_t _index_type ) - { - buffer = 0; - print_stack = 0; - // creates the data structure for the tree topology - STARTTIMER(); - Par = bp_construct(npar, (pb*) par, OPT_DEGREE|0); - STOPTIMER(Building); - PRINTTIME("Building parenthesis struct", Building); - STARTTIMER(); - - - // creates structure for tags - - TagName = (vector*)TN; - tIdMap = (TagIdMap *) tim; - - uint max_tag = TN->size() - 1; - - static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); - alphabet_mapper *am = new alphabet_mapper_none(); - Tags = new static_sequence_bs((uint*)tags,npar,am,bmb); - - //cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl; - - //Ensures that for small tag numbers, we are on an 8bit boundary. - //Makes tag access way faster with negligeable waste of space. - tags_blen = bits8(max_tag); - std::cerr << "Tags blen is " << tags_blen << "\n"; - tags_len = (uint)npar; - tags_fix = new uint[uint_len(tags_blen,tags_len)]; - for(uint i=0;i<(uint)npar;i++) - set_field(tags_fix,tags_blen,i,tags[i]); - delete bmb; - free(tags); - tags = NULL; - - STOPTIMER(Building); - PRINTTIME("Building Tag Structure", Building); - - EBVector = new static_bitsequence_rrr02(empty_texts_bmp,npar,32); - free(empty_texts_bmp); - empty_texts_bmp = NULL; - - - disable_tc = dis_tc; - text_index_type = _index_type; - if (!disable_tc) { - assert(TCB != 0); - STARTTIMER(); - Text = TCB->InitTextCollection(); - delete TCB; - STOPTIMER(Building); - PRINTTIME("Building TextCollection", Building); - - } else { - Text = NULL; - } - - std::cerr << "Number of distinct tags " << TagName->size() << "\n"; - //std::cerr.flush(); - } - - -// ~XMLTree: frees memory of XML tree. -XMLTree::~XMLTree() - { - int i; - - bp_delete(Par); - Par = NULL; - - delete tIdMap; - tIdMap = NULL; - - delete TagName; - TagName = NULL; - - delete Tags; - Tags = NULL; - - delete Text; - Text = NULL; - - delete EBVector; - EBVector = NULL; - - } - - -void XMLTree::print_stats() - { - uint total_space = Tags->size()+sizeof(static_sequence*); - total_space += sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)); - cout << "Space usage for XMLTree:" << endl - << " - tags static_sequence: " << Tags->size()+sizeof(static_sequence*) << endl - << " - tags access array: " << sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)) << endl - << " ... add Diego structures ... " << endl - << " *total* " << total_space << endl; - } - -// Save: saves XML tree data structure to file. -void XMLTree::Save(int fd, char * name) - { - FILE *fp; - int i; - off_t pos = lseek(fd, 0, SEEK_CUR); - int fd2 = dup(fd); - fp = fdopen(fd2, "w"); - fseek(fp, pos, SEEK_SET); - // first stores the tree topology - saveTree(Par, fp); - - // stores the table with tag names - int ntags = TagName->size(); - - ufwrite(&ntags, sizeof(int), 1, fp); - for (i = 0; iat(i).c_str()); - - - // stores the tags - Tags->save(fp); - ufwrite(&tags_blen,sizeof(uint),1,fp); - ufwrite(&tags_len,sizeof(uint),1,fp); - ufwrite(tags_fix,sizeof(uint),uint_len(tags_blen,tags_len),fp); - - // flags - ufwrite(&disable_tc, sizeof(bool),1,fp); - - //text positions - EBVector->save(fp); - std::cerr << "TC Index position: " << ftell(fp) << std::endl; - // stores the texts - if (!disable_tc) { - std::cerr << "Writing " << sizeof(TextCollectionBuilder::index_type_t) << " bytes\n" << std::endl; - ufwrite(&text_index_type, sizeof(TextCollectionBuilder::index_type_t), 1, fp); - - - string file(name); - switch (text_index_type){ - case TextCollectionBuilder::index_type_default: - file.append("_default"); - break; - case TextCollectionBuilder::index_type_swcsa: - file.append("_swcsa"); - break; - case TextCollectionBuilder::index_type_rlcsa: - file.append("_rlcsa"); - break; - }; - - Text->Save(fp, file.c_str()); - - - } - fflush(fp); - fclose(fp); - } - -// Load: loads XML tree data structure from file. Returns -// a pointer to the loaded data structure -XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor, char * name) - { - - FILE *fp; - char buffer[1024]; - XMLTree *XML_Tree; - int i; - - buffer[1023] = '\0'; - - fp = fdopen(fd, "r"); - - XML_Tree = new XMLTree(); - STARTTIMER(); - // Load the tree structure - XML_Tree->Par = loadTree(fp); - STOPTIMER(Loading); - PRINTTIME("Loading parenthesis struct", Loading); - STARTTIMER(); - - XML_Tree->TagName = new std::vector(); - XML_Tree->tIdMap = new std::unordered_map(); - std::string s; - int ntags; - - // Load the tag names - ufread(&ntags, sizeof(int), 1, fp); - - for (i=0; iTagName->push_back(s); - XML_Tree->tIdMap->insert(std::make_pair(s,i)); - - }; - STOPTIMER(Loading); - PRINTTIME("Loading tag names struct", Loading); - STARTTIMER(); - - // loads the tag structure - XML_Tree->Tags = static_sequence::load(fp); - ufread(&XML_Tree->tags_blen,sizeof(uint),1,fp); - std::cerr << "tags_blen is "<< XML_Tree->tags_blen <<"\n"; - ufread(&XML_Tree->tags_len,sizeof(uint),1,fp); - XML_Tree->tags_fix = new uint[uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)]; - ufread(XML_Tree->tags_fix,sizeof(uint),uint_len(XML_Tree->tags_blen,XML_Tree->tags_len),fp); - - // TODO ask francisco about this - /// FIXME:UGLY tests! - //uint * seq = new uint[XML_Tree->tags_len]; - //for(uint i=0;itags_len;i++) - // seq[i] = get_field(XML_Tree->tags_fix,XML_Tree->tags_blen,i); - //cout << "Tags test: " << XML_Tree->Tags->test(seq,XML_Tree->tags_len) << endl; - //XML_Tree->Tags->test(seq,XML_Tree->tags_len); - //delete [] seq; - /// End ugly tests - - STOPTIMER(Loading); - std::cerr << (uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/(1024*1024) << " MB for tag sequence" << std::endl; - PRINTTIME("Loading tag struct", Loading); - STARTTIMER(); - - // loads the flags - - ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp); - if (load_tc) { - XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); - - STOPTIMER(Loading); - PRINTTIME("Loading text bitvector struct", Loading); - STARTTIMER(); - std::cerr << "TC Load Index position: " << ftell(fp) << std::endl; - // Not used - // loads the texts - if (!XML_Tree->disable_tc){ - ufread(&(XML_Tree->text_index_type), - sizeof(TextCollectionBuilder::index_type_t), 1, fp); - string file(name); - switch (XML_Tree->text_index_type){ - case TextCollectionBuilder::index_type_default: - file.append("_default"); - break; - case TextCollectionBuilder::index_type_swcsa: - file.append("_swcsa"); - break; - case TextCollectionBuilder::index_type_rlcsa: - file.append("_rlcsa"); - break; - }; - - - XML_Tree->Text = TextCollection::Load(fp, file.c_str(), TextCollection::index_mode_default, sample_factor); - - } - else XML_Tree->Text = NULL; - STOPTIMER(Loading); - PRINTTIME("Loading TextCollection", Loading); - STARTTIMER(); - } - else { - XML_Tree->EBVector = NULL; - XML_Tree->Text = NULL; - XML_Tree->disable_tc = true; - }; - - - return XML_Tree; - } - - - - -int XMLTree::SubtreeElements(treeNode x) - { - - int size = bp_subtree_size(Par, x); - if (x == Root()){ - x = bp_first_child(Par,x); - size = size - 1; - }; - - int s = x + 2*size - 1; - int ntext = Tags->rank(PCDATA_TAG_ID, s) - Tags->rank(PCDATA_TAG_ID, node2tagpos(x)-1); - size = size - ntext; - treeNode fin = bp_find_close(Par,x); - treeNode y = Tags->select_next(ATTRIBUTE_TAG_ID,node2tagpos(x)); - while (y != NULLT && y < fin){ - size -= SubtreeSize(y); - y = Tags->select_next(ATTRIBUTE_TAG_ID, node2tagpos(y)); - }; - return size; - } - -// IsLeaf(x): returns whether node x is leaf or not. In the succinct representation -// this is just a bit inspection. -bool XMLTree::IsLeaf(treeNode x) - { - NULLT_IF(x==NULLT); - return bp_isleaf(Par, x); - } - -// IsAncestor(x,y): returns whether node x is ancestor of node y. -bool XMLTree::IsAncestor(treeNode x, treeNode y) - { - return bp_is_ancestor(Par, x, y); - } - -// IsChild(x,y): returns whether node x is parent of node y. -bool XMLTree::IsChild(treeNode x, treeNode y) - { - if (!bp_is_ancestor(Par, x, y)) return false; - return bp_depth(Par, x) == (bp_depth(Par, y) + 1); - } - - -// NumChildren(x): number of children of node x. Constant time with the data structure -// of Sadakane. -int XMLTree::NumChildren(treeNode x) - { - return bp_degree(Par, x); - } - -// ChildNumber(x): returns i if node x is the i-th children of its parent. -int XMLTree::ChildNumber(treeNode x) - { - return bp_child_rank(Par, x); - } - -// Depth(x): depth of node x, a simple binary rank on the parentheses sequence. -int XMLTree::Depth(treeNode x) - { - return bp_depth(Par, x); - } - -// Preorder(x): returns the preorder number of node x, just counting the tree -// nodes (i.e., tags, it disregards the texts in the tree). -int XMLTree::Preorder(treeNode x) - { - return bp_preorder_rank(Par, x); - } - -// Postorder(x): returns the postorder number of node x, just counting the tree -// nodes (i.e., tags, it disregards the texts in the tree). -int XMLTree::Postorder(treeNode x) - { - return bp_postorder_rank(Par, x); - } - -// DocIds(x): returns the range of text identifiers that descend from node x. -// returns {NULLT, NULLT} when there are no texts descending from x. -range XMLTree::DocIds(treeNode x) - { - range r; - if (x == NULLT) { - r.min = NULLT; - r.max = NULLT; - return r; - }; - int min = EBVector->rank1(x-1); - int max = EBVector->rank1(x+2*bp_subtree_size(Par, x)-2); - if (min==max) { // range is empty, no texts within the subtree of x - r.min = NULLT; - r.max = NULLT; - } - else { // the range is non-empty, there are texts within the subtree of x - r.min = min+1; - r.max = max; - } - return r; - } - - -// Child(x,i): returns the i-th child of node x, assuming it exists. -treeNode XMLTree::Child(treeNode x, int i) -{ - if (i <= OPTD) return bp_naive_child(Par, x, i); - else return bp_child(Par, x, i); -} - - -treeNode XMLTree::SelectChild(treeNode x, TagIdSet *tags) -{ - - NULLT_IF(x==NULLT || bp_isleaf(Par,x)); - int i; - treeNode child = bp_first_child(Par, x); - TagType t; - while (child != NULLT) { - t = Tag(child); - if (tags->find(t) != tags->end()) return child; - child = fast_sibling(Par, child,t); - } - return NULLT; -} - - -treeNode XMLTree::SelectFollowingSibling(treeNode x, TagIdSet *tags) -{ - - NULLT_IF(x==NULLT); - int i; - TagType t; - treeNode sibling = bp_next_sibling(Par, x); - while (sibling != NULLT) { - t = Tag(sibling); - if (tags->find(t) != tags->end()) return sibling; - sibling = fast_sibling(Par, sibling,t); - } - return NULLT; - } - - -// TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an -// ancestor of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedPreceding(treeNode x, TagType tag) - { - int r, s; - treeNode node_s, root; - r = (int)Tags->rank(tag, node2tagpos(x)-1); - if (r==0) return NULLT; // there is no such node. - s = (int)Tags->select(tag, r); - root = bp_root_node(Par); - node_s = tagpos2node(s); - while (bp_is_ancestor(Par, node_s, x) && (node_s!=root)) { // the one that we found is an ancestor of x - r--; - if (r==0) return NULLT; // there is no such node - s = (int)Tags->select(tag, r); // we should use select_prev instead when provided - node_s = tagpos2node(s); - } - return NULLT; // there is no such node - } - - -// TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in -// the subtree of x. Returns NULLT if there is none. -treeNode XMLTree::TaggedFollowing(treeNode x, TagType tag) - { - NULLT_IF (x ==NULLT || x == Root()); - return tagpos2node(Tags->select_next(tag, bp_find_close(Par, x))); - - } - - -/* Here we inline TaggedFoll to find the min globally, and only at the end - we check if the min is below the context node */ -treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ancestor) - { - - NULLT_IF(x==NULLT || x==Root()); - - treeNode close = bp_find_close(Par,x); - treeNode ns = close+1; - if ( (bp_inspect(Par,ns) == OP) && (tags->find(Tag(ns)) != tags->end())) - return ns; - - int i; - treeNode min = NULLT; - treeNode aux; - - - TagIdSet::const_iterator tagit; - for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - - aux = tagpos2node(Tags->select_next(*tagit, close)); - if (aux == NULLT) continue; - if ((min == NULLT) || (aux < min)) min = aux; - }; - - // found the smallest node in preorder which is after x. - // if ctx is the root node, just return what we found. - - if (ancestor == Root() || min == NULLT || min < bp_find_close(Par, ancestor)) return min; - else return NULLT; - - } -// TaggedAncestor(x, tag): returns the closest ancestor of x tagged tag. Return -// NULLT is there is none. -treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag) - { - if (x == NULLT || x == Root()) - return NULLT; - - treeNode s = bp_parent(Par, x), r = Root(); - while (s != r) { - if (Tag(s) == tag) return s; - s = bp_parent(Par, s); - } - return NULLT; - } - - - -// MyText(x): returns the document identifier of the text below node x, -// or NULLT if x is not a leaf node or the text is empty. Assumes Doc -// ids start from 0. -DocID XMLTree::MyText(treeNode x) -{ - TagType tag = Tag(x); - // seems faster than testing EBVector->access(x); - - if (tag == PCDATA_TAG_ID || tag == ATTRIBUTE_DATA_TAG_ID) - return (DocID) (EBVector->rank1(x)-1); - else - return (DocID) NULLT; - -} -// MyText(x): returns the document identifier of the text below node x, -// or NULLT if x is not a leaf node or the text is empty. Assumes Doc -// ids start from 0. -DocID XMLTree::MyTextUnsafe(treeNode x) -{ - return (DocID) (EBVector->rank1(x)-1); //-1 because document ids start from 0 - -} -// TextXMLId(d): returns the preorder of document with identifier d in the tree consisting of -// all tree nodes and all text nodes. Assumes that the tree root has preorder 1. -int XMLTree::TextXMLId(DocID d) - { - NULLT_IF(d == NULLT); - int s = EBVector->select1(d+1); - return bp_rank_open(Par, s) + d + 1; // +1 because root has preorder 1 - - } - -// NodeXMLId(x): returns the preorder of node x in the tree consisting -// of all tree nodes and all text nodes. Assumes that the tree root has -// preorder 0; -int XMLTree::NodeXMLId(treeNode x) - { - NULLT_IF(x == NULLT); - if (x == Root()) return 1; // root node has preorder 1 - return bp_rank_open(Par, x) + EBVector->rank1(x-1); - } - -// ParentNode(d): returns the parent node of document identifier d. -treeNode XMLTree::ParentNode(DocID d) - { - NULLT_IF (d == NULLT); - return (treeNode) EBVector->select1(d+1); - } - -// GetTagId: returns the tag identifier corresponding to a given tag name. -// Returns NULLT in case that the tag name does not exists. -TagType XMLTree::GetTagId(unsigned char *tagname) - { - - string s = (char *) tagname; - TagIdMapIT it = tIdMap->find(s); - return (TagType) ((it != tIdMap->end()) ? it->second : -1); - - } - - -// GetTagName(tagid): returns the tag name of a given tag identifier. -// Returns NULL in case that the tag identifier is not valid. -unsigned char *XMLTree::GetTagName(TagType tagid) - { - unsigned char *s; - if ( tagid < 0 || tagid >= TagName->size()) - return (unsigned char *) ""; - strcpy((char *)s, (*TagName)[tagid].c_str()); - - return (s == NULL ? (unsigned char*) "" : s); - } - - -const unsigned char *XMLTree::GetTagNameByRef(TagType tagid) - { - - unsigned char *s; - if ( tagid < 0 || tagid >= TagName->size()) - return (unsigned char *) ""; - - return (const unsigned char *) (*TagName)[tagid].c_str(); - - } - - - -TagType XMLTree::RegisterTag(unsigned char *tagname) - { - TagType id = XMLTree::GetTagId(tagname); - if (id == NULLT) { - string s = (char *) tagname; - REGISTER_TAG(TagName,tIdMap,s); - }; - - return id; - } - - -treeNode XMLTree::Closing(treeNode x) { - return bp_find_close(Par,x); -} -bool XMLTree::IsOpen(treeNode x) { return bp_inspect(Par,x); } - -//WARNING this uses directly the underlying implementation for plain text - - -void XMLTree::Print(int fd,treeNode x, bool no_text){ - - if (buffer == 0) { - buffer = new string(BUFFER_ALLOC, 0); - buffer->clear(); - print_stack = new std::vector(); - print_stack->reserve(256); - }; - - treeNode fin = bp_find_close(Par,x); - treeNode n = x; - TagType tag = Tag(n); - - range r = DocIds(x); - treeNode first_idx; - treeNode first_text = (tag == PCDATA_TAG_ID ? x : ParentNode(r.min-1)); - treeNode first_att = NULLT; - - if (first_att == NULLT) - first_idx = first_text; - else if (first_text == NULLT) - first_idx = first_att; - else - first_idx = min(first_att,first_text); - - uchar * current_text=NULL; - - if (first_idx != NULLT) - current_text = GetText(MyTextUnsafe(first_idx)); - uchar * orig_text = current_text; - size_t read = 0; - - while (n <= fin){ - if (bp_inspect(Par,n)){ - if (tag == PCDATA_TAG_ID) { - - if (no_text) - _dputs("<$/>", fd); - else { - read = _dprintf((const char*) current_text, fd); - current_text += (read + 1); - }; - n+=2; // skip closing $ - tag = Tag(n); - - } else { - - _dputc('<',fd); - _dput_str((*TagName)[tag], fd); - n++; - if (bp_inspect(Par,n)) { - print_stack->push_back(&((*TagName)[tag])); - tag = Tag(n); - if (tag == ATTRIBUTE_TAG_ID){ - n++; - if (no_text) _dputs("><@@>",fd); - - while (bp_inspect(Par,n)){ - if (no_text) { - _dputc('<', fd); - _dputs((const char*) &(GetTagNameByRef(Tag(n))[3]), fd); - _dputc('>', fd); - _dputs("<$@/>', fd); - n+= 4; - } else { - _dputc(' ', fd); - _dputs((const char*) &(GetTagNameByRef(Tag(n))[3]), fd); - n++; - _dputs("=\"", fd); - read = _dprintf((const char*) current_text, fd); - current_text += (read + 1); - _dputc('"', fd); - n+=3; - } - }; - if (no_text) _dputs("", fd); - else _dputc('>', fd); - n++; - tag=Tag(n); - - } else - _dputc('>', fd); - - } else {// tag - _dputs("/>", fd); - n++; - tag=Tag(n); - }; - }; - } else do { - _dputs("back()), fd); - _dputc('>', fd); - print_stack->pop_back(); - n++; - } while (!(bp_inspect(Par,n) || print_stack->empty())); - tag = Tag(n); - }; - _dputc('\n', fd); - if (orig_text && text_index_type != (TextCollectionBuilder::index_type_default)) - Text->DeleteText(orig_text); - //_flush(fd); -} diff --git a/XMLTree.h b/XMLTree.h deleted file mode 100644 index 74a71e6..0000000 --- a/XMLTree.h +++ /dev/null @@ -1,774 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2008 by Diego Arroyuelo * - * Interface for the in-memory XQuery/XPath engine * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ - -#ifndef XMLTREE_H_ -#define XMLTREE_H_ - - -#include -#include -#include -#include - -#undef W -#undef WW -#undef Wminusone - -#include -#include -#include -#include -#include -#include - -using SXSI::TextCollection; -using SXSI::TextCollectionBuilder; - - -// this constant is used to efficiently compute the child operation in the tree -#define OPTD 10 - -#define NULLT -1 - -#define PERM_SAMPLE 10 - - -typedef int treeNode; -typedef int TagType; -typedef int DocID; - -typedef struct { - int min; - int max; -} range; - -// Encoding of the XML Document : -// The following TAGs and IDs are fixed, "" is the tag of the root. -// a TextNode is represented by a leaf <<$>>> The DocId in the TextCollection -// of that leaf is kept in a bit sequence. -// a TextNode below an attribute is likewise represented by a leaf <<@$>><> -// An element ... the representation is: -// <<@>> <<@>a1> <<$@>>DocID(v1)>a1> ... > .... -// Hence the attributes (if any) are always below the first child of their element, -// as the children of a fake node <@>. - - -#define DOCUMENT_OPEN_TAG "" -#define DOCUMENT_TAG_ID 0 -#define ATTRIBUTE_OPEN_TAG "<@>" -#define ATTRIBUTE_TAG_ID 1 -#define PCDATA_OPEN_TAG "<$>" -#define PCDATA_TAG_ID 2 -#define ATTRIBUTE_DATA_OPEN_TAG "<@$>" -#define ATTRIBUTE_DATA_TAG_ID 3 -#define CLOSING_TAG "" -#define CLOSING_TAG_ID 4 -#define DOCUMENT_CLOSE_TAG "/" -#define ATTRIBUTE_CLOSE_TAG "/<@>" -#define PCDATA_CLOSE_TAG "/<$>" -#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" - - -typedef std::unordered_set TagIdSet; -typedef std::unordered_map TagIdMap; -typedef TagIdMap::const_iterator TagIdMapIT; - -#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ - (v)->push_back(t); } while (false) - -// returns NULLT if the test is true -#define NULLT_IF(x) do { if (x) return NULLT; } while (0) -#define IS_NIL(x) ((x) < 0) - -// Direct calls to sarray library - -#define BUFFER_ALLOC (8192 * 2) -#define BUFFER_SIZE (BUFFER_ALLOC / 2) - -// tag position -> tree node -static treeNode tagpos2node(int t) - { - return (treeNode) t; - } -// tree node -> tag position -static int node2tagpos(treeNode x) -{ - return (int)x; -} - - -class XMLTreeBuilder; - -class XMLTree { - - // Only the builder can access the constructor - friend class XMLTreeBuilder; - - private: - /** Balanced parentheses representation of the tree */ - bp *Par; - - /** Mapping from tag identifer to tag name */ - std::vector *TagName; - TagIdMap * tIdMap; - - /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence *EBVector; - - /** Tag sequence represented with a data structure for rank and select */ - static_sequence *Tags; - uint * tags_fix; - uint tags_blen, tags_len; - - /** The texts in the XML document */ - TextCollection *Text; - - // Allows to disable the TextCollection for benchmarkin purposes - bool disable_tc; - SXSI::TextCollectionBuilder::index_type_t text_index_type; - - std::string *buffer; - std::vector *print_stack; - - - void _real_flush(int fd, size_t size) { - if (size == 0) return; - size_t written; - while (1) { - written = write(fd, buffer->data(), size); - if ((written < 0) && (errno == EAGAIN || errno == EINTR)) - continue; - break; - }; - buffer->clear(); - - } - - void _flush(int fd){ - size_t size = buffer->size(); - if (size < BUFFER_SIZE) return; - _real_flush(fd, size); - } - - void _dput_str(std::string s, int fd){ - buffer->append(s); - _flush(fd); - } - - void _dputs(const char* s, int fd){ - buffer->append(s); - _flush(fd); - } - - void _dputc(const char c, int fd){ - buffer->push_back(c); - } - - size_t _dprintf(const char* s, int fd){ - if (s == NULL) return 0; - size_t i; - char c; - for (i = 0; (c = s[i]); i++) { - switch (c) { - case '"': - _dputs(""", fd); - break; - case '&': - _dputs("&", fd); - break; - case '\'': - _dputs("'", fd); - break; - case '<': - _dputs("<", fd); - break; - case '>': - _dputs(">", fd); - break; - default: - _dputc(c, fd); - - }; - }; - return i; - } - - void PrintNode(treeNode n, int fd); - /** Data structure constructors */ - XMLTree(){ buffer = 0;}; - - // non const pointer are freed by this method. - XMLTree( pb * const par, - uint npar, - std::vector * const TN, - TagIdMap * const tim, uint *empty_texts_bmp, - TagType *tags, - TextCollectionBuilder * const TCB, bool dis_tc, - TextCollectionBuilder::index_type_t _index_type ); - -public: - /** Data structure destructor */ - ~XMLTree(); - - /** root(): returns the tree root. */ - treeNode Root() { return 0; } - - /** Size() : Number of parenthesis */ - unsigned int Size(){ - return tags_len/2; - } - - - /** NumTags() : Number of distinct tags */ - unsigned int NumTags() { - return TagName->size(); - } - - int TagsBinaryLength(){ return tags_blen; }; - unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); }; - unsigned int * TagStruct() { return tags_fix; }; - - - /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of - * node x. */ - int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); } - - /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree - * of node x. */ - int SubtreeTags(treeNode x, TagType tag){ - //int s = x + 2*subtree_size(Par, x) - 1; - treeNode y = bp_find_close(Par, x); - - if (y - x < 10) { - int count = 0; - for(int i = x; i < y; i++) - count += (Tag(i) == tag); - return count; - } - else - return (Tags->rank(tag, y) - Tags->rank(tag, x)); - }; - - /** SubtreeElements(x) of element nodes in the subtree of x - */ - int SubtreeElements(treeNode x); - - /** IsLeaf(x): returns whether node x is leaf or not. In the succinct - * representation this is just a bit inspection. */ - - bool IsLeaf(treeNode x); - - /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ - - bool IsAncestor(treeNode x, treeNode y); - - - /** IsRigthDescendant returns true if y is a descendant of x and y is - not a descendant of the first child of x */ - bool IsRightDescendant(treeNode x, treeNode y) { - if (x <= Root()) return false; - treeNode z = bp_parent_close(Par, x); - treeNode c = bp_find_close(Par, x); - return (y > c && y < z ); - } - - - /** IsChild(x,y): returns whether node x is parent of node y. */ - bool IsChild(treeNode x, treeNode y); - - /** IsFirstChild(x): returns whether node x is the first child of its parent. */ - /* SLOW - bool IsFirstChild(treeNode x) { - return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1)); - }; - */ - - bool IsFirstChild(treeNode x) { - return (x <= Root()) || (bp_inspect(Par,x-1) == OP); - } - - /** NumChildren(x): number of children of node x. Constant time with the - * data structure of Sadakane. */ - int NumChildren(treeNode x); - - /** ChildNumber(x): returns i if node x is the i-th children of its - * parent. */ - int ChildNumber(treeNode x); - - /** Depth(x): depth of node x, a simple binary rank on the parentheses - * sequence. */ - int Depth(treeNode x); - - /** Preorder(x): returns the preorder number of node x, just regarding tree - * nodes (and not texts). */ - int Preorder(treeNode x); - - /** Postorder(x): returns the postorder number of node x, just regarding - * tree nodes (and not texts). */ - int Postorder(treeNode x); - - - /** DocIds(x): returns the range (i.e., a pair of integers) of document - * identifiers that descend from node x. */ - range DocIds(treeNode x); - - /** Parent(x): returns the parent node of node x. */ - treeNode Parent(treeNode x) { - return (x == Root()) ? NULLT : bp_parent(Par, x); - }; - - treeNode BinaryParent(treeNode x){ - if (x <= Root()) - return NULLT; - else { - treeNode prev = x - 1; - return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev); - }; - }; - - /* Assumes x is neither 0 nor -1 */ - - /** Child(x,i): returns the i-th child of node x, assuming it exists. */ - treeNode Child(treeNode x, int i); - - - - /** LastChild(x): returns the last child of node x. */ - treeNode LastChild(treeNode x) { - NULLT_IF(x == NULLT || bp_isleaf(Par,x)); - return bp_find_open(Par, bp_find_close(Par, x)-1); - } - - /** PrevSibling(x): returns the previous sibling of node x, assuming it - * exists. */ - - treeNode PrevSibling(treeNode x) - { - NULLT_IF(x==NULLT); - return bp_prev_sibling(Par, x); - } - - - /** TaggedChild(x,tag): returns the first child of node x tagged tag, or - * NULLT if there is none. Because of the balanced-parentheses representation - * of the tree, this operation is not supported efficiently, just iterating - * among the children of node x until finding the desired child. */ - - - treeNode SelectChild(treeNode x, TagIdSet * tags); - - /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or - * NULLT if there is none. */ - - treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); - - - - - treeNode SelectDescendant(treeNode x, TagIdSet * tags) { - NULLT_IF (x == NULLT); - treeNode fc = x+1; - if (bp_inspect(Par, fc) == CP) return NULLT; - treeNode min = NULLT; - treeNode aux; - - TagIdSet::const_iterator tagit; - for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - aux = TaggedDescendant(x, (TagType) *tagit); - if (((unsigned int) aux) < ((unsigned int) min)) min = aux; - }; - return min; - } - - /** TaggedPrec(x,tag): returns the first node tagged tag with smaller - * preorder than x and not an ancestor of x. Returns NULLT if there - * is none. */ - treeNode TaggedPreceding(treeNode x, TagType tag); - - /** TaggedFoll(x,tag): returns the first node tagged tag with larger - * preorder than x and not in the subtree of x. Returns NULLT if there - * is none. */ - treeNode TaggedFollowing(treeNode x, TagType tag); - - - - treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); - - // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); - - treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing) - { - - NULLT_IF(x <= 0); - - treeNode close = bp_find_close(Par,x); - - - treeNode min = NULLT; - treeNode aux; - - - TagIdSet::const_iterator tagit; - for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - - aux = tagpos2node(Tags->select_next(*tagit, close)); - - if (((unsigned int) aux) < ((unsigned int) min)) min = aux; - }; - - - return (min < ancestor_closing) ? min : NULLT; - - } - - /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged - * tag. Return NULLT is there is none. */ - treeNode TaggedAncestor(treeNode x, TagType tag); - - /** PrevText(x): returns the document identifier of the text to the left of - * node x, or NULLT if x is the root node. */ - DocID PrevText(treeNode x); - - /** NextText(x): returns the document identifier of the text to the right of - * node x, or NULLT if x is the root node. */ - DocID NextText(treeNode x); - - /** MyText(x): returns the document identifier of the text below node x, or - * NULLT if x is not a leaf node. */ - DocID MyText(treeNode x); - DocID MyTextUnsafe(treeNode x); - - /** TextXMLId(d): returns the preorder of document with identifier d in the - * tree consisting of all tree nodes and all text nodes. */ - int TextXMLId(DocID d); - - /** NodeXMLId(x): returns the preorder of node x in the tree consisting of - * all tree nodes and all text nodes. */ - int NodeXMLId(treeNode x); - - /** ParentNode(d): returns the parent node of document identifier d. */ - treeNode ParentNode(DocID d); - - treeNode PrevNode(DocID d); - - /** GetTagId(tagname): returns the tag identifier corresponding to a given - * tag name. Returns NULLT in case that the tag name does not exists. */ - TagType GetTagId(unsigned char *tagname); - - /** GetTagName(tagid): returns the tag name of a given tag identifier. - * Returns NULL in case that the tag identifier is not valid.*/ - unsigned char *GetTagName(TagType tagid); - - /** GetTagName(tagid): returns the tag name of a given tag identifier. - * The result is just a reference and should not be freed by the caller. - */ - const unsigned char *GetTagNameByRef(TagType tagid); - - /** RegisterTag adds a new tag to the tag collection this is needed - * if the query contains a tag which is not in the document, we need - * to give this new tag a fresh id and store it somewhere. A logical - * choice is here. - * We might want to use a hashtable instead of an array though. - */ - TagType RegisterTag(unsigned char *tagname); - - bool EmptyText(DocID i) { - return Text->EmptyText(i); - } - - /** Prefix(s): search for texts prefixed by string s. */ - TextCollection::document_result Prefix(uchar const *s) { - return Text->Prefix(s); - } - - /** Suffix(s): search for texts having string s as a suffix. */ - TextCollection::document_result Suffix(uchar const *s) { - return Text->Suffix(s); - } - - /** Equal(s): search for texts equal to string s. */ - TextCollection::document_result Equals(uchar const *s) { - return Text->Equal(s); - } - - /** Contains(s): search for texts containing string s. */ - TextCollection::document_result Contains(uchar const *s) { - return Text->Contains(s); - } - - /** LessThan(s): returns document identifiers for the texts that - * are lexicographically smaller than string s. */ - TextCollection::document_result LessThan(uchar const *s) { - return Text->LessThan(s); - } - - /** IsPrefix(x): returns true if there is a text prefixed by string s. */ - bool IsPrefix(uchar const *s) { - return Text->IsPrefix(s); - } - - /** IsSuffix(s): returns true if there is a text having string s as a - * suffix.*/ - bool IsSuffix(uchar const *s) { - return Text->IsSuffix(s); - } - - /** IsEqual(s): returns true if there is a text that equals given - * string s. */ - bool IsEqual(uchar const *s) { - return Text->IsEqual(s); - } - - /** IsContains(s): returns true if there is a text containing string s. */ - bool IsContains(uchar const *s) { - return Text->IsContains(s); - } - - /** IsLessThan(s): returns true if there is at least a text that is - * lexicographically smaller than string s. */ - bool IsLessThan(uchar const *s) { - return Text->IsLessThan(s); - } - - /** Count(s): Global counting */ - unsigned Count(uchar const *s) { - return Text->Count(s); - } - - /** CountPrefix(s): counting version of Prefix(s). */ - unsigned CountPrefix(uchar const *s) { - return Text->CountPrefix(s); - } - - /** CountSuffix(s): counting version of Suffix(s). */ - unsigned CountSuffix(uchar const *s) { - return Text->CountSuffix(s); - } - - /** CountEqual(s): counting version of Equal(s). */ - unsigned CountEqual(uchar const *s) { - return Text->CountEqual(s); - } - - /** CountContains(s): counting version of Contains(s). */ - unsigned CountContains(uchar const *s) { - return Text->CountContains(s); - } - - /** CountLessThan(s): counting version of LessThan(s). */ - unsigned CountLessThan(uchar const *s) { - return Text->CountLessThan(s); - } - - /** GetText(d): returns the text corresponding to document with - * id d. */ - uchar* GetText(DocID d) { - - uchar * s = Text->GetText(d); - return (s[0] == 1 ? (s+1) : s); - } - - /** GetText(i, j): returns the texts corresponding to documents with - * ids i, i+1, ..., j. Texts are separated by '\0' character. */ - // uchar* GetText(DocID i, DocID j) { - // uchar * s = Text->GetText(i, j); - // return (s[0] == 1 ? (uchar*)"" : s); - //} - - TextCollection *getTextCollection() { - return Text; - } - - /** Save: saves XML tree data structure to file. */ - void Save(int fd, char* name ); - - /** Load: loads XML tree data structure from file. sample_rate_text - * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name); - - void insertTag(TagType tag, uint position); - - void print_stats(); - - - /** Parenthesis functions */ - treeNode Closing(treeNode x); - - bool IsOpen(treeNode x); - - - /** Print procedure */ - void Print(int fd,treeNode x, bool no_text); - void Print(int fd,treeNode x) { Print(fd,x,false); } - void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); } - - // The following are inlined here for speed - /** Tag(x): returns the tag identifier of node x. */ - - inline TagType Tag(treeNode x) const throw () { - if (tags_blen == 8) - return (TagType) (((uchar*)tags_fix)[(int) x]); - else - return get_field(tags_fix, tags_blen, x); - /* - { - size_t idxlen = x * tags_blen; - size_t j = idxlen % W; - size_t i = idxlen / W; - size_t offset = W - tags_blen; - size_t offset2 = offset - j; - size_t w = tags_fix[i]; - return (offset2 >= 0) - ? ( w << offset2 ) >> offset - : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset; - }; */ - - } - - /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf - */ - treeNode FirstChild(treeNode x) { - NULLT_IF(x==NULLT); - return bp_first_child(Par, x); - }; - - - /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT - * if none. - */ - treeNode FirstElement(treeNode node){ - { - NULLT_IF(node == NULLT); - treeNode x = bp_first_child(Par, node); - NULLT_IF(x == NULLT); - switch (Tag(x)){ - - case ATTRIBUTE_TAG_ID: - x = bp_next_sibling(Par,x); - if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x; - - case PCDATA_TAG_ID: - x = x+2; - return (bp_inspect(Par,x)==OP)? x : NULLT; - - default: - return x; - } - } - }; - - /** NextSibling(x): returns the next sibling of node x, or NULLT if none - * exists. */ - - treeNode NextSibling(treeNode x) { - NULLT_IF (x <= 0); - return bp_next_sibling(Par, x); - }; - - /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT - * if none. - */ - treeNode NextElement(treeNode x) - { - NULLT_IF(x <= 0); - x = bp_next_sibling(Par, x); - NULLT_IF(x == NULLT); - if (Tag(x) == PCDATA_TAG_ID){ - int y = x+2; - return (bp_inspect(Par, y) == OP) ? y : NULLT; - } - else return x; - }; - /** TaggedDesc(x,tag): returns the first node tagged tag with larger - * preorder than x and within the subtree of x. Returns NULT if there - * is none. */ - inline treeNode TaggedNext(treeNode x, TagType tag) - { - return tagpos2node(Tags->select_next(tag,node2tagpos(x))); - } - inline treeNode TaggedDescendant(treeNode x, TagType tag) - { - - int s = (int) Tags->select_next(tag,node2tagpos(x)); - NULLT_IF (s == -1); - - treeNode y = tagpos2node(s); // transforms the tag position into a node position - - return (bp_is_ancestor(Par,x,y) ? y : NULLT); - }; - - inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) - { - treeNode close = bp_find_close(Par, x); - treeNode s = tagpos2node(Tags->select_next(tag, close)); - - if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s; - else return NULLT; - }; - - inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing) - { - treeNode close = bp_find_close(Par, x); - treeNode s = tagpos2node(Tags->select_next(tag, close)); - - if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s; - else return NULLT; - }; - - inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing) - { - treeNode close = bp_find_close(Par, x); - int rank = bp_rank_open(Par, close); - treeNode y = bp_select_open(Par, rank+1); - return (y < ancestor_closing) ? y : NULLT; - }; - -// TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none. -treeNode TaggedFollowingSibling(treeNode x, TagType tag) -{ - NULLT_IF(x==NULLT); - treeNode sibling = x; - TagType ctag; - while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) { - ctag = Tag(sibling); - if (ctag == tag) return sibling; - } - return NULLT; -}; - -treeNode TaggedChild(treeNode x, TagType tag) -{ - - NULLT_IF(x==NULLT || bp_isleaf(Par,x)); - treeNode child; - child = bp_first_child(Par, x); - -if (Tag(child) == tag) - return child; - else - return TaggedFollowingSibling(child, tag); -}; - - -}; - - -#endif - diff --git a/XMLTreeBuilder.cpp b/XMLTreeBuilder.cpp deleted file mode 100644 index d9b0832..0000000 --- a/XMLTreeBuilder.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include "common.h" -#include "XMLTreeBuilder.h" -#include "timings.h" -using std::string; - -XMLTreeBuilder::~XMLTreeBuilder(){ - //free(par_aux); - free(tags_aux); - //delete other stuff. - -} - -// OpenDocument(empty_texts): it starts the construction of the data structure for -// the XML document. Parameter empty_texts indicates whether we index empty texts -// in document or not. Returns a non-zero value upon success, NULLT in case of error. -int XMLTreeBuilder::OpenDocument(bool empty_texts, - int sample_rate_text, - bool dtc, - TextCollectionBuilder::index_type_t index_type) - { - npar = 0; - parArraySize = 1; - disable_tc = dtc; - text_index_type = index_type; - STARTTIMER(); - - par_aux = (pb *)umalloc(sizeof(pb)*parArraySize); - - tags_aux = (TagType *) umalloc(sizeof(TagType)); - - TagName = new vector(); - tIdMap = new std::unordered_map(); - - REGISTER_TAG(TagName,tIdMap,DOCUMENT_OPEN_TAG); - REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_OPEN_TAG); - REGISTER_TAG(TagName,tIdMap,PCDATA_OPEN_TAG); - REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_OPEN_TAG); - REGISTER_TAG(TagName,tIdMap,CLOSING_TAG); - REGISTER_TAG(TagName,tIdMap,DOCUMENT_CLOSE_TAG); - REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_CLOSE_TAG); - REGISTER_TAG(TagName,tIdMap,PCDATA_CLOSE_TAG); - REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_CLOSE_TAG); - - - if (disable_tc) - TextBuilder = 0; - else - TextBuilder = TextCollectionBuilder::create((unsigned)sample_rate_text, index_type); - - Text = 0; - empty_texts_aux = (unsigned int *)ucalloc(sizeof(unsigned int),1); - eta_size = sizeof(unsigned int); - return 1; // indicates success in the initialization of the data structure - } - -// CloseDocument(): it finishes the construction of the data structure for the XML -// document. Tree and tags are represented in the final form, dynamic data -// structures are made static, and the flag "finished" is set to true. After that, -// the data structure can be queried. -XMLTree *XMLTreeBuilder::CloseDocument() - { - //closing parenthesis for the tree root - //par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); - //setbit(par_aux, npar, CP); - //npar++; - - // makes the text collection static - STOPTIMER(Parsing); - PRINTTIME("Parsing XML Document", Parsing); - - XMLTree *T = new XMLTree(par_aux, - npar, - TagName, - tIdMap, - empty_texts_aux, // freed by the constructor - tags_aux, // freed by the constructor - TextBuilder, // freed by the constructor - disable_tc, - text_index_type); - tags_aux = 0; - empty_texts_aux = 0; - return T; - } - - -// NewOpenTag(tagname): indicates the event of finding a new opening tag in the document. -// Tag name is given. Returns a non-zero value upon success, and returns NULLT -// in case of failing when trying to insert the new tag. -int XMLTreeBuilder::NewOpenTag(string tagname) - { - int i; - // inserts a new opening parentheses in the bit sequence - if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - - // If array is already 1GB, be gentler when resizing: - if (sizeof(pb)*parArraySize >= 1024*1024*1024) - parArraySize += (128*1024*1024); - else - parArraySize *= 2; - par_aux = (pb *) urealloc(par_aux, sizeof(pb)*parArraySize); - }; - - bp_setbit(par_aux,npar,OP); // marks a new opening parenthesis - - TagIdMapIT tag_id = tIdMap->find(tagname); - - if (tag_id == tIdMap->end()){ - REGISTER_TAG(TagName,tIdMap,tagname); - i = TagName->size() - 1; - } - else - i = tag_id->second; - - if (tagname.compare(PCDATA_OPEN_TAG) == 0 || - tagname.compare(ATTRIBUTE_DATA_OPEN_TAG) == 0){ - }; - - - tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); - - tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags - - npar++; - - return 1; // success - } - - -// NewClosingTag(tagname): indicates the event of finding a new closing tag in the document. -// Tag name is given. Returns a non-zero value upon success, and returns NULLT -// in case of failing when trying to insert the new tag. -int XMLTreeBuilder::NewClosingTag(string tagname) - { - int i; - // inserts a new closing parentheses in the bit sequence - if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - // If array is already 1GB, be gentler when resizing: - if (sizeof(pb)*parArraySize >= 1024*1024*1024) - parArraySize += (128*1024*1024); - else - parArraySize *= 2; - par_aux = (pb *)urealloc(par_aux, sizeof(pb)*parArraySize); - }; - - bp_setbit(par_aux,npar,CP); // marks a new closing parenthesis - - //tagname.insert(0,"/"); - - //TagIdMapIT tag_id = tIdMap->find(tagname); - - // if (tag_id == tIdMap->end()){ - // REGISTER_TAG(TagName,tIdMap,tagname); - // i = TagName->size() - 1; - // } - // else - // i = tag_id->second; - - tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1)); - - tags_aux[npar] = CLOSING_TAG_ID; // inserts the new tag id within the preorder sequence of tags - - npar++; - - return 1; // success - } - - -// NewText(s): indicates the event of finding a new (non-empty) text s in the document. -// The new text is inserted within the text collection. Returns a non-zero value upon -// success, NULLT in case of error. -int XMLTreeBuilder::NewText(string text) - { - if (!disable_tc) { - if (text.empty()) - TextBuilder->InsertText((uchar *)"\001"); - else - TextBuilder->InsertText((uchar *) text.c_str()); - }; - - int n_eta_size = sizeof(uint)*(1+(npar-1)/(8*sizeof(uint))); - //see basics.h, recalloc resizes and sets the new area to 0. - - empty_texts_aux = (uint *)urecalloc(empty_texts_aux,eta_size,n_eta_size); - eta_size = n_eta_size; - bitset(empty_texts_aux, npar-1); // marks the non-empty text with a 1 in the bit vector - - return 1; // success - } - - diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h deleted file mode 100644 index 5253dd0..0000000 --- a/XMLTreeBuilder.h +++ /dev/null @@ -1,116 +0,0 @@ - -/****************************************************************************** - * Copyright (C) 2009 by Diego Arroyuelo * - * Builder class for the in-memory XQuery/XPath engine * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ - -#ifndef XMLTREEBUILDER_H_ -#define XMLTREEBUILDER_H_ - -#include -#undef W -#undef WW -#undef Wminusone - - -#include "XMLTree.h" - -using SXSI::TextCollection; -using SXSI::TextCollectionBuilder; - -#define NULLT -1 - - // sets bit p in e -#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) - // cleans bit p in e -#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) - - - -class XMLTreeBuilder { - - /** Array containing the balanced parentheses sequence */ - pb *par_aux; - int parArraySize; - int npar; - - /** Mapping from tag identifer to tag name */ - std::vector *TagName; - TagIdMap * tIdMap; - /** Array containing the sequence of tags */ - TagType *tags_aux; - - /** The texts in the XML document */ - TextCollectionBuilder *TextBuilder; - TextCollection *Text; - - /** The texts in the XML document (cached for faster display) */ - - std::vector *CachedText; - - unsigned int *empty_texts_aux; - int eta_size; - // Allows to disable the TextCollection for benchmarkin purposes - bool disable_tc; - TextCollectionBuilder::index_type_t text_index_type; -public: - - XMLTreeBuilder() {;}; - - ~XMLTreeBuilder(); - - /** OpenDocument(sample_rate_text,dtc): initilizes the construction - * of the data structure for the XML document. Parameter - * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space - * requirement). dtc disable the use of the TextCollection - * (i.e. everything is considered an empty text *) - * Returns a non-zero value upon success, NULLT in case of - * error. */ - int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc, - TextCollectionBuilder::index_type_t index_type); - - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, - * dynamic data structures are made static, returning the resulting - * XMLTree. After that, the XMLTree data structure can be queried. */ - XMLTree *CloseDocument(); - - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewOpenTag(std::string tagname); - - /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewClosingTag(std::string tagname); - - /** NewText(s): indicates the event of finding a new text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. - * If the string is empty, which is legal in attributes, then - * the string the sequence '\0x01\0x00' is inserted in the TextCollection - * It is ok to do so since a non printable character cannot occur in an XML document - */ - int NewText(std::string text); - - -}; -#endif - diff --git a/bit-vector.cpp b/bit-vector.cpp new file mode 100644 index 0000000..d69c177 --- /dev/null +++ b/bit-vector.cpp @@ -0,0 +1,223 @@ +#include +#include "bit-vector.hpp" + +static inline uint8_t pow2_mult8(uint8_t x){ + return (~((x & (x-1))==0) + 1) & (x >> 3); + +} + +static inline size_t uint32bits(size_t bits){ + return ((bits-1) / W) + 1; +} + + + +void array_copy(const uint32_t * src, uint32_t * dst, size_t len) +{ + while (len--) *dst++ = *src++; +} + +size_t bit_vector::allocforsize(size_t bits) +{ + size_t i = _alloc; + while (uint32bits(bits) >= i) + i = 2*i+1; + return i; +} + +void bit_vector::grow(size_t alloc) +{ + if (alloc <= _alloc) + return; + + uint32_t* vector = new uint32_t[ alloc ]; + + array_copy(_vector,vector,_alloc); + + if (!_leaked) delete [] _vector; + _vector = vector; + _alloc = alloc; + _leaked = false; + return; +} + + +bit_vector::bit_vector (size_t len, size_t blen, uint32_t init) +{ + _size = len*blen; + _leaked = false; + if (_size == 0) { + _alloc = 0; + _vector = 0; + return; + }; + _alloc = uint32bits(_size); + _vector = new uint32_t[_alloc]; + + for(size_t i = 0; i < len; ++i) set_field(i,blen, init); + return; +} + +bit_vector::bit_vector(int fd) +{ + ssize_t toread = sizeof(size_t); + auto msg = std::runtime_error("I/O error in bit_vector::read"); + _leaked = false; + if (toread != read(fd, &_size, toread)) + throw msg; + + _alloc = uint32bits(_size); + _vector = new uint32_t[_alloc]; + + toread = sizeof(uint32_t); + for(size_t i = 0; i < _alloc; ++i) + if (toread != read(fd, &_vector[i], toread)){ + delete [] _vector; + _vector = 0; + throw msg; + }; + return; + +} + +bit_vector::bit_vector (const bit_vector& src) +{ + _leaked = false; + _size = src._size; + _vector = new uint32_t[ _alloc = src._alloc ]; + array_copy(src._vector,_vector,_alloc); +} + +bit_vector & bit_vector::operator=(const bit_vector& src) +{ + if (this != &src) { + _size = src._size; + delete [] _vector; + _vector = new uint32_t[ _alloc = src._alloc ]; + array_copy(src._vector,_vector,_alloc); + _leaked = false; + }; + return *this; +} + +bit_vector::~bit_vector() +{ + if (!_leaked) delete [] _vector; + _size = 0; + _vector = 0; + _alloc = 0; +} + + + +void bit_vector::save(int fd) const +{ + + ssize_t towrite = sizeof(size_t); + std::string msg = "I/O error in bit_vector::write"; + + if (towrite != write(fd,&_size,towrite)) + throw msg; + + towrite = sizeof(uint32_t); + + for (size_t i = 0; i < _alloc; ++i) + if (towrite != write(fd, &_vector[i], towrite)) + throw msg; + + return; + +} + + +void bit_vector::pack() +{ + + size_t nalloc = uint32bits(_size); + uint32_t * nvector = new uint32_t[nalloc]; + array_copy(_vector,nvector,nalloc); + if (!_leaked) delete [] _vector; + _alloc = nalloc; + _vector = nvector; + _leaked = false; + +} + +//sets the idxth bit to b +void bit_vector::set(size_t idx, bool b) +{ + size_t i = idx / sizeof(uint32_t); + size_t j = idx % sizeof(uint32_t); + grow(allocforsize(idx+1)); + if (idx >= _size) + _size = idx+1; + uint32_t mask = 1 << j; + if (b) + _vector[i] |= mask; + else + _vector[i] &= ~mask; +} + +void bit_vector::push_back(bool b) +{ + set(_size, b); +} + +bool bit_vector::pop_back() +{ + _size--; + return unsafe_get(_size); +} + +bool bit_vector::flip(size_t idx) +{ + bool b = get(idx); + set(idx, !b); + return b; +} + +void bit_vector::set_field(size_t index, size_t len, uint32_t v) +{ + size_t i=index*len/W, j=index*len-i*W; + + grow(allocforsize((index+2)*len)); + if (j+len > W){ + if ((index+1)*len >= _size) + _size = (index+2)*len; + } else if (index*len >= _size) + _size = (index+1)*len; + + uint32_t mask = ((j+len) < W ? ~0U << (j+len) : 0) + | ((W-j) < W ? ~0U >> (W-j) : 0); + _vector[i] = (_vector[i] & mask) | v << j; + + if (j+len>W) { + mask = ((~0U) << (len+j-W)); + _vector[i+1] = (_vector[i+1] & mask)| v >> (W-j); + } +} + +bool bit_vector::get(size_t index) const +{ + if (index >= _size) + throw (std::out_of_range("bit_vector::get")); + + return unsafe_get(index); +} + +uint32_t bit_vector::get_field(size_t index, size_t len) const +{ + if ((index+1)*len > _size) + throw (std::out_of_range("bit_vector::get_field: index out of bound")); + if (len > W) + throw (std::invalid_argument("bit_vector::get_field: len > 32" )); + + return unsafe_get_field(index, len); +} + +uint32_t * bit_vector::get_vector() const +{ + uint32_t * vector = new uint32_t[_alloc]; + array_copy(_vector,vector,_alloc); + return vector; +} diff --git a/bit-vector.hpp b/bit-vector.hpp new file mode 100644 index 0000000..cd6382e --- /dev/null +++ b/bit-vector.hpp @@ -0,0 +1,148 @@ +#ifndef __BIT_VECTOR_HPP__ +#define __BIT_VECTOR_HPP__ + +#include +#include +#include + + +#ifndef W +#define W (sizeof(uint32_t)*CHAR_BIT) +#define WW (W*2) +#endif + + +/** Simple class implementing a vector of bits as an uint32_t array. + */ + +class bit_vector { + + public: + /** Builds a bit vector capable of holding len elements of blen bits each, + initializes the vector with the copies len times the blen lowermost bits + of init. + Throws an exception if blen == 0 or blen > 32. + */ + bit_vector (size_t len=0, size_t blen=1, uint32_t init=0); + + /** Load the bit_vector from a file descriptor */ + bit_vector (int fd); + + /** copy constructor */ + bit_vector (const bit_vector& src); + + bit_vector & operator=(const bit_vector& src); + + /** Destructor + */ + ~bit_vector (); + + /** Writes the content of the the bit_vector to the file descriptor fd, in + binary format. + */ + void save(int fd) const; + + /** Returns the size of the bit_vector in bits */ + size_t size() const { return _size; }; + + /** Returns the maximum number of bits that can be added without + reallocation + */ + size_t max_size() const { return _alloc*W; }; + + /** Resizes the vector so that it can store size() bits and such that at most + sizeof(uint32_t) - 1 bits are wasted.*/ + void pack(); + + /** Sets the idxth bit to b. The bit_vector is resized if necessary */ + void set(size_t idx, bool b); + + /** Adds bit b at the end of the bit_vector. The bit_vector is resized if + necessary */ + void push_back(bool b); + + /** Returns the last bit of the bit vector, which is removed */ + bool pop_back(); + + /** Sets bits [idx*len - (idx+1)*len] of the bit vector to the len + lowermost bits of v. + Throws an exception if len == 0 or len > W + The bit_vector is resized if necessary. + */ + void set_field(size_t idx, size_t len, uint32_t v); + + /** Flips the idxth bit of the bit_vector. Returns the value of the bit + before flipping */ + bool flip(size_t idx); + + /** Returns the idxth bit. Performs bound checking */ + bool get(size_t idx) const; + + /** Returns the bits idx*len to (idx+1)*len, packed into an uint32_t. + Throws an exception if len == 0 or len > sizeof(uint32_t). + Performs bound checking. + */ + uint32_t get_field(size_t idx, size_t len) const; + + /** returns a copy of the internal vector + */ + uint32_t* get_vector() const; + /** returns a pointer to the internal vector + */ + uint32_t* get_vector_ptr() { _leaked = true; return _vector; } + + /** Efficient unsafe get/get_field */ + bool unsafe_get(size_t idx) const; + uint32_t unsafe_get_field(size_t idx, size_t len) const; + + +private: + // true if get_vector_ptr() was called + bool _leaked; + // Size in bits + size_t _size; + // Allocated space in sizeof(uint32_t) + size_t _alloc; + uint32_t * _vector; + /** Increase the size of _vector + */ + + size_t allocforsize(size_t bits); + void grow(size_t alloc); + + +}; + +/** Unsafe version of get. Does not perform bound checking + */ +inline bool bit_vector::unsafe_get(size_t idx) const +{ + return (_vector[idx / W] >> (idx % W)) & true; +}; + +/** Unsafe version of get_field. Does not perform bound checking. + Assumes 1<= len <= sizeof(uint32_t). +*/ +inline uint32_t bit_vector::unsafe_get_field(size_t idx, size_t len) const +{ + switch (len){ + case 8: + return ((uint8_t*)_vector)[idx]; + case 16: + return ((uint16_t*)_vector)[idx]; + case 32: + return _vector[idx]; + case 1: + return unsafe_get(idx); + default: + size_t idxlen = idx*len; + size_t j = idxlen % W; + size_t i = idxlen / W; + size_t offset = W - len; + return (offset >= j) + ? (_vector[i] << (offset-j)) >> offset + : (_vector[i] >> j) | (_vector [i+1] << (W+offset-j)) >> offset; + }; +}; + +#endif diff --git a/common.h b/common.h deleted file mode 100644 index da8fa61..0000000 --- a/common.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef COMMON_H_ -#define COMMON_H_ - - -#include -#include -//#include // for memset -#include -#include -#include -#define B_ERROR(msg) do { fprintf(stderr,"%s\n", msg); exit(1); } while (0) - -inline void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream) - { - size_t res; - res = fread(ptr,size,nmemb,stream); - if (res < nmemb) - B_ERROR ("ufread I/O error" ); - return; - } - -inline void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) - { - size_t res; - res = fwrite(ptr,size,nmemb,stream); - if (res < nmemb) - B_ERROR("ufwrite I/O error"); - return; - } - -inline void *urealloc(void *ptr, size_t size) - { - void *dest = realloc(ptr,size); - //don't fail if we requested size 0 - if (dest == NULL && size > 0 ) - B_ERROR("urealoc error"); - return dest; - } -// realloc and set to 0 -inline void *urecalloc(void *ptr, size_t o_size, size_t n_size) - { - if (o_size == n_size) - return ptr; - - void *dest = realloc(ptr,n_size); - //don't fail if we requested size 0 - if (dest == NULL && n_size > 0 ) - B_ERROR("urecalloc error"); - // Initialize the new area with 0 - void * n_area_start = &(((char*) dest)[o_size]); - // memset(n_area_start,0, n_size - o_size); - for(size_t i = 0; i < n_size - o_size;i++) - ((char *) n_area_start)[i] = 0; - return dest; - } - -inline void *ucalloc(size_t nmemb, size_t size) - { - void * dest = calloc(nmemb,size); - //don't fail if we requested size 0 - if (dest == NULL && nmemb > 0 && size > 0 ) - B_ERROR("ucalloc error"); - return dest; - } - -inline void *umalloc(size_t size) - { - void * dest = malloc(size); - if (dest == NULL && size > 0) - B_ERROR("umaloc error"); - return dest; - } - -#endif diff --git a/timings.h b/timings.h deleted file mode 100644 index d4c4935..0000000 --- a/timings.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef TIMINGS_H_ -#define TIMINGS_H_ -//Timing Macro's -#include -#include -#include -#include -#include - -static double tParsing = 0; -static unsigned int cParsing = 0; - -static double tLoading = 0; -static unsigned int cLoading = 0; -static double tBuilding = 0; -static unsigned int cBuilding = 0; - -static struct timeval tmpv1; -static struct timeval tmpv2; -static std::string mem1; -static std::string mem2; - -static void read_procmem(std::string& memstr) { - std::string buf; - pid_t pid = getpid(); - std::stringstream path; - path << "/proc/" << pid << "/status"; - std::ifstream infile; - infile.open (path.str().c_str(), std::ifstream::in); - while (infile.good()){ - getline(infile,buf); - if (infile.eof()) { - memstr = "Could not read memory"; - return; - }; - int idx = buf.find("VmRSS"); - if (idx == 0){ - memstr = buf; - return; - }; - }; - memstr = "Could not read memory"; - return; - -} - -#define STARTTIMER() do { \ - read_procmem(mem1); \ - gettimeofday(&tmpv1,NULL); \ - } while (0) \ - -#define STOPTIMER(x) do { \ - gettimeofday(&tmpv2,NULL); \ - read_procmem(mem2); \ - (t##x) = ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \ - (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \ - (c##x)= (c##x)+1; \ - } while (0) - -#define PRINTTIME(s,x) do { \ - std::cerr << (s) << " : " << (t##x) << "ms" << std::endl; \ - std::cerr << "Mem use before: " << mem1 << std::endl; \ - std::cerr << "Mem use after: " << mem2 << std::endl; \ - } while (0) \ - - - - - - - - -#endif diff --git a/xml-tree-builder.cpp b/xml-tree-builder.cpp new file mode 100644 index 0000000..0b7effc --- /dev/null +++ b/xml-tree-builder.cpp @@ -0,0 +1,140 @@ +#include "xml-tree-builder.hpp" +#include +#include + +using namespace SXSI; + +xml_tree_builder::xml_tree_builder() +{ + opened = false; + par = 0; + tags = 0; + tag_ids = 0; + text_positions = 0; + disable_text_index = false; + tc_builder = 0; +} + +xml_tree_builder::~xml_tree_builder() +{ + if (opened) reset(); +} + +void xml_tree_builder::reset() +{ + delete par; + delete tags; + delete tag_ids; + if (!disable_text_index){ + delete tc_builder; + delete text_positions; + }; +} + + +int32_t xml_tree_builder::register_tag(std::string tag, int32_t id) +{ + auto found = tag_ids->find(tag); + + if (found == tag_ids->end()) { + if (id != current_tag) + throw std::runtime_error("xml-tree-builder: inconsistant tag id"); + + tag_ids->insert(std::make_pair(tag, id)); + current_tag++; + return id; + } else + return found->second; + +} + +int32_t xml_tree_builder::register_tag(std::string tag) +{ + return register_tag(tag, current_tag); +} + +void +xml_tree_builder::open_document(bool disable_text_index, + unsigned int sample_rate, + TextCollectionBuilder::index_type_t idx_type) +{ + if (opened) reset(); + opened = true; + par = new bit_vector(); + tags = new std::vector(); + current_tag = 0; + tag_ids = new std::unordered_map(); + + register_tag(xml_tree::DOCUMENT_OPEN_TAG, xml_tree::DOCUMENT_OPEN_TAG_ID); + register_tag(xml_tree::DOCUMENT_OPEN_TAG, xml_tree::DOCUMENT_OPEN_TAG_ID); + + register_tag(xml_tree::ATTRIBUTE_OPEN_TAG, xml_tree::ATTRIBUTE_OPEN_TAG_ID); + register_tag(xml_tree::ATTRIBUTE_OPEN_TAG, xml_tree::ATTRIBUTE_OPEN_TAG_ID); + + register_tag(xml_tree::PCDATA_OPEN_TAG, xml_tree::PCDATA_OPEN_TAG_ID); + register_tag(xml_tree::PCDATA_OPEN_TAG, xml_tree::PCDATA_OPEN_TAG_ID); + + register_tag(xml_tree::ATTRIBUTE_DATA_OPEN_TAG, + xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID); + + register_tag(xml_tree::ATTRIBUTE_DATA_OPEN_TAG, + xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID); + + + this->disable_text_index = disable_text_index; + if (!disable_text_index){ + tc_builder = TextCollectionBuilder::create(sample_rate, idx_type); + text_positions = new bit_vector(); + text_index_type = idx_type; + }; +} + +void xml_tree_builder::open_tag(std::string tag) +{ + int32_t id = register_tag(tag); + tags->push_back(id); + par->push_back(true); + if (!disable_text_index) text_positions->push_back(false); +} + +void xml_tree_builder::close_tag(std::string) +{ + xml_tree::tag_t t = xml_tree::CLOSE_TAG_ID; + tags->push_back(t); + par->push_back(false); + if (!disable_text_index) text_positions->push_back(false); +} + +void xml_tree_builder::text(std::string s) +{ + if (!disable_text_index){ + if (s.empty()) s = "\001"; + tc_builder->InsertText((const unsigned char *) s.c_str()); + text_positions->set(text_positions->size() - 1, true); + } +} + +xml_tree *xml_tree_builder::close_document() +{ + if (opened) { + opened = false; + auto tags_ = tags; + auto tag_ids_ = tag_ids; + auto par_ = par; + auto tc_builder_ = tc_builder; + auto text_positions_ = text_positions; + tc_builder = 0; + text_positions = 0; + tags = 0; + tag_ids = 0; + par = 0; + return new xml_tree(tags_, tag_ids_, par_, + disable_text_index, + tc_builder_, + text_index_type, + text_positions_); + }; + + throw std::runtime_error("xml_tree_builder: inconsistent parser state"); +} + diff --git a/xml-tree-builder.hpp b/xml-tree-builder.hpp new file mode 100644 index 0000000..7550824 --- /dev/null +++ b/xml-tree-builder.hpp @@ -0,0 +1,48 @@ +#ifndef XML_TREE_BUILDER_HPP_ +#define XML_TREE_BUILDER_HPP_ + +#include +#include +#include "xml-tree.hpp" +#include "bit-vector.hpp" +#undef W +#undef WW +#undef Wminusone +#include + +class xml_tree_builder { + +public: + xml_tree_builder(); + ~xml_tree_builder(); + void open_document(bool disable_text_index, + unsigned int sample_rate, + SXSI::TextCollectionBuilder::index_type_t idx_type); + + xml_tree *close_document(); + void open_tag(std::string); + void close_tag(std::string); + void text(std::string); + +private: + void reset(); + int32_t register_tag(std::string); + int32_t register_tag(std::string, int32_t); + + + bit_vector *par; + std::vector *tags; + int32_t current_tag; + std::unordered_map *tag_ids; + bool opened; + + + bit_vector *text_positions; + SXSI::TextCollectionBuilder *tc_builder; + bool disable_text_index; + SXSI::TextCollectionBuilder::index_type_t text_index_type; + +}; + + +#endif diff --git a/xml-tree-inc.hpp b/xml-tree-inc.hpp new file mode 100644 index 0000000..6ce1724 --- /dev/null +++ b/xml-tree-inc.hpp @@ -0,0 +1,225 @@ +#ifndef XML_TREE_INTERNAL__ +#error "xml-tree-inc.hpp should not be included directly" +#endif + +#ifndef XML_TREE_INC_HPP_ +#define XML_TREE_INC_HPP_ + +inline uint32_t xml_tree::size() const +{ + return tag_seq_len / 2; +} + +inline uint32_t xml_tree::num_tags() const +{ + return tag_names->size(); +} + +inline uint32_t xml_tree::subtree_size(xml_tree::node_t x) const +{ + return bp_subtree_size(this->par, x); +} + +inline uint32_t +xml_tree::subtree_tags(xml_tree::node_t x, xml_tree::tag_t label) const +{ + xml_tree::node_t y = bp_find_close(this->par, x); + if (y - x < 10) { + uint32_t count = 0; + for(xml_tree::node_t i = x; i < y; i++) + count += (tag(i) == label); + return count; + } else { + return tags->rank(label, y) - tags->rank(label, x); + }; +} + +inline bool xml_tree::is_leaf(xml_tree::node_t x) const +{ + return !bp_inspect(this->par, x+1); +} + +inline bool xml_tree::is_ancestor(xml_tree::node_t x, + xml_tree::node_t y) const +{ + return bp_is_ancestor(this->par, x, y); +} + +inline bool +xml_tree::is_right_descendant(xml_tree::node_t x, + xml_tree::node_t y) const +{ + return + (x > root()) + && (y <= bp_parent_close(this->par, x)) + && (y >= bp_find_close(this->par, x)); +} + +inline bool xml_tree::is_first_child(xml_tree::node_t x) const +{ + return (x <= this->root()) || (bp_inspect(this->par, x - 1)); +} + +inline bool xml_tree::is_nil(xml_tree::node_t x) const +{ + return (x == xml_tree::NIL); +} + +inline xml_tree::tag_t xml_tree::tag(xml_tree::node_t x) const +{ + if (bits_per_tag == 8) + return (xml_tree::tag_t) (((unsigned char*) tag_seq)[x]); + else + return get_field(tag_seq, bits_per_tag, x); +} + +inline xml_tree::node_t xml_tree::root() const +{ + return xml_tree::ROOT; +} + + +inline xml_tree::node_t xml_tree::parent(xml_tree::node_t x) const +{ + return bp_parent(this->par, x); +} + +inline xml_tree::node_t xml_tree::first_child(node_t x) const +{ + return bp_first_child(this->par, x); +} + +inline xml_tree::node_t xml_tree::last_child(xml_tree::node_t x) const +{ + if (is_leaf(x)) + return xml_tree::NIL; + else + return bp_find_open(this->par, bp_find_close(this->par, x) - 1); +} + +inline xml_tree::node_t xml_tree::next_sibling(xml_tree::node_t x) const +{ + return bp_next_sibling(this->par, x); +} + +inline xml_tree::node_t xml_tree::prev_sibling(xml_tree::node_t x) const +{ + return bp_prev_sibling(this->par, x); +} + +inline xml_tree::node_t xml_tree::first_element(xml_tree::node_t x) const +{ + xml_tree::node_t n = first_child(x); + if (is_nil(n)) return xml_tree::NIL; + switch (tag(n)) { + case xml_tree::ATTRIBUTE_OPEN_TAG_ID: + n = next_sibling(n); + if (is_nil(n) || tag(n) != xml_tree::PCDATA_OPEN_TAG_ID) return n; + //Fallthrough + case PCDATA_OPEN_TAG_ID: + n = n + 2; + return bp_inspect(this->par, n) ? n : xml_tree::NIL; + }; +} + +inline xml_tree::node_t xml_tree::next_element(xml_tree::node_t x) const +{ + x = next_sibling(x); + if (is_nil(x)) return x; + if (tag(x) == xml_tree::PCDATA_OPEN_TAG_ID){ + xml_tree::node_t y = x + 2; + return bp_inspect(this->par, y) ? y : xml_tree::NIL; + } else + return x; +} + +inline xml_tree::node_t xml_tree::tagged_next(node_t x, tag_t tag) const +{ + return this->tags->select_next(tag, x); +} + +inline xml_tree::node_t +xml_tree::tagged_descendant(xml_tree::node_t x, + xml_tree::tag_t tag) const +{ + xml_tree::node_t y = tagged_next(x, tag); + return is_nil(y) || !is_ancestor(x, y) ? xml_tree::NIL : y; +} + +inline xml_tree::node_t +xml_tree::tagged_following_before(xml_tree::node_t x, + xml_tree::tag_t tag, + xml_tree::node_t limit) const +{ + xml_tree::node_t close = bp_find_close(this->par, x); + xml_tree::node_t s = tagged_next(close, tag); + return (s < limit) ? s : xml_tree::NIL; +} + + +inline xml_tree::node_t xml_tree::tagged_child(xml_tree::node_t x, + xml_tree::tag_t t) const +{ + xml_tree::node_t c = first_child(x); + if (is_nil(c) || tag(c) == t) + return c; + else + tagged_sibling(c, t); +} + +inline xml_tree::node_t xml_tree::tagged_sibling(xml_tree::node_t x, + xml_tree::tag_t t) const +{ + xml_tree::node_t sibling = next_sibling(x); + while(!is_nil(sibling) && tag(sibling) != t) sibling = next_sibling(sibling); + return sibling; +} + +xml_tree::node_t xml_tree::closing(xml_tree::node_t x) const +{ + return bp_find_close(this->par, x); +} + + +inline SXSI::TextCollection *xml_tree::get_text_collection() const +{ + return text_collection; +} + +inline xml_tree::node_t xml_tree::parent_node(int32_t d) const +{ + return (xml_tree::node_t) text_positions->select1(d + 1); +} + +inline SXSI::TextCollection::document_result +xml_tree::prefix(uchar const *s) const +{ + return text_collection->Prefix(s); +} + +inline SXSI::TextCollection::document_result +xml_tree::suffix(uchar const *s) const +{ + return text_collection->Suffix(s); +} + +inline SXSI::TextCollection::document_result +xml_tree::equals(uchar const *s) const +{ + return text_collection->Equal(s); +} + +inline SXSI::TextCollection::document_result +xml_tree::contains(uchar const *s) const +{ + return text_collection->Contains(s); +} + +inline SXSI::TextCollection::document_result +xml_tree::less_than(uchar const *s) const +{ + return text_collection->LessThan(s); +} + + +#endif //XML_TREE_INC_HPP_ diff --git a/xml-tree.cpp b/xml-tree.cpp new file mode 100644 index 0000000..6d33878 --- /dev/null +++ b/xml-tree.cpp @@ -0,0 +1,580 @@ +extern "C" { +#include +#include +#include +#include +} + +#include +#include +#include +#include +#include "xml-tree.hpp" + +using namespace SXSI; + +const xml_tree::tag_t xml_tree::NIL_TAG_ID; +const char* xml_tree::NIL_TAG = ""; +const xml_tree::tag_t xml_tree::DOCUMENT_OPEN_TAG_ID; +const char* xml_tree::DOCUMENT_OPEN_TAG = ""; +const xml_tree::tag_t xml_tree::ATTRIBUTE_OPEN_TAG_ID; +const char* xml_tree::ATTRIBUTE_OPEN_TAG = "<@>"; +const xml_tree::tag_t xml_tree::PCDATA_OPEN_TAG_ID; +const char* xml_tree::PCDATA_OPEN_TAG = "<$>"; +const xml_tree::tag_t xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID; +const char* xml_tree::ATTRIBUTE_DATA_OPEN_TAG = "<@$>"; +const xml_tree::tag_t xml_tree::CLOSE_TAG_ID; +const char* xml_tree::CLOSE_TAG = ""; + +static int bits8 (int t ) { + int r = bits(t); + if (r <= 8) + return 8; + else if (r <= 16) + return 16; + else + return r; +} + + + + +static void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream) + { + size_t res; + res = fread(ptr,size,nmemb,stream); + if (res < nmemb) + throw std::runtime_error("ufread I/O error" ); + return; + } + +static void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) + { + size_t res; + res = fwrite(ptr,size,nmemb,stream); + if (res < nmemb) + throw std::runtime_error("ufwrite I/O error"); + return; + } + + + +xml_tree::xml_tree() +{ + print_stack = 0; + print_buffer = 0; +} + +xml_tree::xml_tree(std::vector *tags, + std::unordered_map *tag_ids, + bit_vector *parbitmap, + bool disable_tc, + SXSI::TextCollectionBuilder *tc_builder, + SXSI::TextCollectionBuilder::index_type_t idx_type, + bit_vector *textbitmap) +{ + print_stack = 0; + print_buffer = 0; + + size_t npar = parbitmap->size(); + parbitmap->pack(); + par = bp_construct(npar, + parbitmap->get_vector_ptr(), + OPT_DEGREE); + delete parbitmap; + + this->tag_ids = tag_ids; + tag_names = new std::vector(); + tag_names->resize(tag_ids->size()); + for(auto val : *(this->tag_ids)) + (*this->tag_names)[val.second] = val.first; + + uint32_t max_tag = tag_names->size() - 1; + static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); + alphabet_mapper *am = new alphabet_mapper_none(); + this->tags = new static_sequence_bs((uint32_t *) &tags[0], npar, am, bmb); + bits_per_tag = bits8(max_tag); + tag_seq_len = npar; + tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)]; + for(size_t i = 0; i < (size_t) tag_seq_len; i++) + set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]); + + delete tags; + + if (disable_tc) { + text_positions = 0; + text_collection = 0; + } else { + textbitmap->pack(); + uint32_t * textbm = textbitmap->get_vector_ptr(); + text_positions = new static_bitsequence_rrr02(textbm, + npar, + 32); + delete [] textbm; + + this->text_index_type = idx_type; + text_collection = tc_builder->InitTextCollection(); + delete tc_builder; + }; + + +} + +xml_tree::~xml_tree() +{ + bp_delete(par); + delete tags; + delete [] tag_seq; + delete tag_names; + delete tag_ids; + + if (text_collection) delete text_collection; + if (text_positions) delete text_positions; +} + +bool xml_tree::is_child(xml_tree::node_t x, xml_tree::node_t y) const +{ + return !is_leaf(x) && is_ancestor(y, x) && depth(x)+1 == depth(y); +} + +uint32_t xml_tree::depth(xml_tree::node_t x) const +{ + return bp_depth(this->par, x); +} + + +uint32_t xml_tree::preorder(xml_tree::node_t x) const +{ + return bp_preorder_rank(this->par, x); +} + +uint32_t xml_tree::postorder(xml_tree::node_t x) const +{ + return bp_postorder_rank(this->par, x); +} + +xml_tree::node_t +xml_tree::select_child(xml_tree::node_t x, + std::unordered_set *tags) const +{ + if (is_nil(x) || is_leaf(x)) return xml_tree::NIL; + xml_tree::node_t child = first_child(x); + if (tags->find(tag(child)) != tags->end()) return child; + return select_sibling(child, tags); +} + +xml_tree::node_t +xml_tree::select_descendant(xml_tree::node_t x, + std::unordered_set *tags) const +{ + if (is_leaf(x)) return xml_tree::NIL; + auto min = xml_tree::NIL; + auto aux = xml_tree::NIL; + for(auto tag = tags->begin(); tag != tags->end(); ++ tag){ + aux = tagged_descendant(x, *tag); + if ((unsigned int) aux < (unsigned int) min) min = aux; + }; + return min; +} + + +xml_tree::node_t +xml_tree::select_sibling(xml_tree::node_t x, + std::unordered_set *tags) const +{ + xml_tree::node_t sibling = next_sibling(x); + while(!is_nil(sibling) && tags->find(tag(sibling)) == tags->end()) + sibling = next_sibling(sibling); + return (sibling); +} + +xml_tree::node_t +xml_tree::select_following_before(xml_tree::node_t x, + std::unordered_set *tags, + xml_tree::node_t limit) const +{ + auto min = xml_tree::NIL; + auto aux = xml_tree::NIL; + for(auto tag = tags->begin(); tag != tags->end(); ++tag){ + aux = tagged_following_before(x, *tag, limit); + if ((unsigned int) aux < (unsigned int) min) min = aux; + } + return min; +} + +void xml_tree::save(int fd, char* s) +{ + FILE* fp; + int fd2 = dup(fd); + + fp = fdopen(fd2, "w"); + + if (fd == 0) throw(std::runtime_error(strerror(errno))); + saveTree(par, fp); //TODO use new api + + int ntags = tag_names->size(); + + ufwrite(&ntags, sizeof(int), 1, fp); + for (int i = 0; i < ntags;i++) + fprintf(fp, "%s\n", tag_names->at(i).c_str()); + + tags->save(fp); + + ufwrite(&bits_per_tag, sizeof(uint),1,fp); + ufwrite(&tag_seq_len, sizeof(uint),1,fp); + ufwrite(tag_seq, sizeof(uint), uint_len(bits_per_tag, tag_seq_len), fp); + + bool disable_tc = text_collection == 0 || text_positions == 0; + ufwrite(&disable_tc, sizeof(bool),1,fp); + + text_positions->save(fp); + if (!disable_tc) { + ufwrite(&text_index_type, + sizeof(TextCollectionBuilder::index_type_t), + 1, fp); + + std::string file(s); + switch (text_index_type){ + case TextCollectionBuilder::index_type_default: + file.append("_default"); + break; + case TextCollectionBuilder::index_type_swcsa: + file.append("_swcsa"); + break; + case TextCollectionBuilder::index_type_rlcsa: + file.append("_rlcsa"); + break; + }; + + text_collection->Save(fp, file.c_str()); + }; + + fflush(fp); + fclose(fp); + +} +//static xml_tree* load(char*, bool, int); +// void print(int, node_t, bool no_text=false); +xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) +{ + FILE *fp; + char buffer[1024]; + + int i; + buffer[1023] = '\0'; + fp = fdopen(fd, "r"); + + xml_tree *tree = new xml_tree(); + + tree->par = loadTree(fp); //TODO use new api + tree->tag_names = new std::vector(); + tree->tag_ids = new std::unordered_map(); + std::string s; + int ntags; + + ufread(&ntags, sizeof(int), 1, fp); + + for (i = 0; i < ntags; i++) { + if (fgets(buffer, 1022, fp) != buffer) + throw std::runtime_error("xml_tree::load: cannot read tag list"); + s = buffer; + // remove the trailing \n + s.erase(s.size()-1); + tree->tag_names->push_back(s); + tree->tag_ids->insert(std::make_pair(s, + static_cast(i))); + + }; + + tree->tags = static_sequence::load(fp); + ufread(&tree->bits_per_tag, sizeof(uint), 1, fp); + ufread(&tree->tag_seq_len, sizeof(uint), 1, fp); + size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len); + tree->tag_seq = new uint[size]; + ufread(tree->tag_seq, sizeof(uint), size, fp); + + bool disable_tc; + ufread(&disable_tc, sizeof(bool), 1, fp); + + if (disable_tc || !load_tc) { + tree->text_positions = 0; + tree->text_collection = 0; + } else { + + tree->text_positions = static_bitsequence_rrr02::load(fp); + + ufread(&tree->text_index_type, + sizeof(TextCollectionBuilder::index_type_t), 1, fp); + + std::string file(name); + switch (tree->text_index_type){ + case TextCollectionBuilder::index_type_default: + file.append("_default"); + break; + case TextCollectionBuilder::index_type_swcsa: + file.append("_swcsa"); + break; + case TextCollectionBuilder::index_type_rlcsa: + file.append("_rlcsa"); + break; + }; + + + tree->text_collection = + TextCollection::Load(fp, + file.c_str(), + TextCollection::index_mode_default, + sf); + + }; + return tree; +} + +uint32_t xml_tree::subtree_elements(xml_tree::node_t x) const +{ + + uint32_t size = bp_subtree_size(par, x); + if (x == root()){ + x = bp_first_child(par,x); + size = size - 1; + }; + + int s = x + 2*size - 1; + int ntext = + tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, s) - + tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, x-1); + + size = size - ntext; + xml_tree::node_t fin = bp_find_close(par, x); + xml_tree::node_t y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, x); + while (y != xml_tree::NIL && y < fin){ + size -= subtree_size(y); + y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, y); + }; + return size; + } + +uint32_t xml_tree::num_children(xml_tree::node_t x) const +{ + return bp_degree(par, x); +} + +uint32_t xml_tree::child_pos(xml_tree::node_t x) const +{ + return bp_child_rank(par, x); +} + +xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const +{ + if (i < 10) return bp_naive_child(par, x, i); + else bp_child(par, x, i); +} + +std::pair xml_tree::text_id_range(xml_tree::node_t x) const +{ + int32_t i, j; + i = text_positions->rank1(x - 1); + j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2); + if (i == j) + return std::make_pair(xml_tree::NIL, xml_tree::NIL); + else + return std::make_pair(i + 1, j); +} + +int32_t xml_tree::text_id(xml_tree::node_t x) const +{ + return (int32_t) text_positions->rank1(x) - 1; +} + +unsigned char* xml_tree::get_text(int32_t id) const +{ + unsigned char * s = text_collection->GetText(id); + return s + (s[0] == 1); +} + +void xml_tree::uflush(int fd) +{ + size_t size = print_buffer->size(); + if (size < BUFFER_SIZE) return; + uflush_r(fd, size); +} +void xml_tree::flush(int fd) +{ + uflush_r(fd, print_buffer->size()); +} + + +void xml_tree::uflush_r(int fd, size_t s) +{ + if (s == 0) return; + size_t written; + while (1) { + written = write(fd, print_buffer->data(), s); + if ((written < 0) && (errno == EAGAIN || errno == EINTR)) + continue; + break; + }; + print_buffer->clear(); +} +void xml_tree::uput_str(std::string s, int fd) +{ + print_buffer->append(s); + uflush(fd); +} +void xml_tree::uputs(const char* s, int fd) +{ + print_buffer->append(s); + uflush(fd); +} +void xml_tree::uputc(const char c, int fd) +{ + print_buffer->push_back(c); + uflush(fd); +} + +const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const +{ + + unsigned char *s; + if (tagid < 0 || tagid >= tag_names->size()) + return ""; + + return (const char *) (*tag_names)[tagid].c_str(); +} + +xml_tree::tag_t xml_tree::register_tag(char *s) +{ + auto found = tag_ids->find(std::string(s)); + if (found == tag_ids->end()) + return xml_tree::NIL_TAG_ID; + else + return (*found).second; +} + +size_t xml_tree::uprintf(const char*s, int fd) +{ + if (s == 0) return 0; + size_t i; + char c; + for (i = 0; (c = s[i]); i++) { + switch (c) { + case '"': + uputs(""", fd); + break; + case '&': + uputs("&", fd); + break; + case '\'': + uputs("'", fd); + break; + case '<': + uputs("<", fd); + break; + case '>': + uputs(">", fd); + break; + default: + uputc(c, fd); + + }; + }; + return i; +} + +void xml_tree::print(int fd, xml_tree::node_t x, bool no_text) +{ + if (print_buffer == 0) { + print_buffer = new std::string(BUFFER_SIZE, 0); + print_buffer->clear(); + print_stack = new std::vector(); + print_stack->reserve(256); + }; + xml_tree::node_t fin = bp_find_close(par, x); + xml_tree::node_t n = x; + xml_tree::tag_t label = tag(n); + unsigned char * orig_text; + unsigned char * current_text; + + auto r = text_id_range(x); + if (r.first == xml_tree::NIL) + current_text = 0; + else + current_text = get_text(r.first); + + orig_text = current_text; + size_t read = 0; + + while (n <= fin) { + + if (bp_inspect(par, n)) { + if (label == xml_tree::PCDATA_OPEN_TAG_ID){ + if (no_text) { + uputs("<$/>", fd); + } else { + read = uprintf( (const char*) current_text, fd); + current_text += read + 1; + }; + n += 2; // skip closin $ + label = tag(n); + } else { + + uputc('<', fd); + uput_str((*tag_names)[label], fd); + n++; + if (bp_inspect(par, n)) { + print_stack->push_back((*tag_names)[label]); + label = tag(n); + if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { + n++; + if (no_text) uputs("><@@>", fd); + + while (bp_inspect(par, n)) + if (no_text) { + uputc('<', fd); + uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); + uputc('>', fd); + uputs("<$@/>', fd); + n += 4; + } else { + uputc(' ', fd); + uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); + n++; + uputs("=\"", fd); + read = uprintf((const char*) current_text, fd); + current_text += read + 1; + uputc('"', fd); + n += 3; + }; + + if (no_text) + uputs("", fd); + else uputc('>', fd); + n++; + label = tag(n); + } else + uputc('>', fd); + } else { + uputs("/>", fd); + n++; + label = tag(n); + }; + }; + } else do { + uputs("back(), fd); + uputc('>', fd); + print_stack->pop_back(); + n++; + } while (!bp_inspect(par, n) || print_stack->empty()); + label = tag(n); + }; + uputc('\n', fd); + if (orig_text && text_index_type != TextCollectionBuilder::index_type_default) + if (*orig_text == '\0') + text_collection->DeleteText(orig_text - 1); + else + text_collection->DeleteText(orig_text); + +} diff --git a/xml-tree.hpp b/xml-tree.hpp new file mode 100644 index 0000000..950f0ef --- /dev/null +++ b/xml-tree.hpp @@ -0,0 +1,159 @@ +#ifndef XML_TREE_HPP_ +#define XML_TREE_HPP_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bit-vector.hpp" + +#undef W +#undef Wminusone +#undef WW + +#include +#include + +class xml_tree_builder; + +class xml_tree { +public: + typedef int32_t node_t; + typedef int32_t tag_t; + + static const node_t NIL = -1; + static const node_t ROOT = 0; + + static const char* NIL_TAG; + static const tag_t NIL_TAG_ID = -1; + static const char* DOCUMENT_OPEN_TAG; + static const tag_t DOCUMENT_OPEN_TAG_ID = 0; + static const char* ATTRIBUTE_OPEN_TAG; + static const tag_t ATTRIBUTE_OPEN_TAG_ID = 1; + static const char* PCDATA_OPEN_TAG; + static const tag_t PCDATA_OPEN_TAG_ID = 2; + static const char* ATTRIBUTE_DATA_OPEN_TAG; + static const tag_t ATTRIBUTE_DATA_OPEN_TAG_ID = 3; + static const char* CLOSE_TAG; + static const tag_t CLOSE_TAG_ID = 4; + + + ~xml_tree(); + + //Counting functions + inline uint32_t size() const; + inline uint32_t num_tags() const; + inline uint32_t subtree_size(node_t) const; + inline uint32_t subtree_tags(node_t, tag_t) const; + uint32_t subtree_elements(node_t) const; + uint32_t num_children(node_t) const; + uint32_t child_pos(node_t) const; + + + //Node_T tests + inline bool is_leaf(node_t) const; + inline bool is_ancestor(node_t, node_t) const; + inline bool is_right_descendant(node_t, node_t) const; + bool is_child(node_t, node_t) const; + inline bool is_first_child(node_t) const; + inline bool is_nil(node_t) const; + + uint32_t depth(node_t) const; + uint32_t preorder(node_t) const; + uint32_t postorder(node_t) const; + + //Tag functions + inline tag_t tag(node_t) const; + const char* get_tag_name_by_ref(tag_t) const; + tag_t register_tag(char *s); + + //Navigation functions + inline node_t root () const; + node_t child(node_t, uint32_t) const; + inline node_t parent(node_t) const; + inline node_t first_child(node_t) const; + inline node_t last_child(node_t) const; + inline node_t next_sibling(node_t) const; + inline node_t prev_sibling(node_t) const; + inline node_t first_element(node_t) const; + inline node_t next_element(node_t) const; + inline node_t tagged_next(node_t, tag_t) const; + inline node_t tagged_descendant(node_t, tag_t) const; + inline node_t tagged_following_before(node_t, tag_t, node_t) const; + inline node_t tagged_child(node_t, tag_t) const; + inline node_t tagged_sibling(node_t, tag_t) const; + node_t select_child(node_t, std::unordered_set*) const; + node_t select_descendant(node_t, std::unordered_set*) const; + node_t select_sibling(node_t, std::unordered_set*) const; + node_t select_following_before (node_t, + std::unordered_set*, node_t) const; + inline node_t closing(node_t) const; + + //Text functions + inline node_t parent_node(int32_t) const; + inline SXSI::TextCollection *get_text_collection() const; + std::pair text_id_range(node_t) const; + int32_t text_id(node_t) const; + unsigned char* get_text(int32_t) const; + + SXSI::TextCollection::document_result prefix(uchar const *s) const; + SXSI::TextCollection::document_result suffix(uchar const *s) const; + SXSI::TextCollection::document_result equals(uchar const *s) const; + SXSI::TextCollection::document_result contains(uchar const *s) const; + SXSI::TextCollection::document_result less_than(uchar const *s) const; + + //I/O functions + void save(int, char*); + static xml_tree* load(int, char*, bool, int); + void print(int, node_t, bool no_text=false); + void flush(int); + +private: + friend class xml_tree_builder; + xml_tree(); + xml_tree(std::vector*, + std::unordered_map*, + bit_vector*, + bool, + SXSI::TextCollectionBuilder *, + SXSI::TextCollectionBuilder::index_type_t, + bit_vector*); + + //Parenthesis sequence + bp *par; + //tag sequence + static_sequence *tags; + uint32_t *tag_seq; + uint32_t tag_seq_len; + uint32_t bits_per_tag; + //Mapping from tag_t identifiers to/from tagnames + std::vector *tag_names; + std::unordered_map *tag_ids; + //Text index + SXSI::TextCollection *text_collection; + static_bitsequence *text_positions; + SXSI::TextCollectionBuilder::index_type_t text_index_type; + //auxiliary data structures for efficient printing. + std::vector *print_stack; + std::string *print_buffer; + +#define BUFFER_SIZE (8192*2) + + void uflush(int); + void uflush_r(int, size_t); + void uput_str(std::string s, int fd); + void uputs(const char* s, int fd); + void uputc(const char c, int fd); + size_t uprintf(const char*s, int fd); +}; + +#define XML_TREE_INTERNAL__ 1 +#include "xml-tree-inc.hpp" + +#endif //XML_TREE_HPP_ -- 2.17.1