-INC_FLAGS=-I.. -I../libcds/includes/
+INC_FLAGS=-I.. -I../libcds/includes/ -I.
ifeq ($(VERBOSE), true)
ifeq ($(DEBUG), true)
OPT_FLAGS=-O0 -g $(POPCOUNT_FLAG) -static
else
- OPT_FLAGS=-O4 $(POPCOUNT_FLAG) -static -flto
+ OPT_FLAGS=-O3 $(POPCOUNT_FLAG) -static -flto
endif
ifeq ($(PROFILE), true)
CXXFLAGS=-std=c++0x $(INC_FLAGS) $(OPT_FLAGS) $(PROF_FLAGS)
CXX=g++
-OBJECTS_XMLTREE=XMLTree.o XMLTreeBuilder.o
-XMLTREE_A=libXMLTree.a
+#OBJECTS_XMLTREE=XMLTree.o XMLTreeBuilder.o
+#XMLTREE_A=libXMLTree.a
+OBJECTS_XMLTREE=bit-vector.o xml-tree.o xml-tree-builder.o
+XMLTREE_A=libxml-tree.a
+++ /dev/null
-#include "common.h"\r
-#include "XMLTree.h"\r
-#include "timings.h"\r
-#include <errno.h>\r
-using std::cout;\r
-using std::endl;\r
-using std::min;\r
-using std::string;\r
-\r
-// functions to convert tag positions to the corresponding tree node and viceversa.\r
-// These are implemented in order to be able to change the tree and Tags representations,\r
-// without affecting the code so much.\r
-// Current implementation corresponds to balanced-parentheses representation for\r
-// the tree, and storing 2 tags per tree node (opening and closing tags).\r
-\r
-\r
-static int bits8 (int t ) {\r
- int r = bits(t);\r
- if (r <= 8)\r
- return 8;\r
- else if (r <= 16)\r
- return 16;\r
- else\r
- return r;\r
-}\r
-\r
-\r
-\r
-static treeNode fast_sibling(bp* Par,treeNode x,TagType tag){\r
-\r
- if (tag == PCDATA_TAG_ID){\r
- x = x+2;\r
- return bp_inspect(Par,x)==OP ? x : NULLT;\r
- } else return bp_next_sibling(Par,x);\r
-\r
-}\r
-\r
-\r
-\r
-\r
-inline uint get_field_no_power(uint *A, uint len, uint index) {\r
-\r
- register uint i=index*len/W, j=index*len-W*i;\r
- return (j+len <= W) ? (A[i] << (W-j-len)) >> (W-len) : (A[i] >> j) | (A[i+1] << (WW-j-len)) >> (W-len);\r
-\r
-}\r
-\r
-static uint fast_get_field(uint* A,int len, int idx)\r
-{\r
- uint f1, f2;\r
- switch (len) {\r
- case 8:\r
- return (uint) (((uchar*)A)[idx]);\r
- case 16:\r
- f2 = ((unsigned short*)A)[idx];\r
- f1 = ((unsigned short*)A)[idx+1];\r
- return (f1 << 16) + f2;\r
- default:\r
- return get_field_no_power (A,len,idx);\r
- };\r
-\r
-}\r
-\r
-\r
-\r
-\r
-XMLTree::XMLTree( pb * const par, uint npar, vector<string> * const TN, TagIdMap * const tim,\r
- uint *empty_texts_bmp, TagType *tags,\r
- TextCollectionBuilder * const TCB, bool dis_tc,\r
- TextCollectionBuilder::index_type_t _index_type )\r
- {\r
- buffer = 0;\r
- print_stack = 0;\r
- // creates the data structure for the tree topology\r
- STARTTIMER();\r
- Par = bp_construct(npar, (pb*) par, OPT_DEGREE|0);\r
- STOPTIMER(Building);\r
- PRINTTIME("Building parenthesis struct", Building);\r
- STARTTIMER();\r
-\r
-\r
- // creates structure for tags\r
-\r
- TagName = (vector<string>*)TN;\r
- tIdMap = (TagIdMap *) tim;\r
-\r
- uint max_tag = TN->size() - 1;\r
-\r
- static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();\r
- alphabet_mapper *am = new alphabet_mapper_none();\r
- Tags = new static_sequence_bs((uint*)tags,npar,am,bmb);\r
-\r
- //cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl;\r
-\r
- //Ensures that for small tag numbers, we are on an 8bit boundary.\r
- //Makes tag access way faster with negligeable waste of space.\r
- tags_blen = bits8(max_tag);\r
- std::cerr << "Tags blen is " << tags_blen << "\n";\r
- tags_len = (uint)npar;\r
- tags_fix = new uint[uint_len(tags_blen,tags_len)];\r
- for(uint i=0;i<(uint)npar;i++)\r
- set_field(tags_fix,tags_blen,i,tags[i]);\r
- delete bmb;\r
- free(tags);\r
- tags = NULL;\r
-\r
- STOPTIMER(Building);\r
- PRINTTIME("Building Tag Structure", Building);\r
-\r
- EBVector = new static_bitsequence_rrr02(empty_texts_bmp,npar,32);\r
- free(empty_texts_bmp);\r
- empty_texts_bmp = NULL;\r
-\r
-\r
- disable_tc = dis_tc;\r
- text_index_type = _index_type;\r
- if (!disable_tc) {\r
- assert(TCB != 0);\r
- STARTTIMER();\r
- Text = TCB->InitTextCollection();\r
- delete TCB;\r
- STOPTIMER(Building);\r
- PRINTTIME("Building TextCollection", Building);\r
-\r
- } else {\r
- Text = NULL;\r
- }\r
-\r
- std::cerr << "Number of distinct tags " << TagName->size() << "\n";\r
- //std::cerr.flush();\r
- }\r
-\r
-\r
-// ~XMLTree: frees memory of XML tree.\r
-XMLTree::~XMLTree()\r
- {\r
- int i;\r
-\r
- bp_delete(Par);\r
- Par = NULL;\r
-\r
- delete tIdMap;\r
- tIdMap = NULL;\r
-\r
- delete TagName;\r
- TagName = NULL;\r
-\r
- delete Tags;\r
- Tags = NULL;\r
-\r
- delete Text;\r
- Text = NULL;\r
-\r
- delete EBVector;\r
- EBVector = NULL;\r
-\r
- }\r
-\r
-\r
-void XMLTree::print_stats()\r
- {\r
- uint total_space = Tags->size()+sizeof(static_sequence*);\r
- total_space += sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len));\r
- cout << "Space usage for XMLTree:" << endl\r
- << " - tags static_sequence: " << Tags->size()+sizeof(static_sequence*) << endl\r
- << " - tags access array: " << sizeof(uint*)+sizeof(uint)*(2+uint_len(tags_blen,tags_len)) << endl\r
- << " ... add Diego structures ... " << endl\r
- << " *total* " << total_space << endl;\r
- }\r
-\r
-// Save: saves XML tree data structure to file.\r
-void XMLTree::Save(int fd, char * name)\r
- {\r
- FILE *fp;\r
- int i;\r
- off_t pos = lseek(fd, 0, SEEK_CUR);\r
- int fd2 = dup(fd);\r
- fp = fdopen(fd2, "w");\r
- fseek(fp, pos, SEEK_SET);\r
- // first stores the tree topology\r
- saveTree(Par, fp);\r
-\r
- // stores the table with tag names\r
- int ntags = TagName->size();\r
-\r
- ufwrite(&ntags, sizeof(int), 1, fp);\r
- for (i = 0; i<ntags;i++)\r
- fprintf(fp, "%s\n",TagName->at(i).c_str());\r
-\r
-\r
- // stores the tags\r
- Tags->save(fp);\r
- ufwrite(&tags_blen,sizeof(uint),1,fp);\r
- ufwrite(&tags_len,sizeof(uint),1,fp);\r
- ufwrite(tags_fix,sizeof(uint),uint_len(tags_blen,tags_len),fp);\r
-\r
- // flags\r
- ufwrite(&disable_tc, sizeof(bool),1,fp);\r
-\r
- //text positions\r
- EBVector->save(fp);\r
- std::cerr << "TC Index position: " << ftell(fp) << std::endl;\r
- // stores the texts\r
- if (!disable_tc) {\r
- std::cerr << "Writing " << sizeof(TextCollectionBuilder::index_type_t) << " bytes\n" << std::endl;\r
- ufwrite(&text_index_type, sizeof(TextCollectionBuilder::index_type_t), 1, fp);\r
-\r
-\r
- string file(name);\r
- switch (text_index_type){\r
- case TextCollectionBuilder::index_type_default:\r
- file.append("_default");\r
- break;\r
- case TextCollectionBuilder::index_type_swcsa:\r
- file.append("_swcsa");\r
- break;\r
- case TextCollectionBuilder::index_type_rlcsa:\r
- file.append("_rlcsa");\r
- break;\r
- };\r
-\r
- Text->Save(fp, file.c_str());\r
-\r
-\r
- }\r
- fflush(fp);\r
- fclose(fp);\r
- }\r
-\r
-// Load: loads XML tree data structure from file. Returns\r
-// a pointer to the loaded data structure\r
-XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor, char * name)\r
- {\r
-\r
- FILE *fp;\r
- char buffer[1024];\r
- XMLTree *XML_Tree;\r
- int i;\r
-\r
- buffer[1023] = '\0';\r
-\r
- fp = fdopen(fd, "r");\r
-\r
- XML_Tree = new XMLTree();\r
- STARTTIMER();\r
- // Load the tree structure\r
- XML_Tree->Par = loadTree(fp);\r
- STOPTIMER(Loading);\r
- PRINTTIME("Loading parenthesis struct", Loading);\r
- STARTTIMER();\r
-\r
- XML_Tree->TagName = new std::vector<std::string>();\r
- XML_Tree->tIdMap = new std::unordered_map<std::string,int>();\r
- std::string s;\r
- int ntags;\r
-\r
- // Load the tag names\r
- ufread(&ntags, sizeof(int), 1, fp);\r
-\r
- for (i=0; i<ntags;i++) {\r
- if (fgets(buffer,1022,fp) != buffer)\r
- throw "Cannot read tag list";\r
- s = buffer;\r
- // remove the trailing \n\r
- s.erase(s.size()-1);\r
- XML_Tree->TagName->push_back(s);\r
- XML_Tree->tIdMap->insert(std::make_pair(s,i));\r
-\r
- };\r
- STOPTIMER(Loading);\r
- PRINTTIME("Loading tag names struct", Loading);\r
- STARTTIMER();\r
-\r
- // loads the tag structure\r
- XML_Tree->Tags = static_sequence::load(fp);\r
- ufread(&XML_Tree->tags_blen,sizeof(uint),1,fp);\r
- std::cerr << "tags_blen is "<< XML_Tree->tags_blen <<"\n";\r
- ufread(&XML_Tree->tags_len,sizeof(uint),1,fp);\r
- XML_Tree->tags_fix = new uint[uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)];\r
- ufread(XML_Tree->tags_fix,sizeof(uint),uint_len(XML_Tree->tags_blen,XML_Tree->tags_len),fp);\r
-\r
- // TODO ask francisco about this\r
- /// FIXME:UGLY tests!\r
- //uint * seq = new uint[XML_Tree->tags_len];\r
- //for(uint i=0;i<XML_Tree->tags_len;i++)\r
- // seq[i] = get_field(XML_Tree->tags_fix,XML_Tree->tags_blen,i);\r
- //cout << "Tags test: " << XML_Tree->Tags->test(seq,XML_Tree->tags_len) << endl;\r
- //XML_Tree->Tags->test(seq,XML_Tree->tags_len);\r
- //delete [] seq;\r
- /// End ugly tests\r
-\r
- STOPTIMER(Loading);\r
- std::cerr << (uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/(1024*1024) << " MB for tag sequence" << std::endl;\r
- PRINTTIME("Loading tag struct", Loading);\r
- STARTTIMER();\r
-\r
- // loads the flags\r
-\r
- ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp);\r
- if (load_tc) {\r
- XML_Tree->EBVector = static_bitsequence_rrr02::load(fp);\r
-\r
- STOPTIMER(Loading);\r
- PRINTTIME("Loading text bitvector struct", Loading);\r
- STARTTIMER();\r
- std::cerr << "TC Load Index position: " << ftell(fp) << std::endl;\r
- // Not used\r
- // loads the texts\r
- if (!XML_Tree->disable_tc){\r
- ufread(&(XML_Tree->text_index_type),\r
- sizeof(TextCollectionBuilder::index_type_t), 1, fp);\r
- string file(name);\r
- switch (XML_Tree->text_index_type){\r
- case TextCollectionBuilder::index_type_default:\r
- file.append("_default");\r
- break;\r
- case TextCollectionBuilder::index_type_swcsa:\r
- file.append("_swcsa");\r
- break;\r
- case TextCollectionBuilder::index_type_rlcsa:\r
- file.append("_rlcsa");\r
- break;\r
- };\r
-\r
-\r
- XML_Tree->Text = TextCollection::Load(fp, file.c_str(), TextCollection::index_mode_default, sample_factor);\r
-\r
- }\r
- else XML_Tree->Text = NULL;\r
- STOPTIMER(Loading);\r
- PRINTTIME("Loading TextCollection", Loading);\r
- STARTTIMER();\r
- }\r
- else {\r
- XML_Tree->EBVector = NULL;\r
- XML_Tree->Text = NULL;\r
- XML_Tree->disable_tc = true;\r
- };\r
-\r
-\r
- return XML_Tree;\r
- }\r
-\r
-\r
-\r
-\r
-int XMLTree::SubtreeElements(treeNode x)\r
- {\r
-\r
- int size = bp_subtree_size(Par, x);\r
- if (x == Root()){\r
- x = bp_first_child(Par,x);\r
- size = size - 1;\r
- };\r
-\r
- int s = x + 2*size - 1;\r
- int ntext = Tags->rank(PCDATA_TAG_ID, s) - Tags->rank(PCDATA_TAG_ID, node2tagpos(x)-1);\r
- size = size - ntext;\r
- treeNode fin = bp_find_close(Par,x);\r
- treeNode y = Tags->select_next(ATTRIBUTE_TAG_ID,node2tagpos(x));\r
- while (y != NULLT && y < fin){\r
- size -= SubtreeSize(y);\r
- y = Tags->select_next(ATTRIBUTE_TAG_ID, node2tagpos(y));\r
- };\r
- return size;\r
- }\r
-\r
-// IsLeaf(x): returns whether node x is leaf or not. In the succinct representation\r
-// this is just a bit inspection.\r
-bool XMLTree::IsLeaf(treeNode x)\r
- {\r
- NULLT_IF(x==NULLT);\r
- return bp_isleaf(Par, x);\r
- }\r
-\r
-// IsAncestor(x,y): returns whether node x is ancestor of node y.\r
-bool XMLTree::IsAncestor(treeNode x, treeNode y)\r
- {\r
- return bp_is_ancestor(Par, x, y);\r
- }\r
-\r
-// IsChild(x,y): returns whether node x is parent of node y.\r
-bool XMLTree::IsChild(treeNode x, treeNode y)\r
- {\r
- if (!bp_is_ancestor(Par, x, y)) return false;\r
- return bp_depth(Par, x) == (bp_depth(Par, y) + 1);\r
- }\r
-\r
-\r
-// NumChildren(x): number of children of node x. Constant time with the data structure\r
-// of Sadakane.\r
-int XMLTree::NumChildren(treeNode x)\r
- {\r
- return bp_degree(Par, x);\r
- }\r
-\r
-// ChildNumber(x): returns i if node x is the i-th children of its parent.\r
-int XMLTree::ChildNumber(treeNode x)\r
- {\r
- return bp_child_rank(Par, x);\r
- }\r
-\r
-// Depth(x): depth of node x, a simple binary rank on the parentheses sequence.\r
-int XMLTree::Depth(treeNode x)\r
- {\r
- return bp_depth(Par, x);\r
- }\r
-\r
-// Preorder(x): returns the preorder number of node x, just counting the tree\r
-// nodes (i.e., tags, it disregards the texts in the tree).\r
-int XMLTree::Preorder(treeNode x)\r
- {\r
- return bp_preorder_rank(Par, x);\r
- }\r
-\r
-// Postorder(x): returns the postorder number of node x, just counting the tree\r
-// nodes (i.e., tags, it disregards the texts in the tree).\r
-int XMLTree::Postorder(treeNode x)\r
- {\r
- return bp_postorder_rank(Par, x);\r
- }\r
-\r
-// DocIds(x): returns the range of text identifiers that descend from node x.\r
-// returns {NULLT, NULLT} when there are no texts descending from x.\r
-range XMLTree::DocIds(treeNode x)\r
- {\r
- range r;\r
- if (x == NULLT) {\r
- r.min = NULLT;\r
- r.max = NULLT;\r
- return r;\r
- };\r
- int min = EBVector->rank1(x-1);\r
- int max = EBVector->rank1(x+2*bp_subtree_size(Par, x)-2);\r
- if (min==max) { // range is empty, no texts within the subtree of x\r
- r.min = NULLT;\r
- r.max = NULLT;\r
- }\r
- else { // the range is non-empty, there are texts within the subtree of x\r
- r.min = min+1;\r
- r.max = max;\r
- }\r
- return r;\r
- }\r
-\r
-\r
-// Child(x,i): returns the i-th child of node x, assuming it exists.\r
-treeNode XMLTree::Child(treeNode x, int i)\r
-{\r
- if (i <= OPTD) return bp_naive_child(Par, x, i);\r
- else return bp_child(Par, x, i);\r
-}\r
-\r
-\r
-treeNode XMLTree::SelectChild(treeNode x, TagIdSet *tags)\r
-{\r
-\r
- NULLT_IF(x==NULLT || bp_isleaf(Par,x));\r
- int i;\r
- treeNode child = bp_first_child(Par, x);\r
- TagType t;\r
- while (child != NULLT) {\r
- t = Tag(child);\r
- if (tags->find(t) != tags->end()) return child;\r
- child = fast_sibling(Par, child,t);\r
- }\r
- return NULLT;\r
-}\r
-\r
-\r
-treeNode XMLTree::SelectFollowingSibling(treeNode x, TagIdSet *tags)\r
-{\r
-\r
- NULLT_IF(x==NULLT);\r
- int i;\r
- TagType t;\r
- treeNode sibling = bp_next_sibling(Par, x);\r
- while (sibling != NULLT) {\r
- t = Tag(sibling);\r
- if (tags->find(t) != tags->end()) return sibling;\r
- sibling = fast_sibling(Par, sibling,t);\r
- }\r
- return NULLT;\r
- }\r
-\r
-\r
-// TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an\r
-// ancestor of x. Returns NULLT if there is none.\r
-treeNode XMLTree::TaggedPreceding(treeNode x, TagType tag)\r
- {\r
- int r, s;\r
- treeNode node_s, root;\r
- r = (int)Tags->rank(tag, node2tagpos(x)-1);\r
- if (r==0) return NULLT; // there is no such node.\r
- s = (int)Tags->select(tag, r);\r
- root = bp_root_node(Par);\r
- node_s = tagpos2node(s);\r
- while (bp_is_ancestor(Par, node_s, x) && (node_s!=root)) { // the one that we found is an ancestor of x\r
- r--;\r
- if (r==0) return NULLT; // there is no such node\r
- s = (int)Tags->select(tag, r); // we should use select_prev instead when provided\r
- node_s = tagpos2node(s);\r
- }\r
- return NULLT; // there is no such node\r
- }\r
-\r
-\r
-// TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in\r
-// the subtree of x. Returns NULLT if there is none.\r
-treeNode XMLTree::TaggedFollowing(treeNode x, TagType tag)\r
- {\r
- NULLT_IF (x ==NULLT || x == Root());\r
- return tagpos2node(Tags->select_next(tag, bp_find_close(Par, x)));\r
-\r
- }\r
-\r
-\r
-/* Here we inline TaggedFoll to find the min globally, and only at the end\r
- we check if the min is below the context node */\r
-treeNode XMLTree::SelectFollowingBelow(treeNode x, TagIdSet *tags, treeNode ancestor)\r
- {\r
-\r
- NULLT_IF(x==NULLT || x==Root());\r
-\r
- treeNode close = bp_find_close(Par,x);\r
- treeNode ns = close+1;\r
- if ( (bp_inspect(Par,ns) == OP) && (tags->find(Tag(ns)) != tags->end()))\r
- return ns;\r
-\r
- int i;\r
- treeNode min = NULLT;\r
- treeNode aux;\r
-\r
-\r
- TagIdSet::const_iterator tagit;\r
- for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {\r
-\r
- aux = tagpos2node(Tags->select_next(*tagit, close));\r
- if (aux == NULLT) continue;\r
- if ((min == NULLT) || (aux < min)) min = aux;\r
- };\r
-\r
- // found the smallest node in preorder which is after x.\r
- // if ctx is the root node, just return what we found.\r
-\r
- if (ancestor == Root() || min == NULLT || min < bp_find_close(Par, ancestor)) return min;\r
- else return NULLT;\r
-\r
- }\r
-// TaggedAncestor(x, tag): returns the closest ancestor of x tagged tag. Return\r
-// NULLT is there is none.\r
-treeNode XMLTree::TaggedAncestor(treeNode x, TagType tag)\r
- {\r
- if (x == NULLT || x == Root())\r
- return NULLT;\r
-\r
- treeNode s = bp_parent(Par, x), r = Root();\r
- while (s != r) {\r
- if (Tag(s) == tag) return s;\r
- s = bp_parent(Par, s);\r
- }\r
- return NULLT;\r
- }\r
-\r
-\r
-\r
-// MyText(x): returns the document identifier of the text below node x,\r
-// or NULLT if x is not a leaf node or the text is empty. Assumes Doc\r
-// ids start from 0.\r
-DocID XMLTree::MyText(treeNode x)\r
-{\r
- TagType tag = Tag(x);\r
- // seems faster than testing EBVector->access(x);\r
-\r
- if (tag == PCDATA_TAG_ID || tag == ATTRIBUTE_DATA_TAG_ID)\r
- return (DocID) (EBVector->rank1(x)-1);\r
- else\r
- return (DocID) NULLT;\r
-\r
-}\r
-// MyText(x): returns the document identifier of the text below node x,\r
-// or NULLT if x is not a leaf node or the text is empty. Assumes Doc\r
-// ids start from 0.\r
-DocID XMLTree::MyTextUnsafe(treeNode x)\r
-{\r
- return (DocID) (EBVector->rank1(x)-1); //-1 because document ids start from 0\r
-\r
-}\r
-// TextXMLId(d): returns the preorder of document with identifier d in the tree consisting of\r
-// all tree nodes and all text nodes. Assumes that the tree root has preorder 1.\r
-int XMLTree::TextXMLId(DocID d)\r
- {\r
- NULLT_IF(d == NULLT);\r
- int s = EBVector->select1(d+1);\r
- return bp_rank_open(Par, s) + d + 1; // +1 because root has preorder 1\r
-\r
- }\r
-\r
-// NodeXMLId(x): returns the preorder of node x in the tree consisting\r
-// of all tree nodes and all text nodes. Assumes that the tree root has\r
-// preorder 0;\r
-int XMLTree::NodeXMLId(treeNode x)\r
- {\r
- NULLT_IF(x == NULLT);\r
- if (x == Root()) return 1; // root node has preorder 1\r
- return bp_rank_open(Par, x) + EBVector->rank1(x-1);\r
- }\r
-\r
-// ParentNode(d): returns the parent node of document identifier d.\r
-treeNode XMLTree::ParentNode(DocID d)\r
- {\r
- NULLT_IF (d == NULLT);\r
- return (treeNode) EBVector->select1(d+1);\r
- }\r
-\r
-// GetTagId: returns the tag identifier corresponding to a given tag name.\r
-// Returns NULLT in case that the tag name does not exists.\r
-TagType XMLTree::GetTagId(unsigned char *tagname)\r
- {\r
-\r
- string s = (char *) tagname;\r
- TagIdMapIT it = tIdMap->find(s);\r
- return (TagType) ((it != tIdMap->end()) ? it->second : -1);\r
-\r
- }\r
-\r
-\r
-// GetTagName(tagid): returns the tag name of a given tag identifier.\r
-// Returns NULL in case that the tag identifier is not valid.\r
-unsigned char *XMLTree::GetTagName(TagType tagid)\r
- {\r
- unsigned char *s;\r
- if ( tagid < 0 || tagid >= TagName->size())\r
- return (unsigned char *) "<INVALID TAG>";\r
- strcpy((char *)s, (*TagName)[tagid].c_str());\r
-\r
- return (s == NULL ? (unsigned char*) "<INVALID TAG>" : s);\r
- }\r
-\r
-\r
-const unsigned char *XMLTree::GetTagNameByRef(TagType tagid)\r
- {\r
-\r
- unsigned char *s;\r
- if ( tagid < 0 || tagid >= TagName->size())\r
- return (unsigned char *) "<INVALID TAG>";\r
-\r
- return (const unsigned char *) (*TagName)[tagid].c_str();\r
-\r
- }\r
-\r
-\r
-\r
-TagType XMLTree::RegisterTag(unsigned char *tagname)\r
- {\r
- TagType id = XMLTree::GetTagId(tagname);\r
- if (id == NULLT) {\r
- string s = (char *) tagname;\r
- REGISTER_TAG(TagName,tIdMap,s);\r
- };\r
-\r
- return id;\r
- }\r
-\r
-\r
-treeNode XMLTree::Closing(treeNode x) {\r
- return bp_find_close(Par,x);\r
-}\r
-bool XMLTree::IsOpen(treeNode x) { return bp_inspect(Par,x); }\r
-\r
-//WARNING this uses directly the underlying implementation for plain text\r
-\r
-\r
-void XMLTree::Print(int fd,treeNode x, bool no_text){\r
-\r
- if (buffer == 0) {\r
- buffer = new string(BUFFER_ALLOC, 0);\r
- buffer->clear();\r
- print_stack = new std::vector<string *>();\r
- print_stack->reserve(256);\r
- };\r
-\r
- treeNode fin = bp_find_close(Par,x);\r
- treeNode n = x;\r
- TagType tag = Tag(n);\r
-\r
- range r = DocIds(x);\r
- treeNode first_idx;\r
- treeNode first_text = (tag == PCDATA_TAG_ID ? x : ParentNode(r.min-1));\r
- treeNode first_att = NULLT;\r
-\r
- if (first_att == NULLT)\r
- first_idx = first_text;\r
- else if (first_text == NULLT)\r
- first_idx = first_att;\r
- else\r
- first_idx = min(first_att,first_text);\r
-\r
- uchar * current_text=NULL;\r
-\r
- if (first_idx != NULLT)\r
- current_text = GetText(MyTextUnsafe(first_idx));\r
- uchar * orig_text = current_text;\r
- size_t read = 0;\r
-\r
- while (n <= fin){\r
- if (bp_inspect(Par,n)){\r
- if (tag == PCDATA_TAG_ID) {\r
-\r
- if (no_text)\r
- _dputs("<$/>", fd);\r
- else {\r
- read = _dprintf((const char*) current_text, fd);\r
- current_text += (read + 1);\r
- };\r
- n+=2; // skip closing $\r
- tag = Tag(n);\r
-\r
- } else {\r
-\r
- _dputc('<',fd);\r
- _dput_str((*TagName)[tag], fd);\r
- n++;\r
- if (bp_inspect(Par,n)) {\r
- print_stack->push_back(&((*TagName)[tag]));\r
- tag = Tag(n);\r
- if (tag == ATTRIBUTE_TAG_ID){\r
- n++;\r
- if (no_text) _dputs("><@@>",fd);\r
-\r
- while (bp_inspect(Par,n)){\r
- if (no_text) {\r
- _dputc('<', fd);\r
- _dputs((const char*) &(GetTagNameByRef(Tag(n))[3]), fd);\r
- _dputc('>', fd);\r
- _dputs("<$@/></", fd);\r
- _dputs((const char*) &(GetTagNameByRef(Tag(n))[3]), fd);\r
- _dputc('>', fd);\r
- n+= 4;\r
- } else {\r
- _dputc(' ', fd);\r
- _dputs((const char*) &(GetTagNameByRef(Tag(n))[3]), fd);\r
- n++;\r
- _dputs("=\"", fd);\r
- read = _dprintf((const char*) current_text, fd);\r
- current_text += (read + 1);\r
- _dputc('"', fd);\r
- n+=3;\r
- }\r
- };\r
- if (no_text) _dputs("</@@>", fd);\r
- else _dputc('>', fd);\r
- n++;\r
- tag=Tag(n);\r
-\r
- } else\r
- _dputc('>', fd);\r
-\r
- } else {// <foo /> tag\r
- _dputs("/>", fd);\r
- n++;\r
- tag=Tag(n);\r
- };\r
- };\r
- } else do {\r
- _dputs("</", fd);\r
- _dput_str(*(print_stack->back()), fd);\r
- _dputc('>', fd);\r
- print_stack->pop_back();\r
- n++;\r
- } while (!(bp_inspect(Par,n) || print_stack->empty()));\r
- tag = Tag(n);\r
- };\r
- _dputc('\n', fd);\r
- if (orig_text && text_index_type != (TextCollectionBuilder::index_type_default))\r
- Text->DeleteText(orig_text);\r
- //_flush(fd);\r
-}\r
+++ /dev/null
-/******************************************************************************
- * Copyright (C) 2008 by Diego Arroyuelo *
- * Interface for the in-memory XQuery/XPath engine *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU Lesser General Public License as published *
- * by the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU Lesser General Public License for more details. *
- * *
- * You should have received a copy of the GNU Lesser General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
- ******************************************************************************/
-
-#ifndef XMLTREE_H_
-#define XMLTREE_H_
-
-
-#include <unordered_set>
-#include <unordered_map>
-#include <sstream>
-#include <TextCollection/TextCollectionBuilder.h>
-
-#undef W
-#undef WW
-#undef Wminusone
-
-#include <bp/bp.h>
-#include <bp/bp-darray.h>
-#include <libcds/includes/basics.h>
-#include <libcds/includes/static_bitsequence.h>
-#include <libcds/includes/alphabet_mapper.h>
-#include <libcds/includes/static_sequence.h>
-
-using SXSI::TextCollection;
-using SXSI::TextCollectionBuilder;
-
-
-// this constant is used to efficiently compute the child operation in the tree
-#define OPTD 10
-
-#define NULLT -1
-
-#define PERM_SAMPLE 10
-
-
-typedef int treeNode;
-typedef int TagType;
-typedef int DocID;
-
-typedef struct {
- int min;
- int max;
-} range;
-
-// Encoding of the XML Document :
-// The following TAGs and IDs are fixed, "" is the tag of the root.
-// a TextNode is represented by a leaf <<$>></<$>> The DocId in the TextCollection
-// of that leaf is kept in a bit sequence.
-// a TextNode below an attribute is likewise represented by a leaf <<@$>><</@$>>
-// An element <e a1="v1" a2="v2" ... an="vn" > ...</e> the representation is:
-// <e><<@>> <<@>a1> <<$@>>DocID(v1)</<$@>></<@>a1> ... </<@>> .... </e>
-// Hence the attributes (if any) are always below the first child of their element,
-// as the children of a fake node <@>.
-
-
-#define DOCUMENT_OPEN_TAG ""
-#define DOCUMENT_TAG_ID 0
-#define ATTRIBUTE_OPEN_TAG "<@>"
-#define ATTRIBUTE_TAG_ID 1
-#define PCDATA_OPEN_TAG "<$>"
-#define PCDATA_TAG_ID 2
-#define ATTRIBUTE_DATA_OPEN_TAG "<@$>"
-#define ATTRIBUTE_DATA_TAG_ID 3
-#define CLOSING_TAG "</>"
-#define CLOSING_TAG_ID 4
-#define DOCUMENT_CLOSE_TAG "/"
-#define ATTRIBUTE_CLOSE_TAG "/<@>"
-#define PCDATA_CLOSE_TAG "/<$>"
-#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"
-
-
-typedef std::unordered_set<int> TagIdSet;
-typedef std::unordered_map<std::string,int> TagIdMap;
-typedef TagIdMap::const_iterator TagIdMapIT;
-
-#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\
- (v)->push_back(t); } while (false)
-
-// returns NULLT if the test is true
-#define NULLT_IF(x) do { if (x) return NULLT; } while (0)
-#define IS_NIL(x) ((x) < 0)
-
-// Direct calls to sarray library
-
-#define BUFFER_ALLOC (8192 * 2)
-#define BUFFER_SIZE (BUFFER_ALLOC / 2)
-
-// tag position -> tree node
-static treeNode tagpos2node(int t)
- {
- return (treeNode) t;
- }
-// tree node -> tag position
-static int node2tagpos(treeNode x)
-{
- return (int)x;
-}
-
-
-class XMLTreeBuilder;
-
-class XMLTree {
-
- // Only the builder can access the constructor
- friend class XMLTreeBuilder;
-
- private:
- /** Balanced parentheses representation of the tree */
- bp *Par;
-
- /** Mapping from tag identifer to tag name */
- std::vector<std::string> *TagName;
- TagIdMap * tIdMap;
-
- /** Bit vector indicating with a 1 the positions of the non-empty texts. */
- static_bitsequence *EBVector;
-
- /** Tag sequence represented with a data structure for rank and select */
- static_sequence *Tags;
- uint * tags_fix;
- uint tags_blen, tags_len;
-
- /** The texts in the XML document */
- TextCollection *Text;
-
- // Allows to disable the TextCollection for benchmarkin purposes
- bool disable_tc;
- SXSI::TextCollectionBuilder::index_type_t text_index_type;
-
- std::string *buffer;
- std::vector<std::string *> *print_stack;
-
-
- void _real_flush(int fd, size_t size) {
- if (size == 0) return;
- size_t written;
- while (1) {
- written = write(fd, buffer->data(), size);
- if ((written < 0) && (errno == EAGAIN || errno == EINTR))
- continue;
- break;
- };
- buffer->clear();
-
- }
-
- void _flush(int fd){
- size_t size = buffer->size();
- if (size < BUFFER_SIZE) return;
- _real_flush(fd, size);
- }
-
- void _dput_str(std::string s, int fd){
- buffer->append(s);
- _flush(fd);
- }
-
- void _dputs(const char* s, int fd){
- buffer->append(s);
- _flush(fd);
- }
-
- void _dputc(const char c, int fd){
- buffer->push_back(c);
- }
-
- size_t _dprintf(const char* s, int fd){
- if (s == NULL) return 0;
- size_t i;
- char c;
- for (i = 0; (c = s[i]); i++) {
- switch (c) {
- case '"':
- _dputs(""", fd);
- break;
- case '&':
- _dputs("&", fd);
- break;
- case '\'':
- _dputs("'", fd);
- break;
- case '<':
- _dputs("<", fd);
- break;
- case '>':
- _dputs(">", fd);
- break;
- default:
- _dputc(c, fd);
-
- };
- };
- return i;
- }
-
- void PrintNode(treeNode n, int fd);
- /** Data structure constructors */
- XMLTree(){ buffer = 0;};
-
- // non const pointer are freed by this method.
- XMLTree( pb * const par,
- uint npar,
- std::vector<std::string> * const TN,
- TagIdMap * const tim, uint *empty_texts_bmp,
- TagType *tags,
- TextCollectionBuilder * const TCB, bool dis_tc,
- TextCollectionBuilder::index_type_t _index_type );
-
-public:
- /** Data structure destructor */
- ~XMLTree();
-
- /** root(): returns the tree root. */
- treeNode Root() { return 0; }
-
- /** Size() : Number of parenthesis */
- unsigned int Size(){
- return tags_len/2;
- }
-
-
- /** NumTags() : Number of distinct tags */
- unsigned int NumTags() {
- return TagName->size();
- }
-
- int TagsBinaryLength(){ return tags_blen; };
- unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); };
- unsigned int * TagStruct() { return tags_fix; };
-
-
- /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
- * node x. */
- int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); }
-
- /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
- * of node x. */
- int SubtreeTags(treeNode x, TagType tag){
- //int s = x + 2*subtree_size(Par, x) - 1;
- treeNode y = bp_find_close(Par, x);
-
- if (y - x < 10) {
- int count = 0;
- for(int i = x; i < y; i++)
- count += (Tag(i) == tag);
- return count;
- }
- else
- return (Tags->rank(tag, y) - Tags->rank(tag, x));
- };
-
- /** SubtreeElements(x) of element nodes in the subtree of x
- */
- int SubtreeElements(treeNode x);
-
- /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
- * representation this is just a bit inspection. */
-
- bool IsLeaf(treeNode x);
-
- /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
-
- bool IsAncestor(treeNode x, treeNode y);
-
-
- /** IsRigthDescendant returns true if y is a descendant of x and y is
- not a descendant of the first child of x */
- bool IsRightDescendant(treeNode x, treeNode y) {
- if (x <= Root()) return false;
- treeNode z = bp_parent_close(Par, x);
- treeNode c = bp_find_close(Par, x);
- return (y > c && y < z );
- }
-
-
- /** IsChild(x,y): returns whether node x is parent of node y. */
- bool IsChild(treeNode x, treeNode y);
-
- /** IsFirstChild(x): returns whether node x is the first child of its parent. */
- /* SLOW
- bool IsFirstChild(treeNode x) {
- return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1));
- };
- */
-
- bool IsFirstChild(treeNode x) {
- return (x <= Root()) || (bp_inspect(Par,x-1) == OP);
- }
-
- /** NumChildren(x): number of children of node x. Constant time with the
- * data structure of Sadakane. */
- int NumChildren(treeNode x);
-
- /** ChildNumber(x): returns i if node x is the i-th children of its
- * parent. */
- int ChildNumber(treeNode x);
-
- /** Depth(x): depth of node x, a simple binary rank on the parentheses
- * sequence. */
- int Depth(treeNode x);
-
- /** Preorder(x): returns the preorder number of node x, just regarding tree
- * nodes (and not texts). */
- int Preorder(treeNode x);
-
- /** Postorder(x): returns the postorder number of node x, just regarding
- * tree nodes (and not texts). */
- int Postorder(treeNode x);
-
-
- /** DocIds(x): returns the range (i.e., a pair of integers) of document
- * identifiers that descend from node x. */
- range DocIds(treeNode x);
-
- /** Parent(x): returns the parent node of node x. */
- treeNode Parent(treeNode x) {
- return (x == Root()) ? NULLT : bp_parent(Par, x);
- };
-
- treeNode BinaryParent(treeNode x){
- if (x <= Root())
- return NULLT;
- else {
- treeNode prev = x - 1;
- return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev);
- };
- };
-
- /* Assumes x is neither 0 nor -1 */
-
- /** Child(x,i): returns the i-th child of node x, assuming it exists. */
- treeNode Child(treeNode x, int i);
-
-
-
- /** LastChild(x): returns the last child of node x. */
- treeNode LastChild(treeNode x) {
- NULLT_IF(x == NULLT || bp_isleaf(Par,x));
- return bp_find_open(Par, bp_find_close(Par, x)-1);
- }
-
- /** PrevSibling(x): returns the previous sibling of node x, assuming it
- * exists. */
-
- treeNode PrevSibling(treeNode x)
- {
- NULLT_IF(x==NULLT);
- return bp_prev_sibling(Par, x);
- }
-
-
- /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
- * NULLT if there is none. Because of the balanced-parentheses representation
- * of the tree, this operation is not supported efficiently, just iterating
- * among the children of node x until finding the desired child. */
-
-
- treeNode SelectChild(treeNode x, TagIdSet * tags);
-
- /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
- * NULLT if there is none. */
-
- treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);
-
-
-
-
- treeNode SelectDescendant(treeNode x, TagIdSet * tags) {
- NULLT_IF (x == NULLT);
- treeNode fc = x+1;
- if (bp_inspect(Par, fc) == CP) return NULLT;
- treeNode min = NULLT;
- treeNode aux;
-
- TagIdSet::const_iterator tagit;
- for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
- aux = TaggedDescendant(x, (TagType) *tagit);
- if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
- };
- return min;
- }
-
- /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
- * preorder than x and not an ancestor of x. Returns NULLT if there
- * is none. */
- treeNode TaggedPreceding(treeNode x, TagType tag);
-
- /** TaggedFoll(x,tag): returns the first node tagged tag with larger
- * preorder than x and not in the subtree of x. Returns NULLT if there
- * is none. */
- treeNode TaggedFollowing(treeNode x, TagType tag);
-
-
-
- treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor);
-
- // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing);
-
- treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing)
- {
-
- NULLT_IF(x <= 0);
-
- treeNode close = bp_find_close(Par,x);
-
-
- treeNode min = NULLT;
- treeNode aux;
-
-
- TagIdSet::const_iterator tagit;
- for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
-
- aux = tagpos2node(Tags->select_next(*tagit, close));
-
- if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
- };
-
-
- return (min < ancestor_closing) ? min : NULLT;
-
- }
-
- /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
- * tag. Return NULLT is there is none. */
- treeNode TaggedAncestor(treeNode x, TagType tag);
-
- /** PrevText(x): returns the document identifier of the text to the left of
- * node x, or NULLT if x is the root node. */
- DocID PrevText(treeNode x);
-
- /** NextText(x): returns the document identifier of the text to the right of
- * node x, or NULLT if x is the root node. */
- DocID NextText(treeNode x);
-
- /** MyText(x): returns the document identifier of the text below node x, or
- * NULLT if x is not a leaf node. */
- DocID MyText(treeNode x);
- DocID MyTextUnsafe(treeNode x);
-
- /** TextXMLId(d): returns the preorder of document with identifier d in the
- * tree consisting of all tree nodes and all text nodes. */
- int TextXMLId(DocID d);
-
- /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
- * all tree nodes and all text nodes. */
- int NodeXMLId(treeNode x);
-
- /** ParentNode(d): returns the parent node of document identifier d. */
- treeNode ParentNode(DocID d);
-
- treeNode PrevNode(DocID d);
-
- /** GetTagId(tagname): returns the tag identifier corresponding to a given
- * tag name. Returns NULLT in case that the tag name does not exists. */
- TagType GetTagId(unsigned char *tagname);
-
- /** GetTagName(tagid): returns the tag name of a given tag identifier.
- * Returns NULL in case that the tag identifier is not valid.*/
- unsigned char *GetTagName(TagType tagid);
-
- /** GetTagName(tagid): returns the tag name of a given tag identifier.
- * The result is just a reference and should not be freed by the caller.
- */
- const unsigned char *GetTagNameByRef(TagType tagid);
-
- /** RegisterTag adds a new tag to the tag collection this is needed
- * if the query contains a tag which is not in the document, we need
- * to give this new tag a fresh id and store it somewhere. A logical
- * choice is here.
- * We might want to use a hashtable instead of an array though.
- */
- TagType RegisterTag(unsigned char *tagname);
-
- bool EmptyText(DocID i) {
- return Text->EmptyText(i);
- }
-
- /** Prefix(s): search for texts prefixed by string s. */
- TextCollection::document_result Prefix(uchar const *s) {
- return Text->Prefix(s);
- }
-
- /** Suffix(s): search for texts having string s as a suffix. */
- TextCollection::document_result Suffix(uchar const *s) {
- return Text->Suffix(s);
- }
-
- /** Equal(s): search for texts equal to string s. */
- TextCollection::document_result Equals(uchar const *s) {
- return Text->Equal(s);
- }
-
- /** Contains(s): search for texts containing string s. */
- TextCollection::document_result Contains(uchar const *s) {
- return Text->Contains(s);
- }
-
- /** LessThan(s): returns document identifiers for the texts that
- * are lexicographically smaller than string s. */
- TextCollection::document_result LessThan(uchar const *s) {
- return Text->LessThan(s);
- }
-
- /** IsPrefix(x): returns true if there is a text prefixed by string s. */
- bool IsPrefix(uchar const *s) {
- return Text->IsPrefix(s);
- }
-
- /** IsSuffix(s): returns true if there is a text having string s as a
- * suffix.*/
- bool IsSuffix(uchar const *s) {
- return Text->IsSuffix(s);
- }
-
- /** IsEqual(s): returns true if there is a text that equals given
- * string s. */
- bool IsEqual(uchar const *s) {
- return Text->IsEqual(s);
- }
-
- /** IsContains(s): returns true if there is a text containing string s. */
- bool IsContains(uchar const *s) {
- return Text->IsContains(s);
- }
-
- /** IsLessThan(s): returns true if there is at least a text that is
- * lexicographically smaller than string s. */
- bool IsLessThan(uchar const *s) {
- return Text->IsLessThan(s);
- }
-
- /** Count(s): Global counting */
- unsigned Count(uchar const *s) {
- return Text->Count(s);
- }
-
- /** CountPrefix(s): counting version of Prefix(s). */
- unsigned CountPrefix(uchar const *s) {
- return Text->CountPrefix(s);
- }
-
- /** CountSuffix(s): counting version of Suffix(s). */
- unsigned CountSuffix(uchar const *s) {
- return Text->CountSuffix(s);
- }
-
- /** CountEqual(s): counting version of Equal(s). */
- unsigned CountEqual(uchar const *s) {
- return Text->CountEqual(s);
- }
-
- /** CountContains(s): counting version of Contains(s). */
- unsigned CountContains(uchar const *s) {
- return Text->CountContains(s);
- }
-
- /** CountLessThan(s): counting version of LessThan(s). */
- unsigned CountLessThan(uchar const *s) {
- return Text->CountLessThan(s);
- }
-
- /** GetText(d): returns the text corresponding to document with
- * id d. */
- uchar* GetText(DocID d) {
-
- uchar * s = Text->GetText(d);
- return (s[0] == 1 ? (s+1) : s);
- }
-
- /** GetText(i, j): returns the texts corresponding to documents with
- * ids i, i+1, ..., j. Texts are separated by '\0' character. */
- // uchar* GetText(DocID i, DocID j) {
- // uchar * s = Text->GetText(i, j);
- // return (s[0] == 1 ? (uchar*)"" : s);
- //}
-
- TextCollection *getTextCollection() {
- return Text;
- }
-
- /** Save: saves XML tree data structure to file. */
- void Save(int fd, char* name );
-
- /** Load: loads XML tree data structure from file. sample_rate_text
- * indicates the sample rate for the text search data structure. */
- static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name);
-
- void insertTag(TagType tag, uint position);
-
- void print_stats();
-
-
- /** Parenthesis functions */
- treeNode Closing(treeNode x);
-
- bool IsOpen(treeNode x);
-
-
- /** Print procedure */
- void Print(int fd,treeNode x, bool no_text);
- void Print(int fd,treeNode x) { Print(fd,x,false); }
- void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); }
-
- // The following are inlined here for speed
- /** Tag(x): returns the tag identifier of node x. */
-
- inline TagType Tag(treeNode x) const throw () {
- if (tags_blen == 8)
- return (TagType) (((uchar*)tags_fix)[(int) x]);
- else
- return get_field(tags_fix, tags_blen, x);
- /*
- {
- size_t idxlen = x * tags_blen;
- size_t j = idxlen % W;
- size_t i = idxlen / W;
- size_t offset = W - tags_blen;
- size_t offset2 = offset - j;
- size_t w = tags_fix[i];
- return (offset2 >= 0)
- ? ( w << offset2 ) >> offset
- : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset;
- }; */
-
- }
-
- /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf
- */
- treeNode FirstChild(treeNode x) {
- NULLT_IF(x==NULLT);
- return bp_first_child(Par, x);
- };
-
-
- /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT
- * if none.
- */
- treeNode FirstElement(treeNode node){
- {
- NULLT_IF(node == NULLT);
- treeNode x = bp_first_child(Par, node);
- NULLT_IF(x == NULLT);
- switch (Tag(x)){
-
- case ATTRIBUTE_TAG_ID:
- x = bp_next_sibling(Par,x);
- if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x;
-
- case PCDATA_TAG_ID:
- x = x+2;
- return (bp_inspect(Par,x)==OP)? x : NULLT;
-
- default:
- return x;
- }
- }
- };
-
- /** NextSibling(x): returns the next sibling of node x, or NULLT if none
- * exists. */
-
- treeNode NextSibling(treeNode x) {
- NULLT_IF (x <= 0);
- return bp_next_sibling(Par, x);
- };
-
- /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT
- * if none.
- */
- treeNode NextElement(treeNode x)
- {
- NULLT_IF(x <= 0);
- x = bp_next_sibling(Par, x);
- NULLT_IF(x == NULLT);
- if (Tag(x) == PCDATA_TAG_ID){
- int y = x+2;
- return (bp_inspect(Par, y) == OP) ? y : NULLT;
- }
- else return x;
- };
- /** TaggedDesc(x,tag): returns the first node tagged tag with larger
- * preorder than x and within the subtree of x. Returns NULT if there
- * is none. */
- inline treeNode TaggedNext(treeNode x, TagType tag)
- {
- return tagpos2node(Tags->select_next(tag,node2tagpos(x)));
- }
- inline treeNode TaggedDescendant(treeNode x, TagType tag)
- {
-
- int s = (int) Tags->select_next(tag,node2tagpos(x));
- NULLT_IF (s == -1);
-
- treeNode y = tagpos2node(s); // transforms the tag position into a node position
-
- return (bp_is_ancestor(Par,x,y) ? y : NULLT);
- };
-
- inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)
- {
- treeNode close = bp_find_close(Par, x);
- treeNode s = tagpos2node(Tags->select_next(tag, close));
-
- if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s;
- else return NULLT;
- };
-
- inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing)
- {
- treeNode close = bp_find_close(Par, x);
- treeNode s = tagpos2node(Tags->select_next(tag, close));
-
- if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s;
- else return NULLT;
- };
-
- inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing)
- {
- treeNode close = bp_find_close(Par, x);
- int rank = bp_rank_open(Par, close);
- treeNode y = bp_select_open(Par, rank+1);
- return (y < ancestor_closing) ? y : NULLT;
- };
-
-// TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none.
-treeNode TaggedFollowingSibling(treeNode x, TagType tag)
-{
- NULLT_IF(x==NULLT);
- treeNode sibling = x;
- TagType ctag;
- while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) {
- ctag = Tag(sibling);
- if (ctag == tag) return sibling;
- }
- return NULLT;
-};
-
-treeNode TaggedChild(treeNode x, TagType tag)
-{
-
- NULLT_IF(x==NULLT || bp_isleaf(Par,x));
- treeNode child;
- child = bp_first_child(Par, x);
-
-if (Tag(child) == tag)
- return child;
- else
- return TaggedFollowingSibling(child, tag);
-};
-
-
-};
-
-
-#endif
-
+++ /dev/null
-#include "common.h"\r
-#include "XMLTreeBuilder.h"\r
-#include "timings.h"\r
-using std::string;\r
-\r
-XMLTreeBuilder::~XMLTreeBuilder(){\r
- //free(par_aux);\r
- free(tags_aux);\r
- //delete other stuff.\r
-\r
-}\r
-\r
-// OpenDocument(empty_texts): it starts the construction of the data structure for\r
-// the XML document. Parameter empty_texts indicates whether we index empty texts\r
-// in document or not. Returns a non-zero value upon success, NULLT in case of error.\r
-int XMLTreeBuilder::OpenDocument(bool empty_texts,\r
- int sample_rate_text,\r
- bool dtc,\r
- TextCollectionBuilder::index_type_t index_type)\r
- {\r
- npar = 0;\r
- parArraySize = 1;\r
- disable_tc = dtc;\r
- text_index_type = index_type;\r
- STARTTIMER();\r
-\r
- par_aux = (pb *)umalloc(sizeof(pb)*parArraySize);\r
-\r
- tags_aux = (TagType *) umalloc(sizeof(TagType));\r
-\r
- TagName = new vector<string>();\r
- tIdMap = new std::unordered_map<string,int>();\r
-\r
- REGISTER_TAG(TagName,tIdMap,DOCUMENT_OPEN_TAG);\r
- REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_OPEN_TAG);\r
- REGISTER_TAG(TagName,tIdMap,PCDATA_OPEN_TAG);\r
- REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_OPEN_TAG);\r
- REGISTER_TAG(TagName,tIdMap,CLOSING_TAG);\r
- REGISTER_TAG(TagName,tIdMap,DOCUMENT_CLOSE_TAG);\r
- REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_CLOSE_TAG);\r
- REGISTER_TAG(TagName,tIdMap,PCDATA_CLOSE_TAG);\r
- REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_CLOSE_TAG);\r
-\r
-\r
- if (disable_tc)\r
- TextBuilder = 0;\r
- else\r
- TextBuilder = TextCollectionBuilder::create((unsigned)sample_rate_text, index_type);\r
-\r
- Text = 0;\r
- empty_texts_aux = (unsigned int *)ucalloc(sizeof(unsigned int),1);\r
- eta_size = sizeof(unsigned int);\r
- return 1; // indicates success in the initialization of the data structure\r
- }\r
-\r
-// CloseDocument(): it finishes the construction of the data structure for the XML\r
-// document. Tree and tags are represented in the final form, dynamic data\r
-// structures are made static, and the flag "finished" is set to true. After that,\r
-// the data structure can be queried.\r
-XMLTree *XMLTreeBuilder::CloseDocument()\r
- {\r
- //closing parenthesis for the tree root\r
- //par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb))));\r
- //setbit(par_aux, npar, CP);\r
- //npar++;\r
-\r
- // makes the text collection static\r
- STOPTIMER(Parsing);\r
- PRINTTIME("Parsing XML Document", Parsing);\r
-\r
- XMLTree *T = new XMLTree(par_aux,\r
- npar,\r
- TagName,\r
- tIdMap,\r
- empty_texts_aux, // freed by the constructor\r
- tags_aux, // freed by the constructor\r
- TextBuilder, // freed by the constructor\r
- disable_tc,\r
- text_index_type);\r
- tags_aux = 0;\r
- empty_texts_aux = 0;\r
- return T;\r
- }\r
-\r
-\r
-// NewOpenTag(tagname): indicates the event of finding a new opening tag in the document.\r
-// Tag name is given. Returns a non-zero value upon success, and returns NULLT\r
-// in case of failing when trying to insert the new tag.\r
-int XMLTreeBuilder::NewOpenTag(string tagname)\r
- {\r
- int i;\r
- // inserts a new opening parentheses in the bit sequence\r
- if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis\r
-\r
- // If array is already 1GB, be gentler when resizing:\r
- if (sizeof(pb)*parArraySize >= 1024*1024*1024)\r
- parArraySize += (128*1024*1024);\r
- else\r
- parArraySize *= 2;\r
- par_aux = (pb *) urealloc(par_aux, sizeof(pb)*parArraySize);\r
- };\r
-\r
- bp_setbit(par_aux,npar,OP); // marks a new opening parenthesis\r
-\r
- TagIdMapIT tag_id = tIdMap->find(tagname);\r
-\r
- if (tag_id == tIdMap->end()){\r
- REGISTER_TAG(TagName,tIdMap,tagname);\r
- i = TagName->size() - 1;\r
- }\r
- else\r
- i = tag_id->second;\r
-\r
- if (tagname.compare(PCDATA_OPEN_TAG) == 0 ||\r
- tagname.compare(ATTRIBUTE_DATA_OPEN_TAG) == 0){\r
- };\r
-\r
-\r
- tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
-\r
- tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags\r
-\r
- npar++;\r
-\r
- return 1; // success\r
- }\r
-\r
-\r
-// NewClosingTag(tagname): indicates the event of finding a new closing tag in the document.\r
-// Tag name is given. Returns a non-zero value upon success, and returns NULLT\r
-// in case of failing when trying to insert the new tag.\r
-int XMLTreeBuilder::NewClosingTag(string tagname)\r
- {\r
- int i;\r
- // inserts a new closing parentheses in the bit sequence\r
- if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis\r
- // If array is already 1GB, be gentler when resizing:\r
- if (sizeof(pb)*parArraySize >= 1024*1024*1024)\r
- parArraySize += (128*1024*1024);\r
- else\r
- parArraySize *= 2;\r
- par_aux = (pb *)urealloc(par_aux, sizeof(pb)*parArraySize);\r
- };\r
-\r
- bp_setbit(par_aux,npar,CP); // marks a new closing parenthesis\r
-\r
- //tagname.insert(0,"/");\r
-\r
- //TagIdMapIT tag_id = tIdMap->find(tagname);\r
-\r
- // if (tag_id == tIdMap->end()){\r
- // REGISTER_TAG(TagName,tIdMap,tagname);\r
- // i = TagName->size() - 1;\r
- // }\r
- // else\r
- // i = tag_id->second;\r
-\r
- tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
-\r
- tags_aux[npar] = CLOSING_TAG_ID; // inserts the new tag id within the preorder sequence of tags\r
-\r
- npar++;\r
-\r
- return 1; // success\r
- }\r
-\r
-\r
-// NewText(s): indicates the event of finding a new (non-empty) text s in the document.\r
-// The new text is inserted within the text collection. Returns a non-zero value upon\r
-// success, NULLT in case of error.\r
-int XMLTreeBuilder::NewText(string text)\r
- {\r
- if (!disable_tc) {\r
- if (text.empty())\r
- TextBuilder->InsertText((uchar *)"\001");\r
- else\r
- TextBuilder->InsertText((uchar *) text.c_str());\r
- };\r
-\r
- int n_eta_size = sizeof(uint)*(1+(npar-1)/(8*sizeof(uint)));\r
- //see basics.h, recalloc resizes and sets the new area to 0.\r
-\r
- empty_texts_aux = (uint *)urecalloc(empty_texts_aux,eta_size,n_eta_size);\r
- eta_size = n_eta_size;\r
- bitset(empty_texts_aux, npar-1); // marks the non-empty text with a 1 in the bit vector\r
-\r
- return 1; // success\r
- }\r
-\r
-\r
+++ /dev/null
-\r
-/******************************************************************************\r
- * Copyright (C) 2009 by Diego Arroyuelo *\r
- * Builder class for the in-memory XQuery/XPath engine *\r
- * *\r
- * This program is free software; you can redistribute it and/or modify *\r
- * it under the terms of the GNU Lesser General Public License as published *\r
- * by the Free Software Foundation; either version 2 of the License, or *\r
- * (at your option) any later version. *\r
- * *\r
- * This program is distributed in the hope that it will be useful, *\r
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *\r
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *\r
- * GNU Lesser General Public License for more details. *\r
- * *\r
- * You should have received a copy of the GNU Lesser General Public License *\r
- * along with this program; if not, write to the *\r
- * Free Software Foundation, Inc., *\r
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *\r
- ******************************************************************************/\r
-\r
-#ifndef XMLTREEBUILDER_H_\r
-#define XMLTREEBUILDER_H_\r
-\r
-#include <TextCollection/TextCollectionBuilder.h>\r
-#undef W\r
-#undef WW\r
-#undef Wminusone\r
-\r
-\r
-#include "XMLTree.h"\r
-\r
-using SXSI::TextCollection;\r
-using SXSI::TextCollectionBuilder;\r
-\r
-#define NULLT -1\r
-\r
- // sets bit p in e\r
-#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))\r
- // cleans bit p in e\r
-#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))\r
-\r
-\r
-\r
-class XMLTreeBuilder {\r
-\r
- /** Array containing the balanced parentheses sequence */\r
- pb *par_aux;\r
- int parArraySize;\r
- int npar;\r
-\r
- /** Mapping from tag identifer to tag name */\r
- std::vector<std::string> *TagName;\r
- TagIdMap * tIdMap;\r
- /** Array containing the sequence of tags */\r
- TagType *tags_aux;\r
-\r
- /** The texts in the XML document */\r
- TextCollectionBuilder *TextBuilder;\r
- TextCollection *Text;\r
-\r
- /** The texts in the XML document (cached for faster display) */\r
-\r
- std::vector<std::string> *CachedText;\r
-\r
- unsigned int *empty_texts_aux;\r
- int eta_size;\r
- // Allows to disable the TextCollection for benchmarkin purposes\r
- bool disable_tc;\r
- TextCollectionBuilder::index_type_t text_index_type;\r
-public:\r
-\r
- XMLTreeBuilder() {;};\r
-\r
- ~XMLTreeBuilder();\r
-\r
- /** OpenDocument(sample_rate_text,dtc): initilizes the construction\r
- * of the data structure for the XML document. Parameter\r
- * sample_rate_text indicates the sampling rate for the text searching data\r
- * structures (small values get faster searching but a bigger space\r
- * requirement). dtc disable the use of the TextCollection\r
- * (i.e. everything is considered an empty text *)\r
- * Returns a non-zero value upon success, NULLT in case of\r
- * error. */\r
- int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc,\r
- TextCollectionBuilder::index_type_t index_type);\r
-\r
- /** CloseDocument(): finishes the construction of the data structure for\r
- * the XML document. Tree and tags are represented in the final form,\r
- * dynamic data structures are made static, returning the resulting\r
- * XMLTree. After that, the XMLTree data structure can be queried. */\r
- XMLTree *CloseDocument();\r
-\r
- /** NewOpenTag(tagname): indicates the event of finding a new opening tag\r
- * in the document. Tag name is given. Returns a non-zero value upon\r
- * success, and returns NULLT in case of error. */\r
- int NewOpenTag(std::string tagname);\r
-\r
- /** NewClosingTag(tagname): indicates the event of finding a new closing tag\r
- * in the document. Tag name is given. Returns a non-zero value upon\r
- * success, and returns NULLT in case of error. */\r
- int NewClosingTag(std::string tagname);\r
-\r
- /** NewText(s): indicates the event of finding a new text s in\r
- * the document. The new text is inserted within the text collection.\r
- * Returns a non-zero value upon success, NULLT in case of error.\r
- * If the string is empty, which is legal in attributes, then\r
- * the string the sequence '\0x01\0x00' is inserted in the TextCollection\r
- * It is ok to do so since a non printable character cannot occur in an XML document\r
- */\r
- int NewText(std::string text);\r
-\r
-\r
-};\r
-#endif\r
-\r
--- /dev/null
+#include <stdexcept>
+#include "bit-vector.hpp"
+
+static inline uint8_t pow2_mult8(uint8_t x){
+ return (~((x & (x-1))==0) + 1) & (x >> 3);
+
+}
+
+static inline size_t uint32bits(size_t bits){
+ return ((bits-1) / W) + 1;
+}
+
+
+
+void array_copy(const uint32_t * src, uint32_t * dst, size_t len)
+{
+ while (len--) *dst++ = *src++;
+}
+
+size_t bit_vector::allocforsize(size_t bits)
+{
+ size_t i = _alloc;
+ while (uint32bits(bits) >= i)
+ i = 2*i+1;
+ return i;
+}
+
+void bit_vector::grow(size_t alloc)
+{
+ if (alloc <= _alloc)
+ return;
+
+ uint32_t* vector = new uint32_t[ alloc ];
+
+ array_copy(_vector,vector,_alloc);
+
+ if (!_leaked) delete [] _vector;
+ _vector = vector;
+ _alloc = alloc;
+ _leaked = false;
+ return;
+}
+
+
+bit_vector::bit_vector (size_t len, size_t blen, uint32_t init)
+{
+ _size = len*blen;
+ _leaked = false;
+ if (_size == 0) {
+ _alloc = 0;
+ _vector = 0;
+ return;
+ };
+ _alloc = uint32bits(_size);
+ _vector = new uint32_t[_alloc];
+
+ for(size_t i = 0; i < len; ++i) set_field(i,blen, init);
+ return;
+}
+
+bit_vector::bit_vector(int fd)
+{
+ ssize_t toread = sizeof(size_t);
+ auto msg = std::runtime_error("I/O error in bit_vector::read");
+ _leaked = false;
+ if (toread != read(fd, &_size, toread))
+ throw msg;
+
+ _alloc = uint32bits(_size);
+ _vector = new uint32_t[_alloc];
+
+ toread = sizeof(uint32_t);
+ for(size_t i = 0; i < _alloc; ++i)
+ if (toread != read(fd, &_vector[i], toread)){
+ delete [] _vector;
+ _vector = 0;
+ throw msg;
+ };
+ return;
+
+}
+
+bit_vector::bit_vector (const bit_vector& src)
+{
+ _leaked = false;
+ _size = src._size;
+ _vector = new uint32_t[ _alloc = src._alloc ];
+ array_copy(src._vector,_vector,_alloc);
+}
+
+bit_vector & bit_vector::operator=(const bit_vector& src)
+{
+ if (this != &src) {
+ _size = src._size;
+ delete [] _vector;
+ _vector = new uint32_t[ _alloc = src._alloc ];
+ array_copy(src._vector,_vector,_alloc);
+ _leaked = false;
+ };
+ return *this;
+}
+
+bit_vector::~bit_vector()
+{
+ if (!_leaked) delete [] _vector;
+ _size = 0;
+ _vector = 0;
+ _alloc = 0;
+}
+
+
+
+void bit_vector::save(int fd) const
+{
+
+ ssize_t towrite = sizeof(size_t);
+ std::string msg = "I/O error in bit_vector::write";
+
+ if (towrite != write(fd,&_size,towrite))
+ throw msg;
+
+ towrite = sizeof(uint32_t);
+
+ for (size_t i = 0; i < _alloc; ++i)
+ if (towrite != write(fd, &_vector[i], towrite))
+ throw msg;
+
+ return;
+
+}
+
+
+void bit_vector::pack()
+{
+
+ size_t nalloc = uint32bits(_size);
+ uint32_t * nvector = new uint32_t[nalloc];
+ array_copy(_vector,nvector,nalloc);
+ if (!_leaked) delete [] _vector;
+ _alloc = nalloc;
+ _vector = nvector;
+ _leaked = false;
+
+}
+
+//sets the idxth bit to b
+void bit_vector::set(size_t idx, bool b)
+{
+ size_t i = idx / sizeof(uint32_t);
+ size_t j = idx % sizeof(uint32_t);
+ grow(allocforsize(idx+1));
+ if (idx >= _size)
+ _size = idx+1;
+ uint32_t mask = 1 << j;
+ if (b)
+ _vector[i] |= mask;
+ else
+ _vector[i] &= ~mask;
+}
+
+void bit_vector::push_back(bool b)
+{
+ set(_size, b);
+}
+
+bool bit_vector::pop_back()
+{
+ _size--;
+ return unsafe_get(_size);
+}
+
+bool bit_vector::flip(size_t idx)
+{
+ bool b = get(idx);
+ set(idx, !b);
+ return b;
+}
+
+void bit_vector::set_field(size_t index, size_t len, uint32_t v)
+{
+ size_t i=index*len/W, j=index*len-i*W;
+
+ grow(allocforsize((index+2)*len));
+ if (j+len > W){
+ if ((index+1)*len >= _size)
+ _size = (index+2)*len;
+ } else if (index*len >= _size)
+ _size = (index+1)*len;
+
+ uint32_t mask = ((j+len) < W ? ~0U << (j+len) : 0)
+ | ((W-j) < W ? ~0U >> (W-j) : 0);
+ _vector[i] = (_vector[i] & mask) | v << j;
+
+ if (j+len>W) {
+ mask = ((~0U) << (len+j-W));
+ _vector[i+1] = (_vector[i+1] & mask)| v >> (W-j);
+ }
+}
+
+bool bit_vector::get(size_t index) const
+{
+ if (index >= _size)
+ throw (std::out_of_range("bit_vector::get"));
+
+ return unsafe_get(index);
+}
+
+uint32_t bit_vector::get_field(size_t index, size_t len) const
+{
+ if ((index+1)*len > _size)
+ throw (std::out_of_range("bit_vector::get_field: index out of bound"));
+ if (len > W)
+ throw (std::invalid_argument("bit_vector::get_field: len > 32" ));
+
+ return unsafe_get_field(index, len);
+}
+
+uint32_t * bit_vector::get_vector() const
+{
+ uint32_t * vector = new uint32_t[_alloc];
+ array_copy(_vector,vector,_alloc);
+ return vector;
+}
--- /dev/null
+#ifndef __BIT_VECTOR_HPP__
+#define __BIT_VECTOR_HPP__
+
+#include <cstdint>
+#include <cstddef>
+#include <climits>
+
+
+#ifndef W
+#define W (sizeof(uint32_t)*CHAR_BIT)
+#define WW (W*2)
+#endif
+
+
+/** Simple class implementing a vector of bits as an uint32_t array.
+ */
+
+class bit_vector {
+
+ public:
+ /** Builds a bit vector capable of holding len elements of blen bits each,
+ initializes the vector with the copies len times the blen lowermost bits
+ of init.
+ Throws an exception if blen == 0 or blen > 32.
+ */
+ bit_vector (size_t len=0, size_t blen=1, uint32_t init=0);
+
+ /** Load the bit_vector from a file descriptor */
+ bit_vector (int fd);
+
+ /** copy constructor */
+ bit_vector (const bit_vector& src);
+
+ bit_vector & operator=(const bit_vector& src);
+
+ /** Destructor
+ */
+ ~bit_vector ();
+
+ /** Writes the content of the the bit_vector to the file descriptor fd, in
+ binary format.
+ */
+ void save(int fd) const;
+
+ /** Returns the size of the bit_vector in bits */
+ size_t size() const { return _size; };
+
+ /** Returns the maximum number of bits that can be added without
+ reallocation
+ */
+ size_t max_size() const { return _alloc*W; };
+
+ /** Resizes the vector so that it can store size() bits and such that at most
+ sizeof(uint32_t) - 1 bits are wasted.*/
+ void pack();
+
+ /** Sets the idxth bit to b. The bit_vector is resized if necessary */
+ void set(size_t idx, bool b);
+
+ /** Adds bit b at the end of the bit_vector. The bit_vector is resized if
+ necessary */
+ void push_back(bool b);
+
+ /** Returns the last bit of the bit vector, which is removed */
+ bool pop_back();
+
+ /** Sets bits [idx*len - (idx+1)*len] of the bit vector to the len
+ lowermost bits of v.
+ Throws an exception if len == 0 or len > W
+ The bit_vector is resized if necessary.
+ */
+ void set_field(size_t idx, size_t len, uint32_t v);
+
+ /** Flips the idxth bit of the bit_vector. Returns the value of the bit
+ before flipping */
+ bool flip(size_t idx);
+
+ /** Returns the idxth bit. Performs bound checking */
+ bool get(size_t idx) const;
+
+ /** Returns the bits idx*len to (idx+1)*len, packed into an uint32_t.
+ Throws an exception if len == 0 or len > sizeof(uint32_t).
+ Performs bound checking.
+ */
+ uint32_t get_field(size_t idx, size_t len) const;
+
+ /** returns a copy of the internal vector
+ */
+ uint32_t* get_vector() const;
+ /** returns a pointer to the internal vector
+ */
+ uint32_t* get_vector_ptr() { _leaked = true; return _vector; }
+
+ /** Efficient unsafe get/get_field */
+ bool unsafe_get(size_t idx) const;
+ uint32_t unsafe_get_field(size_t idx, size_t len) const;
+
+
+private:
+ // true if get_vector_ptr() was called
+ bool _leaked;
+ // Size in bits
+ size_t _size;
+ // Allocated space in sizeof(uint32_t)
+ size_t _alloc;
+ uint32_t * _vector;
+ /** Increase the size of _vector
+ */
+
+ size_t allocforsize(size_t bits);
+ void grow(size_t alloc);
+
+
+};
+
+/** Unsafe version of get. Does not perform bound checking
+ */
+inline bool bit_vector::unsafe_get(size_t idx) const
+{
+ return (_vector[idx / W] >> (idx % W)) & true;
+};
+
+/** Unsafe version of get_field. Does not perform bound checking.
+ Assumes 1<= len <= sizeof(uint32_t).
+*/
+inline uint32_t bit_vector::unsafe_get_field(size_t idx, size_t len) const
+{
+ switch (len){
+ case 8:
+ return ((uint8_t*)_vector)[idx];
+ case 16:
+ return ((uint16_t*)_vector)[idx];
+ case 32:
+ return _vector[idx];
+ case 1:
+ return unsafe_get(idx);
+ default:
+ size_t idxlen = idx*len;
+ size_t j = idxlen % W;
+ size_t i = idxlen / W;
+ size_t offset = W - len;
+ return (offset >= j)
+ ? (_vector[i] << (offset-j)) >> offset
+ : (_vector[i] >> j) | (_vector [i+1] << (W+offset-j)) >> offset;
+ };
+};
+
+#endif
+++ /dev/null
-#ifndef COMMON_H_\r
-#define COMMON_H_\r
-\r
-\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-//#include <string.h> // for memset\r
-#include <sys/types.h>\r
-#include <unistd.h>\r
-#include <errno.h>\r
-#define B_ERROR(msg) do { fprintf(stderr,"%s\n", msg); exit(1); } while (0)\r
-\r
-inline void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream)\r
- {\r
- size_t res;\r
- res = fread(ptr,size,nmemb,stream);\r
- if (res < nmemb)\r
- B_ERROR ("ufread I/O error" );\r
- return;\r
- }\r
-\r
-inline void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)\r
- {\r
- size_t res;\r
- res = fwrite(ptr,size,nmemb,stream);\r
- if (res < nmemb)\r
- B_ERROR("ufwrite I/O error");\r
- return;\r
- }\r
-\r
-inline void *urealloc(void *ptr, size_t size)\r
- {\r
- void *dest = realloc(ptr,size);\r
- //don't fail if we requested size 0\r
- if (dest == NULL && size > 0 )\r
- B_ERROR("urealoc error");\r
- return dest;\r
- }\r
-// realloc and set to 0\r
-inline void *urecalloc(void *ptr, size_t o_size, size_t n_size)\r
- {\r
- if (o_size == n_size)\r
- return ptr;\r
- \r
- void *dest = realloc(ptr,n_size);\r
- //don't fail if we requested size 0\r
- if (dest == NULL && n_size > 0 )\r
- B_ERROR("urecalloc error");\r
- // Initialize the new area with 0\r
- void * n_area_start = &(((char*) dest)[o_size]);\r
- // memset(n_area_start,0, n_size - o_size);\r
- for(size_t i = 0; i < n_size - o_size;i++)\r
- ((char *) n_area_start)[i] = 0;\r
- return dest;\r
- }\r
-\r
-inline void *ucalloc(size_t nmemb, size_t size)\r
- {\r
- void * dest = calloc(nmemb,size);\r
- //don't fail if we requested size 0\r
- if (dest == NULL && nmemb > 0 && size > 0 )\r
- B_ERROR("ucalloc error");\r
- return dest;\r
- }\r
-\r
-inline void *umalloc(size_t size) \r
- {\r
- void * dest = malloc(size);\r
- if (dest == NULL && size > 0)\r
- B_ERROR("umaloc error");\r
- return dest;\r
- }\r
-\r
-#endif\r
+++ /dev/null
-#ifndef TIMINGS_H_
-#define TIMINGS_H_
-//Timing Macro's
-#include <sys/time.h>
-#include <time.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-static double tParsing = 0;
-static unsigned int cParsing = 0;
-
-static double tLoading = 0;
-static unsigned int cLoading = 0;
-static double tBuilding = 0;
-static unsigned int cBuilding = 0;
-
-static struct timeval tmpv1;
-static struct timeval tmpv2;
-static std::string mem1;
-static std::string mem2;
-
-static void read_procmem(std::string& memstr) {
- std::string buf;
- pid_t pid = getpid();
- std::stringstream path;
- path << "/proc/" << pid << "/status";
- std::ifstream infile;
- infile.open (path.str().c_str(), std::ifstream::in);
- while (infile.good()){
- getline(infile,buf);
- if (infile.eof()) {
- memstr = "Could not read memory";
- return;
- };
- int idx = buf.find("VmRSS");
- if (idx == 0){
- memstr = buf;
- return;
- };
- };
- memstr = "Could not read memory";
- return;
-
-}
-
-#define STARTTIMER() do { \
- read_procmem(mem1); \
- gettimeofday(&tmpv1,NULL); \
- } while (0) \
-
-#define STOPTIMER(x) do { \
- gettimeofday(&tmpv2,NULL); \
- read_procmem(mem2); \
- (t##x) = ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \
- (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \
- (c##x)= (c##x)+1; \
- } while (0)
-
-#define PRINTTIME(s,x) do { \
- std::cerr << (s) << " : " << (t##x) << "ms" << std::endl; \
- std::cerr << "Mem use before: " << mem1 << std::endl; \
- std::cerr << "Mem use after: " << mem2 << std::endl; \
- } while (0) \
-
-
-
-
-
-
-
-
-#endif
--- /dev/null
+#include "xml-tree-builder.hpp"
+#include <stdexcept>
+#include <utility>
+
+using namespace SXSI;
+
+xml_tree_builder::xml_tree_builder()
+{
+ opened = false;
+ par = 0;
+ tags = 0;
+ tag_ids = 0;
+ text_positions = 0;
+ disable_text_index = false;
+ tc_builder = 0;
+}
+
+xml_tree_builder::~xml_tree_builder()
+{
+ if (opened) reset();
+}
+
+void xml_tree_builder::reset()
+{
+ delete par;
+ delete tags;
+ delete tag_ids;
+ if (!disable_text_index){
+ delete tc_builder;
+ delete text_positions;
+ };
+}
+
+
+int32_t xml_tree_builder::register_tag(std::string tag, int32_t id)
+{
+ auto found = tag_ids->find(tag);
+
+ if (found == tag_ids->end()) {
+ if (id != current_tag)
+ throw std::runtime_error("xml-tree-builder: inconsistant tag id");
+
+ tag_ids->insert(std::make_pair(tag, id));
+ current_tag++;
+ return id;
+ } else
+ return found->second;
+
+}
+
+int32_t xml_tree_builder::register_tag(std::string tag)
+{
+ return register_tag(tag, current_tag);
+}
+
+void
+xml_tree_builder::open_document(bool disable_text_index,
+ unsigned int sample_rate,
+ TextCollectionBuilder::index_type_t idx_type)
+{
+ if (opened) reset();
+ opened = true;
+ par = new bit_vector();
+ tags = new std::vector<int32_t>();
+ current_tag = 0;
+ tag_ids = new std::unordered_map<std::string, int32_t>();
+
+ register_tag(xml_tree::DOCUMENT_OPEN_TAG, xml_tree::DOCUMENT_OPEN_TAG_ID);
+ register_tag(xml_tree::DOCUMENT_OPEN_TAG, xml_tree::DOCUMENT_OPEN_TAG_ID);
+
+ register_tag(xml_tree::ATTRIBUTE_OPEN_TAG, xml_tree::ATTRIBUTE_OPEN_TAG_ID);
+ register_tag(xml_tree::ATTRIBUTE_OPEN_TAG, xml_tree::ATTRIBUTE_OPEN_TAG_ID);
+
+ register_tag(xml_tree::PCDATA_OPEN_TAG, xml_tree::PCDATA_OPEN_TAG_ID);
+ register_tag(xml_tree::PCDATA_OPEN_TAG, xml_tree::PCDATA_OPEN_TAG_ID);
+
+ register_tag(xml_tree::ATTRIBUTE_DATA_OPEN_TAG,
+ xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID);
+
+ register_tag(xml_tree::ATTRIBUTE_DATA_OPEN_TAG,
+ xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID);
+
+
+ this->disable_text_index = disable_text_index;
+ if (!disable_text_index){
+ tc_builder = TextCollectionBuilder::create(sample_rate, idx_type);
+ text_positions = new bit_vector();
+ text_index_type = idx_type;
+ };
+}
+
+void xml_tree_builder::open_tag(std::string tag)
+{
+ int32_t id = register_tag(tag);
+ tags->push_back(id);
+ par->push_back(true);
+ if (!disable_text_index) text_positions->push_back(false);
+}
+
+void xml_tree_builder::close_tag(std::string)
+{
+ xml_tree::tag_t t = xml_tree::CLOSE_TAG_ID;
+ tags->push_back(t);
+ par->push_back(false);
+ if (!disable_text_index) text_positions->push_back(false);
+}
+
+void xml_tree_builder::text(std::string s)
+{
+ if (!disable_text_index){
+ if (s.empty()) s = "\001";
+ tc_builder->InsertText((const unsigned char *) s.c_str());
+ text_positions->set(text_positions->size() - 1, true);
+ }
+}
+
+xml_tree *xml_tree_builder::close_document()
+{
+ if (opened) {
+ opened = false;
+ auto tags_ = tags;
+ auto tag_ids_ = tag_ids;
+ auto par_ = par;
+ auto tc_builder_ = tc_builder;
+ auto text_positions_ = text_positions;
+ tc_builder = 0;
+ text_positions = 0;
+ tags = 0;
+ tag_ids = 0;
+ par = 0;
+ return new xml_tree(tags_, tag_ids_, par_,
+ disable_text_index,
+ tc_builder_,
+ text_index_type,
+ text_positions_);
+ };
+
+ throw std::runtime_error("xml_tree_builder: inconsistent parser state");
+}
+
--- /dev/null
+#ifndef XML_TREE_BUILDER_HPP_
+#define XML_TREE_BUILDER_HPP_
+
+#include <cstdint>
+#include <unordered_map>
+#include "xml-tree.hpp"
+#include "bit-vector.hpp"
+#undef W
+#undef WW
+#undef Wminusone
+#include <TextCollection/TextCollectionBuilder.h>
+
+class xml_tree_builder {
+
+public:
+ xml_tree_builder();
+ ~xml_tree_builder();
+ void open_document(bool disable_text_index,
+ unsigned int sample_rate,
+ SXSI::TextCollectionBuilder::index_type_t idx_type);
+
+ xml_tree *close_document();
+ void open_tag(std::string);
+ void close_tag(std::string);
+ void text(std::string);
+
+private:
+ void reset();
+ int32_t register_tag(std::string);
+ int32_t register_tag(std::string, int32_t);
+
+
+ bit_vector *par;
+ std::vector<int32_t> *tags;
+ int32_t current_tag;
+ std::unordered_map<std::string, int32_t> *tag_ids;
+ bool opened;
+
+
+ bit_vector *text_positions;
+ SXSI::TextCollectionBuilder *tc_builder;
+ bool disable_text_index;
+ SXSI::TextCollectionBuilder::index_type_t text_index_type;
+
+};
+
+
+#endif
--- /dev/null
+#ifndef XML_TREE_INTERNAL__
+#error "xml-tree-inc.hpp should not be included directly"
+#endif
+
+#ifndef XML_TREE_INC_HPP_
+#define XML_TREE_INC_HPP_
+
+inline uint32_t xml_tree::size() const
+{
+ return tag_seq_len / 2;
+}
+
+inline uint32_t xml_tree::num_tags() const
+{
+ return tag_names->size();
+}
+
+inline uint32_t xml_tree::subtree_size(xml_tree::node_t x) const
+{
+ return bp_subtree_size(this->par, x);
+}
+
+inline uint32_t
+xml_tree::subtree_tags(xml_tree::node_t x, xml_tree::tag_t label) const
+{
+ xml_tree::node_t y = bp_find_close(this->par, x);
+ if (y - x < 10) {
+ uint32_t count = 0;
+ for(xml_tree::node_t i = x; i < y; i++)
+ count += (tag(i) == label);
+ return count;
+ } else {
+ return tags->rank(label, y) - tags->rank(label, x);
+ };
+}
+
+inline bool xml_tree::is_leaf(xml_tree::node_t x) const
+{
+ return !bp_inspect(this->par, x+1);
+}
+
+inline bool xml_tree::is_ancestor(xml_tree::node_t x,
+ xml_tree::node_t y) const
+{
+ return bp_is_ancestor(this->par, x, y);
+}
+
+inline bool
+xml_tree::is_right_descendant(xml_tree::node_t x,
+ xml_tree::node_t y) const
+{
+ return
+ (x > root())
+ && (y <= bp_parent_close(this->par, x))
+ && (y >= bp_find_close(this->par, x));
+}
+
+inline bool xml_tree::is_first_child(xml_tree::node_t x) const
+{
+ return (x <= this->root()) || (bp_inspect(this->par, x - 1));
+}
+
+inline bool xml_tree::is_nil(xml_tree::node_t x) const
+{
+ return (x == xml_tree::NIL);
+}
+
+inline xml_tree::tag_t xml_tree::tag(xml_tree::node_t x) const
+{
+ if (bits_per_tag == 8)
+ return (xml_tree::tag_t) (((unsigned char*) tag_seq)[x]);
+ else
+ return get_field(tag_seq, bits_per_tag, x);
+}
+
+inline xml_tree::node_t xml_tree::root() const
+{
+ return xml_tree::ROOT;
+}
+
+
+inline xml_tree::node_t xml_tree::parent(xml_tree::node_t x) const
+{
+ return bp_parent(this->par, x);
+}
+
+inline xml_tree::node_t xml_tree::first_child(node_t x) const
+{
+ return bp_first_child(this->par, x);
+}
+
+inline xml_tree::node_t xml_tree::last_child(xml_tree::node_t x) const
+{
+ if (is_leaf(x))
+ return xml_tree::NIL;
+ else
+ return bp_find_open(this->par, bp_find_close(this->par, x) - 1);
+}
+
+inline xml_tree::node_t xml_tree::next_sibling(xml_tree::node_t x) const
+{
+ return bp_next_sibling(this->par, x);
+}
+
+inline xml_tree::node_t xml_tree::prev_sibling(xml_tree::node_t x) const
+{
+ return bp_prev_sibling(this->par, x);
+}
+
+inline xml_tree::node_t xml_tree::first_element(xml_tree::node_t x) const
+{
+ xml_tree::node_t n = first_child(x);
+ if (is_nil(n)) return xml_tree::NIL;
+ switch (tag(n)) {
+ case xml_tree::ATTRIBUTE_OPEN_TAG_ID:
+ n = next_sibling(n);
+ if (is_nil(n) || tag(n) != xml_tree::PCDATA_OPEN_TAG_ID) return n;
+ //Fallthrough
+ case PCDATA_OPEN_TAG_ID:
+ n = n + 2;
+ return bp_inspect(this->par, n) ? n : xml_tree::NIL;
+ };
+}
+
+inline xml_tree::node_t xml_tree::next_element(xml_tree::node_t x) const
+{
+ x = next_sibling(x);
+ if (is_nil(x)) return x;
+ if (tag(x) == xml_tree::PCDATA_OPEN_TAG_ID){
+ xml_tree::node_t y = x + 2;
+ return bp_inspect(this->par, y) ? y : xml_tree::NIL;
+ } else
+ return x;
+}
+
+inline xml_tree::node_t xml_tree::tagged_next(node_t x, tag_t tag) const
+{
+ return this->tags->select_next(tag, x);
+}
+
+inline xml_tree::node_t
+xml_tree::tagged_descendant(xml_tree::node_t x,
+ xml_tree::tag_t tag) const
+{
+ xml_tree::node_t y = tagged_next(x, tag);
+ return is_nil(y) || !is_ancestor(x, y) ? xml_tree::NIL : y;
+}
+
+inline xml_tree::node_t
+xml_tree::tagged_following_before(xml_tree::node_t x,
+ xml_tree::tag_t tag,
+ xml_tree::node_t limit) const
+{
+ xml_tree::node_t close = bp_find_close(this->par, x);
+ xml_tree::node_t s = tagged_next(close, tag);
+ return (s < limit) ? s : xml_tree::NIL;
+}
+
+
+inline xml_tree::node_t xml_tree::tagged_child(xml_tree::node_t x,
+ xml_tree::tag_t t) const
+{
+ xml_tree::node_t c = first_child(x);
+ if (is_nil(c) || tag(c) == t)
+ return c;
+ else
+ tagged_sibling(c, t);
+}
+
+inline xml_tree::node_t xml_tree::tagged_sibling(xml_tree::node_t x,
+ xml_tree::tag_t t) const
+{
+ xml_tree::node_t sibling = next_sibling(x);
+ while(!is_nil(sibling) && tag(sibling) != t) sibling = next_sibling(sibling);
+ return sibling;
+}
+
+xml_tree::node_t xml_tree::closing(xml_tree::node_t x) const
+{
+ return bp_find_close(this->par, x);
+}
+
+
+inline SXSI::TextCollection *xml_tree::get_text_collection() const
+{
+ return text_collection;
+}
+
+inline xml_tree::node_t xml_tree::parent_node(int32_t d) const
+{
+ return (xml_tree::node_t) text_positions->select1(d + 1);
+}
+
+inline SXSI::TextCollection::document_result
+xml_tree::prefix(uchar const *s) const
+{
+ return text_collection->Prefix(s);
+}
+
+inline SXSI::TextCollection::document_result
+xml_tree::suffix(uchar const *s) const
+{
+ return text_collection->Suffix(s);
+}
+
+inline SXSI::TextCollection::document_result
+xml_tree::equals(uchar const *s) const
+{
+ return text_collection->Equal(s);
+}
+
+inline SXSI::TextCollection::document_result
+xml_tree::contains(uchar const *s) const
+{
+ return text_collection->Contains(s);
+}
+
+inline SXSI::TextCollection::document_result
+xml_tree::less_than(uchar const *s) const
+{
+ return text_collection->LessThan(s);
+}
+
+
+#endif //XML_TREE_INC_HPP_
--- /dev/null
+extern "C" {
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+}
+
+#include <stdexcept>
+#include <cerrno>
+#include <cstring>
+#include <cstdio>
+#include "xml-tree.hpp"
+
+using namespace SXSI;
+
+const xml_tree::tag_t xml_tree::NIL_TAG_ID;
+const char* xml_tree::NIL_TAG = "<!NIL!>";
+const xml_tree::tag_t xml_tree::DOCUMENT_OPEN_TAG_ID;
+const char* xml_tree::DOCUMENT_OPEN_TAG = "";
+const xml_tree::tag_t xml_tree::ATTRIBUTE_OPEN_TAG_ID;
+const char* xml_tree::ATTRIBUTE_OPEN_TAG = "<@>";
+const xml_tree::tag_t xml_tree::PCDATA_OPEN_TAG_ID;
+const char* xml_tree::PCDATA_OPEN_TAG = "<$>";
+const xml_tree::tag_t xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID;
+const char* xml_tree::ATTRIBUTE_DATA_OPEN_TAG = "<@$>";
+const xml_tree::tag_t xml_tree::CLOSE_TAG_ID;
+const char* xml_tree::CLOSE_TAG = "</>";
+
+static int bits8 (int t ) {
+ int r = bits(t);
+ if (r <= 8)
+ return 8;
+ else if (r <= 16)
+ return 16;
+ else
+ return r;
+}
+
+
+
+
+static void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream)
+ {
+ size_t res;
+ res = fread(ptr,size,nmemb,stream);
+ if (res < nmemb)
+ throw std::runtime_error("ufread I/O error" );
+ return;
+ }
+
+static void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+ {
+ size_t res;
+ res = fwrite(ptr,size,nmemb,stream);
+ if (res < nmemb)
+ throw std::runtime_error("ufwrite I/O error");
+ return;
+ }
+
+
+
+xml_tree::xml_tree()
+{
+ print_stack = 0;
+ print_buffer = 0;
+}
+
+xml_tree::xml_tree(std::vector<int32_t> *tags,
+ std::unordered_map<std::string, int32_t> *tag_ids,
+ bit_vector *parbitmap,
+ bool disable_tc,
+ SXSI::TextCollectionBuilder *tc_builder,
+ SXSI::TextCollectionBuilder::index_type_t idx_type,
+ bit_vector *textbitmap)
+{
+ print_stack = 0;
+ print_buffer = 0;
+
+ size_t npar = parbitmap->size();
+ parbitmap->pack();
+ par = bp_construct(npar,
+ parbitmap->get_vector_ptr(),
+ OPT_DEGREE);
+ delete parbitmap;
+
+ this->tag_ids = tag_ids;
+ tag_names = new std::vector<std::string>();
+ tag_names->resize(tag_ids->size());
+ for(auto val : *(this->tag_ids))
+ (*this->tag_names)[val.second] = val.first;
+
+ uint32_t max_tag = tag_names->size() - 1;
+ static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();
+ alphabet_mapper *am = new alphabet_mapper_none();
+ this->tags = new static_sequence_bs((uint32_t *) &tags[0], npar, am, bmb);
+ bits_per_tag = bits8(max_tag);
+ tag_seq_len = npar;
+ tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)];
+ for(size_t i = 0; i < (size_t) tag_seq_len; i++)
+ set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]);
+
+ delete tags;
+
+ if (disable_tc) {
+ text_positions = 0;
+ text_collection = 0;
+ } else {
+ textbitmap->pack();
+ uint32_t * textbm = textbitmap->get_vector_ptr();
+ text_positions = new static_bitsequence_rrr02(textbm,
+ npar,
+ 32);
+ delete [] textbm;
+
+ this->text_index_type = idx_type;
+ text_collection = tc_builder->InitTextCollection();
+ delete tc_builder;
+ };
+
+
+}
+
+xml_tree::~xml_tree()
+{
+ bp_delete(par);
+ delete tags;
+ delete [] tag_seq;
+ delete tag_names;
+ delete tag_ids;
+
+ if (text_collection) delete text_collection;
+ if (text_positions) delete text_positions;
+}
+
+bool xml_tree::is_child(xml_tree::node_t x, xml_tree::node_t y) const
+{
+ return !is_leaf(x) && is_ancestor(y, x) && depth(x)+1 == depth(y);
+}
+
+uint32_t xml_tree::depth(xml_tree::node_t x) const
+{
+ return bp_depth(this->par, x);
+}
+
+
+uint32_t xml_tree::preorder(xml_tree::node_t x) const
+{
+ return bp_preorder_rank(this->par, x);
+}
+
+uint32_t xml_tree::postorder(xml_tree::node_t x) const
+{
+ return bp_postorder_rank(this->par, x);
+}
+
+xml_tree::node_t
+xml_tree::select_child(xml_tree::node_t x,
+ std::unordered_set<tag_t> *tags) const
+{
+ if (is_nil(x) || is_leaf(x)) return xml_tree::NIL;
+ xml_tree::node_t child = first_child(x);
+ if (tags->find(tag(child)) != tags->end()) return child;
+ return select_sibling(child, tags);
+}
+
+xml_tree::node_t
+xml_tree::select_descendant(xml_tree::node_t x,
+ std::unordered_set<tag_t> *tags) const
+{
+ if (is_leaf(x)) return xml_tree::NIL;
+ auto min = xml_tree::NIL;
+ auto aux = xml_tree::NIL;
+ for(auto tag = tags->begin(); tag != tags->end(); ++ tag){
+ aux = tagged_descendant(x, *tag);
+ if ((unsigned int) aux < (unsigned int) min) min = aux;
+ };
+ return min;
+}
+
+
+xml_tree::node_t
+xml_tree::select_sibling(xml_tree::node_t x,
+ std::unordered_set<tag_t> *tags) const
+{
+ xml_tree::node_t sibling = next_sibling(x);
+ while(!is_nil(sibling) && tags->find(tag(sibling)) == tags->end())
+ sibling = next_sibling(sibling);
+ return (sibling);
+}
+
+xml_tree::node_t
+xml_tree::select_following_before(xml_tree::node_t x,
+ std::unordered_set<tag_t> *tags,
+ xml_tree::node_t limit) const
+{
+ auto min = xml_tree::NIL;
+ auto aux = xml_tree::NIL;
+ for(auto tag = tags->begin(); tag != tags->end(); ++tag){
+ aux = tagged_following_before(x, *tag, limit);
+ if ((unsigned int) aux < (unsigned int) min) min = aux;
+ }
+ return min;
+}
+
+void xml_tree::save(int fd, char* s)
+{
+ FILE* fp;
+ int fd2 = dup(fd);
+
+ fp = fdopen(fd2, "w");
+
+ if (fd == 0) throw(std::runtime_error(strerror(errno)));
+ saveTree(par, fp); //TODO use new api
+
+ int ntags = tag_names->size();
+
+ ufwrite(&ntags, sizeof(int), 1, fp);
+ for (int i = 0; i < ntags;i++)
+ fprintf(fp, "%s\n", tag_names->at(i).c_str());
+
+ tags->save(fp);
+
+ ufwrite(&bits_per_tag, sizeof(uint),1,fp);
+ ufwrite(&tag_seq_len, sizeof(uint),1,fp);
+ ufwrite(tag_seq, sizeof(uint), uint_len(bits_per_tag, tag_seq_len), fp);
+
+ bool disable_tc = text_collection == 0 || text_positions == 0;
+ ufwrite(&disable_tc, sizeof(bool),1,fp);
+
+ text_positions->save(fp);
+ if (!disable_tc) {
+ ufwrite(&text_index_type,
+ sizeof(TextCollectionBuilder::index_type_t),
+ 1, fp);
+
+ std::string file(s);
+ switch (text_index_type){
+ case TextCollectionBuilder::index_type_default:
+ file.append("_default");
+ break;
+ case TextCollectionBuilder::index_type_swcsa:
+ file.append("_swcsa");
+ break;
+ case TextCollectionBuilder::index_type_rlcsa:
+ file.append("_rlcsa");
+ break;
+ };
+
+ text_collection->Save(fp, file.c_str());
+ };
+
+ fflush(fp);
+ fclose(fp);
+
+}
+//static xml_tree* load(char*, bool, int);
+// void print(int, node_t, bool no_text=false);
+xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf)
+{
+ FILE *fp;
+ char buffer[1024];
+
+ int i;
+ buffer[1023] = '\0';
+ fp = fdopen(fd, "r");
+
+ xml_tree *tree = new xml_tree();
+
+ tree->par = loadTree(fp); //TODO use new api
+ tree->tag_names = new std::vector<std::string>();
+ tree->tag_ids = new std::unordered_map<std::string, xml_tree::tag_t>();
+ std::string s;
+ int ntags;
+
+ ufread(&ntags, sizeof(int), 1, fp);
+
+ for (i = 0; i < ntags; i++) {
+ if (fgets(buffer, 1022, fp) != buffer)
+ throw std::runtime_error("xml_tree::load: cannot read tag list");
+ s = buffer;
+ // remove the trailing \n
+ s.erase(s.size()-1);
+ tree->tag_names->push_back(s);
+ tree->tag_ids->insert(std::make_pair(s,
+ static_cast<xml_tree::tag_t>(i)));
+
+ };
+
+ tree->tags = static_sequence::load(fp);
+ ufread(&tree->bits_per_tag, sizeof(uint), 1, fp);
+ ufread(&tree->tag_seq_len, sizeof(uint), 1, fp);
+ size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len);
+ tree->tag_seq = new uint[size];
+ ufread(tree->tag_seq, sizeof(uint), size, fp);
+
+ bool disable_tc;
+ ufread(&disable_tc, sizeof(bool), 1, fp);
+
+ if (disable_tc || !load_tc) {
+ tree->text_positions = 0;
+ tree->text_collection = 0;
+ } else {
+
+ tree->text_positions = static_bitsequence_rrr02::load(fp);
+
+ ufread(&tree->text_index_type,
+ sizeof(TextCollectionBuilder::index_type_t), 1, fp);
+
+ std::string file(name);
+ switch (tree->text_index_type){
+ case TextCollectionBuilder::index_type_default:
+ file.append("_default");
+ break;
+ case TextCollectionBuilder::index_type_swcsa:
+ file.append("_swcsa");
+ break;
+ case TextCollectionBuilder::index_type_rlcsa:
+ file.append("_rlcsa");
+ break;
+ };
+
+
+ tree->text_collection =
+ TextCollection::Load(fp,
+ file.c_str(),
+ TextCollection::index_mode_default,
+ sf);
+
+ };
+ return tree;
+}
+
+uint32_t xml_tree::subtree_elements(xml_tree::node_t x) const
+{
+
+ uint32_t size = bp_subtree_size(par, x);
+ if (x == root()){
+ x = bp_first_child(par,x);
+ size = size - 1;
+ };
+
+ int s = x + 2*size - 1;
+ int ntext =
+ tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, s) -
+ tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, x-1);
+
+ size = size - ntext;
+ xml_tree::node_t fin = bp_find_close(par, x);
+ xml_tree::node_t y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, x);
+ while (y != xml_tree::NIL && y < fin){
+ size -= subtree_size(y);
+ y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, y);
+ };
+ return size;
+ }
+
+uint32_t xml_tree::num_children(xml_tree::node_t x) const
+{
+ return bp_degree(par, x);
+}
+
+uint32_t xml_tree::child_pos(xml_tree::node_t x) const
+{
+ return bp_child_rank(par, x);
+}
+
+xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const
+{
+ if (i < 10) return bp_naive_child(par, x, i);
+ else bp_child(par, x, i);
+}
+
+std::pair<int32_t, int32_t> xml_tree::text_id_range(xml_tree::node_t x) const
+{
+ int32_t i, j;
+ i = text_positions->rank1(x - 1);
+ j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2);
+ if (i == j)
+ return std::make_pair(xml_tree::NIL, xml_tree::NIL);
+ else
+ return std::make_pair(i + 1, j);
+}
+
+int32_t xml_tree::text_id(xml_tree::node_t x) const
+{
+ return (int32_t) text_positions->rank1(x) - 1;
+}
+
+unsigned char* xml_tree::get_text(int32_t id) const
+{
+ unsigned char * s = text_collection->GetText(id);
+ return s + (s[0] == 1);
+}
+
+void xml_tree::uflush(int fd)
+{
+ size_t size = print_buffer->size();
+ if (size < BUFFER_SIZE) return;
+ uflush_r(fd, size);
+}
+void xml_tree::flush(int fd)
+{
+ uflush_r(fd, print_buffer->size());
+}
+
+
+void xml_tree::uflush_r(int fd, size_t s)
+{
+ if (s == 0) return;
+ size_t written;
+ while (1) {
+ written = write(fd, print_buffer->data(), s);
+ if ((written < 0) && (errno == EAGAIN || errno == EINTR))
+ continue;
+ break;
+ };
+ print_buffer->clear();
+}
+void xml_tree::uput_str(std::string s, int fd)
+{
+ print_buffer->append(s);
+ uflush(fd);
+}
+void xml_tree::uputs(const char* s, int fd)
+{
+ print_buffer->append(s);
+ uflush(fd);
+}
+void xml_tree::uputc(const char c, int fd)
+{
+ print_buffer->push_back(c);
+ uflush(fd);
+}
+
+const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const
+{
+
+ unsigned char *s;
+ if (tagid < 0 || tagid >= tag_names->size())
+ return "<INVALID TAG>";
+
+ return (const char *) (*tag_names)[tagid].c_str();
+}
+
+xml_tree::tag_t xml_tree::register_tag(char *s)
+{
+ auto found = tag_ids->find(std::string(s));
+ if (found == tag_ids->end())
+ return xml_tree::NIL_TAG_ID;
+ else
+ return (*found).second;
+}
+
+size_t xml_tree::uprintf(const char*s, int fd)
+{
+ if (s == 0) return 0;
+ size_t i;
+ char c;
+ for (i = 0; (c = s[i]); i++) {
+ switch (c) {
+ case '"':
+ uputs(""", fd);
+ break;
+ case '&':
+ uputs("&", fd);
+ break;
+ case '\'':
+ uputs("'", fd);
+ break;
+ case '<':
+ uputs("<", fd);
+ break;
+ case '>':
+ uputs(">", fd);
+ break;
+ default:
+ uputc(c, fd);
+
+ };
+ };
+ return i;
+}
+
+void xml_tree::print(int fd, xml_tree::node_t x, bool no_text)
+{
+ if (print_buffer == 0) {
+ print_buffer = new std::string(BUFFER_SIZE, 0);
+ print_buffer->clear();
+ print_stack = new std::vector<std::string>();
+ print_stack->reserve(256);
+ };
+ xml_tree::node_t fin = bp_find_close(par, x);
+ xml_tree::node_t n = x;
+ xml_tree::tag_t label = tag(n);
+ unsigned char * orig_text;
+ unsigned char * current_text;
+
+ auto r = text_id_range(x);
+ if (r.first == xml_tree::NIL)
+ current_text = 0;
+ else
+ current_text = get_text(r.first);
+
+ orig_text = current_text;
+ size_t read = 0;
+
+ while (n <= fin) {
+
+ if (bp_inspect(par, n)) {
+ if (label == xml_tree::PCDATA_OPEN_TAG_ID){
+ if (no_text) {
+ uputs("<$/>", fd);
+ } else {
+ read = uprintf( (const char*) current_text, fd);
+ current_text += read + 1;
+ };
+ n += 2; // skip closin $
+ label = tag(n);
+ } else {
+
+ uputc('<', fd);
+ uput_str((*tag_names)[label], fd);
+ n++;
+ if (bp_inspect(par, n)) {
+ print_stack->push_back((*tag_names)[label]);
+ label = tag(n);
+ if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) {
+ n++;
+ if (no_text) uputs("><@@>", fd);
+
+ while (bp_inspect(par, n))
+ if (no_text) {
+ uputc('<', fd);
+ uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
+ uputc('>', fd);
+ uputs("<$@/></", fd);
+ uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
+ uputc('>', fd);
+ n += 4;
+ } else {
+ uputc(' ', fd);
+ uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
+ n++;
+ uputs("=\"", fd);
+ read = uprintf((const char*) current_text, fd);
+ current_text += read + 1;
+ uputc('"', fd);
+ n += 3;
+ };
+
+ if (no_text)
+ uputs("</@@>", fd);
+ else uputc('>', fd);
+ n++;
+ label = tag(n);
+ } else
+ uputc('>', fd);
+ } else {
+ uputs("/>", fd);
+ n++;
+ label = tag(n);
+ };
+ };
+ } else do {
+ uputs("</", fd);
+ uput_str(print_stack->back(), fd);
+ uputc('>', fd);
+ print_stack->pop_back();
+ n++;
+ } while (!bp_inspect(par, n) || print_stack->empty());
+ label = tag(n);
+ };
+ uputc('\n', fd);
+ if (orig_text && text_index_type != TextCollectionBuilder::index_type_default)
+ if (*orig_text == '\0')
+ text_collection->DeleteText(orig_text - 1);
+ else
+ text_collection->DeleteText(orig_text);
+
+}
--- /dev/null
+#ifndef XML_TREE_HPP_
+#define XML_TREE_HPP_
+
+
+#include <cstdint>
+#include <unordered_set>
+#include <unordered_map>
+#include <bp/bp.h>
+#include <bp/bp-darray.h>
+#include <libcds/includes/basics.h>
+#include <libcds/includes/static_bitsequence.h>
+#include <libcds/includes/alphabet_mapper.h>
+#include <libcds/includes/static_sequence.h>
+#include "bit-vector.hpp"
+
+#undef W
+#undef Wminusone
+#undef WW
+
+#include <TextCollection/TextCollection.h>
+#include <TextCollection/TextCollectionBuilder.h>
+
+class xml_tree_builder;
+
+class xml_tree {
+public:
+ typedef int32_t node_t;
+ typedef int32_t tag_t;
+
+ static const node_t NIL = -1;
+ static const node_t ROOT = 0;
+
+ static const char* NIL_TAG;
+ static const tag_t NIL_TAG_ID = -1;
+ static const char* DOCUMENT_OPEN_TAG;
+ static const tag_t DOCUMENT_OPEN_TAG_ID = 0;
+ static const char* ATTRIBUTE_OPEN_TAG;
+ static const tag_t ATTRIBUTE_OPEN_TAG_ID = 1;
+ static const char* PCDATA_OPEN_TAG;
+ static const tag_t PCDATA_OPEN_TAG_ID = 2;
+ static const char* ATTRIBUTE_DATA_OPEN_TAG;
+ static const tag_t ATTRIBUTE_DATA_OPEN_TAG_ID = 3;
+ static const char* CLOSE_TAG;
+ static const tag_t CLOSE_TAG_ID = 4;
+
+
+ ~xml_tree();
+
+ //Counting functions
+ inline uint32_t size() const;
+ inline uint32_t num_tags() const;
+ inline uint32_t subtree_size(node_t) const;
+ inline uint32_t subtree_tags(node_t, tag_t) const;
+ uint32_t subtree_elements(node_t) const;
+ uint32_t num_children(node_t) const;
+ uint32_t child_pos(node_t) const;
+
+
+ //Node_T tests
+ inline bool is_leaf(node_t) const;
+ inline bool is_ancestor(node_t, node_t) const;
+ inline bool is_right_descendant(node_t, node_t) const;
+ bool is_child(node_t, node_t) const;
+ inline bool is_first_child(node_t) const;
+ inline bool is_nil(node_t) const;
+
+ uint32_t depth(node_t) const;
+ uint32_t preorder(node_t) const;
+ uint32_t postorder(node_t) const;
+
+ //Tag functions
+ inline tag_t tag(node_t) const;
+ const char* get_tag_name_by_ref(tag_t) const;
+ tag_t register_tag(char *s);
+
+ //Navigation functions
+ inline node_t root () const;
+ node_t child(node_t, uint32_t) const;
+ inline node_t parent(node_t) const;
+ inline node_t first_child(node_t) const;
+ inline node_t last_child(node_t) const;
+ inline node_t next_sibling(node_t) const;
+ inline node_t prev_sibling(node_t) const;
+ inline node_t first_element(node_t) const;
+ inline node_t next_element(node_t) const;
+ inline node_t tagged_next(node_t, tag_t) const;
+ inline node_t tagged_descendant(node_t, tag_t) const;
+ inline node_t tagged_following_before(node_t, tag_t, node_t) const;
+ inline node_t tagged_child(node_t, tag_t) const;
+ inline node_t tagged_sibling(node_t, tag_t) const;
+ node_t select_child(node_t, std::unordered_set<tag_t>*) const;
+ node_t select_descendant(node_t, std::unordered_set<tag_t>*) const;
+ node_t select_sibling(node_t, std::unordered_set<tag_t>*) const;
+ node_t select_following_before (node_t,
+ std::unordered_set<tag_t>*, node_t) const;
+ inline node_t closing(node_t) const;
+
+ //Text functions
+ inline node_t parent_node(int32_t) const;
+ inline SXSI::TextCollection *get_text_collection() const;
+ std::pair<int32_t, int32_t> text_id_range(node_t) const;
+ int32_t text_id(node_t) const;
+ unsigned char* get_text(int32_t) const;
+
+ SXSI::TextCollection::document_result prefix(uchar const *s) const;
+ SXSI::TextCollection::document_result suffix(uchar const *s) const;
+ SXSI::TextCollection::document_result equals(uchar const *s) const;
+ SXSI::TextCollection::document_result contains(uchar const *s) const;
+ SXSI::TextCollection::document_result less_than(uchar const *s) const;
+
+ //I/O functions
+ void save(int, char*);
+ static xml_tree* load(int, char*, bool, int);
+ void print(int, node_t, bool no_text=false);
+ void flush(int);
+
+private:
+ friend class xml_tree_builder;
+ xml_tree();
+ xml_tree(std::vector<int32_t>*,
+ std::unordered_map<std::string, int32_t>*,
+ bit_vector*,
+ bool,
+ SXSI::TextCollectionBuilder *,
+ SXSI::TextCollectionBuilder::index_type_t,
+ bit_vector*);
+
+ //Parenthesis sequence
+ bp *par;
+ //tag sequence
+ static_sequence *tags;
+ uint32_t *tag_seq;
+ uint32_t tag_seq_len;
+ uint32_t bits_per_tag;
+ //Mapping from tag_t identifiers to/from tagnames
+ std::vector<std::string> *tag_names;
+ std::unordered_map<std::string, tag_t> *tag_ids;
+ //Text index
+ SXSI::TextCollection *text_collection;
+ static_bitsequence *text_positions;
+ SXSI::TextCollectionBuilder::index_type_t text_index_type;
+ //auxiliary data structures for efficient printing.
+ std::vector<std::string> *print_stack;
+ std::string *print_buffer;
+
+#define BUFFER_SIZE (8192*2)
+
+ void uflush(int);
+ void uflush_r(int, size_t);
+ void uput_str(std::string s, int fd);
+ void uputs(const char* s, int fd);
+ void uputc(const char c, int fd);
+ size_t uprintf(const char*s, int fd);
+};
+
+#define XML_TREE_INTERNAL__ 1
+#include "xml-tree-inc.hpp"
+
+#endif //XML_TREE_HPP_