X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;fp=XMLTree.h;h=0000000000000000000000000000000000000000;hb=c6266d8fd1872fad45b18d3d554410d080b65099;hp=74a71e65ab145858500ef611dd5f05ca4a2f4e6c;hpb=03125db103d98200f7a313a38db54c56748283d1;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h deleted file mode 100644 index 74a71e6..0000000 --- a/XMLTree.h +++ /dev/null @@ -1,774 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2008 by Diego Arroyuelo * - * Interface for the in-memory XQuery/XPath engine * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ - -#ifndef XMLTREE_H_ -#define XMLTREE_H_ - - -#include -#include -#include -#include - -#undef W -#undef WW -#undef Wminusone - -#include -#include -#include -#include -#include -#include - -using SXSI::TextCollection; -using SXSI::TextCollectionBuilder; - - -// this constant is used to efficiently compute the child operation in the tree -#define OPTD 10 - -#define NULLT -1 - -#define PERM_SAMPLE 10 - - -typedef int treeNode; -typedef int TagType; -typedef int DocID; - -typedef struct { - int min; - int max; -} range; - -// Encoding of the XML Document : -// The following TAGs and IDs are fixed, "" is the tag of the root. -// a TextNode is represented by a leaf <<$>>> The DocId in the TextCollection -// of that leaf is kept in a bit sequence. -// a TextNode below an attribute is likewise represented by a leaf <<@$>><> -// An element ... the representation is: -// <<@>> <<@>a1> <<$@>>DocID(v1)>a1> ... > .... -// Hence the attributes (if any) are always below the first child of their element, -// as the children of a fake node <@>. - - -#define DOCUMENT_OPEN_TAG "" -#define DOCUMENT_TAG_ID 0 -#define ATTRIBUTE_OPEN_TAG "<@>" -#define ATTRIBUTE_TAG_ID 1 -#define PCDATA_OPEN_TAG "<$>" -#define PCDATA_TAG_ID 2 -#define ATTRIBUTE_DATA_OPEN_TAG "<@$>" -#define ATTRIBUTE_DATA_TAG_ID 3 -#define CLOSING_TAG "" -#define CLOSING_TAG_ID 4 -#define DOCUMENT_CLOSE_TAG "/" -#define ATTRIBUTE_CLOSE_TAG "/<@>" -#define PCDATA_CLOSE_TAG "/<$>" -#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" - - -typedef std::unordered_set TagIdSet; -typedef std::unordered_map TagIdMap; -typedef TagIdMap::const_iterator TagIdMapIT; - -#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ - (v)->push_back(t); } while (false) - -// returns NULLT if the test is true -#define NULLT_IF(x) do { if (x) return NULLT; } while (0) -#define IS_NIL(x) ((x) < 0) - -// Direct calls to sarray library - -#define BUFFER_ALLOC (8192 * 2) -#define BUFFER_SIZE (BUFFER_ALLOC / 2) - -// tag position -> tree node -static treeNode tagpos2node(int t) - { - return (treeNode) t; - } -// tree node -> tag position -static int node2tagpos(treeNode x) -{ - return (int)x; -} - - -class XMLTreeBuilder; - -class XMLTree { - - // Only the builder can access the constructor - friend class XMLTreeBuilder; - - private: - /** Balanced parentheses representation of the tree */ - bp *Par; - - /** Mapping from tag identifer to tag name */ - std::vector *TagName; - TagIdMap * tIdMap; - - /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence *EBVector; - - /** Tag sequence represented with a data structure for rank and select */ - static_sequence *Tags; - uint * tags_fix; - uint tags_blen, tags_len; - - /** The texts in the XML document */ - TextCollection *Text; - - // Allows to disable the TextCollection for benchmarkin purposes - bool disable_tc; - SXSI::TextCollectionBuilder::index_type_t text_index_type; - - std::string *buffer; - std::vector *print_stack; - - - void _real_flush(int fd, size_t size) { - if (size == 0) return; - size_t written; - while (1) { - written = write(fd, buffer->data(), size); - if ((written < 0) && (errno == EAGAIN || errno == EINTR)) - continue; - break; - }; - buffer->clear(); - - } - - void _flush(int fd){ - size_t size = buffer->size(); - if (size < BUFFER_SIZE) return; - _real_flush(fd, size); - } - - void _dput_str(std::string s, int fd){ - buffer->append(s); - _flush(fd); - } - - void _dputs(const char* s, int fd){ - buffer->append(s); - _flush(fd); - } - - void _dputc(const char c, int fd){ - buffer->push_back(c); - } - - size_t _dprintf(const char* s, int fd){ - if (s == NULL) return 0; - size_t i; - char c; - for (i = 0; (c = s[i]); i++) { - switch (c) { - case '"': - _dputs(""", fd); - break; - case '&': - _dputs("&", fd); - break; - case '\'': - _dputs("'", fd); - break; - case '<': - _dputs("<", fd); - break; - case '>': - _dputs(">", fd); - break; - default: - _dputc(c, fd); - - }; - }; - return i; - } - - void PrintNode(treeNode n, int fd); - /** Data structure constructors */ - XMLTree(){ buffer = 0;}; - - // non const pointer are freed by this method. - XMLTree( pb * const par, - uint npar, - std::vector * const TN, - TagIdMap * const tim, uint *empty_texts_bmp, - TagType *tags, - TextCollectionBuilder * const TCB, bool dis_tc, - TextCollectionBuilder::index_type_t _index_type ); - -public: - /** Data structure destructor */ - ~XMLTree(); - - /** root(): returns the tree root. */ - treeNode Root() { return 0; } - - /** Size() : Number of parenthesis */ - unsigned int Size(){ - return tags_len/2; - } - - - /** NumTags() : Number of distinct tags */ - unsigned int NumTags() { - return TagName->size(); - } - - int TagsBinaryLength(){ return tags_blen; }; - unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); }; - unsigned int * TagStruct() { return tags_fix; }; - - - /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of - * node x. */ - int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); } - - /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree - * of node x. */ - int SubtreeTags(treeNode x, TagType tag){ - //int s = x + 2*subtree_size(Par, x) - 1; - treeNode y = bp_find_close(Par, x); - - if (y - x < 10) { - int count = 0; - for(int i = x; i < y; i++) - count += (Tag(i) == tag); - return count; - } - else - return (Tags->rank(tag, y) - Tags->rank(tag, x)); - }; - - /** SubtreeElements(x) of element nodes in the subtree of x - */ - int SubtreeElements(treeNode x); - - /** IsLeaf(x): returns whether node x is leaf or not. In the succinct - * representation this is just a bit inspection. */ - - bool IsLeaf(treeNode x); - - /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ - - bool IsAncestor(treeNode x, treeNode y); - - - /** IsRigthDescendant returns true if y is a descendant of x and y is - not a descendant of the first child of x */ - bool IsRightDescendant(treeNode x, treeNode y) { - if (x <= Root()) return false; - treeNode z = bp_parent_close(Par, x); - treeNode c = bp_find_close(Par, x); - return (y > c && y < z ); - } - - - /** IsChild(x,y): returns whether node x is parent of node y. */ - bool IsChild(treeNode x, treeNode y); - - /** IsFirstChild(x): returns whether node x is the first child of its parent. */ - /* SLOW - bool IsFirstChild(treeNode x) { - return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1)); - }; - */ - - bool IsFirstChild(treeNode x) { - return (x <= Root()) || (bp_inspect(Par,x-1) == OP); - } - - /** NumChildren(x): number of children of node x. Constant time with the - * data structure of Sadakane. */ - int NumChildren(treeNode x); - - /** ChildNumber(x): returns i if node x is the i-th children of its - * parent. */ - int ChildNumber(treeNode x); - - /** Depth(x): depth of node x, a simple binary rank on the parentheses - * sequence. */ - int Depth(treeNode x); - - /** Preorder(x): returns the preorder number of node x, just regarding tree - * nodes (and not texts). */ - int Preorder(treeNode x); - - /** Postorder(x): returns the postorder number of node x, just regarding - * tree nodes (and not texts). */ - int Postorder(treeNode x); - - - /** DocIds(x): returns the range (i.e., a pair of integers) of document - * identifiers that descend from node x. */ - range DocIds(treeNode x); - - /** Parent(x): returns the parent node of node x. */ - treeNode Parent(treeNode x) { - return (x == Root()) ? NULLT : bp_parent(Par, x); - }; - - treeNode BinaryParent(treeNode x){ - if (x <= Root()) - return NULLT; - else { - treeNode prev = x - 1; - return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev); - }; - }; - - /* Assumes x is neither 0 nor -1 */ - - /** Child(x,i): returns the i-th child of node x, assuming it exists. */ - treeNode Child(treeNode x, int i); - - - - /** LastChild(x): returns the last child of node x. */ - treeNode LastChild(treeNode x) { - NULLT_IF(x == NULLT || bp_isleaf(Par,x)); - return bp_find_open(Par, bp_find_close(Par, x)-1); - } - - /** PrevSibling(x): returns the previous sibling of node x, assuming it - * exists. */ - - treeNode PrevSibling(treeNode x) - { - NULLT_IF(x==NULLT); - return bp_prev_sibling(Par, x); - } - - - /** TaggedChild(x,tag): returns the first child of node x tagged tag, or - * NULLT if there is none. Because of the balanced-parentheses representation - * of the tree, this operation is not supported efficiently, just iterating - * among the children of node x until finding the desired child. */ - - - treeNode SelectChild(treeNode x, TagIdSet * tags); - - /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or - * NULLT if there is none. */ - - treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); - - - - - treeNode SelectDescendant(treeNode x, TagIdSet * tags) { - NULLT_IF (x == NULLT); - treeNode fc = x+1; - if (bp_inspect(Par, fc) == CP) return NULLT; - treeNode min = NULLT; - treeNode aux; - - TagIdSet::const_iterator tagit; - for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - aux = TaggedDescendant(x, (TagType) *tagit); - if (((unsigned int) aux) < ((unsigned int) min)) min = aux; - }; - return min; - } - - /** TaggedPrec(x,tag): returns the first node tagged tag with smaller - * preorder than x and not an ancestor of x. Returns NULLT if there - * is none. */ - treeNode TaggedPreceding(treeNode x, TagType tag); - - /** TaggedFoll(x,tag): returns the first node tagged tag with larger - * preorder than x and not in the subtree of x. Returns NULLT if there - * is none. */ - treeNode TaggedFollowing(treeNode x, TagType tag); - - - - treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); - - // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); - - treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing) - { - - NULLT_IF(x <= 0); - - treeNode close = bp_find_close(Par,x); - - - treeNode min = NULLT; - treeNode aux; - - - TagIdSet::const_iterator tagit; - for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { - - aux = tagpos2node(Tags->select_next(*tagit, close)); - - if (((unsigned int) aux) < ((unsigned int) min)) min = aux; - }; - - - return (min < ancestor_closing) ? min : NULLT; - - } - - /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged - * tag. Return NULLT is there is none. */ - treeNode TaggedAncestor(treeNode x, TagType tag); - - /** PrevText(x): returns the document identifier of the text to the left of - * node x, or NULLT if x is the root node. */ - DocID PrevText(treeNode x); - - /** NextText(x): returns the document identifier of the text to the right of - * node x, or NULLT if x is the root node. */ - DocID NextText(treeNode x); - - /** MyText(x): returns the document identifier of the text below node x, or - * NULLT if x is not a leaf node. */ - DocID MyText(treeNode x); - DocID MyTextUnsafe(treeNode x); - - /** TextXMLId(d): returns the preorder of document with identifier d in the - * tree consisting of all tree nodes and all text nodes. */ - int TextXMLId(DocID d); - - /** NodeXMLId(x): returns the preorder of node x in the tree consisting of - * all tree nodes and all text nodes. */ - int NodeXMLId(treeNode x); - - /** ParentNode(d): returns the parent node of document identifier d. */ - treeNode ParentNode(DocID d); - - treeNode PrevNode(DocID d); - - /** GetTagId(tagname): returns the tag identifier corresponding to a given - * tag name. Returns NULLT in case that the tag name does not exists. */ - TagType GetTagId(unsigned char *tagname); - - /** GetTagName(tagid): returns the tag name of a given tag identifier. - * Returns NULL in case that the tag identifier is not valid.*/ - unsigned char *GetTagName(TagType tagid); - - /** GetTagName(tagid): returns the tag name of a given tag identifier. - * The result is just a reference and should not be freed by the caller. - */ - const unsigned char *GetTagNameByRef(TagType tagid); - - /** RegisterTag adds a new tag to the tag collection this is needed - * if the query contains a tag which is not in the document, we need - * to give this new tag a fresh id and store it somewhere. A logical - * choice is here. - * We might want to use a hashtable instead of an array though. - */ - TagType RegisterTag(unsigned char *tagname); - - bool EmptyText(DocID i) { - return Text->EmptyText(i); - } - - /** Prefix(s): search for texts prefixed by string s. */ - TextCollection::document_result Prefix(uchar const *s) { - return Text->Prefix(s); - } - - /** Suffix(s): search for texts having string s as a suffix. */ - TextCollection::document_result Suffix(uchar const *s) { - return Text->Suffix(s); - } - - /** Equal(s): search for texts equal to string s. */ - TextCollection::document_result Equals(uchar const *s) { - return Text->Equal(s); - } - - /** Contains(s): search for texts containing string s. */ - TextCollection::document_result Contains(uchar const *s) { - return Text->Contains(s); - } - - /** LessThan(s): returns document identifiers for the texts that - * are lexicographically smaller than string s. */ - TextCollection::document_result LessThan(uchar const *s) { - return Text->LessThan(s); - } - - /** IsPrefix(x): returns true if there is a text prefixed by string s. */ - bool IsPrefix(uchar const *s) { - return Text->IsPrefix(s); - } - - /** IsSuffix(s): returns true if there is a text having string s as a - * suffix.*/ - bool IsSuffix(uchar const *s) { - return Text->IsSuffix(s); - } - - /** IsEqual(s): returns true if there is a text that equals given - * string s. */ - bool IsEqual(uchar const *s) { - return Text->IsEqual(s); - } - - /** IsContains(s): returns true if there is a text containing string s. */ - bool IsContains(uchar const *s) { - return Text->IsContains(s); - } - - /** IsLessThan(s): returns true if there is at least a text that is - * lexicographically smaller than string s. */ - bool IsLessThan(uchar const *s) { - return Text->IsLessThan(s); - } - - /** Count(s): Global counting */ - unsigned Count(uchar const *s) { - return Text->Count(s); - } - - /** CountPrefix(s): counting version of Prefix(s). */ - unsigned CountPrefix(uchar const *s) { - return Text->CountPrefix(s); - } - - /** CountSuffix(s): counting version of Suffix(s). */ - unsigned CountSuffix(uchar const *s) { - return Text->CountSuffix(s); - } - - /** CountEqual(s): counting version of Equal(s). */ - unsigned CountEqual(uchar const *s) { - return Text->CountEqual(s); - } - - /** CountContains(s): counting version of Contains(s). */ - unsigned CountContains(uchar const *s) { - return Text->CountContains(s); - } - - /** CountLessThan(s): counting version of LessThan(s). */ - unsigned CountLessThan(uchar const *s) { - return Text->CountLessThan(s); - } - - /** GetText(d): returns the text corresponding to document with - * id d. */ - uchar* GetText(DocID d) { - - uchar * s = Text->GetText(d); - return (s[0] == 1 ? (s+1) : s); - } - - /** GetText(i, j): returns the texts corresponding to documents with - * ids i, i+1, ..., j. Texts are separated by '\0' character. */ - // uchar* GetText(DocID i, DocID j) { - // uchar * s = Text->GetText(i, j); - // return (s[0] == 1 ? (uchar*)"" : s); - //} - - TextCollection *getTextCollection() { - return Text; - } - - /** Save: saves XML tree data structure to file. */ - void Save(int fd, char* name ); - - /** Load: loads XML tree data structure from file. sample_rate_text - * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name); - - void insertTag(TagType tag, uint position); - - void print_stats(); - - - /** Parenthesis functions */ - treeNode Closing(treeNode x); - - bool IsOpen(treeNode x); - - - /** Print procedure */ - void Print(int fd,treeNode x, bool no_text); - void Print(int fd,treeNode x) { Print(fd,x,false); } - void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); } - - // The following are inlined here for speed - /** Tag(x): returns the tag identifier of node x. */ - - inline TagType Tag(treeNode x) const throw () { - if (tags_blen == 8) - return (TagType) (((uchar*)tags_fix)[(int) x]); - else - return get_field(tags_fix, tags_blen, x); - /* - { - size_t idxlen = x * tags_blen; - size_t j = idxlen % W; - size_t i = idxlen / W; - size_t offset = W - tags_blen; - size_t offset2 = offset - j; - size_t w = tags_fix[i]; - return (offset2 >= 0) - ? ( w << offset2 ) >> offset - : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset; - }; */ - - } - - /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf - */ - treeNode FirstChild(treeNode x) { - NULLT_IF(x==NULLT); - return bp_first_child(Par, x); - }; - - - /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT - * if none. - */ - treeNode FirstElement(treeNode node){ - { - NULLT_IF(node == NULLT); - treeNode x = bp_first_child(Par, node); - NULLT_IF(x == NULLT); - switch (Tag(x)){ - - case ATTRIBUTE_TAG_ID: - x = bp_next_sibling(Par,x); - if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x; - - case PCDATA_TAG_ID: - x = x+2; - return (bp_inspect(Par,x)==OP)? x : NULLT; - - default: - return x; - } - } - }; - - /** NextSibling(x): returns the next sibling of node x, or NULLT if none - * exists. */ - - treeNode NextSibling(treeNode x) { - NULLT_IF (x <= 0); - return bp_next_sibling(Par, x); - }; - - /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT - * if none. - */ - treeNode NextElement(treeNode x) - { - NULLT_IF(x <= 0); - x = bp_next_sibling(Par, x); - NULLT_IF(x == NULLT); - if (Tag(x) == PCDATA_TAG_ID){ - int y = x+2; - return (bp_inspect(Par, y) == OP) ? y : NULLT; - } - else return x; - }; - /** TaggedDesc(x,tag): returns the first node tagged tag with larger - * preorder than x and within the subtree of x. Returns NULT if there - * is none. */ - inline treeNode TaggedNext(treeNode x, TagType tag) - { - return tagpos2node(Tags->select_next(tag,node2tagpos(x))); - } - inline treeNode TaggedDescendant(treeNode x, TagType tag) - { - - int s = (int) Tags->select_next(tag,node2tagpos(x)); - NULLT_IF (s == -1); - - treeNode y = tagpos2node(s); // transforms the tag position into a node position - - return (bp_is_ancestor(Par,x,y) ? y : NULLT); - }; - - inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) - { - treeNode close = bp_find_close(Par, x); - treeNode s = tagpos2node(Tags->select_next(tag, close)); - - if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s; - else return NULLT; - }; - - inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing) - { - treeNode close = bp_find_close(Par, x); - treeNode s = tagpos2node(Tags->select_next(tag, close)); - - if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s; - else return NULLT; - }; - - inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing) - { - treeNode close = bp_find_close(Par, x); - int rank = bp_rank_open(Par, close); - treeNode y = bp_select_open(Par, rank+1); - return (y < ancestor_closing) ? y : NULLT; - }; - -// TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none. -treeNode TaggedFollowingSibling(treeNode x, TagType tag) -{ - NULLT_IF(x==NULLT); - treeNode sibling = x; - TagType ctag; - while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) { - ctag = Tag(sibling); - if (ctag == tag) return sibling; - } - return NULLT; -}; - -treeNode TaggedChild(treeNode x, TagType tag) -{ - - NULLT_IF(x==NULLT || bp_isleaf(Par,x)); - treeNode child; - child = bp_first_child(Par, x); - -if (Tag(child) == tag) - return child; - else - return TaggedFollowingSibling(child, tag); -}; - - -}; - - -#endif -