+++ /dev/null
-/******************************************************************************
- * Copyright (C) 2008 by Diego Arroyuelo *
- * Interface for the in-memory XQuery/XPath engine *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU Lesser General Public License as published *
- * by the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU Lesser General Public License for more details. *
- * *
- * You should have received a copy of the GNU Lesser General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
- ******************************************************************************/
-
-#ifndef XMLTREE_H_
-#define XMLTREE_H_
-
-
-#include <unordered_set>
-#include <unordered_map>
-#include <sstream>
-#include <TextCollection/TextCollectionBuilder.h>
-
-#undef W
-#undef WW
-#undef Wminusone
-
-#include <bp/bp.h>
-#include <bp/bp-darray.h>
-#include <libcds/includes/basics.h>
-#include <libcds/includes/static_bitsequence.h>
-#include <libcds/includes/alphabet_mapper.h>
-#include <libcds/includes/static_sequence.h>
-
-using SXSI::TextCollection;
-using SXSI::TextCollectionBuilder;
-
-
-// this constant is used to efficiently compute the child operation in the tree
-#define OPTD 10
-
-#define NULLT -1
-
-#define PERM_SAMPLE 10
-
-
-typedef int treeNode;
-typedef int TagType;
-typedef int DocID;
-
-typedef struct {
- int min;
- int max;
-} range;
-
-// Encoding of the XML Document :
-// The following TAGs and IDs are fixed, "" is the tag of the root.
-// a TextNode is represented by a leaf <<$>></<$>> The DocId in the TextCollection
-// of that leaf is kept in a bit sequence.
-// a TextNode below an attribute is likewise represented by a leaf <<@$>><</@$>>
-// An element <e a1="v1" a2="v2" ... an="vn" > ...</e> the representation is:
-// <e><<@>> <<@>a1> <<$@>>DocID(v1)</<$@>></<@>a1> ... </<@>> .... </e>
-// Hence the attributes (if any) are always below the first child of their element,
-// as the children of a fake node <@>.
-
-
-#define DOCUMENT_OPEN_TAG ""
-#define DOCUMENT_TAG_ID 0
-#define ATTRIBUTE_OPEN_TAG "<@>"
-#define ATTRIBUTE_TAG_ID 1
-#define PCDATA_OPEN_TAG "<$>"
-#define PCDATA_TAG_ID 2
-#define ATTRIBUTE_DATA_OPEN_TAG "<@$>"
-#define ATTRIBUTE_DATA_TAG_ID 3
-#define CLOSING_TAG "</>"
-#define CLOSING_TAG_ID 4
-#define DOCUMENT_CLOSE_TAG "/"
-#define ATTRIBUTE_CLOSE_TAG "/<@>"
-#define PCDATA_CLOSE_TAG "/<$>"
-#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"
-
-
-typedef std::unordered_set<int> TagIdSet;
-typedef std::unordered_map<std::string,int> TagIdMap;
-typedef TagIdMap::const_iterator TagIdMapIT;
-
-#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\
- (v)->push_back(t); } while (false)
-
-// returns NULLT if the test is true
-#define NULLT_IF(x) do { if (x) return NULLT; } while (0)
-#define IS_NIL(x) ((x) < 0)
-
-// Direct calls to sarray library
-
-#define BUFFER_ALLOC (8192 * 2)
-#define BUFFER_SIZE (BUFFER_ALLOC / 2)
-
-// tag position -> tree node
-static treeNode tagpos2node(int t)
- {
- return (treeNode) t;
- }
-// tree node -> tag position
-static int node2tagpos(treeNode x)
-{
- return (int)x;
-}
-
-
-class XMLTreeBuilder;
-
-class XMLTree {
-
- // Only the builder can access the constructor
- friend class XMLTreeBuilder;
-
- private:
- /** Balanced parentheses representation of the tree */
- bp *Par;
-
- /** Mapping from tag identifer to tag name */
- std::vector<std::string> *TagName;
- TagIdMap * tIdMap;
-
- /** Bit vector indicating with a 1 the positions of the non-empty texts. */
- static_bitsequence *EBVector;
-
- /** Tag sequence represented with a data structure for rank and select */
- static_sequence *Tags;
- uint * tags_fix;
- uint tags_blen, tags_len;
-
- /** The texts in the XML document */
- TextCollection *Text;
-
- // Allows to disable the TextCollection for benchmarkin purposes
- bool disable_tc;
- SXSI::TextCollectionBuilder::index_type_t text_index_type;
-
- std::string *buffer;
- std::vector<std::string *> *print_stack;
-
-
- void _real_flush(int fd, size_t size) {
- if (size == 0) return;
- size_t written;
- while (1) {
- written = write(fd, buffer->data(), size);
- if ((written < 0) && (errno == EAGAIN || errno == EINTR))
- continue;
- break;
- };
- buffer->clear();
-
- }
-
- void _flush(int fd){
- size_t size = buffer->size();
- if (size < BUFFER_SIZE) return;
- _real_flush(fd, size);
- }
-
- void _dput_str(std::string s, int fd){
- buffer->append(s);
- _flush(fd);
- }
-
- void _dputs(const char* s, int fd){
- buffer->append(s);
- _flush(fd);
- }
-
- void _dputc(const char c, int fd){
- buffer->push_back(c);
- }
-
- size_t _dprintf(const char* s, int fd){
- if (s == NULL) return 0;
- size_t i;
- char c;
- for (i = 0; (c = s[i]); i++) {
- switch (c) {
- case '"':
- _dputs(""", fd);
- break;
- case '&':
- _dputs("&", fd);
- break;
- case '\'':
- _dputs("'", fd);
- break;
- case '<':
- _dputs("<", fd);
- break;
- case '>':
- _dputs(">", fd);
- break;
- default:
- _dputc(c, fd);
-
- };
- };
- return i;
- }
-
- void PrintNode(treeNode n, int fd);
- /** Data structure constructors */
- XMLTree(){ buffer = 0;};
-
- // non const pointer are freed by this method.
- XMLTree( pb * const par,
- uint npar,
- std::vector<std::string> * const TN,
- TagIdMap * const tim, uint *empty_texts_bmp,
- TagType *tags,
- TextCollectionBuilder * const TCB, bool dis_tc,
- TextCollectionBuilder::index_type_t _index_type );
-
-public:
- /** Data structure destructor */
- ~XMLTree();
-
- /** root(): returns the tree root. */
- treeNode Root() { return 0; }
-
- /** Size() : Number of parenthesis */
- unsigned int Size(){
- return tags_len/2;
- }
-
-
- /** NumTags() : Number of distinct tags */
- unsigned int NumTags() {
- return TagName->size();
- }
-
- int TagsBinaryLength(){ return tags_blen; };
- unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); };
- unsigned int * TagStruct() { return tags_fix; };
-
-
- /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
- * node x. */
- int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); }
-
- /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
- * of node x. */
- int SubtreeTags(treeNode x, TagType tag){
- //int s = x + 2*subtree_size(Par, x) - 1;
- treeNode y = bp_find_close(Par, x);
-
- if (y - x < 10) {
- int count = 0;
- for(int i = x; i < y; i++)
- count += (Tag(i) == tag);
- return count;
- }
- else
- return (Tags->rank(tag, y) - Tags->rank(tag, x));
- };
-
- /** SubtreeElements(x) of element nodes in the subtree of x
- */
- int SubtreeElements(treeNode x);
-
- /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
- * representation this is just a bit inspection. */
-
- bool IsLeaf(treeNode x);
-
- /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
-
- bool IsAncestor(treeNode x, treeNode y);
-
-
- /** IsRigthDescendant returns true if y is a descendant of x and y is
- not a descendant of the first child of x */
- bool IsRightDescendant(treeNode x, treeNode y) {
- if (x <= Root()) return false;
- treeNode z = bp_parent_close(Par, x);
- treeNode c = bp_find_close(Par, x);
- return (y > c && y < z );
- }
-
-
- /** IsChild(x,y): returns whether node x is parent of node y. */
- bool IsChild(treeNode x, treeNode y);
-
- /** IsFirstChild(x): returns whether node x is the first child of its parent. */
- /* SLOW
- bool IsFirstChild(treeNode x) {
- return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1));
- };
- */
-
- bool IsFirstChild(treeNode x) {
- return (x <= Root()) || (bp_inspect(Par,x-1) == OP);
- }
-
- /** NumChildren(x): number of children of node x. Constant time with the
- * data structure of Sadakane. */
- int NumChildren(treeNode x);
-
- /** ChildNumber(x): returns i if node x is the i-th children of its
- * parent. */
- int ChildNumber(treeNode x);
-
- /** Depth(x): depth of node x, a simple binary rank on the parentheses
- * sequence. */
- int Depth(treeNode x);
-
- /** Preorder(x): returns the preorder number of node x, just regarding tree
- * nodes (and not texts). */
- int Preorder(treeNode x);
-
- /** Postorder(x): returns the postorder number of node x, just regarding
- * tree nodes (and not texts). */
- int Postorder(treeNode x);
-
-
- /** DocIds(x): returns the range (i.e., a pair of integers) of document
- * identifiers that descend from node x. */
- range DocIds(treeNode x);
-
- /** Parent(x): returns the parent node of node x. */
- treeNode Parent(treeNode x) {
- return (x == Root()) ? NULLT : bp_parent(Par, x);
- };
-
- treeNode BinaryParent(treeNode x){
- if (x <= Root())
- return NULLT;
- else {
- treeNode prev = x - 1;
- return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev);
- };
- };
-
- /* Assumes x is neither 0 nor -1 */
-
- /** Child(x,i): returns the i-th child of node x, assuming it exists. */
- treeNode Child(treeNode x, int i);
-
-
-
- /** LastChild(x): returns the last child of node x. */
- treeNode LastChild(treeNode x) {
- NULLT_IF(x == NULLT || bp_isleaf(Par,x));
- return bp_find_open(Par, bp_find_close(Par, x)-1);
- }
-
- /** PrevSibling(x): returns the previous sibling of node x, assuming it
- * exists. */
-
- treeNode PrevSibling(treeNode x)
- {
- NULLT_IF(x==NULLT);
- return bp_prev_sibling(Par, x);
- }
-
-
- /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
- * NULLT if there is none. Because of the balanced-parentheses representation
- * of the tree, this operation is not supported efficiently, just iterating
- * among the children of node x until finding the desired child. */
-
-
- treeNode SelectChild(treeNode x, TagIdSet * tags);
-
- /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
- * NULLT if there is none. */
-
- treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);
-
-
-
-
- treeNode SelectDescendant(treeNode x, TagIdSet * tags) {
- NULLT_IF (x == NULLT);
- treeNode fc = x+1;
- if (bp_inspect(Par, fc) == CP) return NULLT;
- treeNode min = NULLT;
- treeNode aux;
-
- TagIdSet::const_iterator tagit;
- for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
- aux = TaggedDescendant(x, (TagType) *tagit);
- if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
- };
- return min;
- }
-
- /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
- * preorder than x and not an ancestor of x. Returns NULLT if there
- * is none. */
- treeNode TaggedPreceding(treeNode x, TagType tag);
-
- /** TaggedFoll(x,tag): returns the first node tagged tag with larger
- * preorder than x and not in the subtree of x. Returns NULLT if there
- * is none. */
- treeNode TaggedFollowing(treeNode x, TagType tag);
-
-
-
- treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor);
-
- // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing);
-
- treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing)
- {
-
- NULLT_IF(x <= 0);
-
- treeNode close = bp_find_close(Par,x);
-
-
- treeNode min = NULLT;
- treeNode aux;
-
-
- TagIdSet::const_iterator tagit;
- for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
-
- aux = tagpos2node(Tags->select_next(*tagit, close));
-
- if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
- };
-
-
- return (min < ancestor_closing) ? min : NULLT;
-
- }
-
- /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
- * tag. Return NULLT is there is none. */
- treeNode TaggedAncestor(treeNode x, TagType tag);
-
- /** PrevText(x): returns the document identifier of the text to the left of
- * node x, or NULLT if x is the root node. */
- DocID PrevText(treeNode x);
-
- /** NextText(x): returns the document identifier of the text to the right of
- * node x, or NULLT if x is the root node. */
- DocID NextText(treeNode x);
-
- /** MyText(x): returns the document identifier of the text below node x, or
- * NULLT if x is not a leaf node. */
- DocID MyText(treeNode x);
- DocID MyTextUnsafe(treeNode x);
-
- /** TextXMLId(d): returns the preorder of document with identifier d in the
- * tree consisting of all tree nodes and all text nodes. */
- int TextXMLId(DocID d);
-
- /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
- * all tree nodes and all text nodes. */
- int NodeXMLId(treeNode x);
-
- /** ParentNode(d): returns the parent node of document identifier d. */
- treeNode ParentNode(DocID d);
-
- treeNode PrevNode(DocID d);
-
- /** GetTagId(tagname): returns the tag identifier corresponding to a given
- * tag name. Returns NULLT in case that the tag name does not exists. */
- TagType GetTagId(unsigned char *tagname);
-
- /** GetTagName(tagid): returns the tag name of a given tag identifier.
- * Returns NULL in case that the tag identifier is not valid.*/
- unsigned char *GetTagName(TagType tagid);
-
- /** GetTagName(tagid): returns the tag name of a given tag identifier.
- * The result is just a reference and should not be freed by the caller.
- */
- const unsigned char *GetTagNameByRef(TagType tagid);
-
- /** RegisterTag adds a new tag to the tag collection this is needed
- * if the query contains a tag which is not in the document, we need
- * to give this new tag a fresh id and store it somewhere. A logical
- * choice is here.
- * We might want to use a hashtable instead of an array though.
- */
- TagType RegisterTag(unsigned char *tagname);
-
- bool EmptyText(DocID i) {
- return Text->EmptyText(i);
- }
-
- /** Prefix(s): search for texts prefixed by string s. */
- TextCollection::document_result Prefix(uchar const *s) {
- return Text->Prefix(s);
- }
-
- /** Suffix(s): search for texts having string s as a suffix. */
- TextCollection::document_result Suffix(uchar const *s) {
- return Text->Suffix(s);
- }
-
- /** Equal(s): search for texts equal to string s. */
- TextCollection::document_result Equals(uchar const *s) {
- return Text->Equal(s);
- }
-
- /** Contains(s): search for texts containing string s. */
- TextCollection::document_result Contains(uchar const *s) {
- return Text->Contains(s);
- }
-
- /** LessThan(s): returns document identifiers for the texts that
- * are lexicographically smaller than string s. */
- TextCollection::document_result LessThan(uchar const *s) {
- return Text->LessThan(s);
- }
-
- /** IsPrefix(x): returns true if there is a text prefixed by string s. */
- bool IsPrefix(uchar const *s) {
- return Text->IsPrefix(s);
- }
-
- /** IsSuffix(s): returns true if there is a text having string s as a
- * suffix.*/
- bool IsSuffix(uchar const *s) {
- return Text->IsSuffix(s);
- }
-
- /** IsEqual(s): returns true if there is a text that equals given
- * string s. */
- bool IsEqual(uchar const *s) {
- return Text->IsEqual(s);
- }
-
- /** IsContains(s): returns true if there is a text containing string s. */
- bool IsContains(uchar const *s) {
- return Text->IsContains(s);
- }
-
- /** IsLessThan(s): returns true if there is at least a text that is
- * lexicographically smaller than string s. */
- bool IsLessThan(uchar const *s) {
- return Text->IsLessThan(s);
- }
-
- /** Count(s): Global counting */
- unsigned Count(uchar const *s) {
- return Text->Count(s);
- }
-
- /** CountPrefix(s): counting version of Prefix(s). */
- unsigned CountPrefix(uchar const *s) {
- return Text->CountPrefix(s);
- }
-
- /** CountSuffix(s): counting version of Suffix(s). */
- unsigned CountSuffix(uchar const *s) {
- return Text->CountSuffix(s);
- }
-
- /** CountEqual(s): counting version of Equal(s). */
- unsigned CountEqual(uchar const *s) {
- return Text->CountEqual(s);
- }
-
- /** CountContains(s): counting version of Contains(s). */
- unsigned CountContains(uchar const *s) {
- return Text->CountContains(s);
- }
-
- /** CountLessThan(s): counting version of LessThan(s). */
- unsigned CountLessThan(uchar const *s) {
- return Text->CountLessThan(s);
- }
-
- /** GetText(d): returns the text corresponding to document with
- * id d. */
- uchar* GetText(DocID d) {
-
- uchar * s = Text->GetText(d);
- return (s[0] == 1 ? (s+1) : s);
- }
-
- /** GetText(i, j): returns the texts corresponding to documents with
- * ids i, i+1, ..., j. Texts are separated by '\0' character. */
- // uchar* GetText(DocID i, DocID j) {
- // uchar * s = Text->GetText(i, j);
- // return (s[0] == 1 ? (uchar*)"" : s);
- //}
-
- TextCollection *getTextCollection() {
- return Text;
- }
-
- /** Save: saves XML tree data structure to file. */
- void Save(int fd, char* name );
-
- /** Load: loads XML tree data structure from file. sample_rate_text
- * indicates the sample rate for the text search data structure. */
- static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name);
-
- void insertTag(TagType tag, uint position);
-
- void print_stats();
-
-
- /** Parenthesis functions */
- treeNode Closing(treeNode x);
-
- bool IsOpen(treeNode x);
-
-
- /** Print procedure */
- void Print(int fd,treeNode x, bool no_text);
- void Print(int fd,treeNode x) { Print(fd,x,false); }
- void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); }
-
- // The following are inlined here for speed
- /** Tag(x): returns the tag identifier of node x. */
-
- inline TagType Tag(treeNode x) const throw () {
- if (tags_blen == 8)
- return (TagType) (((uchar*)tags_fix)[(int) x]);
- else
- return get_field(tags_fix, tags_blen, x);
- /*
- {
- size_t idxlen = x * tags_blen;
- size_t j = idxlen % W;
- size_t i = idxlen / W;
- size_t offset = W - tags_blen;
- size_t offset2 = offset - j;
- size_t w = tags_fix[i];
- return (offset2 >= 0)
- ? ( w << offset2 ) >> offset
- : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset;
- }; */
-
- }
-
- /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf
- */
- treeNode FirstChild(treeNode x) {
- NULLT_IF(x==NULLT);
- return bp_first_child(Par, x);
- };
-
-
- /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT
- * if none.
- */
- treeNode FirstElement(treeNode node){
- {
- NULLT_IF(node == NULLT);
- treeNode x = bp_first_child(Par, node);
- NULLT_IF(x == NULLT);
- switch (Tag(x)){
-
- case ATTRIBUTE_TAG_ID:
- x = bp_next_sibling(Par,x);
- if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x;
-
- case PCDATA_TAG_ID:
- x = x+2;
- return (bp_inspect(Par,x)==OP)? x : NULLT;
-
- default:
- return x;
- }
- }
- };
-
- /** NextSibling(x): returns the next sibling of node x, or NULLT if none
- * exists. */
-
- treeNode NextSibling(treeNode x) {
- NULLT_IF (x <= 0);
- return bp_next_sibling(Par, x);
- };
-
- /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT
- * if none.
- */
- treeNode NextElement(treeNode x)
- {
- NULLT_IF(x <= 0);
- x = bp_next_sibling(Par, x);
- NULLT_IF(x == NULLT);
- if (Tag(x) == PCDATA_TAG_ID){
- int y = x+2;
- return (bp_inspect(Par, y) == OP) ? y : NULLT;
- }
- else return x;
- };
- /** TaggedDesc(x,tag): returns the first node tagged tag with larger
- * preorder than x and within the subtree of x. Returns NULT if there
- * is none. */
- inline treeNode TaggedNext(treeNode x, TagType tag)
- {
- return tagpos2node(Tags->select_next(tag,node2tagpos(x)));
- }
- inline treeNode TaggedDescendant(treeNode x, TagType tag)
- {
-
- int s = (int) Tags->select_next(tag,node2tagpos(x));
- NULLT_IF (s == -1);
-
- treeNode y = tagpos2node(s); // transforms the tag position into a node position
-
- return (bp_is_ancestor(Par,x,y) ? y : NULLT);
- };
-
- inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)
- {
- treeNode close = bp_find_close(Par, x);
- treeNode s = tagpos2node(Tags->select_next(tag, close));
-
- if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s;
- else return NULLT;
- };
-
- inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing)
- {
- treeNode close = bp_find_close(Par, x);
- treeNode s = tagpos2node(Tags->select_next(tag, close));
-
- if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s;
- else return NULLT;
- };
-
- inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing)
- {
- treeNode close = bp_find_close(Par, x);
- int rank = bp_rank_open(Par, close);
- treeNode y = bp_select_open(Par, rank+1);
- return (y < ancestor_closing) ? y : NULLT;
- };
-
-// TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none.
-treeNode TaggedFollowingSibling(treeNode x, TagType tag)
-{
- NULLT_IF(x==NULLT);
- treeNode sibling = x;
- TagType ctag;
- while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) {
- ctag = Tag(sibling);
- if (ctag == tag) return sibling;
- }
- return NULLT;
-};
-
-treeNode TaggedChild(treeNode x, TagType tag)
-{
-
- NULLT_IF(x==NULLT || bp_isleaf(Par,x));
- treeNode child;
- child = bp_first_child(Par, x);
-
-if (Tag(child) == tag)
- return child;
- else
- return TaggedFollowingSibling(child, tag);
-};
-
-
-};
-
-
-#endif
-