X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=351db204fd81f1d8e3301c7fcedce62d11847b47;hb=d79d6498e2d585560d915592ef59f3ad6a57b3c7;hp=f605286662ab0897722246ca456809411863248f;hpb=efe894650813a19a0e1408eb5807e59f037afc3b;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index f605286..351db20 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -1,349 +1,769 @@ - -/****************************************************************************** - * Copyright (C) 2008 by Diego Arroyuelo * - * Interface for the in-memory XQuery/XPath engine * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ - -#ifndef XMLTREE_H_ -#define XMLTREE_H_ -#include "TextCollection/TextCollection.h" -#include -#include -#include -#include "bp.h" -#include -#include -#include -using SXSI::TextCollection; - - -// this constant is used to efficiently compute the child operation in the tree -#define OPTD 10 - -#define NULLT -1 - - // sets bit p in e -#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) - // cleans bit p in e -#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) - - -typedef int treeNode; -typedef int TagType; -typedef int DocID; - -typedef struct { - int min; - int max; -} range; - - -class XMLTree { - /** Balanced parentheses representation of the tree */ - bp *Par; - - /** Mapping from tag identifer to tag name */ - unsigned char **TagName; - - /** boolean flag indicating whether we are indexing empty texts or not */ - bool indexing_empty_texts; - - /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence_rrr02 *EBVector; - - /** Tag sequence represented with a data structure for rank and select */ - static_sequence_wvtree *Tags; - - /** The texts in the XML document */ - TextCollection *Text; - - /** Flag indicating whether the whole data structure has been constructed or - * not. If the value is true, you cannot add more texts, nodes, etc. */ - bool finished; - - /** Flag indicating whether the construction of the data structure has been - * initialized or not (by calling method OpenDocument()). If this is true, - * you cannot insert new tags or texts. */ - bool initialized; - - /* the following components are used for construction purposes */ - pb *par_aux; - TagType *tags_aux; - int npar; - int ntagnames; - unsigned int *empty_texts_aux; - -public: - - /** Data structure constructor */ - XMLTree() {finished = false; initialized = false;}; - - /** Data structure destructor */ - ~XMLTree(); - - /** root(): returns the tree root. */ - treeNode Root(); - - /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of - * node x. */ - int SubtreeSize(treeNode x); - - /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree - * of node x. */ - int SubtreeTags(treeNode x, TagType tag); - - /** IsLeaf(x): returns whether node x is leaf or not. In the succinct - * representation this is just a bit inspection. */ - bool IsLeaf(treeNode x); - - /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ - bool IsAncestor(treeNode x, treeNode y); - - /** IsChild(x,y): returns whether node x is parent of node y. */ - bool IsChild(treeNode x, treeNode y); - - /** NumChildren(x): number of children of node x. Constant time with the - * data structure of Sadakane. */ - int NumChildren(treeNode x); - - /** ChildNumber(x): returns i if node x is the i-th children of its - * parent. */ - inline int ChildNumber(treeNode x); - - /** Depth(x): depth of node x, a simple binary rank on the parentheses - * sequence. */ - int Depth(treeNode x); - - /** Preorder(x): returns the preorder number of node x, just regarding tree - * nodes (and not texts). */ - int Preorder(treeNode x); - - /** Postorder(x): returns the postorder number of node x, just regarding - * tree nodes (and not texts). */ - int Postorder(treeNode x); - - /** Tag(x): returns the tag identifier of node x. */ - TagType Tag(treeNode x); - - /** DocIds(x): returns the range (i.e., a pair of integers) of document - * identifiers that descend from node x. */ - range DocIds(treeNode x); - - /** Parent(x): returns the parent node of node x. */ - treeNode Parent(treeNode x); - - /** Child(x,i): returns the i-th child of node x, assuming it exists. */ - treeNode Child(treeNode x, int i); - - /** FirstChild(x): returns the first child of node x, assuming it exists. - * Very fast in BP. */ - treeNode FirstChild(treeNode x); - - /** NextSibling(x): returns the next sibling of node x, assuming it - * exists. */ - treeNode NextSibling(treeNode x); - - /** PrevSibling(x): returns the previous sibling of node x, assuming it - * exists. */ - treeNode PrevSibling(treeNode x); - - /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or - * NULLT if there is none. Because of the balanced-parentheses representation - * of the tree, this operation is not supported efficiently, just iterating - * among the children of node x until finding the desired child. */ - treeNode TaggedChild(treeNode x, int i, TagType tag); - - /** TaggedDesc(x,tag): returns the first node tagged tag with larger - * preorder than x and within the subtree of x. Returns NULT if there - * is none. */ - treeNode TaggedDesc(treeNode x, TagType tag); - - /** TaggedPrec(x,tag): returns the first node tagged tag with smaller - * preorder than x and not an ancestor of x. Returns NULLT if there - * is none. */ - treeNode TaggedPrec(treeNode x, TagType tag); - - /** TaggedFoll(x,tag): returns the first node tagged tag with larger - * preorder than x and not in the subtree of x. Returns NULLT if there - * is none. */ - treeNode TaggedFoll(treeNode x, TagType tag); - - /** PrevText(x): returns the document identifier of the text to the left of - * node x, or NULLT if x is the root node. */ - DocID PrevText(treeNode x); - - /** NextText(x): returns the document identifier of the text to the right of - * node x, or NULLT if x is the root node. */ - DocID NextText(treeNode x); - - /** MyText(x): returns the document identifier of the text below node x, or - * NULLT if x is not a leaf node. */ - DocID MyText(treeNode x); - - /** TextXMLId(d): returns the preorder of document with identifier d in the - * tree consisting of all tree nodes and all text nodes. */ - int TextXMLId(DocID d); - - /** NodeXMLId(x): returns the preorder of node x in the tree consisting of - * all tree nodes and all text nodes. */ - int NodeXMLId(treeNode x); - - /** ParentNode(d): returns the parent node of document identifier d. */ - treeNode ParentNode(DocID d); - - /** OpenDocument(empty_texts,sample_rate_text): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter - * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space - * requirement). Returns a non-zero value upon success, NULLT in case of - * error. */ - int OpenDocument(bool empty_texts, int sample_rate_text); - - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, - * dynamic data structures are made static, and the flag "finished" is set - * to true. After that, the data structure can be queried. */ - int CloseDocument(); - - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); - - /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); - - /** NewText(s): indicates the event of finding a new (non-empty) text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); - - /** GetTagId(tagname): returns the tag identifier corresponding to a given - * tag name. Returns NULLT in case that the tag name does not exists. */ - TagType GetTagId(unsigned char *tagname); - - /** GetTagName(tagid): returns the tag name of a given tag identifier. - * Returns NULL in case that the tag identifier is not valid.*/ - unsigned char *GetTagName(TagType tagid); - - /** Prefix(s): search for texts prefixed by string s. */ - TextCollection::document_result Prefix(uchar const *s) { - return Text->Prefix(s); - } - - /** Suffix(s): search for texts having string s as a suffix. */ - TextCollection::document_result Suffix(uchar const *s) { - return Text->Suffix(s); - } - - /** Equal(s): search for texts equal to string s. */ - TextCollection::document_result Equal(uchar const *s) { - return Text->Equal(s); - } - - /** Contains(s): search for texts containing string s. */ - TextCollection::document_result Contains(uchar const *s) { - return Text->Contains(s); - } - - /** LessThan(s): returns document identifiers for the texts that - * are lexicographically smaller than string s. */ - TextCollection::document_result LessThan(uchar const *s) { - return Text->LessThan(s); - } - - /** IsPrefix(x): returns true if there is a text prefixed by string s. */ - bool IsPrefix(uchar const *s) { - return Text->IsPrefix(s); - } - - /** IsSuffix(s): returns true if there is a text having string s as a - * suffix.*/ - bool IsSuffix(uchar const *s) { - return Text->IsSuffix(s); - } - - /** IsEqual(s): returns true if there is a text that equals given - * string s. */ - bool IsEqual(uchar const *s) { - return Text->IsEqual(s); - } - - /** IsContains(s): returns true if there is a text containing string s. */ - bool IsContains(uchar const *s) { - return Text->IsContains(s); - } - - /** IsLessThan(s): returns true if there is at least a text that is - * lexicographically smaller than string s. */ - bool IsLessThan(uchar const *s) { - return Text->IsLessThan(s); - } - - /** CountPrefix(s): counting version of Prefix(s). */ - unsigned CountPrefix(uchar const *s) { - return Text->CountPrefix(s); - } - - /** CountSuffix(s): counting version of Suffix(s). */ - unsigned CountSuffix(uchar const *s) { - return Text->CountSuffix(s); - } - - /** CountEqual(s): counting version of Equal(s). */ - unsigned CountEqual(uchar const *s) { - return Text->CountEqual(s); - } - - /** CountContains(s): counting version of Contains(s). */ - unsigned CountContains(uchar const *s) { - return Text->CountContains(s); - } - - /** CountLessThan(s): counting version of LessThan(s). */ - unsigned CountLessThan(uchar const *s) { - return CountLessThan(s); - } - - /** GetText(d): returns the text corresponding to document with - * id d. */ - uchar* GetText(DocID d) { - return Text->GetText(d); - } - - TextCollection *getTextCollection() { - return Text; - } - /** Save: saves XML tree data structure to file. */ - void Save(unsigned char *filename); - - /** Load: loads XML tree data structure from file. sample_rate_text - * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(unsigned char *filename, int sample_rate_text); -}; -#endif +/****************************************************************************** + * Copyright (C) 2008 by Diego Arroyuelo * + * Interface for the in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef XMLTREE_H_ +#define XMLTREE_H_ + + +#include +#include +#include +#include + +#undef W +#undef WW +#undef Wminusone + +#include +#include +#include +#include +#include +#include + +using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; + + +// this constant is used to efficiently compute the child operation in the tree +#define OPTD 10 + +#define NULLT -1 + +#define PERM_SAMPLE 10 + + +typedef int treeNode; +typedef int TagType; +typedef int DocID; + +typedef struct { + int min; + int max; +} range; + +// Encoding of the XML Document : +// The following TAGs and IDs are fixed, "" is the tag of the root. +// a TextNode is represented by a leaf <<$>>> The DocId in the TextCollection +// of that leaf is kept in a bit sequence. +// a TextNode below an attribute is likewise represented by a leaf <<@$>><> +// An element ... the representation is: +// <<@>> <<@>a1> <<$@>>DocID(v1)>a1> ... > .... +// Hence the attributes (if any) are always below the first child of their element, +// as the children of a fake node <@>. + + +#define DOCUMENT_OPEN_TAG "" +#define DOCUMENT_TAG_ID 0 +#define ATTRIBUTE_OPEN_TAG "<@>" +#define ATTRIBUTE_TAG_ID 1 +#define PCDATA_OPEN_TAG "<$>" +#define PCDATA_TAG_ID 2 +#define ATTRIBUTE_DATA_OPEN_TAG "<@$>" +#define ATTRIBUTE_DATA_TAG_ID 3 +#define CLOSING_TAG "" +#define CLOSING_TAG_ID 4 +#define DOCUMENT_CLOSE_TAG "/" +#define ATTRIBUTE_CLOSE_TAG "/<@>" +#define PCDATA_CLOSE_TAG "/<$>" +#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" + + +typedef std::unordered_set TagIdSet; +typedef std::unordered_map TagIdMap; +typedef TagIdMap::const_iterator TagIdMapIT; + +#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ + (v)->push_back(t); } while (false) + +// returns NULLT if the test is true +#define NULLT_IF(x) do { if (x) return NULLT; } while (0) +#define IS_NIL(x) ((x) < 0) + +// Direct calls to sarray library + +#define BUFFER_ALLOC (8192 * 2) +#define BUFFER_SIZE (BUFFER_ALLOC / 2) + +// tag position -> tree node +static treeNode tagpos2node(int t) + { + return (treeNode) t; + } +// tree node -> tag position +static int node2tagpos(treeNode x) +{ + return (int)x; +} + + +class XMLTreeBuilder; + +class XMLTree { + + // Only the builder can access the constructor + friend class XMLTreeBuilder; + + private: + /** Balanced parentheses representation of the tree */ + bp *Par; + + /** Mapping from tag identifer to tag name */ + std::vector *TagName; + TagIdMap * tIdMap; + + /** Bit vector indicating with a 1 the positions of the non-empty texts. */ + static_bitsequence *EBVector; + + /** Tag sequence represented with a data structure for rank and select */ + static_sequence *Tags; + uint * tags_fix; + uint tags_blen, tags_len; + + /** The texts in the XML document */ + TextCollection *Text; + + // Allows to disable the TextCollection for benchmarkin purposes + bool disable_tc; + SXSI::TextCollectionBuilder::index_type_t text_index_type; + + std::string *buffer; + std::vector *print_stack; + + + void _real_flush(int fd, size_t size) { + if (size == 0) return; + size_t written; + while (1) { + written = write(fd, buffer->data(), size); + if ((written < 0) && (errno == EAGAIN || errno == EINTR)) + continue; + break; + }; + buffer->clear(); + + } + + void _flush(int fd){ + size_t size = buffer->size(); + if (size < BUFFER_SIZE) return; + _real_flush(fd, size); + } + + void _dput_str(std::string s, int fd){ + buffer->append(s); + _flush(fd); + } + + void _dputs(const char* s, int fd){ + buffer->append(s); + _flush(fd); + } + + void _dputc(const char c, int fd){ + buffer->push_back(c); + } + + size_t _dprintf(const char* s, int fd){ + if (s == NULL) return 0; + size_t i; + char c; + for (i = 0; (c = s[i]); i++) { + switch (c) { + case '"': + _dputs(""", fd); + break; + case '&': + _dputs("&", fd); + break; + case '\'': + _dputs("'", fd); + break; + case '<': + _dputs("<", fd); + break; + case '>': + _dputs(">", fd); + break; + default: + _dputc(c, fd); + + }; + }; + return i; + } + + void PrintNode(treeNode n, int fd); + /** Data structure constructors */ + XMLTree(){ buffer = 0;}; + + // non const pointer are freed by this method. + XMLTree( pb * const par, + uint npar, + std::vector * const TN, + TagIdMap * const tim, uint *empty_texts_bmp, + TagType *tags, + TextCollectionBuilder * const TCB, bool dis_tc, + TextCollectionBuilder::index_type_t _index_type ); + +public: + /** Data structure destructor */ + ~XMLTree(); + + /** root(): returns the tree root. */ + treeNode Root() { return 0; } + + /** Size() : Number of parenthesis */ + unsigned int Size(){ + return tags_len/2; + } + + + /** NumTags() : Number of distinct tags */ + unsigned int NumTags() { + return TagName->size(); + } + + int TagsBinaryLength(){ return tags_blen; }; + unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); }; + unsigned int * TagStruct() { return tags_fix; }; + + + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of + * node x. */ + int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); } + + /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree + * of node x. */ + int SubtreeTags(treeNode x, TagType tag){ + //int s = x + 2*subtree_size(Par, x) - 1; + treeNode y = bp_find_close(Par, x); + + if (y - x < 10) { + int count = 0; + for(int i = x; i < y; i++) + count += (Tag(i) == tag); + return count; + } + else + return (Tags->rank(tag, y) - Tags->rank(tag, x)); + }; + + /** SubtreeElements(x) of element nodes in the subtree of x + */ + int SubtreeElements(treeNode x); + + /** IsLeaf(x): returns whether node x is leaf or not. In the succinct + * representation this is just a bit inspection. */ + + bool IsLeaf(treeNode x); + + /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ + + bool IsAncestor(treeNode x, treeNode y); + + + /** IsRigthDescendant returns true if y is a descendant of x and y is + not a descendant of the first child of x */ + bool IsRightDescendant(treeNode x, treeNode y) { + if (x <= Root()) return false; + treeNode z = bp_parent_close(Par, x); + treeNode c = bp_find_close(Par, x); + return (y > c && y < z ); + } + + + /** IsChild(x,y): returns whether node x is parent of node y. */ + bool IsChild(treeNode x, treeNode y); + + /** IsFirstChild(x): returns whether node x is the first child of its parent. */ + /* OCAML */ + bool IsFirstChild(treeNode x) { + return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1)); + }; + + /** NumChildren(x): number of children of node x. Constant time with the + * data structure of Sadakane. */ + int NumChildren(treeNode x); + + /** ChildNumber(x): returns i if node x is the i-th children of its + * parent. */ + int ChildNumber(treeNode x); + + /** Depth(x): depth of node x, a simple binary rank on the parentheses + * sequence. */ + int Depth(treeNode x); + + /** Preorder(x): returns the preorder number of node x, just regarding tree + * nodes (and not texts). */ + int Preorder(treeNode x); + + /** Postorder(x): returns the postorder number of node x, just regarding + * tree nodes (and not texts). */ + int Postorder(treeNode x); + + + /** DocIds(x): returns the range (i.e., a pair of integers) of document + * identifiers that descend from node x. */ + range DocIds(treeNode x); + + /** Parent(x): returns the parent node of node x. */ + treeNode Parent(treeNode x) { + return (x == Root()) ? NULLT : bp_parent(Par, x); + }; + + treeNode BinaryParent(treeNode x){ + if (x <= Root()) + return NULLT; + else { + treeNode prev = x - 1; + return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev); + }; + }; + + /* Assumes x is neither 0 nor -1 */ + + /** Child(x,i): returns the i-th child of node x, assuming it exists. */ + treeNode Child(treeNode x, int i); + + + + /** LastChild(x): returns the last child of node x. */ + treeNode LastChild(treeNode x) { + NULLT_IF(x == NULLT || bp_isleaf(Par,x)); + return bp_find_open(Par, bp_find_close(Par, x)-1); + } + + /** PrevSibling(x): returns the previous sibling of node x, assuming it + * exists. */ + + treeNode PrevSibling(treeNode x) + { + NULLT_IF(x==NULLT); + return bp_prev_sibling(Par, x); + } + + + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or + * NULLT if there is none. Because of the balanced-parentheses representation + * of the tree, this operation is not supported efficiently, just iterating + * among the children of node x until finding the desired child. */ + + + treeNode SelectChild(treeNode x, TagIdSet * tags); + + /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or + * NULLT if there is none. */ + + treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); + + + + + treeNode SelectDescendant(treeNode x, TagIdSet * tags) { + NULLT_IF (x == NULLT); + treeNode fc = x+1; + if (bp_inspect(Par, fc) == CP) return NULLT; + treeNode min = NULLT; + treeNode aux; + + TagIdSet::const_iterator tagit; + for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { + aux = TaggedDescendant(x, (TagType) *tagit); + if (((unsigned int) aux) < ((unsigned int) min)) min = aux; + }; + return min; + } + + /** TaggedPrec(x,tag): returns the first node tagged tag with smaller + * preorder than x and not an ancestor of x. Returns NULLT if there + * is none. */ + treeNode TaggedPreceding(treeNode x, TagType tag); + + /** TaggedFoll(x,tag): returns the first node tagged tag with larger + * preorder than x and not in the subtree of x. Returns NULLT if there + * is none. */ + treeNode TaggedFollowing(treeNode x, TagType tag); + + + + treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); + + // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); + + treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing) + { + + NULLT_IF(x <= 0); + + treeNode close = bp_find_close(Par,x); + + + treeNode min = NULLT; + treeNode aux; + + + TagIdSet::const_iterator tagit; + for (tagit = tags->begin(); tagit != tags->end(); ++tagit) { + + aux = tagpos2node(Tags->select_next(*tagit, close)); + + if (((unsigned int) aux) < ((unsigned int) min)) min = aux; + }; + + + return (min < ancestor_closing) ? min : NULLT; + + } + + /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged + * tag. Return NULLT is there is none. */ + treeNode TaggedAncestor(treeNode x, TagType tag); + + /** PrevText(x): returns the document identifier of the text to the left of + * node x, or NULLT if x is the root node. */ + DocID PrevText(treeNode x); + + /** NextText(x): returns the document identifier of the text to the right of + * node x, or NULLT if x is the root node. */ + DocID NextText(treeNode x); + + /** MyText(x): returns the document identifier of the text below node x, or + * NULLT if x is not a leaf node. */ + DocID MyText(treeNode x); + DocID MyTextUnsafe(treeNode x); + + /** TextXMLId(d): returns the preorder of document with identifier d in the + * tree consisting of all tree nodes and all text nodes. */ + int TextXMLId(DocID d); + + /** NodeXMLId(x): returns the preorder of node x in the tree consisting of + * all tree nodes and all text nodes. */ + int NodeXMLId(treeNode x); + + /** ParentNode(d): returns the parent node of document identifier d. */ + treeNode ParentNode(DocID d); + + treeNode PrevNode(DocID d); + + /** GetTagId(tagname): returns the tag identifier corresponding to a given + * tag name. Returns NULLT in case that the tag name does not exists. */ + TagType GetTagId(unsigned char *tagname); + + /** GetTagName(tagid): returns the tag name of a given tag identifier. + * Returns NULL in case that the tag identifier is not valid.*/ + unsigned char *GetTagName(TagType tagid); + + /** GetTagName(tagid): returns the tag name of a given tag identifier. + * The result is just a reference and should not be freed by the caller. + */ + const unsigned char *GetTagNameByRef(TagType tagid); + + /** RegisterTag adds a new tag to the tag collection this is needed + * if the query contains a tag which is not in the document, we need + * to give this new tag a fresh id and store it somewhere. A logical + * choice is here. + * We might want to use a hashtable instead of an array though. + */ + TagType RegisterTag(unsigned char *tagname); + + bool EmptyText(DocID i) { + return Text->EmptyText(i); + } + + /** Prefix(s): search for texts prefixed by string s. */ + TextCollection::document_result Prefix(uchar const *s) { + return Text->Prefix(s); + } + + /** Suffix(s): search for texts having string s as a suffix. */ + TextCollection::document_result Suffix(uchar const *s) { + return Text->Suffix(s); + } + + /** Equal(s): search for texts equal to string s. */ + TextCollection::document_result Equals(uchar const *s) { + return Text->Equal(s); + } + + /** Contains(s): search for texts containing string s. */ + TextCollection::document_result Contains(uchar const *s) { + return Text->Contains(s); + } + + /** LessThan(s): returns document identifiers for the texts that + * are lexicographically smaller than string s. */ + TextCollection::document_result LessThan(uchar const *s) { + return Text->LessThan(s); + } + + /** IsPrefix(x): returns true if there is a text prefixed by string s. */ + bool IsPrefix(uchar const *s) { + return Text->IsPrefix(s); + } + + /** IsSuffix(s): returns true if there is a text having string s as a + * suffix.*/ + bool IsSuffix(uchar const *s) { + return Text->IsSuffix(s); + } + + /** IsEqual(s): returns true if there is a text that equals given + * string s. */ + bool IsEqual(uchar const *s) { + return Text->IsEqual(s); + } + + /** IsContains(s): returns true if there is a text containing string s. */ + bool IsContains(uchar const *s) { + return Text->IsContains(s); + } + + /** IsLessThan(s): returns true if there is at least a text that is + * lexicographically smaller than string s. */ + bool IsLessThan(uchar const *s) { + return Text->IsLessThan(s); + } + + /** Count(s): Global counting */ + unsigned Count(uchar const *s) { + return Text->Count(s); + } + + /** CountPrefix(s): counting version of Prefix(s). */ + unsigned CountPrefix(uchar const *s) { + return Text->CountPrefix(s); + } + + /** CountSuffix(s): counting version of Suffix(s). */ + unsigned CountSuffix(uchar const *s) { + return Text->CountSuffix(s); + } + + /** CountEqual(s): counting version of Equal(s). */ + unsigned CountEqual(uchar const *s) { + return Text->CountEqual(s); + } + + /** CountContains(s): counting version of Contains(s). */ + unsigned CountContains(uchar const *s) { + return Text->CountContains(s); + } + + /** CountLessThan(s): counting version of LessThan(s). */ + unsigned CountLessThan(uchar const *s) { + return Text->CountLessThan(s); + } + + /** GetText(d): returns the text corresponding to document with + * id d. */ + uchar* GetText(DocID d) { + + uchar * s = Text->GetText(d); + return (s[0] == 1 ? (s+1) : s); + } + + /** GetText(i, j): returns the texts corresponding to documents with + * ids i, i+1, ..., j. Texts are separated by '\0' character. */ + // uchar* GetText(DocID i, DocID j) { + // uchar * s = Text->GetText(i, j); + // return (s[0] == 1 ? (uchar*)"" : s); + //} + + TextCollection *getTextCollection() { + return Text; + } + + /** Save: saves XML tree data structure to file. */ + void Save(int fd, char* name ); + + /** Load: loads XML tree data structure from file. sample_rate_text + * indicates the sample rate for the text search data structure. */ + static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name); + + void insertTag(TagType tag, uint position); + + void print_stats(); + + + /** Parenthesis functions */ + treeNode Closing(treeNode x); + + bool IsOpen(treeNode x); + + + /** Print procedure */ + void Print(int fd,treeNode x, bool no_text); + void Print(int fd,treeNode x) { Print(fd,x,false); } + void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); } + + // The following are inlined here for speed + /** Tag(x): returns the tag identifier of node x. */ + + inline TagType Tag(treeNode x) const throw () { + if (tags_blen == 8) + return (TagType) (((uchar*)tags_fix)[(int) x]); + else + return get_field(tags_fix, tags_blen, x); + /* + { + size_t idxlen = x * tags_blen; + size_t j = idxlen % W; + size_t i = idxlen / W; + size_t offset = W - tags_blen; + size_t offset2 = offset - j; + size_t w = tags_fix[i]; + return (offset2 >= 0) + ? ( w << offset2 ) >> offset + : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset; + }; */ + + } + + /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf + */ + treeNode FirstChild(treeNode x) { + NULLT_IF(x==NULLT); + return bp_first_child(Par, x); + }; + + + /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT + * if none. + */ + treeNode FirstElement(treeNode node){ + { + NULLT_IF(node == NULLT); + treeNode x = bp_first_child(Par, node); + NULLT_IF(x == NULLT); + switch (Tag(x)){ + + case ATTRIBUTE_TAG_ID: + x = bp_next_sibling(Par,x); + if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x; + + case PCDATA_TAG_ID: + x = x+2; + return (bp_inspect(Par,x)==OP)? x : NULLT; + + default: + return x; + } + } + }; + + /** NextSibling(x): returns the next sibling of node x, or NULLT if none + * exists. */ + + treeNode NextSibling(treeNode x) { + NULLT_IF (x <= 0); + return bp_next_sibling(Par, x); + }; + + /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT + * if none. + */ + treeNode NextElement(treeNode x) + { + NULLT_IF(x <= 0); + x = bp_next_sibling(Par, x); + NULLT_IF(x == NULLT); + if (Tag(x) == PCDATA_TAG_ID){ + int y = x+2; + return (bp_inspect(Par, y) == OP) ? y : NULLT; + } + else return x; + }; + /** TaggedDesc(x,tag): returns the first node tagged tag with larger + * preorder than x and within the subtree of x. Returns NULT if there + * is none. */ + inline treeNode TaggedNext(treeNode x, TagType tag) + { + return tagpos2node(Tags->select_next(tag,node2tagpos(x))); + } + inline treeNode TaggedDescendant(treeNode x, TagType tag) + { + + int s = (int) Tags->select_next(tag,node2tagpos(x)); + NULLT_IF (s == -1); + + treeNode y = tagpos2node(s); // transforms the tag position into a node position + + return (bp_is_ancestor(Par,x,y) ? y : NULLT); + }; + + inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) + { + treeNode close = bp_find_close(Par, x); + treeNode s = tagpos2node(Tags->select_next(tag, close)); + + if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s; + else return NULLT; + }; + + inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing) + { + treeNode close = bp_find_close(Par, x); + treeNode s = tagpos2node(Tags->select_next(tag, close)); + + if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s; + else return NULLT; + }; + + inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing) + { + treeNode close = bp_find_close(Par, x); + int rank = bp_rank_open(Par, close); + treeNode y = bp_select_open(Par, rank+1); + return (y < ancestor_closing) ? y : NULLT; + }; + +// TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none. +treeNode TaggedFollowingSibling(treeNode x, TagType tag) +{ + NULLT_IF(x==NULLT); + treeNode sibling = x; + TagType ctag; + while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) { + ctag = Tag(sibling); + if (ctag == tag) return sibling; + } + return NULLT; +}; + +treeNode TaggedChild(treeNode x, TagType tag) +{ + + NULLT_IF(x==NULLT || bp_isleaf(Par,x)); + treeNode child; + child = bp_first_child(Par, x); + +if (Tag(child) == tag) + return child; + else + return TaggedFollowingSibling(child, tag); +}; + + +}; + + +#endif +