X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=6d11d78de3e5c2fa624eeb937209f3c8a5f8b9e6;hb=f32808a35be7a1e62830a5972473178014fa44e5;hp=275b9bcab03562029094017c787b0a590714f012;hpb=1413ae2197d87e87571c9d8d6fc9f20f691fcea3;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index 275b9bc..6d11d78 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (C) 2008 by Diego Arroyuelo * * Interface for the in-memory XQuery/XPath engine * @@ -21,20 +20,19 @@ #ifndef XMLTREE_H_ #define XMLTREE_H_ + + #include #include +#include #include "TextCollection/TextCollectionBuilder.h" -#include -#include -#include - #undef W #undef WW #undef Wminusone #include "bp.h" - +#include #include #include #include @@ -78,20 +76,30 @@ typedef struct { #define PCDATA_TAG_ID 2 #define ATTRIBUTE_DATA_OPEN_TAG "<@$>" #define ATTRIBUTE_DATA_TAG_ID 3 +#define CLOSING_TAG "" +#define CLOSING_TAG_ID 4 #define DOCUMENT_CLOSE_TAG "/" #define ATTRIBUTE_CLOSE_TAG "/<@>" #define PCDATA_CLOSE_TAG "/<$>" #define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" - -typedef std::unordered_map TagIdMap; +typedef std::unordered_set TagIdSet; +typedef std::unordered_map TagIdMap; typedef TagIdMap::const_iterator TagIdMapIT; #define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ (v)->push_back(t); } while (false) +// returns NULLT if the test is true +#define NULLT_IF(x) do { if (x) return NULLT; } while (0) + + + + + + class XMLTreeBuilder; class XMLTree { @@ -104,7 +112,7 @@ class XMLTree { bp *Par; /** Mapping from tag identifer to tag name */ - vector *TagName; + std::vector *TagName; TagIdMap * tIdMap; /** Bit vector indicating with a 1 the positions of the non-empty texts. */ @@ -121,12 +129,48 @@ class XMLTree { // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; + FILE* stream; + int stream_fd; + std::string * buffer; + void myfputs(const char* s, FILE * fp){ + buffer->append(s); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + + } + void myfputc(const char c, FILE*fp){ + buffer->append(1,c); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + } + void mybufferflush(FILE* fp){ + fputs(buffer->c_str(), fp); + buffer->clear(); + } + + size_t myfprintf(const char* s, FILE * fp){ + if (s == NULL) + return 0; + size_t i = buffer->size(); + buffer->append(s); + size_t j = buffer->size(); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + return (j-i); + } + void PrintNode(treeNode n, int fd); /** Data structure constructors */ - XMLTree(){;}; + XMLTree(){ buffer = 0;}; // non const pointer are freed by this method. - XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, + XMLTree( pb * const par, uint npar, std::vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, TextCollection * const TC, bool dis_tc); public: @@ -134,36 +178,48 @@ public: ~XMLTree(); /** root(): returns the tree root. */ - treeNode Root(); - + treeNode Root() { return 0; } + + /** Size() : Number of parenthesis */ + unsigned int Size(){ + return tags_len/2; + } + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of * node x. */ int SubtreeSize(treeNode x); - + /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree * of node x. */ int SubtreeTags(treeNode x, TagType tag); + /** SubtreeElements(x) of element nodes in the subtree of x + */ + int SubtreeElements(treeNode x); + /** IsLeaf(x): returns whether node x is leaf or not. In the succinct * representation this is just a bit inspection. */ + bool IsLeaf(treeNode x); - + /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ + bool IsAncestor(treeNode x, treeNode y); /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); /** IsFirstChild(x): returns whether node x is the first child of its parent. */ + /* OCAML */ bool IsFirstChild(treeNode x); - + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); - + /** ChildNumber(x): returns i if node x is the i-th children of its * parent. */ - inline int ChildNumber(treeNode x); + int ChildNumber(treeNode x); /** Depth(x): depth of node x, a simple binary rank on the parentheses * sequence. */ @@ -176,35 +232,51 @@ public: /** Postorder(x): returns the postorder number of node x, just regarding * tree nodes (and not texts). */ int Postorder(treeNode x); - + /** Tag(x): returns the tag identifier of node x. */ - TagType Tag(treeNode x); - + TagType Tag(treeNode x) { + if (tags_blen == 8) + return (TagType) (((uchar*)tags_fix)[(int) x]); + else + return (TagType) get_field(tags_fix,tags_blen, (int) x); + } + /** DocIds(x): returns the range (i.e., a pair of integers) of document * identifiers that descend from node x. */ range DocIds(treeNode x); - + /** Parent(x): returns the parent node of node x. */ treeNode Parent(treeNode x); + /* Assumes x is neither 0 nor -1 */ /** Child(x,i): returns the i-th child of node x, assuming it exists. */ treeNode Child(treeNode x, int i); - - /** FirstChild(x): returns the first child of node x, assuming it exists. - * Very fast in BP. */ + + /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf + */ treeNode FirstChild(treeNode x); + + /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT + * if none. + */ treeNode FirstElement(treeNode x); /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x); - /** NextSibling(x): returns the next sibling of node x, assuming it + /** NextSibling(x): returns the next sibling of node x, or NULLT if none * exists. */ + treeNode NextSibling(treeNode x); + + /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT + * if none. + */ treeNode NextElement(treeNode x); - + /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ + treeNode PrevSibling(treeNode x); /** TaggedChild(x,tag): returns the first child of node x tagged tag, or @@ -213,38 +285,38 @@ public: * among the children of node x until finding the desired child. */ treeNode TaggedChild(treeNode x, TagType tag); - treeNode SelectChild(treeNode x, std::unordered_set * tags); + treeNode SelectChild(treeNode x, TagIdSet * tags); - /** TaggedFollSibling(x,tag): returns the first sibling of node x tagged tag, or + /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or * NULLT if there is none. */ - treeNode TaggedFollSibling(treeNode x, TagType tag); + treeNode TaggedFollowingSibling(treeNode x, TagType tag); - treeNode SelectFollSibling(treeNode x, std::unordered_set * tags); + treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ - treeNode TaggedDesc(treeNode x, TagType tag); - - treeNode SelectDesc(treeNode x, std::unordered_set * tags); + treeNode TaggedDescendant(treeNode x, TagType tag); + treeNode SelectDescendant(treeNode x, TagIdSet * tags); /** TaggedPrec(x,tag): returns the first node tagged tag with smaller * preorder than x and not an ancestor of x. Returns NULLT if there * is none. */ - treeNode TaggedPrec(treeNode x, TagType tag); + treeNode TaggedPreceding(treeNode x, TagType tag); /** TaggedFoll(x,tag): returns the first node tagged tag with larger * preorder than x and not in the subtree of x. Returns NULLT if there * is none. */ - treeNode TaggedFoll(treeNode x, TagType tag); + treeNode TaggedFollowing(treeNode x, TagType tag); - treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); - - treeNode SelectFollBelow(treeNode x, std::unordered_set * tags, treeNode root); + treeNode TaggedFollowingBelow(treeNode x, TagType tag,treeNode ancestor); - /** TaggedFollowingSibling(x,tag) */ - treeNode TaggedFollowingSibling(treeNode x, TagType tag); + treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); + + treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); + + treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode closing); /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged * tag. Return NULLT is there is none. */ @@ -261,7 +333,8 @@ public: /** MyText(x): returns the document identifier of the text below node x, or * NULLT if x is not a leaf node. */ DocID MyText(treeNode x); - + DocID MyTextUnsafe(treeNode x); + /** TextXMLId(d): returns the preorder of document with identifier d in the * tree consisting of all tree nodes and all text nodes. */ int TextXMLId(DocID d); @@ -311,7 +384,7 @@ public: } /** Equal(s): search for texts equal to string s. */ - TextCollection::document_result Equal(uchar const *s) { + TextCollection::document_result Equals(uchar const *s) { return Text->Equal(s); } @@ -387,8 +460,9 @@ public: /** GetText(d): returns the text corresponding to document with * id d. */ uchar* GetText(DocID d) { - uchar * s = Text->GetText(d); - return (s[0] == 1 ? (uchar*)"" : s); + + uchar * s = Text->GetText(d); + return (s[0] == 1 ? (s+1) : s); } /** GetText(i, j): returns the texts corresponding to documents with @@ -407,11 +481,27 @@ public: /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd); + static XMLTree *Load(int fd,bool load_tc, int sample_factor); void insertTag(TagType tag, uint position); void print_stats(); + + + /** Parenthesis functions */ + treeNode Closing(treeNode x); + + bool IsOpen(treeNode x); + + + /** Print procedure */ + void Print(int fd,treeNode x, bool no_text); + void Print(int fd,treeNode x) { Print(fd,x,false); } + }; + + + + #endif