X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=8057773c7f0899463cddfd3d078966c07ae8855d;hb=235e3214904e390d2f101c5d5bf7def98745b132;hp=ea9b938ff80b7356f81ec89334006ca73ef697d4;hpb=aa6692a9fd2badf8e8e686b92075f041dc03bbef;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index ea9b938..8057773 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -21,6 +21,8 @@ #ifndef XMLTREE_H_ #define XMLTREE_H_ +#include +#include #include "TextCollection/TextCollectionBuilder.h" #include #include @@ -32,7 +34,7 @@ #undef Wminusone #include "bp.h" -//#include "basics.h" + #include #include #include @@ -57,27 +59,54 @@ typedef struct { int max; } range; -typedef struct nd { - uint position; - struct nd *next; -} ListNode; +// Encoding of the XML Document : +// The following TAGs and IDs are fixed, "" is the tag of the root. +// a TextNode is represented by a leaf <<$>>> The DocId in the TextCollection +// of that leaf is kept in a bit sequence. +// a TextNode below an attribute is likewise represented by a leaf <<@$>><> +// An element ... the representation is: +// <<@>> <<@>a1> <<$@>>DocID(v1)>a1> ... > .... +// Hence the attributes (if any) are always below the first child of their element, +// as the children of a fake node <@>. -typedef struct { - ListNode *first; - ListNode *last; -} TagArrayEntry; + +#define DOCUMENT_OPEN_TAG "" +#define DOCUMENT_TAG_ID 0 +#define ATTRIBUTE_OPEN_TAG "<@>" +#define ATTRIBUTE_TAG_ID 1 +#define PCDATA_OPEN_TAG "<$>" +#define PCDATA_TAG_ID 2 +#define ATTRIBUTE_DATA_OPEN_TAG "<@$>" +#define ATTRIBUTE_DATA_TAG_ID 3 +#define DOCUMENT_CLOSE_TAG "/" +#define ATTRIBUTE_CLOSE_TAG "/<@>" +#define PCDATA_CLOSE_TAG "/<$>" +#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" + + + +typedef std::unordered_map TagIdMap; +typedef TagIdMap::const_iterator TagIdMapIT; + +#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ + (v)->push_back(t); } while (false) + + +class XMLTreeBuilder; class XMLTree { + + // Only the builder can access the constructor + friend class XMLTreeBuilder; + + private: /** Balanced parentheses representation of the tree */ bp *Par; /** Mapping from tag identifer to tag name */ - unsigned char **TagName; - uint ntagnames; + vector *TagName; + TagIdMap * tIdMap; - /** boolean flag indicating whether we are indexing empty texts or not */ - bool indexing_empty_texts; - /** Bit vector indicating with a 1 the positions of the non-empty texts. */ static_bitsequence *EBVector; @@ -90,20 +119,20 @@ class XMLTree { TextCollection *Text; /** The texts in the XML document (cached for faster display) */ - vector CachedText; - - TagArrayEntry *TagArray; + vector *CachedText; // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; -public: + /** Data structure constructors */ - XMLTree() {;}; + XMLTree(){;}; - XMLTree(pb *par, uint npar, unsigned char **TN, uint ntagnames, uint *empty_texts_bmp, TagType *tags, - TextCollection *TC, vector CT, bool indexing_empty_t, bool dis_tc); - + // non const pointer are freed by this method. + XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, + TextCollection * const TC, vector * const CT, bool dis_tc); + +public: /** Data structure destructor */ ~XMLTree(); @@ -167,13 +196,15 @@ public: /** FirstChild(x): returns the first child of node x, assuming it exists. * Very fast in BP. */ treeNode FirstChild(treeNode x); - + treeNode FirstElement(treeNode x); + /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x); /** NextSibling(x): returns the next sibling of node x, assuming it * exists. */ treeNode NextSibling(treeNode x); + treeNode NextElement(treeNode x); /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ @@ -185,35 +216,21 @@ public: * among the children of node x until finding the desired child. */ treeNode TaggedChild(treeNode x, TagType tag); - treeNode SelectChild(treeNode x, TagType *tags, int ntags); + treeNode SelectChild(treeNode x, std::unordered_set * tags); - /** TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or + /** TaggedFollSibling(x,tag): returns the first sibling of node x tagged tag, or * NULLT if there is none. */ - treeNode TaggedSibling(treeNode x, TagType tag); + treeNode TaggedFollSibling(treeNode x, TagType tag); - treeNode SelectSibling(treeNode x, TagType *tags, int ntags); + treeNode SelectFollSibling(treeNode x, std::unordered_set * tags); /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ treeNode TaggedDesc(treeNode x, TagType tag); - treeNode SelectDesc(treeNode x, TagType *tags, int ntags); + treeNode SelectDesc(treeNode x, std::unordered_set * tags); - treeNode TaggedBelow(treeNode x, TagType *childtags, unsigned int ctlen, - TagType *desctags, unsigned int dtlen); - - treeNode TaggedNext(treeNode x, TagType *childtags, unsigned int ctlen, - TagType *folltags, unsigned int flen,treeNode root); - - treeNode TaggedDescOnly(treeNode x, TagType *desctags, unsigned int dtlen); - - treeNode TaggedDescOrFollOnly(treeNode x, TagType *folltags, unsigned int flen, - treeNode root); - - treeNode TaggedFollOnly(treeNode x, TagType *folltags, unsigned int flen, - treeNode root); - /** TaggedPrec(x,tag): returns the first node tagged tag with smaller * preorder than x and not an ancestor of x. Returns NULLT if there @@ -227,7 +244,7 @@ public: treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); - treeNode SelectFollBelow(treeNode x, TagType *tags, int ntags, treeNode ctx); + treeNode SelectFollBelow(treeNode x, std::unordered_set * tags, treeNode root); /** TaggedFollowingSibling(x,tag) */ treeNode TaggedFollowingSibling(treeNode x, TagType tag); @@ -373,12 +390,13 @@ public: /** GetText(d): returns the text corresponding to document with * id d. */ uchar* GetText(DocID d) { - return Text->GetText(d); + uchar * s = Text->GetText(d); + return (s[0] == 1 ? (uchar*)"" : s); } uchar* GetCachedText(DocID d) { - uchar * str = (uchar*) calloc(sizeof(char),(CachedText.at(d).size() + 1)); - strcpy((char*) str,(const char*) CachedText.at(d).c_str()); + uchar * str = (uchar*) calloc(sizeof(char),(CachedText->at(d).size() + 1)); + strcpy((char*) str,(const char*) CachedText->at(d).c_str()); return (uchar*) (str); } @@ -387,11 +405,11 @@ public: } /** Save: saves XML tree data structure to file. */ - void Save(unsigned char *filename); + void Save(int fd); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(unsigned char *filename, int sample_rate_text); + static XMLTree *Load(int fd); void insertTag(TagType tag, uint position);