X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=6d11d78de3e5c2fa624eeb937209f3c8a5f8b9e6;hb=f32808a35be7a1e62830a5972473178014fa44e5;hp=b6aa410e46a7d6adb8baeb659d91a63c92d24ae1;hpb=ac53090f1525b3ef9cbde0b51da5fbb238a06744;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index b6aa410..6d11d78 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (C) 2008 by Diego Arroyuelo * * Interface for the in-memory XQuery/XPath engine * @@ -17,26 +16,28 @@ * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ + ******************************************************************************/ #ifndef XMLTREE_H_ #define XMLTREE_H_ -#include "TextCollection/TextCollection.h" -#include -#include -#include -//KIM : OJO -//clash between TextCollection/Tools.h and libcds/includes/basics.h + +#include +#include +#include +#include "TextCollection/TextCollectionBuilder.h" + #undef W #undef WW #undef Wminusone #include "bp.h" +#include #include #include #include using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; // this constant is used to efficiently compute the child operation in the tree @@ -46,11 +47,6 @@ using SXSI::TextCollection; #define PERM_SAMPLE 10 - // sets bit p in e -#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) - // cleans bit p in e -#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) - typedef int treeNode; typedef int TagType; @@ -61,100 +57,169 @@ typedef struct { int max; } range; +// Encoding of the XML Document : +// The following TAGs and IDs are fixed, "" is the tag of the root. +// a TextNode is represented by a leaf <<$>>> The DocId in the TextCollection +// of that leaf is kept in a bit sequence. +// a TextNode below an attribute is likewise represented by a leaf <<@$>><> +// An element ... the representation is: +// <<@>> <<@>a1> <<$@>>DocID(v1)>a1> ... > .... +// Hence the attributes (if any) are always below the first child of their element, +// as the children of a fake node <@>. + + +#define DOCUMENT_OPEN_TAG "" +#define DOCUMENT_TAG_ID 0 +#define ATTRIBUTE_OPEN_TAG "<@>" +#define ATTRIBUTE_TAG_ID 1 +#define PCDATA_OPEN_TAG "<$>" +#define PCDATA_TAG_ID 2 +#define ATTRIBUTE_DATA_OPEN_TAG "<@$>" +#define ATTRIBUTE_DATA_TAG_ID 3 +#define CLOSING_TAG "" +#define CLOSING_TAG_ID 4 +#define DOCUMENT_CLOSE_TAG "/" +#define ATTRIBUTE_CLOSE_TAG "/<@>" +#define PCDATA_CLOSE_TAG "/<$>" +#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" + + +typedef std::unordered_set TagIdSet; +typedef std::unordered_map TagIdMap; +typedef TagIdMap::const_iterator TagIdMapIT; + +#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ + (v)->push_back(t); } while (false) + + +// returns NULLT if the test is true +#define NULLT_IF(x) do { if (x) return NULLT; } while (0) + + -//KIM : OJO -// I know this class implements the working draft that we have but the logics seem flawed here... -// We should have two classes. One XMLTreeBuilder and one XMLTree. -// XMLTreeBuilder would have OpenDocument, NewOpenTag,... and CloseDocument would return an XMLTree -// XMLTree would have only an initialized structure. If find it really ugly to check (!finished) or (!initialized) -// in every function (FirstChild....). + + + +class XMLTreeBuilder; class XMLTree { + + // Only the builder can access the constructor + friend class XMLTreeBuilder; + + private: /** Balanced parentheses representation of the tree */ bp *Par; /** Mapping from tag identifer to tag name */ - unsigned char **TagName; + std::vector *TagName; + TagIdMap * tIdMap; - /** boolean flag indicating whether we are indexing empty texts or not */ - bool indexing_empty_texts; - /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence_rrr02 *EBVector; + static_bitsequence *EBVector; /** Tag sequence represented with a data structure for rank and select */ static_sequence *Tags; + uint * tags_fix; + uint tags_blen, tags_len; /** The texts in the XML document */ TextCollection *Text; - /** The texts in the XML document (cached for faster display) */ - vector CachedText; - - /** Flag indicating whether the whole data structure has been constructed or - * not. If the value is true, you cannot add more texts, nodes, etc. */ - bool finished; - - /** Flag indicating whether the construction of the data structure has been - * initialized or not (by calling method OpenDocument()). If this is true, - * you cannot insert new tags or texts. */ - bool initialized; - - /* the following components are used for construction purposes */ - pb *par_aux; - TagType *tags_aux; - int npar; - int parArraySize; - int ntagnames; - unsigned int *empty_texts_aux; - - // KIM : OJO - // I added those two. The TagName array should always contains two special tags - // <@> for attributes and <$> for PCDATA. - // <$> can never be in a document (since we handle the text differently) - // but <@> can be returned by the parser. This boolean is needed for the construction - // of the Tag bitmap to know if <@> must be taken into account or not - bool found_attributes; - - // KIM : OJO + // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; -public: + FILE* stream; + int stream_fd; + std::string * buffer; + void myfputs(const char* s, FILE * fp){ + buffer->append(s); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; - /** Data structure constructor */ - XMLTree() {finished = false; initialized = false;}; - + } + void myfputc(const char c, FILE*fp){ + buffer->append(1,c); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + } + void mybufferflush(FILE* fp){ + fputs(buffer->c_str(), fp); + buffer->clear(); + } + + size_t myfprintf(const char* s, FILE * fp){ + if (s == NULL) + return 0; + size_t i = buffer->size(); + buffer->append(s); + size_t j = buffer->size(); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + return (j-i); + } + + void PrintNode(treeNode n, int fd); + /** Data structure constructors */ + XMLTree(){ buffer = 0;}; + + // non const pointer are freed by this method. + XMLTree( pb * const par, uint npar, std::vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, + TextCollection * const TC, bool dis_tc); + +public: /** Data structure destructor */ ~XMLTree(); /** root(): returns the tree root. */ - treeNode Root(); - + treeNode Root() { return 0; } + + /** Size() : Number of parenthesis */ + unsigned int Size(){ + return tags_len/2; + } + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of * node x. */ int SubtreeSize(treeNode x); - + /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree * of node x. */ int SubtreeTags(treeNode x, TagType tag); + /** SubtreeElements(x) of element nodes in the subtree of x + */ + int SubtreeElements(treeNode x); + /** IsLeaf(x): returns whether node x is leaf or not. In the succinct * representation this is just a bit inspection. */ + bool IsLeaf(treeNode x); - + /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ + bool IsAncestor(treeNode x, treeNode y); /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); - + + /** IsFirstChild(x): returns whether node x is the first child of its parent. */ + /* OCAML */ + bool IsFirstChild(treeNode x); + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); - + /** ChildNumber(x): returns i if node x is the i-th children of its * parent. */ - inline int ChildNumber(treeNode x); + int ChildNumber(treeNode x); /** Depth(x): depth of node x, a simple binary rank on the parentheses * sequence. */ @@ -167,59 +232,91 @@ public: /** Postorder(x): returns the postorder number of node x, just regarding * tree nodes (and not texts). */ int Postorder(treeNode x); - + /** Tag(x): returns the tag identifier of node x. */ - TagType Tag(treeNode x); - + TagType Tag(treeNode x) { + if (tags_blen == 8) + return (TagType) (((uchar*)tags_fix)[(int) x]); + else + return (TagType) get_field(tags_fix,tags_blen, (int) x); + } + /** DocIds(x): returns the range (i.e., a pair of integers) of document * identifiers that descend from node x. */ range DocIds(treeNode x); - + /** Parent(x): returns the parent node of node x. */ treeNode Parent(treeNode x); + /* Assumes x is neither 0 nor -1 */ /** Child(x,i): returns the i-th child of node x, assuming it exists. */ treeNode Child(treeNode x, int i); - - /** FirstChild(x): returns the first child of node x, assuming it exists. - * Very fast in BP. */ + + /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf + */ treeNode FirstChild(treeNode x); + + /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT + * if none. + */ + treeNode FirstElement(treeNode x); + + /** LastChild(x): returns the last child of node x. */ + treeNode LastChild(treeNode x); - /** NextSibling(x): returns the next sibling of node x, assuming it + /** NextSibling(x): returns the next sibling of node x, or NULLT if none * exists. */ + treeNode NextSibling(treeNode x); - + + /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT + * if none. + */ + treeNode NextElement(treeNode x); + /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ + treeNode PrevSibling(treeNode x); - /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or * NULLT if there is none. Because of the balanced-parentheses representation * of the tree, this operation is not supported efficiently, just iterating * among the children of node x until finding the desired child. */ - treeNode TaggedChild(treeNode x, int i, TagType tag); + treeNode TaggedChild(treeNode x, TagType tag); + + treeNode SelectChild(treeNode x, TagIdSet * tags); + + /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or + * NULLT if there is none. */ + treeNode TaggedFollowingSibling(treeNode x, TagType tag); + treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); + /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ - treeNode TaggedDesc(treeNode x, TagType tag); + treeNode TaggedDescendant(treeNode x, TagType tag); - /** TaggedNext(x,tag): returns the first node tagged tag with larger - * preorder than x. Returns NULT if there is none. */ - treeNode TaggedNext(treeNode x, TagType tag); + treeNode SelectDescendant(treeNode x, TagIdSet * tags); /** TaggedPrec(x,tag): returns the first node tagged tag with smaller * preorder than x and not an ancestor of x. Returns NULLT if there * is none. */ - treeNode TaggedPrec(treeNode x, TagType tag); + treeNode TaggedPreceding(treeNode x, TagType tag); /** TaggedFoll(x,tag): returns the first node tagged tag with larger * preorder than x and not in the subtree of x. Returns NULLT if there * is none. */ - treeNode TaggedFoll(treeNode x, TagType tag); - - /** TaggedFollowingSibling(x,tag) */ - treeNode TaggedFollowingSibling(treeNode x, TagType tag); + treeNode TaggedFollowing(treeNode x, TagType tag); + + treeNode TaggedFollowingBelow(treeNode x, TagType tag,treeNode ancestor); + + treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); + + treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); + + treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode closing); /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged * tag. Return NULLT is there is none. */ @@ -236,7 +333,8 @@ public: /** MyText(x): returns the document identifier of the text below node x, or * NULLT if x is not a leaf node. */ DocID MyText(treeNode x); - + DocID MyTextUnsafe(treeNode x); + /** TextXMLId(d): returns the preorder of document with identifier d in the * tree consisting of all tree nodes and all text nodes. */ int TextXMLId(DocID d); @@ -247,46 +345,8 @@ public: /** ParentNode(d): returns the parent node of document identifier d. */ treeNode ParentNode(DocID d); - - /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter - * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space - * requirement). dtc disable the use of the TextCollection - * (i.e. everything is considered an empty text *) - * Returns a non-zero value upon success, NULLT in case of - * error. */ - - int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); - - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, - * dynamic data structures are made static, and the flag "finished" is set - * to true. After that, the data structure can be queried. */ - int CloseDocument(); - - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); - /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); - - /** NewText(s): indicates the event of finding a new (non-empty) text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); + treeNode PrevNode(DocID d); /** GetTagId(tagname): returns the tag identifier corresponding to a given * tag name. Returns NULLT in case that the tag name does not exists. */ @@ -296,14 +356,11 @@ public: * Returns NULL in case that the tag identifier is not valid.*/ unsigned char *GetTagName(TagType tagid); - - // OJO /** GetTagName(tagid): returns the tag name of a given tag identifier. * The result is just a reference and should not be freed by the caller. */ const unsigned char *GetTagNameByRef(TagType tagid); - //OJO /** RegisterTag adds a new tag to the tag collection this is needed * if the query contains a tag which is not in the document, we need * to give this new tag a fresh id and store it somewhere. A logical @@ -313,8 +370,9 @@ public: TagType RegisterTag(unsigned char *tagname); bool EmptyText(DocID i) { - return Text->EmptyText(i); + return Text->EmptyText(i); } + /** Prefix(s): search for texts prefixed by string s. */ TextCollection::document_result Prefix(uchar const *s) { return Text->Prefix(s); @@ -326,7 +384,7 @@ public: } /** Equal(s): search for texts equal to string s. */ - TextCollection::document_result Equal(uchar const *s) { + TextCollection::document_result Equals(uchar const *s) { return Text->Equal(s); } @@ -368,6 +426,11 @@ public: bool IsLessThan(uchar const *s) { return Text->IsLessThan(s); } + + /** Count(s): Global counting */ + unsigned Count(uchar const *s) { + return Text->Count(s); + } /** CountPrefix(s): counting version of Prefix(s). */ unsigned CountPrefix(uchar const *s) { @@ -391,29 +454,54 @@ public: /** CountLessThan(s): counting version of LessThan(s). */ unsigned CountLessThan(uchar const *s) { - return CountLessThan(s); + return Text->CountLessThan(s); } /** GetText(d): returns the text corresponding to document with * id d. */ uchar* GetText(DocID d) { - return Text->GetText(d); + + uchar * s = Text->GetText(d); + return (s[0] == 1 ? (s+1) : s); } - uchar* GetCachedText(DocID d) { - uchar * str = (uchar*) calloc(sizeof(char),(CachedText.at(d).size() + 1)); - strcpy((char*) str,(const char*) CachedText.at(d).c_str()); - return (uchar*) (str); - } - + /** GetText(i, j): returns the texts corresponding to documents with + * ids i, i+1, ..., j. Texts are separated by '\0' character. */ + // uchar* GetText(DocID i, DocID j) { + // uchar * s = Text->GetText(i, j); + // return (s[0] == 1 ? (uchar*)"" : s); + //} + TextCollection *getTextCollection() { return Text; } + /** Save: saves XML tree data structure to file. */ - void Save(unsigned char *filename); + void Save(int fd); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(unsigned char *filename, int sample_rate_text); + static XMLTree *Load(int fd,bool load_tc, int sample_factor); + + void insertTag(TagType tag, uint position); + + void print_stats(); + + + /** Parenthesis functions */ + treeNode Closing(treeNode x); + + bool IsOpen(treeNode x); + + + /** Print procedure */ + void Print(int fd,treeNode x, bool no_text); + void Print(int fd,treeNode x) { Print(fd,x,false); } + }; + + + + #endif +