X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=6a722f9b3be8ca172e1efa154504b40add3ae2cc;hb=fdf29cca5a2df608373ffbeb3a4155876a55df2c;hp=cc5f0896bf5f7663b4cce79eefc300c45d889a28;hpb=add1a8a0b8a7ca05bb099ec38a60f3c52b9dfd71;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index cc5f089..6a722f9 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -17,19 +17,29 @@ * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ + ******************************************************************************/ #ifndef XMLTREE_H_ #define XMLTREE_H_ -#include "TextCollection/TextCollection.h" +#include +#include +#include "TextCollection/TextCollectionBuilder.h" #include #include #include + + +#undef W +#undef WW +#undef Wminusone + #include "bp.h" + #include #include #include using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; // this constant is used to efficiently compute the child operation in the tree @@ -39,11 +49,6 @@ using SXSI::TextCollection; #define PERM_SAMPLE 10 - // sets bit p in e -#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) - // cleans bit p in e -#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) - typedef int treeNode; typedef int TagType; @@ -54,49 +59,80 @@ typedef struct { int max; } range; +// Encoding of the XML Document : +// The following TAGs and IDs are fixed, "" is the tag of the root. +// a TextNode is represented by a leaf <<$>>> The DocId in the TextCollection +// of that leaf is kept in a bit sequence. +// a TextNode below an attribute is likewise represented by a leaf <<@$>><> +// An element ... the representation is: +// <<@>> <<@>a1> <<$@>>DocID(v1)>a1> ... > .... +// Hence the attributes (if any) are always below the first child of their element, +// as the children of a fake node <@>. + + +#define DOCUMENT_OPEN_TAG "" +#define DOCUMENT_TAG_ID 0 +#define ATTRIBUTE_OPEN_TAG "<@>" +#define ATTRIBUTE_TAG_ID 1 +#define PCDATA_OPEN_TAG "<$>" +#define PCDATA_TAG_ID 2 +#define ATTRIBUTE_DATA_OPEN_TAG "<@$>" +#define ATTRIBUTE_DATA_TAG_ID 3 +#define DOCUMENT_CLOSE_TAG "/" +#define ATTRIBUTE_CLOSE_TAG "/<@>" +#define PCDATA_CLOSE_TAG "/<$>" +#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>" + + + +typedef std::unordered_map TagIdMap; +typedef TagIdMap::const_iterator TagIdMapIT; + +#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ + (v)->push_back(t); } while (false) + + +class XMLTreeBuilder; class XMLTree { + + // Only the builder can access the constructor + friend class XMLTreeBuilder; + + private: /** Balanced parentheses representation of the tree */ bp *Par; /** Mapping from tag identifer to tag name */ - unsigned char **TagName; + vector *TagName; + TagIdMap * tIdMap; - /** boolean flag indicating whether we are indexing empty texts or not */ - bool indexing_empty_texts; - /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence_rrr02 *EBVector; + static_bitsequence *EBVector; /** Tag sequence represented with a data structure for rank and select */ static_sequence *Tags; + uint * tags_fix; + uint tags_blen, tags_len; /** The texts in the XML document */ TextCollection *Text; - - /** Flag indicating whether the whole data structure has been constructed or - * not. If the value is true, you cannot add more texts, nodes, etc. */ - bool finished; - /** Flag indicating whether the construction of the data structure has been - * initialized or not (by calling method OpenDocument()). If this is true, - * you cannot insert new tags or texts. */ - bool initialized; - - /* the following components are used for construction purposes */ - pb *par_aux; - TagType *tags_aux; - int npar; - int parArraySize; - int ntagnames; - unsigned int *empty_texts_aux; - bool found_attributes; + /** The texts in the XML document (cached for faster display) */ + vector *CachedText; + + // Allows to disable the TextCollection for benchmarkin purposes + bool disable_tc; -public: - /** Data structure constructor */ - XMLTree() {finished = false; initialized = false;}; - + /** Data structure constructors */ + XMLTree(){;}; + + // non const pointer are freed by this method. + XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, + TextCollection * const TC, vector * const CT, bool dis_tc); + +public: /** Data structure destructor */ ~XMLTree(); @@ -120,7 +156,10 @@ public: /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); - + + /** IsFirstChild(x): returns whether node x is the first child of its parent. */ + bool IsFirstChild(treeNode x); + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); @@ -157,26 +196,42 @@ public: /** FirstChild(x): returns the first child of node x, assuming it exists. * Very fast in BP. */ treeNode FirstChild(treeNode x); + treeNode FirstElement(treeNode x); + + /** LastChild(x): returns the last child of node x. */ + treeNode LastChild(treeNode x); /** NextSibling(x): returns the next sibling of node x, assuming it * exists. */ treeNode NextSibling(treeNode x); + treeNode NextElement(treeNode x); /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ treeNode PrevSibling(treeNode x); - /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or * NULLT if there is none. Because of the balanced-parentheses representation * of the tree, this operation is not supported efficiently, just iterating * among the children of node x until finding the desired child. */ - treeNode TaggedChild(treeNode x, int i, TagType tag); + treeNode TaggedChild(treeNode x, TagType tag); + + treeNode SelectChild(treeNode x, std::unordered_set * tags); + + /** TaggedFollSibling(x,tag): returns the first sibling of node x tagged tag, or + * NULLT if there is none. */ + treeNode TaggedFollSibling(treeNode x, TagType tag); + treeNode SelectFollSibling(treeNode x, std::unordered_set * tags); + /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ treeNode TaggedDesc(treeNode x, TagType tag); + treeNode SelectDesc(treeNode x, std::unordered_set * tags); + + /** TaggedPrec(x,tag): returns the first node tagged tag with smaller * preorder than x and not an ancestor of x. Returns NULLT if there * is none. */ @@ -186,7 +241,18 @@ public: * preorder than x and not in the subtree of x. Returns NULLT if there * is none. */ treeNode TaggedFoll(treeNode x, TagType tag); + + treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); + treeNode SelectFollBelow(treeNode x, std::unordered_set * tags, treeNode root); + + /** TaggedFollowingSibling(x,tag) */ + treeNode TaggedFollowingSibling(treeNode x, TagType tag); + + /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged + * tag. Return NULLT is there is none. */ + treeNode TaggedAncestor(treeNode x, TagType tag); + /** PrevText(x): returns the document identifier of the text to the left of * node x, or NULLT if x is the root node. */ DocID PrevText(treeNode x); @@ -209,43 +275,8 @@ public: /** ParentNode(d): returns the parent node of document identifier d. */ treeNode ParentNode(DocID d); - - /** OpenDocument(empty_texts,sample_rate_text): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter - * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space - * requirement). Returns a non-zero value upon success, NULLT in case of - * error. */ - int OpenDocument(bool empty_texts, int sample_rate_text); - - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, - * dynamic data structures are made static, and the flag "finished" is set - * to true. After that, the data structure can be queried. */ - int CloseDocument(); - - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); - - /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); - - /** NewText(s): indicates the event of finding a new (non-empty) text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); + + treeNode PrevNode(DocID d); /** GetTagId(tagname): returns the tag identifier corresponding to a given * tag name. Returns NULLT in case that the tag name does not exists. */ @@ -255,12 +286,23 @@ public: * Returns NULL in case that the tag identifier is not valid.*/ unsigned char *GetTagName(TagType tagid); + /** GetTagName(tagid): returns the tag name of a given tag identifier. + * The result is just a reference and should not be freed by the caller. + */ + const unsigned char *GetTagNameByRef(TagType tagid); + /** RegisterTag adds a new tag to the tag collection this is needed + * if the query contains a tag which is not in the document, we need + * to give this new tag a fresh id and store it somewhere. A logical + * choice is here. + * We might want to use a hashtable instead of an array though. + */ TagType RegisterTag(unsigned char *tagname); bool EmptyText(DocID i) { - return Text->EmptyText(i); + return Text->EmptyText(i); } + /** Prefix(s): search for texts prefixed by string s. */ TextCollection::document_result Prefix(uchar const *s) { return Text->Prefix(s); @@ -314,6 +356,11 @@ public: bool IsLessThan(uchar const *s) { return Text->IsLessThan(s); } + + /** Count(s): Global counting */ + unsigned Count(uchar const *s) { + return Text->Count(s); + } /** CountPrefix(s): counting version of Prefix(s). */ unsigned CountPrefix(uchar const *s) { @@ -337,23 +384,43 @@ public: /** CountLessThan(s): counting version of LessThan(s). */ unsigned CountLessThan(uchar const *s) { - return CountLessThan(s); + return Text->CountLessThan(s); } /** GetText(d): returns the text corresponding to document with * id d. */ uchar* GetText(DocID d) { - return Text->GetText(d); + uchar * s = Text->GetText(d); + return (s[0] == 1 ? (uchar*)"" : s); + } + + /** GetText(i, j): returns the texts corresponding to documents with + * ids i, i+1, ..., j. Texts are separated by '\0' character. */ + uchar* GetText(DocID i, DocID j) { + uchar * s = Text->GetText(i, j); + return (s[0] == 1 ? (uchar*)"" : s); + } + + uchar* GetCachedText(DocID d) { + uchar * str = (uchar*) calloc(sizeof(char),(CachedText->at(d).size() + 1)); + strcpy((char*) str,(const char*) CachedText->at(d).c_str()); + return (uchar*) (str); } TextCollection *getTextCollection() { return Text; } + /** Save: saves XML tree data structure to file. */ - void Save(unsigned char *filename); + void Save(int fd); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(unsigned char *filename, int sample_rate_text); + static XMLTree *Load(int fd); + + void insertTag(TagType tag, uint position); + + void print_stats(); }; #endif +