X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTree.h;h=ea9b938ff80b7356f81ec89334006ca73ef697d4;hb=aa6692a9fd2badf8e8e686b92075f041dc03bbef;hp=1268cb6b015c369e0bfdc87f7425879d8b88cc47;hpb=5db16dd3e0bf609bc0fa84ee7d067f6bbc58013e;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index 1268cb6..ea9b938 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -17,7 +17,7 @@ * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ + ******************************************************************************/ #ifndef XMLTREE_H_ #define XMLTREE_H_ @@ -26,13 +26,13 @@ #include #include -//KIM : OJO -//clash between TextCollection/Tools.h and libcds/includes/basics.h + #undef W #undef WW #undef Wminusone #include "bp.h" +//#include "basics.h" #include #include #include @@ -47,11 +47,6 @@ using SXSI::TextCollectionBuilder; #define PERM_SAMPLE 10 - // sets bit p in e -#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) - // cleans bit p in e -#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) - typedef int treeNode; typedef int TagType; @@ -62,13 +57,15 @@ typedef struct { int max; } range; +typedef struct nd { + uint position; + struct nd *next; +} ListNode; -//KIM : OJO -// I know this class implements the working draft that we have but the logics seem flawed here... -// We should have two classes. One XMLTreeBuilder and one XMLTree. -// XMLTreeBuilder would have OpenDocument, NewOpenTag,... and CloseDocument would return an XMLTree -// XMLTree would have only an initialized structure. If find it really ugly to check (!finished) or (!initialized) -// in every function (FirstChild....). +typedef struct { + ListNode *first; + ListNode *last; +} TagArrayEntry; class XMLTree { /** Balanced parentheses representation of the tree */ @@ -76,59 +73,36 @@ class XMLTree { /** Mapping from tag identifer to tag name */ unsigned char **TagName; + uint ntagnames; /** boolean flag indicating whether we are indexing empty texts or not */ bool indexing_empty_texts; /** Bit vector indicating with a 1 the positions of the non-empty texts. */ - static_bitsequence_rrr02 *EBVector; + static_bitsequence *EBVector; /** Tag sequence represented with a data structure for rank and select */ static_sequence *Tags; - uint * tags_fix; - uint tags_blen, tags_len; + uint * tags_fix; + uint tags_blen, tags_len; /** The texts in the XML document */ - TextCollectionBuilder *TextBuilder; TextCollection *Text; /** The texts in the XML document (cached for faster display) */ vector CachedText; - /** Flag indicating whether the whole data structure has been constructed or - * not. If the value is true, you cannot add more texts, nodes, etc. */ - bool finished; - - /** Flag indicating whether the construction of the data structure has been - * initialized or not (by calling method OpenDocument()). If this is true, - * you cannot insert new tags or texts. */ - bool initialized; - - /* the following components are used for construction purposes */ - pb *par_aux; - TagType *tags_aux; - int npar; - int parArraySize; - int ntagnames; - unsigned int *empty_texts_aux; - - // KIM : OJO - // I added those two. The TagName array should always contains two special tags - // <@> for attributes and <$> for PCDATA. - // <$> can never be in a document (since we handle the text differently) - // but <@> can be returned by the parser. This boolean is needed for the construction - // of the Tag bitmap to know if <@> must be taken into account or not - bool found_attributes; - - // KIM : OJO + TagArrayEntry *TagArray; + // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; public: - void print_stats(); + /** Data structure constructors */ + XMLTree() {;}; - /** Data structure constructor */ - XMLTree() {finished = false; initialized = false; Text = 0; TextBuilder = 0; }; + XMLTree(pb *par, uint npar, unsigned char **TN, uint ntagnames, uint *empty_texts_bmp, TagType *tags, + TextCollection *TC, vector CT, bool indexing_empty_t, bool dis_tc); /** Data structure destructor */ ~XMLTree(); @@ -154,9 +128,9 @@ public: /** IsChild(x,y): returns whether node x is parent of node y. */ bool IsChild(treeNode x, treeNode y); - /** IsChild(x,y): returns whether node x is the first child of its parent */ + /** IsFirstChild(x): returns whether node x is the first child of its parent. */ bool IsFirstChild(treeNode x); - + /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ int NumChildren(treeNode x); @@ -194,30 +168,37 @@ public: * Very fast in BP. */ treeNode FirstChild(treeNode x); - /** LastChild(x): returns the last child of node x. - * Implemented by Kim naively. */ + /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x); - + /** NextSibling(x): returns the next sibling of node x, assuming it * exists. */ - treeNode NextSibling(treeNode x); /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ treeNode PrevSibling(treeNode x); - /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or * NULLT if there is none. Because of the balanced-parentheses representation * of the tree, this operation is not supported efficiently, just iterating * among the children of node x until finding the desired child. */ - treeNode TaggedChild(treeNode x, int i, TagType tag); + treeNode TaggedChild(treeNode x, TagType tag); + treeNode SelectChild(treeNode x, TagType *tags, int ntags); + + /** TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or + * NULLT if there is none. */ + treeNode TaggedSibling(treeNode x, TagType tag); + + treeNode SelectSibling(treeNode x, TagType *tags, int ntags); + /** TaggedDesc(x,tag): returns the first node tagged tag with larger * preorder than x and within the subtree of x. Returns NULT if there * is none. */ treeNode TaggedDesc(treeNode x, TagType tag); + treeNode SelectDesc(treeNode x, TagType *tags, int ntags); treeNode TaggedBelow(treeNode x, TagType *childtags, unsigned int ctlen, TagType *desctags, unsigned int dtlen); @@ -245,7 +226,9 @@ public: treeNode TaggedFoll(treeNode x, TagType tag); treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); - + + treeNode SelectFollBelow(treeNode x, TagType *tags, int ntags, treeNode ctx); + /** TaggedFollowingSibling(x,tag) */ treeNode TaggedFollowingSibling(treeNode x, TagType tag); @@ -275,48 +258,9 @@ public: /** ParentNode(d): returns the parent node of document identifier d. */ treeNode ParentNode(DocID d); + treeNode PrevNode(DocID d); - /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter - * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space - * requirement). dtc disable the use of the TextCollection - * (i.e. everything is considered an empty text *) - * Returns a non-zero value upon success, NULLT in case of - * error. */ - - int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); - - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, - * dynamic data structures are made static, and the flag "finished" is set - * to true. After that, the data structure can be queried. */ - int CloseDocument(); - - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); - - /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon - * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); - - /** NewText(s): indicates the event of finding a new (non-empty) text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); - /** GetTagId(tagname): returns the tag identifier corresponding to a given * tag name. Returns NULLT in case that the tag name does not exists. */ TagType GetTagId(unsigned char *tagname); @@ -325,14 +269,11 @@ public: * Returns NULL in case that the tag identifier is not valid.*/ unsigned char *GetTagName(TagType tagid); - - // OJO /** GetTagName(tagid): returns the tag name of a given tag identifier. * The result is just a reference and should not be freed by the caller. */ const unsigned char *GetTagNameByRef(TagType tagid); - //OJO /** RegisterTag adds a new tag to the tag collection this is needed * if the query contains a tag which is not in the document, we need * to give this new tag a fresh id and store it somewhere. A logical @@ -344,6 +285,7 @@ public: bool EmptyText(DocID i) { return Text->EmptyText(i); } + /** Prefix(s): search for texts prefixed by string s. */ TextCollection::document_result Prefix(uchar const *s) { return Text->Prefix(s); @@ -425,7 +367,7 @@ public: /** CountLessThan(s): counting version of LessThan(s). */ unsigned CountLessThan(uchar const *s) { - return CountLessThan(s); + return Text->CountLessThan(s); } /** GetText(d): returns the text corresponding to document with @@ -443,11 +385,17 @@ public: TextCollection *getTextCollection() { return Text; } + /** Save: saves XML tree data structure to file. */ void Save(unsigned char *filename); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ static XMLTree *Load(unsigned char *filename, int sample_rate_text); + + void insertTag(TagType tag, uint position); + + void print_stats(); }; #endif +