X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTreeBuilder.h;h=336ca2dfff33fe86765143f8fe028ff912422397;hb=235e3214904e390d2f101c5d5bf7def98745b132;hp=38d81baf7a3849f477249203845ed6cec2f5b4b1;hpb=aa6692a9fd2badf8e8e686b92075f041dc03bbef;p=SXSI%2FXMLTree.git diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h index 38d81ba..336ca2d 100644 --- a/XMLTreeBuilder.h +++ b/XMLTreeBuilder.h @@ -21,6 +21,7 @@ #ifndef XMLTREEBUILDER_H_ #define XMLTREEBUILDER_H_ +#include #include "TextCollection/TextCollectionBuilder.h" #include #include @@ -31,8 +32,9 @@ #undef WW #undef Wminusone -#include "bp.h" + #include "XMLTree.h" +#include "bp.h" #include #include #include @@ -47,6 +49,7 @@ using SXSI::TextCollectionBuilder; #define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + class XMLTreeBuilder { /** Array containing the balanced parentheses sequence */ @@ -55,9 +58,8 @@ class XMLTreeBuilder { int npar; /** Mapping from tag identifer to tag name */ - unsigned char **TagName; - int ntagnames; - + vector *TagName; + TagIdMap * tIdMap; /** Array containing the sequence of tags */ TagType *tags_aux; @@ -66,19 +68,11 @@ class XMLTreeBuilder { TextCollection *Text; /** The texts in the XML document (cached for faster display) */ - vector CachedText; - /** boolean flag indicating whether we are indexing empty texts or not */ - bool indexing_empty_texts; - unsigned int *empty_texts_aux; - - // The TagName array should always contains two special tags - // <@> for attributes and <$> for PCDATA. - // <$> can never be in a document (since we handle the text differently) - // but <@> can be returned by the parser. This boolean is needed for the construction - // of the Tag bitmap to know if <@> must be taken into account or not - bool found_attributes; + vector *CachedText; + unsigned int *empty_texts_aux; + int eta_size; // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; @@ -88,9 +82,8 @@ public: ~XMLTreeBuilder(); - /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter + /** OpenDocument(sample_rate_text,dtc): initilizes the construction + * of the data structure for the XML document. Parameter * sample_rate_text indicates the sampling rate for the text searching data * structures (small values get faster searching but a bigger space * requirement). dtc disable the use of the TextCollection @@ -108,24 +101,23 @@ public: /** NewOpenTag(tagname): indicates the event of finding a new opening tag * in the document. Tag name is given. Returns a non-zero value upon * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); + int NewOpenTag(string tagname); /** NewClosingTag(tagname): indicates the event of finding a new closing tag * in the document. Tag name is given. Returns a non-zero value upon * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); + int NewClosingTag(string tagname); - /** NewText(s): indicates the event of finding a new (non-empty) text s in + /** NewText(s): indicates the event of finding a new text s in * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); + * Returns a non-zero value upon success, NULLT in case of error. + * If the string is empty, which is legal in attributes, then + * the string the sequence '\0x01\0x00' is inserted in the TextCollection + * It is ok to do so since a non printable character cannot occur in an XML document + */ + int NewText(string text); + + }; #endif