X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTreeBuilder.h;h=5253dd0c7bf902dca3d3bc2cf62e9fd9a5be80e0;hb=d79d6498e2d585560d915592ef59f3ad6a57b3c7;hp=38d81baf7a3849f477249203845ed6cec2f5b4b1;hpb=aa6692a9fd2badf8e8e686b92075f041dc03bbef;p=SXSI%2FXMLTree.git diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h index 38d81ba..5253dd0 100644 --- a/XMLTreeBuilder.h +++ b/XMLTreeBuilder.h @@ -17,25 +17,19 @@ * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - ******************************************************************************/ + ******************************************************************************/ #ifndef XMLTREEBUILDER_H_ #define XMLTREEBUILDER_H_ -#include "TextCollection/TextCollectionBuilder.h" -#include -#include -#include - +#include #undef W #undef WW #undef Wminusone -#include "bp.h" + #include "XMLTree.h" -#include -#include -#include + using SXSI::TextCollection; using SXSI::TextCollectionBuilder; @@ -47,85 +41,76 @@ using SXSI::TextCollectionBuilder; #define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + class XMLTreeBuilder { - + /** Array containing the balanced parentheses sequence */ pb *par_aux; int parArraySize; int npar; - /** Mapping from tag identifer to tag name */ - unsigned char **TagName; - int ntagnames; - + /** Mapping from tag identifer to tag name */ + std::vector *TagName; + TagIdMap * tIdMap; /** Array containing the sequence of tags */ TagType *tags_aux; - + /** The texts in the XML document */ TextCollectionBuilder *TextBuilder; TextCollection *Text; - - /** The texts in the XML document (cached for faster display) */ - vector CachedText; - /** boolean flag indicating whether we are indexing empty texts or not */ - bool indexing_empty_texts; - unsigned int *empty_texts_aux; + /** The texts in the XML document (cached for faster display) */ - // The TagName array should always contains two special tags - // <@> for attributes and <$> for PCDATA. - // <$> can never be in a document (since we handle the text differently) - // but <@> can be returned by the parser. This boolean is needed for the construction - // of the Tag bitmap to know if <@> must be taken into account or not - bool found_attributes; + std::vector *CachedText; + unsigned int *empty_texts_aux; + int eta_size; // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; - + TextCollectionBuilder::index_type_t text_index_type; public: XMLTreeBuilder() {;}; ~XMLTreeBuilder(); - - /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction - * of the data structure for the XML document. Parameter empty_texts - * indicates whether we index empty texts in document or not. Parameter + + /** OpenDocument(sample_rate_text,dtc): initilizes the construction + * of the data structure for the XML document. Parameter * sample_rate_text indicates the sampling rate for the text searching data - * structures (small values get faster searching but a bigger space + * structures (small values get faster searching but a bigger space * requirement). dtc disable the use of the TextCollection * (i.e. everything is considered an empty text *) - * Returns a non-zero value upon success, NULLT in case of + * Returns a non-zero value upon success, NULLT in case of * error. */ - int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); + int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc, + TextCollectionBuilder::index_type_t index_type); - /** CloseDocument(): finishes the construction of the data structure for - * the XML document. Tree and tags are represented in the final form, + /** CloseDocument(): finishes the construction of the data structure for + * the XML document. Tree and tags are represented in the final form, * dynamic data structures are made static, returning the resulting * XMLTree. After that, the XMLTree data structure can be queried. */ XMLTree *CloseDocument(); - /** NewOpenTag(tagname): indicates the event of finding a new opening tag - * in the document. Tag name is given. Returns a non-zero value upon + /** NewOpenTag(tagname): indicates the event of finding a new opening tag + * in the document. Tag name is given. Returns a non-zero value upon * success, and returns NULLT in case of error. */ - int NewOpenTag(unsigned char *tagname); - + int NewOpenTag(std::string tagname); + /** NewClosingTag(tagname): indicates the event of finding a new closing tag - * in the document. Tag name is given. Returns a non-zero value upon + * in the document. Tag name is given. Returns a non-zero value upon * success, and returns NULLT in case of error. */ - int NewClosingTag(unsigned char *tagname); - - /** NewText(s): indicates the event of finding a new (non-empty) text s in - * the document. The new text is inserted within the text collection. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewText(unsigned char *s); - - /** NewEmptyText(): indicates the event of finding a new empty text in the - * document. In case of indexing empty and non-empty texts, we insert the - * empty texts into the text collection. In case of indexing only non-empty - * texts, it just indicates an empty text in the bit vector of empty texts. - * Returns a non-zero value upon success, NULLT in case of error. */ - int NewEmptyText(); + int NewClosingTag(std::string tagname); + + /** NewText(s): indicates the event of finding a new text s in + * the document. The new text is inserted within the text collection. + * Returns a non-zero value upon success, NULLT in case of error. + * If the string is empty, which is legal in attributes, then + * the string the sequence '\0x01\0x00' is inserted in the TextCollection + * It is ok to do so since a non printable character cannot occur in an XML document + */ + int NewText(std::string text); + + }; #endif