X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLTreeBuilder.h;fp=XMLTreeBuilder.h;h=38d81baf7a3849f477249203845ed6cec2f5b4b1;hb=aa6692a9fd2badf8e8e686b92075f041dc03bbef;hp=0000000000000000000000000000000000000000;hpb=5db16dd3e0bf609bc0fa84ee7d067f6bbc58013e;p=SXSI%2FXMLTree.git diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h new file mode 100644 index 0000000..38d81ba --- /dev/null +++ b/XMLTreeBuilder.h @@ -0,0 +1,131 @@ + +/****************************************************************************** + * Copyright (C) 2009 by Diego Arroyuelo * + * Builder class for the in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef XMLTREEBUILDER_H_ +#define XMLTREEBUILDER_H_ +#include "TextCollection/TextCollectionBuilder.h" +#include +#include +#include + + +#undef W +#undef WW +#undef Wminusone + +#include "bp.h" +#include "XMLTree.h" +#include +#include +#include +using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; + +#define NULLT -1 + + // sets bit p in e +#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) + // cleans bit p in e +#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + + +class XMLTreeBuilder { + + /** Array containing the balanced parentheses sequence */ + pb *par_aux; + int parArraySize; + int npar; + + /** Mapping from tag identifer to tag name */ + unsigned char **TagName; + int ntagnames; + + /** Array containing the sequence of tags */ + TagType *tags_aux; + + /** The texts in the XML document */ + TextCollectionBuilder *TextBuilder; + TextCollection *Text; + + /** The texts in the XML document (cached for faster display) */ + vector CachedText; + + /** boolean flag indicating whether we are indexing empty texts or not */ + bool indexing_empty_texts; + unsigned int *empty_texts_aux; + + // The TagName array should always contains two special tags + // <@> for attributes and <$> for PCDATA. + // <$> can never be in a document (since we handle the text differently) + // but <@> can be returned by the parser. This boolean is needed for the construction + // of the Tag bitmap to know if <@> must be taken into account or not + bool found_attributes; + + // Allows to disable the TextCollection for benchmarkin purposes + bool disable_tc; + +public: + + XMLTreeBuilder() {;}; + + ~XMLTreeBuilder(); + + /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction + * of the data structure for the XML document. Parameter empty_texts + * indicates whether we index empty texts in document or not. Parameter + * sample_rate_text indicates the sampling rate for the text searching data + * structures (small values get faster searching but a bigger space + * requirement). dtc disable the use of the TextCollection + * (i.e. everything is considered an empty text *) + * Returns a non-zero value upon success, NULLT in case of + * error. */ + int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); + + /** CloseDocument(): finishes the construction of the data structure for + * the XML document. Tree and tags are represented in the final form, + * dynamic data structures are made static, returning the resulting + * XMLTree. After that, the XMLTree data structure can be queried. */ + XMLTree *CloseDocument(); + + /** NewOpenTag(tagname): indicates the event of finding a new opening tag + * in the document. Tag name is given. Returns a non-zero value upon + * success, and returns NULLT in case of error. */ + int NewOpenTag(unsigned char *tagname); + + /** NewClosingTag(tagname): indicates the event of finding a new closing tag + * in the document. Tag name is given. Returns a non-zero value upon + * success, and returns NULLT in case of error. */ + int NewClosingTag(unsigned char *tagname); + + /** NewText(s): indicates the event of finding a new (non-empty) text s in + * the document. The new text is inserted within the text collection. + * Returns a non-zero value upon success, NULLT in case of error. */ + int NewText(unsigned char *s); + + /** NewEmptyText(): indicates the event of finding a new empty text in the + * document. In case of indexing empty and non-empty texts, we insert the + * empty texts into the text collection. In case of indexing only non-empty + * texts, it just indicates an empty text in the bit vector of empty texts. + * Returns a non-zero value upon success, NULLT in case of error. */ + int NewEmptyText(); +}; +#endif +