* along with this program; if not, write to the *\r
* Free Software Foundation, Inc., *\r
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *\r
- ******************************************************************************/ \r
+ ******************************************************************************/\r
\r
#ifndef XMLTREEBUILDER_H_\r
#define XMLTREEBUILDER_H_\r
-#include "TextCollection/TextCollectionBuilder.h"\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <cstring>\r
-\r
\r
+#include <TextCollection/TextCollectionBuilder.h>\r
#undef W\r
#undef WW\r
#undef Wminusone\r
\r
-#include "bp.h"\r
+\r
#include "XMLTree.h"\r
-#include <static_bitsequence.h>\r
-#include <alphabet_mapper.h>\r
-#include <static_sequence.h>\r
+\r
using SXSI::TextCollection;\r
using SXSI::TextCollectionBuilder;\r
\r
#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))\r
\r
\r
+\r
class XMLTreeBuilder {\r
- \r
+\r
/** Array containing the balanced parentheses sequence */\r
pb *par_aux;\r
int parArraySize;\r
int npar;\r
\r
- /** Mapping from tag identifer to tag name */ \r
- unsigned char **TagName;\r
- int ntagnames;\r
-\r
+ /** Mapping from tag identifer to tag name */\r
+ std::vector<std::string> *TagName;\r
+ TagIdMap * tIdMap;\r
/** Array containing the sequence of tags */\r
TagType *tags_aux;\r
- \r
+\r
/** The texts in the XML document */\r
TextCollectionBuilder *TextBuilder;\r
TextCollection *Text;\r
- \r
- /** The texts in the XML document (cached for faster display) */\r
- vector<string> CachedText;\r
\r
- /** boolean flag indicating whether we are indexing empty texts or not */\r
- bool indexing_empty_texts; \r
- unsigned int *empty_texts_aux;\r
+ /** The texts in the XML document (cached for faster display) */\r
\r
- // The TagName array should always contains two special tags\r
- // <@> for attributes and <$> for PCDATA.\r
- // <$> can never be in a document (since we handle the text differently)\r
- // but <@> can be returned by the parser. This boolean is needed for the construction\r
- // of the Tag bitmap to know if <@> must be taken into account or not\r
- bool found_attributes;\r
+ std::vector<std::string> *CachedText;\r
\r
+ unsigned int *empty_texts_aux;\r
+ int eta_size;\r
// Allows to disable the TextCollection for benchmarkin purposes\r
bool disable_tc;\r
-\r
+ TextCollectionBuilder::index_type_t text_index_type;\r
public:\r
\r
XMLTreeBuilder() {;};\r
\r
~XMLTreeBuilder();\r
- \r
- /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction\r
- * of the data structure for the XML document. Parameter empty_texts \r
- * indicates whether we index empty texts in document or not. Parameter \r
+\r
+ /** OpenDocument(sample_rate_text,dtc): initilizes the construction\r
+ * of the data structure for the XML document. Parameter\r
* sample_rate_text indicates the sampling rate for the text searching data\r
- * structures (small values get faster searching but a bigger space \r
+ * structures (small values get faster searching but a bigger space\r
* requirement). dtc disable the use of the TextCollection\r
* (i.e. everything is considered an empty text *)\r
- * Returns a non-zero value upon success, NULLT in case of \r
+ * Returns a non-zero value upon success, NULLT in case of\r
* error. */\r
- int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc);\r
+ int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc,\r
+ TextCollectionBuilder::index_type_t index_type);\r
\r
- /** CloseDocument(): finishes the construction of the data structure for \r
- * the XML document. Tree and tags are represented in the final form, \r
+ /** CloseDocument(): finishes the construction of the data structure for\r
+ * the XML document. Tree and tags are represented in the final form,\r
* dynamic data structures are made static, returning the resulting\r
* XMLTree. After that, the XMLTree data structure can be queried. */\r
XMLTree *CloseDocument();\r
\r
- /** NewOpenTag(tagname): indicates the event of finding a new opening tag \r
- * in the document. Tag name is given. Returns a non-zero value upon \r
+ /** NewOpenTag(tagname): indicates the event of finding a new opening tag\r
+ * in the document. Tag name is given. Returns a non-zero value upon\r
* success, and returns NULLT in case of error. */\r
- int NewOpenTag(unsigned char *tagname);\r
- \r
+ int NewOpenTag(std::string tagname);\r
+\r
/** NewClosingTag(tagname): indicates the event of finding a new closing tag\r
- * in the document. Tag name is given. Returns a non-zero value upon \r
+ * in the document. Tag name is given. Returns a non-zero value upon\r
* success, and returns NULLT in case of error. */\r
- int NewClosingTag(unsigned char *tagname);\r
- \r
- /** NewText(s): indicates the event of finding a new (non-empty) text s in \r
- * the document. The new text is inserted within the text collection. \r
- * Returns a non-zero value upon success, NULLT in case of error. */\r
- int NewText(unsigned char *s);\r
-\r
- /** NewEmptyText(): indicates the event of finding a new empty text in the \r
- * document. In case of indexing empty and non-empty texts, we insert the \r
- * empty texts into the text collection. In case of indexing only non-empty\r
- * texts, it just indicates an empty text in the bit vector of empty texts. \r
- * Returns a non-zero value upon success, NULLT in case of error. */\r
- int NewEmptyText();\r
+ int NewClosingTag(std::string tagname);\r
+\r
+ /** NewText(s): indicates the event of finding a new text s in\r
+ * the document. The new text is inserted within the text collection.\r
+ * Returns a non-zero value upon success, NULLT in case of error.\r
+ * If the string is empty, which is legal in attributes, then\r
+ * the string the sequence '\0x01\0x00' is inserted in the TextCollection\r
+ * It is ok to do so since a non printable character cannot occur in an XML document\r
+ */\r
+ int NewText(std::string text);\r
+\r
+\r
};\r
#endif\r
\r