Don't create the text collection during parsing but afterwards.

[SXSI/XMLTree.git] / XMLTreeBuilder.h
diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h

index 38d81ba..5253dd0 100644 (file)
--- a/XMLTreeBuilder.h
+++ b/XMLTreeBuilder.h
@@ -17,25 +17,19 @@
   *   along with this program; if not, write to the                            *\r
   *   Free Software Foundation, Inc.,                                          *\r
   *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.                *\r
- ******************************************************************************/ \r
+ ******************************************************************************/\r
  \r
  #ifndef XMLTREEBUILDER_H_\r
  #define XMLTREEBUILDER_H_\r
-#include "TextCollection/TextCollectionBuilder.h"\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <cstring>\r
-\r
  \r
+#include <TextCollection/TextCollectionBuilder.h>\r
  #undef W\r
  #undef WW\r
  #undef Wminusone\r
  \r
-#include "bp.h"\r
+\r
  #include "XMLTree.h"\r
-#include <static_bitsequence.h>\r
-#include <alphabet_mapper.h>\r
-#include <static_sequence.h>\r
+\r
  using SXSI::TextCollection;\r
  using SXSI::TextCollectionBuilder;\r
  \r
@@ -47,85 +41,76 @@ using SXSI::TextCollectionBuilder;
  #define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))\r
  \r
  \r
+\r
  class XMLTreeBuilder {\r
-  \r
+\r
     /** Array containing the balanced parentheses sequence */\r
     pb *par_aux;\r
     int parArraySize;\r
     int npar;\r
  \r
-   /** Mapping from tag identifer to tag name */  \r
-   unsigned char **TagName;\r
-   int ntagnames;\r
-\r
+   /** Mapping from tag identifer to tag name */\r
+   std::vector<std::string> *TagName;\r
+   TagIdMap * tIdMap;\r
     /** Array containing the sequence of tags */\r
     TagType *tags_aux;\r
-   \r
+\r
     /** The texts in the XML document */\r
     TextCollectionBuilder *TextBuilder;\r
     TextCollection *Text;\r
-   \r
-   /** The texts in the XML document (cached for faster display) */\r
-   vector<string> CachedText;\r
  \r
-   /** boolean flag indicating whether we are indexing empty texts or not */\r
-   bool indexing_empty_texts; \r
-   unsigned int *empty_texts_aux;\r
+   /** The texts in the XML document (cached for faster display) */\r
  \r
-   // The TagName array should always contains two special tags\r
-   // <@> for attributes and <$> for PCDATA.\r
-   // <$> can never be in a document (since we handle the text differently)\r
-   // but <@> can be returned by the parser. This boolean is needed for the construction\r
-   // of the Tag bitmap to know if <@> must be taken into account or not\r
-   bool found_attributes;\r
+   std::vector<std::string> *CachedText;\r
  \r
+   unsigned int *empty_texts_aux;\r
+   int eta_size;\r
     // Allows to disable the TextCollection for benchmarkin purposes\r
     bool disable_tc;\r
-\r
+   TextCollectionBuilder::index_type_t text_index_type;\r
  public:\r
  \r
     XMLTreeBuilder() {;};\r
  \r
     ~XMLTreeBuilder();\r
-   \r
-   /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction\r
-    * of the data structure for the XML document. Parameter empty_texts \r
-    * indicates whether we index empty texts in document or not. Parameter \r
+\r
+   /** OpenDocument(sample_rate_text,dtc): initilizes the construction\r
+    * of the data structure for the XML document.  Parameter\r
      * sample_rate_text indicates the sampling rate for the text searching data\r
-    * structures (small values get faster searching but a bigger space \r
+    * structures (small values get faster searching but a bigger space\r
      * requirement). dtc disable the use of the TextCollection\r
      * (i.e. everything is considered an empty text *)\r
-    * Returns a non-zero value upon success, NULLT in case of \r
+    * Returns a non-zero value upon success, NULLT in case of\r
      * error. */\r
-   int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc);\r
+   int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc,\r
+                   TextCollectionBuilder::index_type_t index_type);\r
  \r
-   /** CloseDocument(): finishes the construction of the data structure for \r
-    * the XML document. Tree and tags are represented in the final form, \r
+   /** CloseDocument(): finishes the construction of the data structure for\r
+    * the XML document. Tree and tags are represented in the final form,\r
      * dynamic data structures are made static, returning the resulting\r
      * XMLTree. After that, the XMLTree data structure can be queried. */\r
     XMLTree *CloseDocument();\r
  \r
-   /** NewOpenTag(tagname): indicates the event of finding a new opening tag \r
-    * in the document. Tag name is given. Returns a non-zero value upon \r
+   /** NewOpenTag(tagname): indicates the event of finding a new opening tag\r
+    * in the document. Tag name is given. Returns a non-zero value upon\r
      * success, and returns NULLT in case of error. */\r
-   int NewOpenTag(unsigned char *tagname);\r
-   \r
+   int NewOpenTag(std::string tagname);\r
+\r
     /** NewClosingTag(tagname): indicates the event of finding a new closing tag\r
-    *  in the document. Tag name is given. Returns a non-zero value upon \r
+    *  in the document. Tag name is given. Returns a non-zero value upon\r
      *  success, and returns NULLT in case of error. */\r
-   int NewClosingTag(unsigned char *tagname);\r
- \r
-   /** NewText(s): indicates the event of finding a new (non-empty) text s in \r
-    * the document. The new text is inserted within the text collection. \r
-    * Returns a non-zero value upon success, NULLT in case of error. */\r
-   int NewText(unsigned char *s);\r
-\r
-   /** NewEmptyText(): indicates the event of finding a new empty text in the \r
-    * document. In case of indexing empty and non-empty texts, we insert the \r
-    * empty texts into the text collection. In case of indexing only non-empty\r
-    * texts, it just indicates an empty text in the bit vector of empty texts. \r
-    * Returns a non-zero value upon success, NULLT in case of error. */\r
-   int NewEmptyText();\r
+   int NewClosingTag(std::string tagname);\r
+\r
+   /** NewText(s): indicates the event of finding a new text s in\r
+    * the document. The new text is inserted within the text collection.\r
+    * Returns a non-zero value upon success, NULLT in case of error.\r
+    * If the string is empty, which is legal in attributes, then\r
+    * the string the sequence '\0x01\0x00' is inserted in the TextCollection\r
+    * It is ok to do so since a non printable character cannot occur in an XML document\r
+    */\r
+   int NewText(std::string text);\r
+\r
+\r
  };\r
  #endif\r
  \r