2 /******************************************************************************
\r
3 * Copyright (C) 2009 by Diego Arroyuelo *
\r
4 * Builder class for the in-memory XQuery/XPath engine *
\r
6 * This program is free software; you can redistribute it and/or modify *
\r
7 * it under the terms of the GNU Lesser General Public License as published *
\r
8 * by the Free Software Foundation; either version 2 of the License, or *
\r
9 * (at your option) any later version. *
\r
11 * This program is distributed in the hope that it will be useful, *
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
\r
14 * GNU Lesser General Public License for more details. *
\r
16 * You should have received a copy of the GNU Lesser General Public License *
\r
17 * along with this program; if not, write to the *
\r
18 * Free Software Foundation, Inc., *
\r
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
\r
20 ******************************************************************************/
\r
22 #ifndef XMLTREEBUILDER_H_
\r
23 #define XMLTREEBUILDER_H_
\r
24 #include "TextCollection/TextCollectionBuilder.h"
\r
35 #include "XMLTree.h"
\r
36 #include <static_bitsequence.h>
\r
37 #include <alphabet_mapper.h>
\r
38 #include <static_sequence.h>
\r
39 using SXSI::TextCollection;
\r
40 using SXSI::TextCollectionBuilder;
\r
45 #define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))
\r
46 // cleans bit p in e
\r
47 #define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))
\r
50 class XMLTreeBuilder {
\r
52 /** Array containing the balanced parentheses sequence */
\r
57 /** Mapping from tag identifer to tag name */
\r
58 unsigned char **TagName;
\r
61 /** Array containing the sequence of tags */
\r
64 /** The texts in the XML document */
\r
65 TextCollectionBuilder *TextBuilder;
\r
66 TextCollection *Text;
\r
68 /** The texts in the XML document (cached for faster display) */
\r
69 vector<string> CachedText;
\r
71 /** boolean flag indicating whether we are indexing empty texts or not */
\r
72 bool indexing_empty_texts;
\r
73 unsigned int *empty_texts_aux;
\r
75 // The TagName array should always contains two special tags
\r
76 // <@> for attributes and <$> for PCDATA.
\r
77 // <$> can never be in a document (since we handle the text differently)
\r
78 // but <@> can be returned by the parser. This boolean is needed for the construction
\r
79 // of the Tag bitmap to know if <@> must be taken into account or not
\r
80 bool found_attributes;
\r
82 // Allows to disable the TextCollection for benchmarkin purposes
\r
87 XMLTreeBuilder() {;};
\r
91 /** OpenDocument(empty_texts,sample_rate_text,dtc): initilizes the construction
\r
92 * of the data structure for the XML document. Parameter empty_texts
\r
93 * indicates whether we index empty texts in document or not. Parameter
\r
94 * sample_rate_text indicates the sampling rate for the text searching data
\r
95 * structures (small values get faster searching but a bigger space
\r
96 * requirement). dtc disable the use of the TextCollection
\r
97 * (i.e. everything is considered an empty text *)
\r
98 * Returns a non-zero value upon success, NULLT in case of
\r
100 int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc);
\r
102 /** CloseDocument(): finishes the construction of the data structure for
\r
103 * the XML document. Tree and tags are represented in the final form,
\r
104 * dynamic data structures are made static, returning the resulting
\r
105 * XMLTree. After that, the XMLTree data structure can be queried. */
\r
106 XMLTree *CloseDocument();
\r
108 /** NewOpenTag(tagname): indicates the event of finding a new opening tag
\r
109 * in the document. Tag name is given. Returns a non-zero value upon
\r
110 * success, and returns NULLT in case of error. */
\r
111 int NewOpenTag(unsigned char *tagname);
\r
113 /** NewClosingTag(tagname): indicates the event of finding a new closing tag
\r
114 * in the document. Tag name is given. Returns a non-zero value upon
\r
115 * success, and returns NULLT in case of error. */
\r
116 int NewClosingTag(unsigned char *tagname);
\r
118 /** NewText(s): indicates the event of finding a new (non-empty) text s in
\r
119 * the document. The new text is inserted within the text collection.
\r
120 * Returns a non-zero value upon success, NULLT in case of error. */
\r
121 int NewText(unsigned char *s);
\r
123 /** NewEmptyText(): indicates the event of finding a new empty text in the
\r
124 * document. In case of indexing empty and non-empty texts, we insert the
\r
125 * empty texts into the text collection. In case of indexing only non-empty
\r
126 * texts, it just indicates an empty text in the bit vector of empty texts.
\r
127 * Returns a non-zero value upon success, NULLT in case of error. */
\r
128 int NewEmptyText();
\r