Update HACKING file with new build instructions
[SXSI/xpathcomp.git] / src / XMLDocShredder.cpp
1 /**********************************************************
2  * XMLDocShredder.cpp
3  * ---------------------
4  * Implementation of the class that receives events from the XML parser and 
5  * invokes corresponding construction methods of the storage interface.
6  * 
7  * Author: Greg Leighton
8  * Date: 02/11/08
9  * Changes:
10  *              05/11/08 -- Fixed bug related to parsing empty elements
11  *                               -- Set parser properties to automatically resolve
12  *                                      entity references and load external DTD if present
13  *                               -- Modified processEndDocument() by adding a nodeFinished()
14  *                                      call to the storage interface to close off the 
15  *                                      document node
16  *
17  */
18
19 #include <iostream>
20 #include "XMLDocShredder.h"
21 #include <libxml++/exceptions/parse_error.h>
22 #include "Utils.h"
23
24 using namespace Glib;
25
26 void XMLDocShredder::doText(){
27
28   if (!buffer.empty()){
29     tb->NewOpenTag(PCDATA_OPEN_TAG);
30     tb->NewText(buffer);
31     tb->NewClosingTag(PCDATA_OPEN_TAG);
32   };
33   buffer.clear();
34
35 }
36
37 void XMLDocShredder::setProperties(){
38   /* instruct the parser to expand entity references and report as 
39    * regular PCDATA
40    */ 
41   reader_->set_parser_property(
42                                TextReader::SubstEntities, true);
43                 
44   /* instruct parser to read external DTD, if present.  This is 
45          * needed to obtain any entity definitions in the DTD
46          */
47   reader_->set_parser_property(
48                                TextReader::LoadDtd, true);
49   
50   
51   /* 
52    */
53   reader_->set_parser_property(
54                                TextReader::DefaultAttrs, true);
55   
56
57   /* but we don't want to do validation since it would slow us down
58    */
59
60
61   reader_->set_parser_property(
62                                TextReader::Validate, false);
63   
64 }
65 XMLDocShredder::XMLDocShredder(const unsigned char * data,
66                                TextReader::size_type size,
67                                int sf, 
68                                bool iet, 
69                                bool dtc,
70                                TextCollectionBuilder::index_type_t index_type
71                                )                        
72 {
73   tree = NULL;
74   reader_ = new TextReader(data,size,"");
75   setProperties();
76   tb  = new XMLTreeBuilder();
77   buffer.clear();
78   tb->OpenDocument(iet,sf,dtc, index_type);
79 }
80
81 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
82                                TextCollectionBuilder::index_type_t index_type
83                                )
84 {
85   tree = NULL;
86   reader_ = new TextReader(inFileName);
87   setProperties();
88   tb = new XMLTreeBuilder();
89   buffer.clear();
90   tb->OpenDocument(iet,sf,dtc,index_type);
91 }
92
93 XMLDocShredder::~XMLDocShredder()
94 {
95         delete reader_;
96         reader_ = NULL;
97         delete tb;
98         tb = NULL;
99
100 }
101
102
103 void XMLDocShredder::processStartElement()
104 {
105   doText();
106   // fetch element name; this will be the full qualified name
107   ustring name = reader_->get_name();
108   bool empty = false;
109   size_t found = name.find_first_of(':');
110   if (found == ustring::npos)  
111     tb->NewOpenTag(name);
112   else
113     tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
114   
115   /* We must be really carefull here. calling process attributes moves
116      the document pointer on the last attribute, hence calling reader_->is_empty
117      afterwards will yield the wrong result. It is better to call it while we are
118      on the element and generate a nodeFinished() call at the end */
119   empty = reader_->is_empty_element();
120   
121   
122   // now, process attributes
123   if (reader_->has_attributes())
124     processAttributes();
125   
126   
127   if (empty)
128     tb->NewClosingTag(name);
129   
130   
131 }
132
133 void XMLDocShredder::processEndElement()
134 {
135   doText();
136   ustring name = reader_->get_name();
137   tb->NewClosingTag(name);
138 }
139
140 void XMLDocShredder::processPCDATA()
141 {
142   // send the content of this PCDATA node to the storage interface as a text node
143   if (reader_->has_value())
144     buffer += reader_->get_value();
145
146 }
147
148 void XMLDocShredder::processAttributes()
149 {
150         reader_->move_to_first_attribute();
151                 
152         string nspaceStr = "xmlns";
153         tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
154         do
155           {
156                 ustring name = reader_->get_name();
157                 ustring value = reader_->get_value();
158                 
159                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
160                 * so we have to extract it and build a namespace uri node out of it before
161                 * passing to the storage interface */
162                 
163                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
164                 {
165                   //TODO 
166                 }
167                 
168                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
169                  * parent element to store the attribute name, possessing a child text node storing the 
170                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
171                  */
172                  
173                 else
174                 {
175                   string attname = "<@>"+name;
176                   tb->NewOpenTag(attname);
177                   tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
178                   tb->NewText(value);
179                   tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
180                   tb->NewClosingTag(attname);
181                 }
182         }
183         while (reader_->move_to_next_attribute());
184         tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
185 }
186
187 void XMLDocShredder::processSignificantWhitespace()
188 {
189   if (reader_->has_value())
190     buffer += reader_->get_value();
191
192 }
193
194 void XMLDocShredder::processStartDocument(const string docName)
195 {
196   // tell storage interface to construct the document name
197
198   tb->NewOpenTag(DOCUMENT_OPEN_TAG);
199   
200 }
201
202 void XMLDocShredder::processEndDocument()
203 {
204   doText();
205   /* tell the storage interface that document parsing has finished, and structures
206    * can now be written to disk. */
207   tb->NewClosingTag(DOCUMENT_OPEN_TAG);
208   tree = tb->CloseDocument();
209
210 }
211
212 void XMLDocShredder::processComment()
213 {
214   //storageIfc_->newChild("!" + reader_->get_value());
215   //storageIfc_->nodeFinished();
216 }
217
218 void XMLDocShredder::processProcessingInstruction()
219 {
220         ustring name = reader_->get_name();
221         ustring value = reader_->get_value();   
222         
223         /* Create a child node to store the target of the PI, append a text node to it to store 
224          * the PI data, send to the storage interface.  Close off the PI node with a call to
225          * nodeFinished
226          */
227         
228         // storageIfc_->newChild("?" + name);
229         // storageIfc_->newText(value);
230         // storageIfc_->nodeFinished();
231 }
232
233 void XMLDocShredder::processDocTypeDeclaration()
234 {
235         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
236          * in case we do want to process it in the future.
237         */
238 }
239
240 void XMLDocShredder::processCDATASection()
241 {
242         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
243          * model.  Instead, we simply pass the converted text value to the storage interface as 
244          * a text node attached to the current context node.
245          */
246   if (reader_->has_value())
247     buffer+= reader_->get_value();
248 }
249
250 void XMLDocShredder::processUnknownNodeType()
251 {
252         cout << "unknown token encountered during parsing" << endl;
253         throw xmlpp::parse_error("unknown token encountered during parsing");
254                 
255 }
256
257 void XMLDocShredder::parse()
258 {       
259         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
260         {
261                 switch (reader_->get_node_type())
262                 {
263                         case TextReader::Element:
264                                 processStartElement();
265                                 break;
266                                 
267                         case TextReader::Text:
268                                 processPCDATA();
269                                 break;
270                                 
271                         case TextReader::EndElement:
272                                 processEndElement();
273                                 break;
274                                 
275                         case TextReader::SignificantWhitespace:
276                                 processSignificantWhitespace();
277                                 break;
278                                 
279                         case TextReader::Comment:
280                                 processComment();
281                                 break;
282                         
283                         case TextReader::DocumentType:
284                                 processDocTypeDeclaration();
285                                 break;
286                                 
287                         case TextReader::ProcessingInstruction:
288                                 processProcessingInstruction();
289                                 break;
290                         
291                         case TextReader::CDATA:
292                                 processCDATASection();
293                                 break;
294                         
295                         case TextReader::None:
296                                 processUnknownNodeType();
297                                 break;
298                                 
299                         default:
300                                 int type = reader_->get_node_type();
301                                 cout << "  Node type: " << type << endl;
302                                 break;  
303                         
304                 }
305         }                       
306 }