.
[SXSI/xpathcomp.git] / XMLDocShredder.cpp
1 /**********************************************************
2  * XMLDocShredder.cpp
3  * ---------------------
4  * Implementation of the class that receives events from the XML parser and 
5  * invokes corresponding construction methods of the storage interface.
6  * 
7  * Author: Greg Leighton
8  * Date: 02/11/08
9  * Changes:
10  *              05/11/08 -- Fixed bug related to parsing empty elements
11  *                               -- Set parser properties to automatically resolve
12  *                                      entity references and load external DTD if present
13  *                               -- Modified processEndDocument() by adding a nodeFinished()
14  *                                      call to the storage interface to close off the 
15  *                                      document node
16  *
17  */
18
19 #include <iostream>
20 #include "XMLDocShredder.h"
21 #include "SXSIStorageInterface.h"
22 #include <libxml++/exceptions/parse_error.h>
23 #include "Utils.h"
24
25 using namespace Glib;
26
27 void XMLDocShredder::setProperties(){
28   /* instruct the parser to expand entity references and report as 
29    * regular PCDATA
30    */ 
31   reader_->set_parser_property(
32                                TextReader::SubstEntities, true);
33                 
34   /* instruct parser to read external DTD, if present.  This is 
35          * needed to obtain any entity definitions in the DTD
36          */
37   reader_->set_parser_property(
38                                TextReader::LoadDtd, true);
39   
40   
41   /* 
42    */
43   reader_->set_parser_property(
44                                TextReader::DefaultAttrs, true);
45   
46
47   /* but we don't want to do validation since it would slow us down
48    */
49
50
51   reader_->set_parser_property(
52                                TextReader::Validate, false);
53   
54 }
55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
56                                TextReader::size_type size)                      
57 {
58   last_text = false;
59   reader_ = new TextReader(data,size,"");
60   setProperties();
61   storageIfc_ = new SXSIStorageInterface();
62   buffer = "";
63 }
64
65 XMLDocShredder::XMLDocShredder(const string inFileName)
66 {
67   last_text = false;
68   reader_ = new TextReader(inFileName);
69   setProperties();
70   storageIfc_ = new SXSIStorageInterface();
71   buffer = "";
72 }
73
74 XMLDocShredder::~XMLDocShredder()
75 {
76         delete reader_;
77         delete storageIfc_;
78
79 }
80
81
82 void XMLDocShredder::processStartElement()
83 {
84         // fetch element name; this will be the full qualified name
85         ustring name = reader_->get_name();
86         bool empty = false;
87         
88         storageIfc_->newText(buffer); //prevText
89         buffer.erase();
90         
91         storageIfc_->newChild(name);
92
93         /* We must be really carefull here. calling process attributes moves
94            the document pointer on the last attribute, hence calling reader_->is_empty
95            afterwards will yield the wrong result. It is better to call it while we are
96            on the element and generate a nodeFinished() call at the end */
97         empty = reader_->is_empty_element();
98
99
100         // now, process attributes
101         if (reader_->has_attributes())
102           {
103             processAttributes();
104           };
105
106         
107         if (empty){
108             storageIfc_->newText("");  //myText
109             storageIfc_->nodeFinished(name);       
110         };
111
112
113 }
114
115 void XMLDocShredder::processEndElement()
116 {
117   // tell the storage interface that the current node has been completely processed
118   storageIfc_->newText(buffer); //prevText
119   buffer.erase();
120   storageIfc_->nodeFinished(reader_->get_name());
121 }
122
123 void XMLDocShredder::processPCDATA()
124 {
125         // send the content of this PCDATA node to the storage interface as a text node
126          
127         if (reader_->has_value())
128         {
129           buffer += reader_->get_value();
130
131         };
132
133 }
134
135 void XMLDocShredder::processAttributes()
136 {
137         reader_->move_to_first_attribute();
138                 
139         string nspaceStr = "xmlns";
140         storageIfc_->newText(""); //prevText
141         storageIfc_->newChild("<@>");
142         do
143         {
144                 ustring name = reader_->get_name();
145                 ustring value = reader_->get_value();
146                 
147                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
148                 * so we have to extract it and build a namespace uri node out of it before
149                 * passing to the storage interface */
150                 
151                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
152                 {
153                         storageIfc_->newChild(":" + value);
154                         storageIfc_->nodeFinished(":" + value); 
155                 }
156                 
157                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
158                  * parent element to store the attribute name, possessing a child text node storing the 
159                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
160                  */
161                  
162                 else
163                 {
164                   storageIfc_->newText(""); //prevText
165                   storageIfc_->newChild(name);
166                   storageIfc_->newText(value);
167                   storageIfc_->nodeFinished(name);
168                 }
169         }
170         while (reader_->move_to_next_attribute());
171         storageIfc_->newText(""); //nextText
172         storageIfc_->nodeFinished("<@>");
173 }
174
175 void XMLDocShredder::processSignificantWhitespace()
176 {
177   // each significant whitespace sequence constructs a text node
178   buffer += reader_->get_value();       
179         
180 }
181
182 void XMLDocShredder::processStartDocument(const string docName)
183 {
184   // tell storage interface to construct the document name
185   storageIfc_->newChild("");  
186   
187 }
188
189 void XMLDocShredder::processEndDocument()
190 {
191         /* tell the storage interface that document parsing has finished, and structures
192          * can now be written to disk. */
193   storageIfc_->newText("");
194   storageIfc_->nodeFinished("");
195   storageIfc_->parsingFinished();       
196 }
197
198 void XMLDocShredder::processComment()
199 {
200   //storageIfc_->newChild("!" + reader_->get_value());
201   //storageIfc_->nodeFinished();
202 }
203
204 void XMLDocShredder::processProcessingInstruction()
205 {
206         ustring name = reader_->get_name();
207         ustring value = reader_->get_value();   
208         
209         /* Create a child node to store the target of the PI, append a text node to it to store 
210          * the PI data, send to the storage interface.  Close off the PI node with a call to
211          * nodeFinished
212          */
213         
214         // storageIfc_->newChild("?" + name);
215         // storageIfc_->newText(value);
216         // storageIfc_->nodeFinished();
217 }
218
219 void XMLDocShredder::processDocTypeDeclaration()
220 {
221         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
222          * in case we do want to process it in the future.
223         */
224 }
225
226 void XMLDocShredder::processCDATASection()
227 {
228         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
229          * model.  Instead, we simply pass the converted text value to the storage interface as 
230          * a text node attached to the current context node.
231          */
232   
233         ustring value = reader_->get_value();
234         storageIfc_->newText(value);
235         last_text = true;
236
237 }
238
239 void XMLDocShredder::processUnknownNodeType()
240 {
241         cout << "unknown token encountered during parsing" << endl;
242         throw xmlpp::parse_error("unknown token encountered during parsing");
243                 
244 }
245
246 void XMLDocShredder::parse()
247 {       
248         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
249         {
250                 switch (reader_->get_node_type())
251                 {
252                         case TextReader::Element:
253                                 processStartElement();
254                                 break;
255                                 
256                         case TextReader::Text:
257                                 processPCDATA();
258                                 break;
259                                 
260                         case TextReader::EndElement:
261                                 processEndElement();
262                                 break;
263                                 
264                         case TextReader::SignificantWhitespace:
265                                 processSignificantWhitespace();
266                                 break;
267                                 
268                         case TextReader::Comment:
269                                 processComment();
270                                 break;
271                         
272                         case TextReader::DocumentType:
273                                 processDocTypeDeclaration();
274                                 break;
275                                 
276                         case TextReader::ProcessingInstruction:
277                                 processProcessingInstruction();
278                                 break;
279                         
280                         case TextReader::CDATA:
281                                 processCDATASection();
282                                 break;
283                         
284                         case TextReader::None:
285                                 processUnknownNodeType();
286                                 break;
287                                 
288                         default:
289                                 int type = reader_->get_node_type();
290                                 cout << "  Node type: " << type << endl;
291                                 break;  
292                         
293                 }
294         }                       
295 }