Cleaning dead code
[SXSI/xpathcomp.git] / XMLDocShredder.cpp
1 /**********************************************************
2  * XMLDocShredder.cpp
3  * ---------------------
4  * Implementation of the class that receives events from the XML parser and 
5  * invokes corresponding construction methods of the storage interface.
6  * 
7  * Author: Greg Leighton
8  * Date: 02/11/08
9  * Changes:
10  *              05/11/08 -- Fixed bug related to parsing empty elements
11  *                               -- Set parser properties to automatically resolve
12  *                                      entity references and load external DTD if present
13  *                               -- Modified processEndDocument() by adding a nodeFinished()
14  *                                      call to the storage interface to close off the 
15  *                                      document node
16  *
17  */
18
19 #include <iostream>
20 #include "XMLDocShredder.h"
21 #include "SXSIStorageInterface.h"
22 #include <libxml++/exceptions/parse_error.h>
23 #include "Utils.h"
24
25 using namespace Glib;
26
27 void XMLDocShredder::setProperties(){
28   /* instruct the parser to expand entity references and report as 
29    * regular PCDATA
30    */ 
31   reader_->set_parser_property(
32                                TextReader::SubstEntities, true);
33                 
34   /* instruct parser to read external DTD, if present.  This is 
35          * needed to obtain any entity definitions in the DTD
36          */
37   reader_->set_parser_property(
38                                TextReader::LoadDtd, true);
39   
40   
41   /* 
42    */
43   reader_->set_parser_property(
44                                TextReader::DefaultAttrs, true);
45   
46
47   /* but we don't want to do validation since it would slow us down
48    */
49
50
51   reader_->set_parser_property(
52                                TextReader::Validate, false);
53   
54 }
55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
56                                TextReader::size_type size,
57                                int sf, 
58                                bool iet, 
59                                bool dtc)                        
60 {
61   last_text = false;
62   reader_ = new TextReader(data,size,"");
63   setProperties();
64   storageIfc_ = new SXSIStorageInterface(sf,iet,dtc);
65   buffer = "";
66 }
67
68 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc)
69 {
70   last_text = false;
71   reader_ = new TextReader(inFileName);
72   setProperties();
73   storageIfc_ = new SXSIStorageInterface(sf,iet,dtc);
74   buffer = "";
75 }
76
77 XMLDocShredder::~XMLDocShredder()
78 {
79         delete reader_;
80         delete storageIfc_;
81
82 }
83
84
85 void XMLDocShredder::processStartElement()
86 {
87         // fetch element name; this will be the full qualified name
88         ustring name = reader_->get_name();
89         bool empty = false;
90         
91         storageIfc_->newText(buffer); //prevText
92         buffer.erase();
93         
94         storageIfc_->newChild(name);
95
96         /* We must be really carefull here. calling process attributes moves
97            the document pointer on the last attribute, hence calling reader_->is_empty
98            afterwards will yield the wrong result. It is better to call it while we are
99            on the element and generate a nodeFinished() call at the end */
100         empty = reader_->is_empty_element();
101
102
103         // now, process attributes
104         if (reader_->has_attributes())
105           {
106             processAttributes();
107           };
108
109         
110         if (empty){
111             storageIfc_->newText("");  //myText
112             storageIfc_->nodeFinished(name);       
113         };
114
115
116 }
117
118 void XMLDocShredder::processEndElement()
119 {
120   // tell the storage interface that the current node has been completely processed
121   storageIfc_->newText(buffer); //prevText
122   buffer.erase();
123   storageIfc_->nodeFinished(reader_->get_name());
124 }
125
126 void XMLDocShredder::processPCDATA()
127 {
128         // send the content of this PCDATA node to the storage interface as a text node
129          
130         if (reader_->has_value())
131         {
132           buffer += reader_->get_value();
133         };
134
135 }
136
137 void XMLDocShredder::processAttributes()
138 {
139         reader_->move_to_first_attribute();
140                 
141         string nspaceStr = "xmlns";
142         storageIfc_->newText(""); //prevText
143         storageIfc_->newChild("<@>");
144         do
145         {
146                 ustring name = reader_->get_name();
147                 ustring value = reader_->get_value();
148                 
149                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
150                 * so we have to extract it and build a namespace uri node out of it before
151                 * passing to the storage interface */
152                 
153                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
154                 {
155                         storageIfc_->newChild(":" + value);
156                         storageIfc_->nodeFinished(":" + value); 
157                 }
158                 
159                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
160                  * parent element to store the attribute name, possessing a child text node storing the 
161                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
162                  */
163                  
164                 else
165                 {
166                   storageIfc_->newText(""); //prevText
167                   storageIfc_->newChild(name);
168                   storageIfc_->newText(value);
169                   storageIfc_->nodeFinished(name);
170                 }
171         }
172         while (reader_->move_to_next_attribute());
173         storageIfc_->newText(""); //nextText
174         storageIfc_->nodeFinished("<@>");
175 }
176
177 void XMLDocShredder::processSignificantWhitespace()
178 {
179   // each significant whitespace sequence constructs a text node
180   buffer += reader_->get_value();       
181         
182 }
183
184 void XMLDocShredder::processStartDocument(const string docName)
185 {
186   // tell storage interface to construct the document name
187   storageIfc_->newChild("");  
188   
189 }
190
191 void XMLDocShredder::processEndDocument()
192 {
193         /* tell the storage interface that document parsing has finished, and structures
194          * can now be written to disk. */
195   storageIfc_->newText("");
196   storageIfc_->nodeFinished("");
197   storageIfc_->parsingFinished();       
198 }
199
200 void XMLDocShredder::processComment()
201 {
202   //storageIfc_->newChild("!" + reader_->get_value());
203   //storageIfc_->nodeFinished();
204 }
205
206 void XMLDocShredder::processProcessingInstruction()
207 {
208         ustring name = reader_->get_name();
209         ustring value = reader_->get_value();   
210         
211         /* Create a child node to store the target of the PI, append a text node to it to store 
212          * the PI data, send to the storage interface.  Close off the PI node with a call to
213          * nodeFinished
214          */
215         
216         // storageIfc_->newChild("?" + name);
217         // storageIfc_->newText(value);
218         // storageIfc_->nodeFinished();
219 }
220
221 void XMLDocShredder::processDocTypeDeclaration()
222 {
223         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
224          * in case we do want to process it in the future.
225         */
226 }
227
228 void XMLDocShredder::processCDATASection()
229 {
230         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
231          * model.  Instead, we simply pass the converted text value to the storage interface as 
232          * a text node attached to the current context node.
233          */
234   
235         ustring value = reader_->get_value();
236         storageIfc_->newText(value);
237         last_text = true;
238
239 }
240
241 void XMLDocShredder::processUnknownNodeType()
242 {
243         cout << "unknown token encountered during parsing" << endl;
244         throw xmlpp::parse_error("unknown token encountered during parsing");
245                 
246 }
247
248 void XMLDocShredder::parse()
249 {       
250         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
251         {
252                 switch (reader_->get_node_type())
253                 {
254                         case TextReader::Element:
255                                 processStartElement();
256                                 break;
257                                 
258                         case TextReader::Text:
259                                 processPCDATA();
260                                 break;
261                                 
262                         case TextReader::EndElement:
263                                 processEndElement();
264                                 break;
265                                 
266                         case TextReader::SignificantWhitespace:
267                                 processSignificantWhitespace();
268                                 break;
269                                 
270                         case TextReader::Comment:
271                                 processComment();
272                                 break;
273                         
274                         case TextReader::DocumentType:
275                                 processDocTypeDeclaration();
276                                 break;
277                                 
278                         case TextReader::ProcessingInstruction:
279                                 processProcessingInstruction();
280                                 break;
281                         
282                         case TextReader::CDATA:
283                                 processCDATASection();
284                                 break;
285                         
286                         case TextReader::None:
287                                 processUnknownNodeType();
288                                 break;
289                                 
290                         default:
291                                 int type = reader_->get_node_type();
292                                 cout << "  Node type: " << type << endl;
293                                 break;  
294                         
295                 }
296         }                       
297 }