3daaf45dfbbcdc4a53cef508465a013e42d1d29f
[SXSI/xpathcomp.git] / XMLDocShredder.cpp
1 /**********************************************************
2  * XMLDocShredder.cpp
3  * ---------------------
4  * Implementation of the class that receives events from the XML parser and 
5  * invokes corresponding construction methods of the storage interface.
6  * 
7  * Author: Greg Leighton
8  * Date: 02/11/08
9  * Changes:
10  *              05/11/08 -- Fixed bug related to parsing empty elements
11  *                               -- Set parser properties to automatically resolve
12  *                                      entity references and load external DTD if present
13  *                               -- Modified processEndDocument() by adding a nodeFinished()
14  *                                      call to the storage interface to close off the 
15  *                                      document node
16  *
17  */
18
19 #include <iostream>
20 #include "XMLDocShredder.h"
21 #include "SXSIStorageInterface.h"
22 #include <libxml++/exceptions/parse_error.h>
23 #include "Utils.h"
24
25 using namespace Glib;
26
27 void XMLDocShredder::setProperties(){
28   /* instruct the parser to expand entity references and report as 
29    * regular PCDATA
30    */ 
31   reader_->set_parser_property(
32                                TextReader::SubstEntities, true);
33                 
34   /* instruct parser to read external DTD, if present.  This is 
35          * needed to obtain any entity definitions in the DTD
36          */
37   reader_->set_parser_property(
38                                TextReader::LoadDtd, true);
39   
40   
41   /* 
42    */
43   reader_->set_parser_property(
44                                TextReader::DefaultAttrs, true);
45   
46
47   /* but we don't want to do validation since it would slow us down
48    */
49
50
51   reader_->set_parser_property(
52                                TextReader::Validate, false);
53   
54 }
55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
56                                TextReader::size_type size)                      
57 {
58   last_text = false;
59   reader_ = new TextReader(data,size,"");
60   setProperties();
61   storageIfc_ = new SXSIStorageInterface();
62 }
63
64 XMLDocShredder::XMLDocShredder(const string inFileName)
65 {
66   last_text = false;
67   reader_ = new TextReader(inFileName);
68   setProperties();
69   storageIfc_ = new SXSIStorageInterface();
70
71 }
72
73 XMLDocShredder::~XMLDocShredder()
74 {
75         delete reader_;
76         delete storageIfc_;
77
78 }
79
80
81 void XMLDocShredder::processStartElement()
82 {
83         // fetch element name; this will be the full qualified name
84         ustring name = reader_->get_name();
85         bool empty = false;
86         
87         if (!last_text)
88           storageIfc_->newText(""); //prevText
89         last_text = false;
90
91         storageIfc_->newChild(name);
92
93         /* We must be really carefull here. calling process attributes moves
94            the document pointer on the last attribute, hence calling reader_->is_empty
95            afterwards will yield the wrong result. It is better to call it while we are
96            on the element and generate a nodeFinished() call at the end */
97         empty = reader_->is_empty_element();
98
99
100         // now, process attributes
101         if (reader_->has_attributes())
102           {
103             processAttributes();
104           };
105
106         
107         if (empty){
108             storageIfc_->newText("");  //myText
109             storageIfc_->nodeFinished(name);       
110         };
111
112
113
114
115
116 }
117
118 void XMLDocShredder::processEndElement()
119 {
120   // tell the storage interface that the current node has been completely processed
121   if (!last_text)
122     storageIfc_->newText(""); //nextText of previous node
123   last_text = false;
124   storageIfc_->nodeFinished(reader_->get_name());
125 }
126
127 void XMLDocShredder::processPCDATA()
128 {
129         // send the content of this PCDATA node to the storage interface as a text node
130          
131         if (reader_->has_value())
132         {         
133           storageIfc_->newText(reader_->get_value());
134           last_text = true;
135         }
136         else 
137           storageIfc_->newText("");
138 }
139
140 void XMLDocShredder::processAttributes()
141 {
142         reader_->move_to_first_attribute();
143                 
144         string nspaceStr = "xmlns";     
145         storageIfc_->newChild("<@>");
146         do
147         {
148                 ustring name = reader_->get_name();
149                 ustring value = reader_->get_value();
150                 
151                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
152                 * so we have to extract it and build a namespace uri node out of it before
153                 * passing to the storage interface */
154                 
155                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
156                 {
157                         storageIfc_->newChild(":" + value);
158                         storageIfc_->nodeFinished(":" + value); 
159                 }
160                 
161                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
162                  * parent element to store the attribute name, possessing a child text node storing the 
163                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
164                  */
165                  
166                 else
167                 {
168                   storageIfc_->newText(""); //prevText
169                   storageIfc_->newChild(name);
170                   storageIfc_->newText(value);
171                   storageIfc_->nodeFinished(name);
172                 }
173         }
174         while (reader_->move_to_next_attribute());
175         storageIfc_->newText(""); //nextText
176         storageIfc_->nodeFinished("<@>");
177 }
178
179 void XMLDocShredder::processSignificantWhitespace()
180 {
181         ustring value = reader_->get_value();   
182         // each significant whitespace sequence constructs a text node
183         storageIfc_->newText(value);
184         
185 }
186
187 void XMLDocShredder::processStartDocument(const string docName)
188 {
189   // tell storage interface to construct the document name
190   storageIfc_->newChild("");  
191   
192 }
193
194 void XMLDocShredder::processEndDocument()
195 {
196         /* tell the storage interface that document parsing has finished, and structures
197          * can now be written to disk. */
198   storageIfc_->newText("");
199   storageIfc_->nodeFinished("");
200   storageIfc_->parsingFinished();       
201 }
202
203 void XMLDocShredder::processComment()
204 {
205   //storageIfc_->newChild("!" + reader_->get_value());
206   //storageIfc_->nodeFinished();
207 }
208
209 void XMLDocShredder::processProcessingInstruction()
210 {
211         ustring name = reader_->get_name();
212         ustring value = reader_->get_value();   
213         
214         /* Create a child node to store the target of the PI, append a text node to it to store 
215          * the PI data, send to the storage interface.  Close off the PI node with a call to
216          * nodeFinished
217          */
218         
219         // storageIfc_->newChild("?" + name);
220         // storageIfc_->newText(value);
221         // storageIfc_->nodeFinished();
222 }
223
224 void XMLDocShredder::processDocTypeDeclaration()
225 {
226         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
227          * in case we do want to process it in the future.
228         */
229 }
230
231 void XMLDocShredder::processCDATASection()
232 {
233         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
234          * model.  Instead, we simply pass the converted text value to the storage interface as 
235          * a text node attached to the current context node.
236          */
237   
238         ustring value = reader_->get_value();
239         storageIfc_->newText(value);
240         last_text = true;
241
242 }
243
244 void XMLDocShredder::processUnknownNodeType()
245 {
246         cout << "unknown token encountered during parsing" << endl;
247         throw xmlpp::parse_error("unknown token encountered during parsing");
248                 
249 }
250
251 void XMLDocShredder::parse()
252 {       
253         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
254         {
255                 switch (reader_->get_node_type())
256                 {
257                         case TextReader::Element:
258                                 processStartElement();
259                                 break;
260                                 
261                         case TextReader::Text:
262                                 processPCDATA();
263                                 break;
264                                 
265                         case TextReader::EndElement:
266                                 processEndElement();
267                                 break;
268                                 
269                         case TextReader::SignificantWhitespace:
270                                 processSignificantWhitespace();
271                                 break;
272                                 
273                         case TextReader::Comment:
274                                 processComment();
275                                 break;
276                         
277                         case TextReader::DocumentType:
278                                 processDocTypeDeclaration();
279                                 break;
280                                 
281                         case TextReader::ProcessingInstruction:
282                                 processProcessingInstruction();
283                                 break;
284                         
285                         case TextReader::CDATA:
286                                 processCDATASection();
287                                 break;
288                         
289                         case TextReader::None:
290                                 processUnknownNodeType();
291                                 break;
292                                 
293                         default:
294                                 int type = reader_->get_node_type();
295                                 cout << "  Node type: " << type << endl;
296                                 break;  
297                         
298                 }
299         }                       
300 }