Initial commit
[SXSI/xpathcomp.git] / XMLDocShredder.cpp
1 /**********************************************************
2  * XMLDocShredder.cpp
3  * ---------------------
4  * Implementation of the class that receives events from the XML parser and 
5  * invokes corresponding construction methods of the storage interface.
6  * 
7  * Author: Greg Leighton
8  * Date: 02/11/08
9  * Changes:
10  *              05/11/08 -- Fixed bug related to parsing empty elements
11  *                               -- Set parser properties to automatically resolve
12  *                                      entity references and load external DTD if present
13  *                               -- Modified processEndDocument() by adding a nodeFinished()
14  *                                      call to the storage interface to close off the 
15  *                                      document node
16  *
17  */
18
19 #include <iostream>
20 #include "XMLDocShredder.h"
21 #include "OCamlStorageInterface.h"
22 #include <libxml++/exceptions/parse_error.h>
23 #include "Utils.h"
24
25 using namespace Glib;
26
27 void XMLDocShredder::setProperties(){
28   /* instruct the parser to expand entity references and report as 
29    * regular PCDATA
30    */ 
31   reader_->set_parser_property(
32                                TextReader::SubstEntities, true);
33                 
34   /* instruct parser to read external DTD, if present.  This is 
35          * needed to obtain any entity definitions in the DTD
36          */
37   reader_->set_parser_property(
38                                TextReader::LoadDtd, true);
39   
40   
41   /* 
42    */
43   reader_->set_parser_property(
44                                TextReader::DefaultAttrs, true);
45   
46
47   /* but we don't want to do validation since it would slow us down
48    */
49
50
51   reader_->set_parser_property(
52                                TextReader::Validate, false);
53   
54 }
55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
56                                TextReader::size_type size)                      
57 {
58   reader_ = new TextReader(data,size,"");
59   setProperties();
60   storageIfc_ = new OCamlStorageInterface();
61   //tagsID_ = new unordered_map<int,string>(107);
62   //idTags_ = new unordered_map<string,int>(107);
63 }
64
65 XMLDocShredder::XMLDocShredder(const string inFileName)
66 {
67   reader_ = new TextReader(inFileName);
68   setProperties();
69   storageIfc_ = new OCamlStorageInterface();
70   //  tagsID_ = new unordered_map<int,string>(107);
71   // idTags_ = new unordered_map<string,int>(107);
72 }
73
74 XMLDocShredder::~XMLDocShredder()
75 {
76         delete reader_;
77         delete storageIfc_;
78
79 }
80
81 int XMLDocShredder::tagID(string name)
82 {
83   int res = tagsID_[name];
84   return  res;
85 }
86 string XMLDocShredder::idTag(int id)
87 {
88
89   return  idTags_[id];
90 }
91
92
93 void XMLDocShredder::processStartElement()
94 {
95         // fetch element name; this will be the full qualified name
96         ustring name = reader_->get_name();
97         bool empty = false;
98
99         storageIfc_->newChild(name);
100
101         /* We must be really carefull here. calling process attributes moves
102            the document pointer on the last attribute, hence calling reader_->is_empty
103            afterwards will yield the wrong result. It is better to call it while we are
104            on the element and generate a nodeFinished() call at the end */
105         empty = reader_->is_empty_element();
106
107
108         // now, process attributes
109         if (reader_->has_attributes())
110           {
111             processAttributes();
112           };
113
114         
115         if (empty){
116           DPRINT("Node " << name <<" is empty!\n")
117             storageIfc_->nodeFinished();
118         };
119
120
121
122
123
124 }
125
126 void XMLDocShredder::processEndElement()
127 {
128         // tell the storage interface that the current node has been completely processed
129         storageIfc_->nodeFinished();
130 }
131
132 void XMLDocShredder::processPCDATA()
133 {
134         // send the content of this PCDATA node to the storage interface as a text node
135         if (reader_->has_value())
136         {         
137           storageIfc_->newChild("<$>");
138           storageIfc_->newText(reader_->get_value());
139         }
140 }
141
142 void XMLDocShredder::processAttributes()
143 {
144         reader_->move_to_first_attribute();
145                 
146         string nspaceStr = "xmlns";     
147         storageIfc_->newChild("<@>");
148         do
149         {
150                 ustring name = reader_->get_name();
151                 ustring value = reader_->get_value();
152                 
153                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
154                 * so we have to extract it and build a namespace uri node out of it before
155                 * passing to the storage interface */
156                 
157                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
158                 {
159                         storageIfc_->newChild(":" + value);
160                         storageIfc_->nodeFinished();    
161                 }
162                 
163                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
164                  * parent element to store the attribute name, possessing a child text node storing the 
165                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
166                  */
167                  
168                 else
169                 {
170                         storageIfc_->newChild(name);
171                         storageIfc_->newChild("<$>");
172                         storageIfc_->newText(value);
173                         storageIfc_->nodeFinished();
174                         //                      storageIfc_->nodeFinished();
175                 }
176         }
177         while (reader_->move_to_next_attribute());
178         storageIfc_->nodeFinished();
179 }
180
181 void XMLDocShredder::processSignificantWhitespace()
182 {
183         ustring value = reader_->get_value();
184         
185         // each significant whitespace sequence constructs a text node
186         storageIfc_->newChild("<$>");
187         storageIfc_->newText(value);
188         //storageIfc_->nodeFinished();
189         
190 }
191
192 void XMLDocShredder::processStartDocument(const string docName)
193 {
194   // tell storage interface to construct the document name
195   //  storageIfc_->newChild("");  
196 }
197
198 void XMLDocShredder::processEndDocument()
199 {
200         /* tell the storage interface that document parsing has finished, and structures
201          * can now be written to disk. */
202   //  storageIfc_->nodeFinished();
203   storageIfc_->parsingFinished();       
204 }
205
206 void XMLDocShredder::processComment()
207 {
208   //storageIfc_->newChild("!" + reader_->get_value());
209   //storageIfc_->nodeFinished();
210 }
211
212 void XMLDocShredder::processProcessingInstruction()
213 {
214         ustring name = reader_->get_name();
215         ustring value = reader_->get_value();   
216         
217         /* Create a child node to store the target of the PI, append a text node to it to store 
218          * the PI data, send to the storage interface.  Close off the PI node with a call to
219          * nodeFinished
220          */
221         
222         // storageIfc_->newChild("?" + name);
223         // storageIfc_->newText(value);
224         // storageIfc_->nodeFinished();
225 }
226
227 void XMLDocShredder::processDocTypeDeclaration()
228 {
229         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
230          * in case we do want to process it in the future.
231         */
232 }
233
234 void XMLDocShredder::processCDATASection()
235 {
236         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
237          * model.  Instead, we simply pass the converted text value to the storage interface as 
238          * a text node attached to the current context node.
239          */
240         ustring value = reader_->get_value();
241         storageIfc_->newChild("<$>");
242         storageIfc_->newText(value);
243         //      storageIfc_->nodeFinished();
244
245 }
246
247 void XMLDocShredder::processUnknownNodeType()
248 {
249         cout << "unknown token encountered during parsing" << endl;
250         throw xmlpp::parse_error("unknown token encountered during parsing");
251                 
252 }
253
254 void XMLDocShredder::parse()
255 {       
256         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
257         {
258                 switch (reader_->get_node_type())
259                 {
260                         case TextReader::Element:
261                                 processStartElement();
262                                 break;
263                                 
264                         case TextReader::Text:
265                                 processPCDATA();
266                                 break;
267                                 
268                         case TextReader::EndElement:
269                                 processEndElement();
270                                 break;
271                                 
272                         case TextReader::SignificantWhitespace:
273                                 processSignificantWhitespace();
274                                 break;
275                                 
276                         case TextReader::Comment:
277                                 processComment();
278                                 break;
279                         
280                         case TextReader::DocumentType:
281                                 processDocTypeDeclaration();
282                                 break;
283                                 
284                         case TextReader::ProcessingInstruction:
285                                 processProcessingInstruction();
286                                 break;
287                         
288                         case TextReader::CDATA:
289                                 processCDATASection();
290                                 break;
291                         
292                         case TextReader::None:
293                                 processUnknownNodeType();
294                                 break;
295                                 
296                         default:
297                                 int type = reader_->get_node_type();
298                                 cout << "  Node type: " << type << endl;
299                                 break;  
300                         
301                 }
302         }                       
303 }