1 /**********************************************************
3 * ---------------------
4 * Implementation of the class that receives events from the XML parser and
5 * invokes corresponding construction methods of the storage interface.
7 * Author: Greg Leighton
10 * 05/11/08 -- Fixed bug related to parsing empty elements
11 * -- Set parser properties to automatically resolve
12 * entity references and load external DTD if present
13 * -- Modified processEndDocument() by adding a nodeFinished()
14 * call to the storage interface to close off the
20 #include "XMLDocShredder.h"
21 #include "OCamlStorageInterface.h"
22 #include <libxml++/exceptions/parse_error.h>
27 void XMLDocShredder::setProperties(){
28 /* instruct the parser to expand entity references and report as
31 reader_->set_parser_property(
32 TextReader::SubstEntities, true);
34 /* instruct parser to read external DTD, if present. This is
35 * needed to obtain any entity definitions in the DTD
37 reader_->set_parser_property(
38 TextReader::LoadDtd, true);
43 reader_->set_parser_property(
44 TextReader::DefaultAttrs, true);
47 /* but we don't want to do validation since it would slow us down
51 reader_->set_parser_property(
52 TextReader::Validate, false);
55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
56 TextReader::size_type size)
58 reader_ = new TextReader(data,size,"");
60 storageIfc_ = new OCamlStorageInterface();
61 //tagsID_ = new unordered_map<int,string>(107);
62 //idTags_ = new unordered_map<string,int>(107);
65 XMLDocShredder::XMLDocShredder(const string inFileName)
67 reader_ = new TextReader(inFileName);
69 storageIfc_ = new OCamlStorageInterface();
70 // tagsID_ = new unordered_map<int,string>(107);
71 // idTags_ = new unordered_map<string,int>(107);
74 XMLDocShredder::~XMLDocShredder()
81 int XMLDocShredder::tagID(string name)
83 int res = tagsID_[name];
86 string XMLDocShredder::idTag(int id)
93 void XMLDocShredder::processStartElement()
95 // fetch element name; this will be the full qualified name
96 ustring name = reader_->get_name();
99 storageIfc_->newChild(name);
101 /* We must be really carefull here. calling process attributes moves
102 the document pointer on the last attribute, hence calling reader_->is_empty
103 afterwards will yield the wrong result. It is better to call it while we are
104 on the element and generate a nodeFinished() call at the end */
105 empty = reader_->is_empty_element();
108 // now, process attributes
109 if (reader_->has_attributes())
116 DPRINT("Node " << name <<" is empty!\n")
117 storageIfc_->nodeFinished();
126 void XMLDocShredder::processEndElement()
128 // tell the storage interface that the current node has been completely processed
129 storageIfc_->nodeFinished();
132 void XMLDocShredder::processPCDATA()
134 // send the content of this PCDATA node to the storage interface as a text node
135 if (reader_->has_value())
137 storageIfc_->newChild("<$>");
138 storageIfc_->newText(reader_->get_value());
142 void XMLDocShredder::processAttributes()
144 reader_->move_to_first_attribute();
146 string nspaceStr = "xmlns";
147 storageIfc_->newChild("<@>");
150 ustring name = reader_->get_name();
151 ustring value = reader_->get_value();
153 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
154 * so we have to extract it and build a namespace uri node out of it before
155 * passing to the storage interface */
157 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
159 storageIfc_->newChild(":" + value);
160 storageIfc_->nodeFinished();
163 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
164 * parent element to store the attribute name, possessing a child text node storing the
165 * attribute value. Then, we close off the attribute node with a call to nodeFinished()
170 storageIfc_->newChild(name);
171 storageIfc_->newChild("<$>");
172 storageIfc_->newText(value);
173 storageIfc_->nodeFinished();
174 // storageIfc_->nodeFinished();
177 while (reader_->move_to_next_attribute());
178 storageIfc_->nodeFinished();
181 void XMLDocShredder::processSignificantWhitespace()
183 ustring value = reader_->get_value();
185 // each significant whitespace sequence constructs a text node
186 storageIfc_->newChild("<$>");
187 storageIfc_->newText(value);
188 //storageIfc_->nodeFinished();
192 void XMLDocShredder::processStartDocument(const string docName)
194 // tell storage interface to construct the document name
195 // storageIfc_->newChild("");
198 void XMLDocShredder::processEndDocument()
200 /* tell the storage interface that document parsing has finished, and structures
201 * can now be written to disk. */
202 // storageIfc_->nodeFinished();
203 storageIfc_->parsingFinished();
206 void XMLDocShredder::processComment()
208 //storageIfc_->newChild("!" + reader_->get_value());
209 //storageIfc_->nodeFinished();
212 void XMLDocShredder::processProcessingInstruction()
214 ustring name = reader_->get_name();
215 ustring value = reader_->get_value();
217 /* Create a child node to store the target of the PI, append a text node to it to store
218 * the PI data, send to the storage interface. Close off the PI node with a call to
222 // storageIfc_->newChild("?" + name);
223 // storageIfc_->newText(value);
224 // storageIfc_->nodeFinished();
227 void XMLDocShredder::processDocTypeDeclaration()
229 /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
230 * in case we do want to process it in the future.
234 void XMLDocShredder::processCDATASection()
236 /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
237 * model. Instead, we simply pass the converted text value to the storage interface as
238 * a text node attached to the current context node.
240 ustring value = reader_->get_value();
241 storageIfc_->newChild("<$>");
242 storageIfc_->newText(value);
243 // storageIfc_->nodeFinished();
247 void XMLDocShredder::processUnknownNodeType()
249 cout << "unknown token encountered during parsing" << endl;
250 throw xmlpp::parse_error("unknown token encountered during parsing");
254 void XMLDocShredder::parse()
256 while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
258 switch (reader_->get_node_type())
260 case TextReader::Element:
261 processStartElement();
264 case TextReader::Text:
268 case TextReader::EndElement:
272 case TextReader::SignificantWhitespace:
273 processSignificantWhitespace();
276 case TextReader::Comment:
280 case TextReader::DocumentType:
281 processDocTypeDeclaration();
284 case TextReader::ProcessingInstruction:
285 processProcessingInstruction();
288 case TextReader::CDATA:
289 processCDATASection();
292 case TextReader::None:
293 processUnknownNodeType();
297 int type = reader_->get_node_type();
298 cout << " Node type: " << type << endl;