1 /**********************************************************
3 * ---------------------
4 * Implementation of the class that receives events from the XML parser and
5 * invokes corresponding construction methods of the storage interface.
7 * Author: Greg Leighton
10 * 05/11/08 -- Fixed bug related to parsing empty elements
11 * -- Set parser properties to automatically resolve
12 * entity references and load external DTD if present
13 * -- Modified processEndDocument() by adding a nodeFinished()
14 * call to the storage interface to close off the
20 #include "XMLDocShredder.h"
21 #include <libxml++/exceptions/parse_error.h>
26 void XMLDocShredder::doText(){
29 tb->NewOpenTag(PCDATA_OPEN_TAG);
31 tb->NewClosingTag(PCDATA_OPEN_TAG);
37 void XMLDocShredder::setProperties(){
38 /* instruct the parser to expand entity references and report as
41 reader_->set_parser_property(
42 TextReader::SubstEntities, true);
44 /* instruct parser to read external DTD, if present. This is
45 * needed to obtain any entity definitions in the DTD
47 reader_->set_parser_property(
48 TextReader::LoadDtd, true);
53 reader_->set_parser_property(
54 TextReader::DefaultAttrs, true);
57 /* but we don't want to do validation since it would slow us down
61 reader_->set_parser_property(
62 TextReader::Validate, false);
65 XMLDocShredder::XMLDocShredder(const unsigned char * data,
66 TextReader::size_type size,
70 TextCollectionBuilder::index_type_t index_type
74 reader_ = new TextReader(data,size,"");
76 tb = new XMLTreeBuilder();
78 tb->OpenDocument(iet,sf,dtc, index_type);
81 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
82 TextCollectionBuilder::index_type_t index_type
86 reader_ = new TextReader(inFileName);
88 tb = new XMLTreeBuilder();
90 tb->OpenDocument(iet,sf,dtc,index_type);
93 XMLDocShredder::~XMLDocShredder()
103 void XMLDocShredder::processStartElement()
106 // fetch element name; this will be the full qualified name
107 ustring name = reader_->get_name();
109 size_t found = name.find_first_of(':');
110 if (found == ustring::npos)
111 tb->NewOpenTag(name);
113 tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
115 /* We must be really carefull here. calling process attributes moves
116 the document pointer on the last attribute, hence calling reader_->is_empty
117 afterwards will yield the wrong result. It is better to call it while we are
118 on the element and generate a nodeFinished() call at the end */
119 empty = reader_->is_empty_element();
122 // now, process attributes
123 if (reader_->has_attributes())
128 tb->NewClosingTag(name);
133 void XMLDocShredder::processEndElement()
136 ustring name = reader_->get_name();
137 tb->NewClosingTag(name);
140 void XMLDocShredder::processPCDATA()
142 // send the content of this PCDATA node to the storage interface as a text node
143 if (reader_->has_value())
144 buffer += reader_->get_value();
148 void XMLDocShredder::processAttributes()
150 reader_->move_to_first_attribute();
152 string nspaceStr = "xmlns";
153 tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
156 ustring name = reader_->get_name();
157 ustring value = reader_->get_value();
159 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
160 * so we have to extract it and build a namespace uri node out of it before
161 * passing to the storage interface */
163 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
168 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
169 * parent element to store the attribute name, possessing a child text node storing the
170 * attribute value. Then, we close off the attribute node with a call to nodeFinished()
175 string attname = "<@>"+name;
176 tb->NewOpenTag(attname);
177 tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
179 tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
180 tb->NewClosingTag(attname);
183 while (reader_->move_to_next_attribute());
184 tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
187 void XMLDocShredder::processSignificantWhitespace()
189 if (reader_->has_value())
190 buffer += reader_->get_value();
194 void XMLDocShredder::processStartDocument(const string docName)
196 // tell storage interface to construct the document name
198 tb->NewOpenTag(DOCUMENT_OPEN_TAG);
202 void XMLDocShredder::processEndDocument()
205 /* tell the storage interface that document parsing has finished, and structures
206 * can now be written to disk. */
207 tb->NewClosingTag(DOCUMENT_OPEN_TAG);
208 tree = tb->CloseDocument();
212 void XMLDocShredder::processComment()
214 //storageIfc_->newChild("!" + reader_->get_value());
215 //storageIfc_->nodeFinished();
218 void XMLDocShredder::processProcessingInstruction()
220 ustring name = reader_->get_name();
221 ustring value = reader_->get_value();
223 /* Create a child node to store the target of the PI, append a text node to it to store
224 * the PI data, send to the storage interface. Close off the PI node with a call to
228 // storageIfc_->newChild("?" + name);
229 // storageIfc_->newText(value);
230 // storageIfc_->nodeFinished();
233 void XMLDocShredder::processDocTypeDeclaration()
235 /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
236 * in case we do want to process it in the future.
240 void XMLDocShredder::processCDATASection()
242 /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
243 * model. Instead, we simply pass the converted text value to the storage interface as
244 * a text node attached to the current context node.
246 if (reader_->has_value())
247 buffer+= reader_->get_value();
250 void XMLDocShredder::processUnknownNodeType()
252 cout << "unknown token encountered during parsing" << endl;
253 throw xmlpp::parse_error("unknown token encountered during parsing");
257 void XMLDocShredder::parse()
259 while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
261 switch (reader_->get_node_type())
263 case TextReader::Element:
264 processStartElement();
267 case TextReader::Text:
271 case TextReader::EndElement:
275 case TextReader::SignificantWhitespace:
276 processSignificantWhitespace();
279 case TextReader::Comment:
283 case TextReader::DocumentType:
284 processDocTypeDeclaration();
287 case TextReader::ProcessingInstruction:
288 processProcessingInstruction();
291 case TextReader::CDATA:
292 processCDATASection();
295 case TextReader::None:
296 processUnknownNodeType();
300 int type = reader_->get_node_type();
301 cout << " Node type: " << type << endl;