--- /dev/null
+/**********************************************************
+ * XMLDocShredder.cpp
+ * ---------------------
+ * Implementation of the class that receives events from the XML parser and
+ * invokes corresponding construction methods of the storage interface.
+ *
+ * Author: Greg Leighton
+ * Date: 02/11/08
+ * Changes:
+ * 05/11/08 -- Fixed bug related to parsing empty elements
+ * -- Set parser properties to automatically resolve
+ * entity references and load external DTD if present
+ * -- Modified processEndDocument() by adding a nodeFinished()
+ * call to the storage interface to close off the
+ * document node
+ *
+ */
+
+#include <iostream>
+#include "XMLDocShredder.h"
+#include <libxml++/exceptions/parse_error.h>
+#include "Utils.h"
+
+using namespace Glib;
+
+void XMLDocShredder::doText(){
+
+ if (!buffer.empty()){
+ tb->NewOpenTag(PCDATA_OPEN_TAG);
+ tb->NewText(buffer);
+ tb->NewClosingTag(PCDATA_OPEN_TAG);
+ };
+ buffer.clear();
+
+}
+
+void XMLDocShredder::setProperties(){
+ /* instruct the parser to expand entity references and report as
+ * regular PCDATA
+ */
+ reader_->set_parser_property(
+ TextReader::SubstEntities, true);
+
+ /* instruct parser to read external DTD, if present. This is
+ * needed to obtain any entity definitions in the DTD
+ */
+ reader_->set_parser_property(
+ TextReader::LoadDtd, true);
+
+
+ /*
+ */
+ reader_->set_parser_property(
+ TextReader::DefaultAttrs, true);
+
+
+ /* but we don't want to do validation since it would slow us down
+ */
+
+
+ reader_->set_parser_property(
+ TextReader::Validate, false);
+
+}
+XMLDocShredder::XMLDocShredder(const unsigned char * data,
+ TextReader::size_type size,
+ int sf,
+ bool iet,
+ bool dtc,
+ TextCollectionBuilder::index_type_t index_type
+ )
+{
+ tree = NULL;
+ reader_ = new TextReader(data,size,"");
+ setProperties();
+ tb = new XMLTreeBuilder();
+ buffer.clear();
+ tb->OpenDocument(iet,sf,dtc, index_type);
+}
+
+XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
+ TextCollectionBuilder::index_type_t index_type
+ )
+{
+ tree = NULL;
+ reader_ = new TextReader(inFileName);
+ setProperties();
+ tb = new XMLTreeBuilder();
+ buffer.clear();
+ tb->OpenDocument(iet,sf,dtc,index_type);
+}
+
+XMLDocShredder::~XMLDocShredder()
+{
+ delete reader_;
+ reader_ = NULL;
+ delete tb;
+ tb = NULL;
+
+}
+
+
+void XMLDocShredder::processStartElement()
+{
+ doText();
+ // fetch element name; this will be the full qualified name
+ ustring name = reader_->get_name();
+ bool empty = false;
+ size_t found = name.find_first_of(':');
+ if (found == ustring::npos)
+ tb->NewOpenTag(name);
+ else
+ tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
+
+ /* We must be really carefull here. calling process attributes moves
+ the document pointer on the last attribute, hence calling reader_->is_empty
+ afterwards will yield the wrong result. It is better to call it while we are
+ on the element and generate a nodeFinished() call at the end */
+ empty = reader_->is_empty_element();
+
+
+ // now, process attributes
+ if (reader_->has_attributes())
+ processAttributes();
+
+
+ if (empty)
+ tb->NewClosingTag(name);
+
+
+}
+
+void XMLDocShredder::processEndElement()
+{
+ doText();
+ ustring name = reader_->get_name();
+ tb->NewClosingTag(name);
+}
+
+void XMLDocShredder::processPCDATA()
+{
+ // send the content of this PCDATA node to the storage interface as a text node
+ if (reader_->has_value())
+ buffer += reader_->get_value();
+
+}
+
+void XMLDocShredder::processAttributes()
+{
+ reader_->move_to_first_attribute();
+
+ string nspaceStr = "xmlns";
+ tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
+ do
+ {
+ ustring name = reader_->get_name();
+ ustring value = reader_->get_value();
+
+ /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
+ * so we have to extract it and build a namespace uri node out of it before
+ * passing to the storage interface */
+
+ if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
+ {
+ //TODO
+ }
+
+ /* otherwise, this is an ordinary attribute, so we construct a new child node of the
+ * parent element to store the attribute name, possessing a child text node storing the
+ * attribute value. Then, we close off the attribute node with a call to nodeFinished()
+ */
+
+ else
+ {
+ string attname = "<@>"+name;
+ tb->NewOpenTag(attname);
+ tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
+ tb->NewText(value);
+ tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
+ tb->NewClosingTag(attname);
+ }
+ }
+ while (reader_->move_to_next_attribute());
+ tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
+}
+
+void XMLDocShredder::processSignificantWhitespace()
+{
+ if (reader_->has_value())
+ buffer += reader_->get_value();
+
+}
+
+void XMLDocShredder::processStartDocument(const string docName)
+{
+ // tell storage interface to construct the document name
+
+ tb->NewOpenTag(DOCUMENT_OPEN_TAG);
+
+}
+
+void XMLDocShredder::processEndDocument()
+{
+ doText();
+ /* tell the storage interface that document parsing has finished, and structures
+ * can now be written to disk. */
+ tb->NewClosingTag(DOCUMENT_OPEN_TAG);
+ tree = tb->CloseDocument();
+
+}
+
+void XMLDocShredder::processComment()
+{
+ //storageIfc_->newChild("!" + reader_->get_value());
+ //storageIfc_->nodeFinished();
+}
+
+void XMLDocShredder::processProcessingInstruction()
+{
+ ustring name = reader_->get_name();
+ ustring value = reader_->get_value();
+
+ /* Create a child node to store the target of the PI, append a text node to it to store
+ * the PI data, send to the storage interface. Close off the PI node with a call to
+ * nodeFinished
+ */
+
+ // storageIfc_->newChild("?" + name);
+ // storageIfc_->newText(value);
+ // storageIfc_->nodeFinished();
+}
+
+void XMLDocShredder::processDocTypeDeclaration()
+{
+ /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
+ * in case we do want to process it in the future.
+ */
+}
+
+void XMLDocShredder::processCDATASection()
+{
+ /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
+ * model. Instead, we simply pass the converted text value to the storage interface as
+ * a text node attached to the current context node.
+ */
+ if (reader_->has_value())
+ buffer+= reader_->get_value();
+}
+
+void XMLDocShredder::processUnknownNodeType()
+{
+ cout << "unknown token encountered during parsing" << endl;
+ throw xmlpp::parse_error("unknown token encountered during parsing");
+
+}
+
+void XMLDocShredder::parse()
+{
+ while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
+ {
+ switch (reader_->get_node_type())
+ {
+ case TextReader::Element:
+ processStartElement();
+ break;
+
+ case TextReader::Text:
+ processPCDATA();
+ break;
+
+ case TextReader::EndElement:
+ processEndElement();
+ break;
+
+ case TextReader::SignificantWhitespace:
+ processSignificantWhitespace();
+ break;
+
+ case TextReader::Comment:
+ processComment();
+ break;
+
+ case TextReader::DocumentType:
+ processDocTypeDeclaration();
+ break;
+
+ case TextReader::ProcessingInstruction:
+ processProcessingInstruction();
+ break;
+
+ case TextReader::CDATA:
+ processCDATASection();
+ break;
+
+ case TextReader::None:
+ processUnknownNodeType();
+ break;
+
+ default:
+ int type = reader_->get_node_type();
+ cout << " Node type: " << type << endl;
+ break;
+
+ }
+ }
+}