+++ /dev/null
-/**********************************************************
- * XMLDocShredder.cpp
- * ---------------------
- * Implementation of the class that receives events from the XML parser and
- * invokes corresponding construction methods of the storage interface.
- *
- * Author: Greg Leighton
- * Date: 02/11/08
- * Changes:
- * 05/11/08 -- Fixed bug related to parsing empty elements
- * -- Set parser properties to automatically resolve
- * entity references and load external DTD if present
- * -- Modified processEndDocument() by adding a nodeFinished()
- * call to the storage interface to close off the
- * document node
- *
- */
-
-#include <iostream>
-#include "XMLDocShredder.h"
-#include <libxml++/exceptions/parse_error.h>
-#include "Utils.h"
-
-using namespace Glib;
-
-void XMLDocShredder::doText(){
-
- if (!buffer.empty()){
- tb->NewOpenTag(PCDATA_OPEN_TAG);
- tb->NewText(buffer);
- tb->NewClosingTag(PCDATA_OPEN_TAG);
- };
- buffer.clear();
-
-}
-
-void XMLDocShredder::setProperties(){
- /* instruct the parser to expand entity references and report as
- * regular PCDATA
- */
- reader_->set_parser_property(
- TextReader::SubstEntities, true);
-
- /* instruct parser to read external DTD, if present. This is
- * needed to obtain any entity definitions in the DTD
- */
- reader_->set_parser_property(
- TextReader::LoadDtd, true);
-
-
- /*
- */
- reader_->set_parser_property(
- TextReader::DefaultAttrs, true);
-
-
- /* but we don't want to do validation since it would slow us down
- */
-
-
- reader_->set_parser_property(
- TextReader::Validate, false);
-
-}
-XMLDocShredder::XMLDocShredder(const unsigned char * data,
- TextReader::size_type size,
- int sf,
- bool iet,
- bool dtc,
- TextCollectionBuilder::index_type_t index_type
- )
-{
- tree = NULL;
- reader_ = new TextReader(data,size,"");
- setProperties();
- tb = new XMLTreeBuilder();
- buffer.clear();
- tb->OpenDocument(iet,sf,dtc, index_type);
-}
-
-XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
- TextCollectionBuilder::index_type_t index_type
- )
-{
- tree = NULL;
- reader_ = new TextReader(inFileName);
- setProperties();
- tb = new XMLTreeBuilder();
- buffer.clear();
- tb->OpenDocument(iet,sf,dtc,index_type);
-}
-
-XMLDocShredder::~XMLDocShredder()
-{
- delete reader_;
- reader_ = NULL;
- delete tb;
- tb = NULL;
-
-}
-
-
-void XMLDocShredder::processStartElement()
-{
- doText();
- // fetch element name; this will be the full qualified name
- ustring name = reader_->get_name();
- bool empty = false;
- size_t found = name.find_first_of(':');
- if (found == ustring::npos)
- tb->NewOpenTag(name);
- else
- tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
-
- /* We must be really carefull here. calling process attributes moves
- the document pointer on the last attribute, hence calling reader_->is_empty
- afterwards will yield the wrong result. It is better to call it while we are
- on the element and generate a nodeFinished() call at the end */
- empty = reader_->is_empty_element();
-
-
- // now, process attributes
- if (reader_->has_attributes())
- processAttributes();
-
-
- if (empty)
- tb->NewClosingTag(name);
-
-
-}
-
-void XMLDocShredder::processEndElement()
-{
- doText();
- ustring name = reader_->get_name();
- tb->NewClosingTag(name);
-}
-
-void XMLDocShredder::processPCDATA()
-{
- // send the content of this PCDATA node to the storage interface as a text node
- if (reader_->has_value())
- buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processAttributes()
-{
- reader_->move_to_first_attribute();
-
- string nspaceStr = "xmlns";
- tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
- do
- {
- ustring name = reader_->get_name();
- ustring value = reader_->get_value();
-
- /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
- * so we have to extract it and build a namespace uri node out of it before
- * passing to the storage interface */
-
- if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
- {
- //TODO
- }
-
- /* otherwise, this is an ordinary attribute, so we construct a new child node of the
- * parent element to store the attribute name, possessing a child text node storing the
- * attribute value. Then, we close off the attribute node with a call to nodeFinished()
- */
-
- else
- {
- string attname = "<@>"+name;
- tb->NewOpenTag(attname);
- tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
- tb->NewText(value);
- tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
- tb->NewClosingTag(attname);
- }
- }
- while (reader_->move_to_next_attribute());
- tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
-}
-
-void XMLDocShredder::processSignificantWhitespace()
-{
- if (reader_->has_value())
- buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processStartDocument(const string docName)
-{
- // tell storage interface to construct the document name
-
- tb->NewOpenTag(DOCUMENT_OPEN_TAG);
-
-}
-
-void XMLDocShredder::processEndDocument()
-{
- doText();
- /* tell the storage interface that document parsing has finished, and structures
- * can now be written to disk. */
- tb->NewClosingTag(DOCUMENT_OPEN_TAG);
- tree = tb->CloseDocument();
-
-}
-
-void XMLDocShredder::processComment()
-{
- //storageIfc_->newChild("!" + reader_->get_value());
- //storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processProcessingInstruction()
-{
- ustring name = reader_->get_name();
- ustring value = reader_->get_value();
-
- /* Create a child node to store the target of the PI, append a text node to it to store
- * the PI data, send to the storage interface. Close off the PI node with a call to
- * nodeFinished
- */
-
- // storageIfc_->newChild("?" + name);
- // storageIfc_->newText(value);
- // storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processDocTypeDeclaration()
-{
- /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
- * in case we do want to process it in the future.
- */
-}
-
-void XMLDocShredder::processCDATASection()
-{
- /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
- * model. Instead, we simply pass the converted text value to the storage interface as
- * a text node attached to the current context node.
- */
- if (reader_->has_value())
- buffer+= reader_->get_value();
-}
-
-void XMLDocShredder::processUnknownNodeType()
-{
- cout << "unknown token encountered during parsing" << endl;
- throw xmlpp::parse_error("unknown token encountered during parsing");
-
-}
-
-void XMLDocShredder::parse()
-{
- while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
- {
- switch (reader_->get_node_type())
- {
- case TextReader::Element:
- processStartElement();
- break;
-
- case TextReader::Text:
- processPCDATA();
- break;
-
- case TextReader::EndElement:
- processEndElement();
- break;
-
- case TextReader::SignificantWhitespace:
- processSignificantWhitespace();
- break;
-
- case TextReader::Comment:
- processComment();
- break;
-
- case TextReader::DocumentType:
- processDocTypeDeclaration();
- break;
-
- case TextReader::ProcessingInstruction:
- processProcessingInstruction();
- break;
-
- case TextReader::CDATA:
- processCDATASection();
- break;
-
- case TextReader::None:
- processUnknownNodeType();
- break;
-
- default:
- int type = reader_->get_node_type();
- cout << " Node type: " << type << endl;
- break;
-
- }
- }
-}