X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2FXMLDocShredder.cpp;fp=src%2FXMLDocShredder.cpp;h=ffe4de7616064f3fa5d57153c0099e15017fa308;hb=4b52da1a20a4fe031930bb96d2ca46bec06dc529;hp=0000000000000000000000000000000000000000;hpb=a223af3254fb51c279cfbccdc18c59484fdca74e;p=SXSI%2Fxpathcomp.git diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp new file mode 100644 index 0000000..ffe4de7 --- /dev/null +++ b/src/XMLDocShredder.cpp @@ -0,0 +1,306 @@ +/********************************************************** + * XMLDocShredder.cpp + * --------------------- + * Implementation of the class that receives events from the XML parser and + * invokes corresponding construction methods of the storage interface. + * + * Author: Greg Leighton + * Date: 02/11/08 + * Changes: + * 05/11/08 -- Fixed bug related to parsing empty elements + * -- Set parser properties to automatically resolve + * entity references and load external DTD if present + * -- Modified processEndDocument() by adding a nodeFinished() + * call to the storage interface to close off the + * document node + * + */ + +#include +#include "XMLDocShredder.h" +#include +#include "Utils.h" + +using namespace Glib; + +void XMLDocShredder::doText(){ + + if (!buffer.empty()){ + tb->NewOpenTag(PCDATA_OPEN_TAG); + tb->NewText(buffer); + tb->NewClosingTag(PCDATA_OPEN_TAG); + }; + buffer.clear(); + +} + +void XMLDocShredder::setProperties(){ + /* instruct the parser to expand entity references and report as + * regular PCDATA + */ + reader_->set_parser_property( + TextReader::SubstEntities, true); + + /* instruct parser to read external DTD, if present. This is + * needed to obtain any entity definitions in the DTD + */ + reader_->set_parser_property( + TextReader::LoadDtd, true); + + + /* + */ + reader_->set_parser_property( + TextReader::DefaultAttrs, true); + + + /* but we don't want to do validation since it would slow us down + */ + + + reader_->set_parser_property( + TextReader::Validate, false); + +} +XMLDocShredder::XMLDocShredder(const unsigned char * data, + TextReader::size_type size, + int sf, + bool iet, + bool dtc, + TextCollectionBuilder::index_type_t index_type + ) +{ + tree = NULL; + reader_ = new TextReader(data,size,""); + setProperties(); + tb = new XMLTreeBuilder(); + buffer.clear(); + tb->OpenDocument(iet,sf,dtc, index_type); +} + +XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc, + TextCollectionBuilder::index_type_t index_type + ) +{ + tree = NULL; + reader_ = new TextReader(inFileName); + setProperties(); + tb = new XMLTreeBuilder(); + buffer.clear(); + tb->OpenDocument(iet,sf,dtc,index_type); +} + +XMLDocShredder::~XMLDocShredder() +{ + delete reader_; + reader_ = NULL; + delete tb; + tb = NULL; + +} + + +void XMLDocShredder::processStartElement() +{ + doText(); + // fetch element name; this will be the full qualified name + ustring name = reader_->get_name(); + bool empty = false; + size_t found = name.find_first_of(':'); + if (found == ustring::npos) + tb->NewOpenTag(name); + else + tb->NewOpenTag(name.substr(found+1,name.length() - found - 1)); + + /* We must be really carefull here. calling process attributes moves + the document pointer on the last attribute, hence calling reader_->is_empty + afterwards will yield the wrong result. It is better to call it while we are + on the element and generate a nodeFinished() call at the end */ + empty = reader_->is_empty_element(); + + + // now, process attributes + if (reader_->has_attributes()) + processAttributes(); + + + if (empty) + tb->NewClosingTag(name); + + +} + +void XMLDocShredder::processEndElement() +{ + doText(); + ustring name = reader_->get_name(); + tb->NewClosingTag(name); +} + +void XMLDocShredder::processPCDATA() +{ + // send the content of this PCDATA node to the storage interface as a text node + if (reader_->has_value()) + buffer += reader_->get_value(); + +} + +void XMLDocShredder::processAttributes() +{ + reader_->move_to_first_attribute(); + + string nspaceStr = "xmlns"; + tb->NewOpenTag(ATTRIBUTE_OPEN_TAG); + do + { + ustring name = reader_->get_name(); + ustring value = reader_->get_value(); + + /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute, + * so we have to extract it and build a namespace uri node out of it before + * passing to the storage interface */ + + if ((name.find(nspaceStr.c_str(), 0, 5)) == 0) + { + //TODO + } + + /* otherwise, this is an ordinary attribute, so we construct a new child node of the + * parent element to store the attribute name, possessing a child text node storing the + * attribute value. Then, we close off the attribute node with a call to nodeFinished() + */ + + else + { + string attname = "<@>"+name; + tb->NewOpenTag(attname); + tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG); + tb->NewText(value); + tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG); + tb->NewClosingTag(attname); + } + } + while (reader_->move_to_next_attribute()); + tb->NewClosingTag(ATTRIBUTE_OPEN_TAG); +} + +void XMLDocShredder::processSignificantWhitespace() +{ + if (reader_->has_value()) + buffer += reader_->get_value(); + +} + +void XMLDocShredder::processStartDocument(const string docName) +{ + // tell storage interface to construct the document name + + tb->NewOpenTag(DOCUMENT_OPEN_TAG); + +} + +void XMLDocShredder::processEndDocument() +{ + doText(); + /* tell the storage interface that document parsing has finished, and structures + * can now be written to disk. */ + tb->NewClosingTag(DOCUMENT_OPEN_TAG); + tree = tb->CloseDocument(); + +} + +void XMLDocShredder::processComment() +{ + //storageIfc_->newChild("!" + reader_->get_value()); + //storageIfc_->nodeFinished(); +} + +void XMLDocShredder::processProcessingInstruction() +{ + ustring name = reader_->get_name(); + ustring value = reader_->get_value(); + + /* Create a child node to store the target of the PI, append a text node to it to store + * the PI data, send to the storage interface. Close off the PI node with a call to + * nodeFinished + */ + + // storageIfc_->newChild("?" + name); + // storageIfc_->newText(value); + // storageIfc_->nodeFinished(); +} + +void XMLDocShredder::processDocTypeDeclaration() +{ + /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton + * in case we do want to process it in the future. + */ +} + +void XMLDocShredder::processCDATASection() +{ + /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data + * model. Instead, we simply pass the converted text value to the storage interface as + * a text node attached to the current context node. + */ + if (reader_->has_value()) + buffer+= reader_->get_value(); +} + +void XMLDocShredder::processUnknownNodeType() +{ + cout << "unknown token encountered during parsing" << endl; + throw xmlpp::parse_error("unknown token encountered during parsing"); + +} + +void XMLDocShredder::parse() +{ + while (reader_->read() && (reader_->get_read_state() != TextReader::Error)) + { + switch (reader_->get_node_type()) + { + case TextReader::Element: + processStartElement(); + break; + + case TextReader::Text: + processPCDATA(); + break; + + case TextReader::EndElement: + processEndElement(); + break; + + case TextReader::SignificantWhitespace: + processSignificantWhitespace(); + break; + + case TextReader::Comment: + processComment(); + break; + + case TextReader::DocumentType: + processDocTypeDeclaration(); + break; + + case TextReader::ProcessingInstruction: + processProcessingInstruction(); + break; + + case TextReader::CDATA: + processCDATASection(); + break; + + case TextReader::None: + processUnknownNodeType(); + break; + + default: + int type = reader_->get_node_type(); + cout << " Node type: " << type << endl; + break; + + } + } +}