X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2FXMLDocShredder.cpp;fp=src%2FXMLDocShredder.cpp;h=0000000000000000000000000000000000000000;hb=ff13d22656fdbdffb2d909192bd17ba135606224;hp=ffe4de7616064f3fa5d57153c0099e15017fa308;hpb=d7e793e387b3a59877765e15c54649a9d74d137e;p=SXSI%2Fxpathcomp.git diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp deleted file mode 100644 index ffe4de7..0000000 --- a/src/XMLDocShredder.cpp +++ /dev/null @@ -1,306 +0,0 @@ -/********************************************************** - * XMLDocShredder.cpp - * --------------------- - * Implementation of the class that receives events from the XML parser and - * invokes corresponding construction methods of the storage interface. - * - * Author: Greg Leighton - * Date: 02/11/08 - * Changes: - * 05/11/08 -- Fixed bug related to parsing empty elements - * -- Set parser properties to automatically resolve - * entity references and load external DTD if present - * -- Modified processEndDocument() by adding a nodeFinished() - * call to the storage interface to close off the - * document node - * - */ - -#include -#include "XMLDocShredder.h" -#include -#include "Utils.h" - -using namespace Glib; - -void XMLDocShredder::doText(){ - - if (!buffer.empty()){ - tb->NewOpenTag(PCDATA_OPEN_TAG); - tb->NewText(buffer); - tb->NewClosingTag(PCDATA_OPEN_TAG); - }; - buffer.clear(); - -} - -void XMLDocShredder::setProperties(){ - /* instruct the parser to expand entity references and report as - * regular PCDATA - */ - reader_->set_parser_property( - TextReader::SubstEntities, true); - - /* instruct parser to read external DTD, if present. This is - * needed to obtain any entity definitions in the DTD - */ - reader_->set_parser_property( - TextReader::LoadDtd, true); - - - /* - */ - reader_->set_parser_property( - TextReader::DefaultAttrs, true); - - - /* but we don't want to do validation since it would slow us down - */ - - - reader_->set_parser_property( - TextReader::Validate, false); - -} -XMLDocShredder::XMLDocShredder(const unsigned char * data, - TextReader::size_type size, - int sf, - bool iet, - bool dtc, - TextCollectionBuilder::index_type_t index_type - ) -{ - tree = NULL; - reader_ = new TextReader(data,size,""); - setProperties(); - tb = new XMLTreeBuilder(); - buffer.clear(); - tb->OpenDocument(iet,sf,dtc, index_type); -} - -XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc, - TextCollectionBuilder::index_type_t index_type - ) -{ - tree = NULL; - reader_ = new TextReader(inFileName); - setProperties(); - tb = new XMLTreeBuilder(); - buffer.clear(); - tb->OpenDocument(iet,sf,dtc,index_type); -} - -XMLDocShredder::~XMLDocShredder() -{ - delete reader_; - reader_ = NULL; - delete tb; - tb = NULL; - -} - - -void XMLDocShredder::processStartElement() -{ - doText(); - // fetch element name; this will be the full qualified name - ustring name = reader_->get_name(); - bool empty = false; - size_t found = name.find_first_of(':'); - if (found == ustring::npos) - tb->NewOpenTag(name); - else - tb->NewOpenTag(name.substr(found+1,name.length() - found - 1)); - - /* We must be really carefull here. calling process attributes moves - the document pointer on the last attribute, hence calling reader_->is_empty - afterwards will yield the wrong result. It is better to call it while we are - on the element and generate a nodeFinished() call at the end */ - empty = reader_->is_empty_element(); - - - // now, process attributes - if (reader_->has_attributes()) - processAttributes(); - - - if (empty) - tb->NewClosingTag(name); - - -} - -void XMLDocShredder::processEndElement() -{ - doText(); - ustring name = reader_->get_name(); - tb->NewClosingTag(name); -} - -void XMLDocShredder::processPCDATA() -{ - // send the content of this PCDATA node to the storage interface as a text node - if (reader_->has_value()) - buffer += reader_->get_value(); - -} - -void XMLDocShredder::processAttributes() -{ - reader_->move_to_first_attribute(); - - string nspaceStr = "xmlns"; - tb->NewOpenTag(ATTRIBUTE_OPEN_TAG); - do - { - ustring name = reader_->get_name(); - ustring value = reader_->get_value(); - - /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute, - * so we have to extract it and build a namespace uri node out of it before - * passing to the storage interface */ - - if ((name.find(nspaceStr.c_str(), 0, 5)) == 0) - { - //TODO - } - - /* otherwise, this is an ordinary attribute, so we construct a new child node of the - * parent element to store the attribute name, possessing a child text node storing the - * attribute value. Then, we close off the attribute node with a call to nodeFinished() - */ - - else - { - string attname = "<@>"+name; - tb->NewOpenTag(attname); - tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG); - tb->NewText(value); - tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG); - tb->NewClosingTag(attname); - } - } - while (reader_->move_to_next_attribute()); - tb->NewClosingTag(ATTRIBUTE_OPEN_TAG); -} - -void XMLDocShredder::processSignificantWhitespace() -{ - if (reader_->has_value()) - buffer += reader_->get_value(); - -} - -void XMLDocShredder::processStartDocument(const string docName) -{ - // tell storage interface to construct the document name - - tb->NewOpenTag(DOCUMENT_OPEN_TAG); - -} - -void XMLDocShredder::processEndDocument() -{ - doText(); - /* tell the storage interface that document parsing has finished, and structures - * can now be written to disk. */ - tb->NewClosingTag(DOCUMENT_OPEN_TAG); - tree = tb->CloseDocument(); - -} - -void XMLDocShredder::processComment() -{ - //storageIfc_->newChild("!" + reader_->get_value()); - //storageIfc_->nodeFinished(); -} - -void XMLDocShredder::processProcessingInstruction() -{ - ustring name = reader_->get_name(); - ustring value = reader_->get_value(); - - /* Create a child node to store the target of the PI, append a text node to it to store - * the PI data, send to the storage interface. Close off the PI node with a call to - * nodeFinished - */ - - // storageIfc_->newChild("?" + name); - // storageIfc_->newText(value); - // storageIfc_->nodeFinished(); -} - -void XMLDocShredder::processDocTypeDeclaration() -{ - /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton - * in case we do want to process it in the future. - */ -} - -void XMLDocShredder::processCDATASection() -{ - /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data - * model. Instead, we simply pass the converted text value to the storage interface as - * a text node attached to the current context node. - */ - if (reader_->has_value()) - buffer+= reader_->get_value(); -} - -void XMLDocShredder::processUnknownNodeType() -{ - cout << "unknown token encountered during parsing" << endl; - throw xmlpp::parse_error("unknown token encountered during parsing"); - -} - -void XMLDocShredder::parse() -{ - while (reader_->read() && (reader_->get_read_state() != TextReader::Error)) - { - switch (reader_->get_node_type()) - { - case TextReader::Element: - processStartElement(); - break; - - case TextReader::Text: - processPCDATA(); - break; - - case TextReader::EndElement: - processEndElement(); - break; - - case TextReader::SignificantWhitespace: - processSignificantWhitespace(); - break; - - case TextReader::Comment: - processComment(); - break; - - case TextReader::DocumentType: - processDocTypeDeclaration(); - break; - - case TextReader::ProcessingInstruction: - processProcessingInstruction(); - break; - - case TextReader::CDATA: - processCDATASection(); - break; - - case TextReader::None: - processUnknownNodeType(); - break; - - default: - int type = reader_->get_node_type(); - cout << " Node type: " << type << endl; - break; - - } - } -}