X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLDocShredder.cpp;h=cb70e19590eef87217c0daaa82c62f66061ca7c7;hb=df5fdb22632be887ecd9f5c46a014e7e970148a2;hp=6e9c41a3e5f2ff138bedc4a91edd4cec972e7b1f;hpb=3623eefccfb5fc69e19ad975a3669f51a2a8b276;p=SXSI%2Fxpathcomp.git diff --git a/XMLDocShredder.cpp b/XMLDocShredder.cpp index 6e9c41a..cb70e19 100644 --- a/XMLDocShredder.cpp +++ b/XMLDocShredder.cpp @@ -18,12 +18,22 @@ #include #include "XMLDocShredder.h" -#include "OCamlStorageInterface.h" #include #include "Utils.h" using namespace Glib; +void XMLDocShredder::doText(){ + + if (!buffer.empty()){ + tb->NewOpenTag(PCDATA_OPEN_TAG); + tb->NewText(buffer); + tb->NewClosingTag(PCDATA_OPEN_TAG); + }; + buffer.clear(); + +} + void XMLDocShredder::setProperties(){ /* instruct the parser to expand entity references and report as * regular PCDATA @@ -53,100 +63,92 @@ void XMLDocShredder::setProperties(){ } XMLDocShredder::XMLDocShredder(const unsigned char * data, - TextReader::size_type size) + TextReader::size_type size, + int sf, + bool iet, + bool dtc) { + tree = NULL; reader_ = new TextReader(data,size,""); setProperties(); - storageIfc_ = new OCamlStorageInterface(); - //tagsID_ = new unordered_map(107); - //idTags_ = new unordered_map(107); + tb = new XMLTreeBuilder(); + buffer.clear(); + tb->OpenDocument(iet,sf,dtc); } -XMLDocShredder::XMLDocShredder(const string inFileName) +XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc) { + tree = NULL; reader_ = new TextReader(inFileName); setProperties(); - storageIfc_ = new OCamlStorageInterface(); - // tagsID_ = new unordered_map(107); - // idTags_ = new unordered_map(107); + tb = new XMLTreeBuilder(); + buffer.clear(); + tb->OpenDocument(iet,sf,dtc); } XMLDocShredder::~XMLDocShredder() { delete reader_; - delete storageIfc_; + reader_ = NULL; + delete tb; + tb = NULL; } -int XMLDocShredder::tagID(string name) -{ - int res = tagsID_[name]; - return res; -} -string XMLDocShredder::idTag(int id) -{ - - return idTags_[id]; -} - void XMLDocShredder::processStartElement() { - // fetch element name; this will be the full qualified name - ustring name = reader_->get_name(); - bool empty = false; - - storageIfc_->newChild(name); - - /* We must be really carefull here. calling process attributes moves - the document pointer on the last attribute, hence calling reader_->is_empty - afterwards will yield the wrong result. It is better to call it while we are - on the element and generate a nodeFinished() call at the end */ - empty = reader_->is_empty_element(); - - - // now, process attributes - if (reader_->has_attributes()) - { - processAttributes(); - }; - - - if (empty){ - DPRINT("Node " << name <<" is empty!\n") - storageIfc_->nodeFinished(); - }; - - - - - + doText(); + // fetch element name; this will be the full qualified name + ustring name = reader_->get_name(); + bool empty = false; + size_t found = name.find_first_of(':'); + if (found == ustring::npos) + tb->NewOpenTag(name); + else + tb->NewOpenTag(name.substr(found+1,name.length() - found - 1)); + + /* We must be really carefull here. calling process attributes moves + the document pointer on the last attribute, hence calling reader_->is_empty + afterwards will yield the wrong result. It is better to call it while we are + on the element and generate a nodeFinished() call at the end */ + empty = reader_->is_empty_element(); + + + // now, process attributes + if (reader_->has_attributes()) + processAttributes(); + + + if (empty) + tb->NewClosingTag(name); + + } void XMLDocShredder::processEndElement() { - // tell the storage interface that the current node has been completely processed - storageIfc_->nodeFinished(); + doText(); + ustring name = reader_->get_name(); + tb->NewClosingTag(name); } void XMLDocShredder::processPCDATA() { - // send the content of this PCDATA node to the storage interface as a text node - if (reader_->has_value()) - { - storageIfc_->newChild("<$>"); - storageIfc_->newText(reader_->get_value()); - } + // send the content of this PCDATA node to the storage interface as a text node + if (reader_->has_value()) + buffer += reader_->get_value(); + } void XMLDocShredder::processAttributes() { reader_->move_to_first_attribute(); - string nspaceStr = "xmlns"; - storageIfc_->newChild("<@>"); + string nspaceStr = "xmlns"; + tb->NewOpenTag(ATTRIBUTE_OPEN_TAG); do - { + { ustring name = reader_->get_name(); ustring value = reader_->get_value(); @@ -156,8 +158,7 @@ void XMLDocShredder::processAttributes() if ((name.find(nspaceStr.c_str(), 0, 5)) == 0) { - storageIfc_->newChild(":" + value); - storageIfc_->nodeFinished(); + //TODO } /* otherwise, this is an ordinary attribute, so we construct a new child node of the @@ -167,40 +168,41 @@ void XMLDocShredder::processAttributes() else { - storageIfc_->newChild(name); - storageIfc_->newChild("<$>"); - storageIfc_->newText(value); - storageIfc_->nodeFinished(); - // storageIfc_->nodeFinished(); + string attname = "<@>"+name; + tb->NewOpenTag(attname); + tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG); + tb->NewText(value); + tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG); + tb->NewClosingTag(attname); } } while (reader_->move_to_next_attribute()); - storageIfc_->nodeFinished(); + tb->NewClosingTag(ATTRIBUTE_OPEN_TAG); } void XMLDocShredder::processSignificantWhitespace() { - ustring value = reader_->get_value(); - - // each significant whitespace sequence constructs a text node - storageIfc_->newChild("<$>"); - storageIfc_->newText(value); - //storageIfc_->nodeFinished(); - + if (reader_->has_value()) + buffer += reader_->get_value(); + } void XMLDocShredder::processStartDocument(const string docName) { // tell storage interface to construct the document name - // storageIfc_->newChild(""); + + tb->NewOpenTag(DOCUMENT_OPEN_TAG); + } void XMLDocShredder::processEndDocument() { - /* tell the storage interface that document parsing has finished, and structures - * can now be written to disk. */ - // storageIfc_->nodeFinished(); - storageIfc_->parsingFinished(); + doText(); + /* tell the storage interface that document parsing has finished, and structures + * can now be written to disk. */ + tb->NewClosingTag(DOCUMENT_OPEN_TAG); + tree = tb->CloseDocument(); + } void XMLDocShredder::processComment() @@ -237,11 +239,8 @@ void XMLDocShredder::processCDATASection() * model. Instead, we simply pass the converted text value to the storage interface as * a text node attached to the current context node. */ - ustring value = reader_->get_value(); - storageIfc_->newChild("<$>"); - storageIfc_->newText(value); - // storageIfc_->nodeFinished(); - + if (reader_->has_value()) + buffer+= reader_->get_value(); } void XMLDocShredder::processUnknownNodeType()