XMLDocShredder.cpp

   1 /**********************************************************
   2  * XMLDocShredder.cpp
   3  * ---------------------
   4  * Implementation of the class that receives events from the XML parser and
   5  * invokes corresponding construction methods of the storage interface.
   6  *
   7  * Author: Greg Leighton
   8  * Date: 02/11/08
   9  * Changes:
  10  *              05/11/08 -- Fixed bug related to parsing empty elements
  11  *                               -- Set parser properties to automatically resolve
  12  *                                      entity references and load external DTD if present
  13  *                               -- Modified processEndDocument() by adding a nodeFinished()
  14  *                                      call to the storage interface to close off the
  15  *                                      document node
  16  *
  17  */
  18
  19 #include <iostream>
  20 #include "XMLDocShredder.h"
  21 #include <libxml++/exceptions/parse_error.h>
  22 #include "Utils.h"
  23
  24 using namespace Glib;
  25
  26 void XMLDocShredder::doText(){
  27
  28   if (!buffer.empty()){
  29     tb->NewOpenTag(PCDATA_OPEN_TAG);
  30     tb->NewText(buffer);
  31     tb->NewClosingTag(PCDATA_OPEN_TAG);
  32   };
  33   buffer.clear();
  34
  35 }
  36
  37 void XMLDocShredder::setProperties(){
  38   /* instruct the parser to expand entity references and report as
  39    * regular PCDATA
  40    */
  41   reader_->set_parser_property(
  42                                TextReader::SubstEntities, true);
  43
  44   /* instruct parser to read external DTD, if present.  This is
  45          * needed to obtain any entity definitions in the DTD
  46          */
  47   reader_->set_parser_property(
  48                                TextReader::LoadDtd, true);
  49
  50
  51   /*
  52    */
  53   reader_->set_parser_property(
  54                                TextReader::DefaultAttrs, true);
  55
  56
  57   /* but we don't want to do validation since it would slow us down
  58    */
  59
  60
  61   reader_->set_parser_property(
  62                                TextReader::Validate, false);
  63
  64 }
  65 XMLDocShredder::XMLDocShredder(const unsigned char * data,
  66                                TextReader::size_type size,
  67                                int sf,
  68                                bool iet,
  69                                bool dtc)
  70 {
  71   tree = NULL;
  72   reader_ = new TextReader(data,size,"");
  73   setProperties();
  74   tb  = new XMLTreeBuilder();
  75   buffer.clear();
  76   tb->OpenDocument(iet,sf,dtc);
  77 }
  78
  79 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc)
  80 {
  81   tree = NULL;
  82   reader_ = new TextReader(inFileName);
  83   setProperties();
  84   tb = new XMLTreeBuilder();
  85   buffer.clear();
  86   tb->OpenDocument(iet,sf,dtc);
  87 }
  88
  89 XMLDocShredder::~XMLDocShredder()
  90 {
  91         delete reader_;
  92         reader_ = NULL;
  93         delete tb;
  94         tb = NULL;
  95
  96 }
  97
  98
  99 void XMLDocShredder::processStartElement()
 100 {
 101   doText();
 102   // fetch element name; this will be the full qualified name
 103   ustring name = reader_->get_name();
 104   bool empty = false;
 105   size_t found = name.find_first_of(':');
 106   if (found == ustring::npos)
 107     tb->NewOpenTag(name);
 108   else
 109     tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
 110
 111   /* We must be really carefull here. calling process attributes moves
 112      the document pointer on the last attribute, hence calling reader_->is_empty
 113      afterwards will yield the wrong result. It is better to call it while we are
 114      on the element and generate a nodeFinished() call at the end */
 115   empty = reader_->is_empty_element();
 116
 117
 118   // now, process attributes
 119   if (reader_->has_attributes())
 120     processAttributes();
 121
 122
 123   if (empty)
 124     tb->NewClosingTag(name);
 125
 126
 127 }
 128
 129 void XMLDocShredder::processEndElement()
 130 {
 131   doText();
 132   ustring name = reader_->get_name();
 133   tb->NewClosingTag(name);
 134 }
 135
 136 void XMLDocShredder::processPCDATA()
 137 {
 138   // send the content of this PCDATA node to the storage interface as a text node
 139   if (reader_->has_value())
 140     buffer += reader_->get_value();
 141
 142 }
 143
 144 void XMLDocShredder::processAttributes()
 145 {
 146         reader_->move_to_first_attribute();
 147
 148         string nspaceStr = "xmlns";
 149         tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
 150         do
 151           {
 152                 ustring name = reader_->get_name();
 153                 ustring value = reader_->get_value();
 154
 155                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
 156                 * so we have to extract it and build a namespace uri node out of it before
 157                 * passing to the storage interface */
 158
 159                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 160                 {
 161                   //TODO
 162                 }
 163
 164                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
 165                  * parent element to store the attribute name, possessing a child text node storing the
 166                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
 167                  */
 168
 169                 else
 170                 {
 171                   string attname = "<@>"+name;
 172                   tb->NewOpenTag(attname);
 173                   tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
 174                   tb->NewText(value);
 175                   tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
 176                   tb->NewClosingTag(attname);
 177                 }
 178         }
 179         while (reader_->move_to_next_attribute());
 180         tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
 181 }
 182
 183 void XMLDocShredder::processSignificantWhitespace()
 184 {
 185   if (reader_->has_value())
 186     buffer += reader_->get_value();
 187
 188 }
 189
 190 void XMLDocShredder::processStartDocument(const string docName)
 191 {
 192   // tell storage interface to construct the document name
 193
 194   tb->NewOpenTag(DOCUMENT_OPEN_TAG);
 195
 196 }
 197
 198 void XMLDocShredder::processEndDocument()
 199 {
 200   doText();
 201   /* tell the storage interface that document parsing has finished, and structures
 202    * can now be written to disk. */
 203   tb->NewClosingTag(DOCUMENT_OPEN_TAG);
 204   tree = tb->CloseDocument();
 205
 206 }
 207
 208 void XMLDocShredder::processComment()
 209 {
 210   //storageIfc_->newChild("!" + reader_->get_value());
 211   //storageIfc_->nodeFinished();
 212 }
 213
 214 void XMLDocShredder::processProcessingInstruction()
 215 {
 216         ustring name = reader_->get_name();
 217         ustring value = reader_->get_value();
 218
 219         /* Create a child node to store the target of the PI, append a text node to it to store
 220          * the PI data, send to the storage interface.  Close off the PI node with a call to
 221          * nodeFinished
 222          */
 223
 224         // storageIfc_->newChild("?" + name);
 225         // storageIfc_->newText(value);
 226         // storageIfc_->nodeFinished();
 227 }
 228
 229 void XMLDocShredder::processDocTypeDeclaration()
 230 {
 231         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
 232          * in case we do want to process it in the future.
 233         */
 234 }
 235
 236 void XMLDocShredder::processCDATASection()
 237 {
 238         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
 239          * model.  Instead, we simply pass the converted text value to the storage interface as
 240          * a text node attached to the current context node.
 241          */
 242   if (reader_->has_value())
 243     buffer+= reader_->get_value();
 244 }
 245
 246 void XMLDocShredder::processUnknownNodeType()
 247 {
 248         cout << "unknown token encountered during parsing" << endl;
 249         throw xmlpp::parse_error("unknown token encountered during parsing");
 250
 251 }
 252
 253 void XMLDocShredder::parse()
 254 {
 255         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
 256         {
 257                 switch (reader_->get_node_type())
 258                 {
 259                         case TextReader::Element:
 260                                 processStartElement();
 261                                 break;
 262
 263                         case TextReader::Text:
 264                                 processPCDATA();
 265                                 break;
 266
 267                         case TextReader::EndElement:
 268                                 processEndElement();
 269                                 break;
 270
 271                         case TextReader::SignificantWhitespace:
 272                                 processSignificantWhitespace();
 273                                 break;
 274
 275                         case TextReader::Comment:
 276                                 processComment();
 277                                 break;
 278
 279                         case TextReader::DocumentType:
 280                                 processDocTypeDeclaration();
 281                                 break;
 282
 283                         case TextReader::ProcessingInstruction:
 284                                 processProcessingInstruction();
 285                                 break;
 286
 287                         case TextReader::CDATA:
 288                                 processCDATASection();
 289                                 break;
 290
 291                         case TextReader::None:
 292                                 processUnknownNodeType();
 293                                 break;
 294
 295                         default:
 296                                 int type = reader_->get_node_type();
 297                                 cout << "  Node type: " << type << endl;
 298                                 break;
 299
 300                 }
 301         }
 302 }