XMLDocShredder.cpp

   1 /**********************************************************
   2  * XMLDocShredder.cpp
   3  * ---------------------
   4  * Implementation of the class that receives events from the XML parser and
   5  * invokes corresponding construction methods of the storage interface.
   6  *
   7  * Author: Greg Leighton
   8  * Date: 02/11/08
   9  * Changes:
  10  *              05/11/08 -- Fixed bug related to parsing empty elements
  11  *                               -- Set parser properties to automatically resolve
  12  *                                      entity references and load external DTD if present
  13  *                               -- Modified processEndDocument() by adding a nodeFinished()
  14  *                                      call to the storage interface to close off the
  15  *                                      document node
  16  *
  17  */
  18
  19 #include <iostream>
  20 #include "XMLDocShredder.h"
  21 #include "SXSIStorageInterface.h"
  22 #include <libxml++/exceptions/parse_error.h>
  23 #include "Utils.h"
  24
  25 using namespace Glib;
  26
  27 void XMLDocShredder::setProperties(){
  28   /* instruct the parser to expand entity references and report as
  29    * regular PCDATA
  30    */
  31   reader_->set_parser_property(
  32                                TextReader::SubstEntities, true);
  33
  34   /* instruct parser to read external DTD, if present.  This is
  35          * needed to obtain any entity definitions in the DTD
  36          */
  37   reader_->set_parser_property(
  38                                TextReader::LoadDtd, true);
  39
  40
  41   /*
  42    */
  43   reader_->set_parser_property(
  44                                TextReader::DefaultAttrs, true);
  45
  46
  47   /* but we don't want to do validation since it would slow us down
  48    */
  49
  50
  51   reader_->set_parser_property(
  52                                TextReader::Validate, false);
  53
  54 }
  55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
  56                                TextReader::size_type size,
  57                                int sf,
  58                                bool iet,
  59                                bool dtc)
  60 {
  61   last_text = false;
  62   reader_ = new TextReader(data,size,"");
  63   setProperties();
  64   storageIfc_ = new SXSIStorageInterface(sf,iet,dtc);
  65   buffer = "";
  66 }
  67
  68 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc)
  69 {
  70   last_text = false;
  71   reader_ = new TextReader(inFileName);
  72   setProperties();
  73   storageIfc_ = new SXSIStorageInterface(sf,iet,dtc);
  74   buffer = "";
  75 }
  76
  77 XMLDocShredder::~XMLDocShredder()
  78 {
  79         delete reader_;
  80         delete storageIfc_;
  81
  82 }
  83
  84
  85 void XMLDocShredder::processStartElement()
  86 {
  87         // fetch element name; this will be the full qualified name
  88         ustring name = reader_->get_name();
  89         bool empty = false;
  90
  91         storageIfc_->newText(buffer); //prevText
  92         buffer.erase();
  93
  94         storageIfc_->newChild(name);
  95
  96         /* We must be really carefull here. calling process attributes moves
  97            the document pointer on the last attribute, hence calling reader_->is_empty
  98            afterwards will yield the wrong result. It is better to call it while we are
  99            on the element and generate a nodeFinished() call at the end */
 100         empty = reader_->is_empty_element();
 101
 102
 103         // now, process attributes
 104         if (reader_->has_attributes())
 105           {
 106             processAttributes();
 107           };
 108
 109
 110         if (empty){
 111             storageIfc_->newText("");  //myText
 112             storageIfc_->nodeFinished(name);
 113         };
 114
 115
 116 }
 117
 118 void XMLDocShredder::processEndElement()
 119 {
 120   // tell the storage interface that the current node has been completely processed
 121   storageIfc_->newText(buffer); //prevText
 122   buffer.erase();
 123   storageIfc_->nodeFinished(reader_->get_name());
 124 }
 125
 126 void XMLDocShredder::processPCDATA()
 127 {
 128         // send the content of this PCDATA node to the storage interface as a text node
 129
 130         if (reader_->has_value())
 131         {
 132           buffer += reader_->get_value();
 133         };
 134
 135 }
 136
 137 void XMLDocShredder::processAttributes()
 138 {
 139         reader_->move_to_first_attribute();
 140
 141         string nspaceStr = "xmlns";
 142         storageIfc_->newText(""); //prevText
 143         storageIfc_->newChild("<@>");
 144         do
 145         {
 146                 ustring name = reader_->get_name();
 147                 ustring value = reader_->get_value();
 148
 149                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
 150                 * so we have to extract it and build a namespace uri node out of it before
 151                 * passing to the storage interface */
 152
 153                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 154                 {
 155                         storageIfc_->newChild(":" + value);
 156                         storageIfc_->nodeFinished(":" + value);
 157                 }
 158
 159                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
 160                  * parent element to store the attribute name, possessing a child text node storing the
 161                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
 162                  */
 163
 164                 else
 165                 {
 166                   storageIfc_->newText(""); //prevText
 167                   storageIfc_->newChild(name);
 168                   storageIfc_->newText(value);
 169                   storageIfc_->nodeFinished(name);
 170                 }
 171         }
 172         while (reader_->move_to_next_attribute());
 173         storageIfc_->newText(""); //nextText
 174         storageIfc_->nodeFinished("<@>");
 175 }
 176
 177 void XMLDocShredder::processSignificantWhitespace()
 178 {
 179   // each significant whitespace sequence constructs a text node
 180   buffer += reader_->get_value();
 181
 182 }
 183
 184 void XMLDocShredder::processStartDocument(const string docName)
 185 {
 186   // tell storage interface to construct the document name
 187   storageIfc_->newChild("");
 188
 189 }
 190
 191 void XMLDocShredder::processEndDocument()
 192 {
 193         /* tell the storage interface that document parsing has finished, and structures
 194          * can now be written to disk. */
 195   storageIfc_->newText("");
 196   storageIfc_->nodeFinished("");
 197   storageIfc_->parsingFinished();
 198 }
 199
 200 void XMLDocShredder::processComment()
 201 {
 202   //storageIfc_->newChild("!" + reader_->get_value());
 203   //storageIfc_->nodeFinished();
 204 }
 205
 206 void XMLDocShredder::processProcessingInstruction()
 207 {
 208         ustring name = reader_->get_name();
 209         ustring value = reader_->get_value();
 210
 211         /* Create a child node to store the target of the PI, append a text node to it to store
 212          * the PI data, send to the storage interface.  Close off the PI node with a call to
 213          * nodeFinished
 214          */
 215
 216         // storageIfc_->newChild("?" + name);
 217         // storageIfc_->newText(value);
 218         // storageIfc_->nodeFinished();
 219 }
 220
 221 void XMLDocShredder::processDocTypeDeclaration()
 222 {
 223         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
 224          * in case we do want to process it in the future.
 225         */
 226 }
 227
 228 void XMLDocShredder::processCDATASection()
 229 {
 230         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
 231          * model.  Instead, we simply pass the converted text value to the storage interface as
 232          * a text node attached to the current context node.
 233          */
 234
 235         ustring value = reader_->get_value();
 236         storageIfc_->newText(value);
 237         last_text = true;
 238
 239 }
 240
 241 void XMLDocShredder::processUnknownNodeType()
 242 {
 243         cout << "unknown token encountered during parsing" << endl;
 244         throw xmlpp::parse_error("unknown token encountered during parsing");
 245
 246 }
 247
 248 void XMLDocShredder::parse()
 249 {
 250         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
 251         {
 252                 switch (reader_->get_node_type())
 253                 {
 254                         case TextReader::Element:
 255                                 processStartElement();
 256                                 break;
 257
 258                         case TextReader::Text:
 259                                 processPCDATA();
 260                                 break;
 261
 262                         case TextReader::EndElement:
 263                                 processEndElement();
 264                                 break;
 265
 266                         case TextReader::SignificantWhitespace:
 267                                 processSignificantWhitespace();
 268                                 break;
 269
 270                         case TextReader::Comment:
 271                                 processComment();
 272                                 break;
 273
 274                         case TextReader::DocumentType:
 275                                 processDocTypeDeclaration();
 276                                 break;
 277
 278                         case TextReader::ProcessingInstruction:
 279                                 processProcessingInstruction();
 280                                 break;
 281
 282                         case TextReader::CDATA:
 283                                 processCDATASection();
 284                                 break;
 285
 286                         case TextReader::None:
 287                                 processUnknownNodeType();
 288                                 break;
 289
 290                         default:
 291                                 int type = reader_->get_node_type();
 292                                 cout << "  Node type: " << type << endl;
 293                                 break;
 294
 295                 }
 296         }
 297 }