XMLDocShredder.cpp

   1 /**********************************************************
   2  * XMLDocShredder.cpp
   3  * ---------------------
   4  * Implementation of the class that receives events from the XML parser and
   5  * invokes corresponding construction methods of the storage interface.
   6  *
   7  * Author: Greg Leighton
   8  * Date: 02/11/08
   9  * Changes:
  10  *              05/11/08 -- Fixed bug related to parsing empty elements
  11  *                               -- Set parser properties to automatically resolve
  12  *                                      entity references and load external DTD if present
  13  *                               -- Modified processEndDocument() by adding a nodeFinished()
  14  *                                      call to the storage interface to close off the
  15  *                                      document node
  16  *
  17  */
  18
  19 #include <iostream>
  20 #include "XMLDocShredder.h"
  21 #include "SXSIStorageInterface.h"
  22 #include <libxml++/exceptions/parse_error.h>
  23 #include "Utils.h"
  24
  25 using namespace Glib;
  26
  27 void XMLDocShredder::setProperties(){
  28   /* instruct the parser to expand entity references and report as
  29    * regular PCDATA
  30    */
  31   reader_->set_parser_property(
  32                                TextReader::SubstEntities, true);
  33
  34   /* instruct parser to read external DTD, if present.  This is
  35          * needed to obtain any entity definitions in the DTD
  36          */
  37   reader_->set_parser_property(
  38                                TextReader::LoadDtd, true);
  39
  40
  41   /*
  42    */
  43   reader_->set_parser_property(
  44                                TextReader::DefaultAttrs, true);
  45
  46
  47   /* but we don't want to do validation since it would slow us down
  48    */
  49
  50
  51   reader_->set_parser_property(
  52                                TextReader::Validate, false);
  53
  54 }
  55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
  56                                TextReader::size_type size)
  57 {
  58   last_text = false;
  59   reader_ = new TextReader(data,size,"");
  60   setProperties();
  61   storageIfc_ = new SXSIStorageInterface();
  62 }
  63
  64 XMLDocShredder::XMLDocShredder(const string inFileName)
  65 {
  66   last_text = false;
  67   reader_ = new TextReader(inFileName);
  68   setProperties();
  69   storageIfc_ = new SXSIStorageInterface();
  70
  71 }
  72
  73 XMLDocShredder::~XMLDocShredder()
  74 {
  75         delete reader_;
  76         delete storageIfc_;
  77
  78 }
  79
  80
  81 void XMLDocShredder::processStartElement()
  82 {
  83         // fetch element name; this will be the full qualified name
  84         ustring name = reader_->get_name();
  85         bool empty = false;
  86
  87         if (!last_text)
  88           storageIfc_->newText(""); //prevText
  89         last_text = false;
  90
  91         storageIfc_->newChild(name);
  92
  93         /* We must be really carefull here. calling process attributes moves
  94            the document pointer on the last attribute, hence calling reader_->is_empty
  95            afterwards will yield the wrong result. It is better to call it while we are
  96            on the element and generate a nodeFinished() call at the end */
  97         empty = reader_->is_empty_element();
  98
  99
 100         // now, process attributes
 101         if (reader_->has_attributes())
 102           {
 103             processAttributes();
 104           };
 105
 106
 107         if (empty){
 108           DPRINT("Node " << name <<" is empty!\n")
 109             storageIfc_->newText("");  //myText
 110             storageIfc_->nodeFinished(name);
 111             storageIfc_->newText("");  //nextText
 112         };
 113
 114
 115
 116
 117
 118 }
 119
 120 void XMLDocShredder::processEndElement()
 121 {
 122   // tell the storage interface that the current node has been completely processed
 123   if (!last_text)
 124     storageIfc_->newText(""); //nextText of previous node
 125   last_text = false;
 126   storageIfc_->nodeFinished(reader_->get_name());
 127 }
 128
 129 void XMLDocShredder::processPCDATA()
 130 {
 131         // send the content of this PCDATA node to the storage interface as a text node
 132
 133         if (reader_->has_value())
 134         {
 135           storageIfc_->newChild("<$>");
 136           storageIfc_->newText(reader_->get_value());
 137           last_text = true;
 138         }
 139         else
 140           storageIfc_->newText("");
 141 }
 142
 143 void XMLDocShredder::processAttributes()
 144 {
 145         reader_->move_to_first_attribute();
 146
 147         string nspaceStr = "xmlns";
 148         storageIfc_->newChild("<@>");
 149         do
 150         {
 151                 ustring name = reader_->get_name();
 152                 ustring value = reader_->get_value();
 153
 154                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
 155                 * so we have to extract it and build a namespace uri node out of it before
 156                 * passing to the storage interface */
 157
 158                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 159                 {
 160                         storageIfc_->newChild(":" + value);
 161                         storageIfc_->nodeFinished(":" + value);
 162                 }
 163
 164                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
 165                  * parent element to store the attribute name, possessing a child text node storing the
 166                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
 167                  */
 168
 169                 else
 170                 {
 171                         storageIfc_->newChild(name);
 172                         storageIfc_->newChild("<$>");
 173                         storageIfc_->newText(value);
 174                         storageIfc_->nodeFinished("<$>");
 175                 }
 176         }
 177         while (reader_->move_to_next_attribute());
 178         storageIfc_->nodeFinished("<@>");
 179 }
 180
 181 void XMLDocShredder::processSignificantWhitespace()
 182 {
 183         ustring value = reader_->get_value();
 184
 185         // each significant whitespace sequence constructs a text node
 186         storageIfc_->newChild("<$>");
 187         storageIfc_->newText(value);
 188
 189
 190 }
 191
 192 void XMLDocShredder::processStartDocument(const string docName)
 193 {
 194   // tell storage interface to construct the document name
 195   storageIfc_->newChild("ROOT");
 196
 197 }
 198
 199 void XMLDocShredder::processEndDocument()
 200 {
 201         /* tell the storage interface that document parsing has finished, and structures
 202          * can now be written to disk. */
 203   storageIfc_->nodeFinished("ROOT");
 204   storageIfc_->parsingFinished();
 205 }
 206
 207 void XMLDocShredder::processComment()
 208 {
 209   //storageIfc_->newChild("!" + reader_->get_value());
 210   //storageIfc_->nodeFinished();
 211 }
 212
 213 void XMLDocShredder::processProcessingInstruction()
 214 {
 215         ustring name = reader_->get_name();
 216         ustring value = reader_->get_value();
 217
 218         /* Create a child node to store the target of the PI, append a text node to it to store
 219          * the PI data, send to the storage interface.  Close off the PI node with a call to
 220          * nodeFinished
 221          */
 222
 223         // storageIfc_->newChild("?" + name);
 224         // storageIfc_->newText(value);
 225         // storageIfc_->nodeFinished();
 226 }
 227
 228 void XMLDocShredder::processDocTypeDeclaration()
 229 {
 230         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
 231          * in case we do want to process it in the future.
 232         */
 233 }
 234
 235 void XMLDocShredder::processCDATASection()
 236 {
 237         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
 238          * model.  Instead, we simply pass the converted text value to the storage interface as
 239          * a text node attached to the current context node.
 240          */
 241
 242         ustring value = reader_->get_value();
 243         storageIfc_->newChild("<$>");
 244         storageIfc_->newText(value);
 245         last_text = true;
 246         //      storageIfc_->nodeFinished();
 247
 248 }
 249
 250 void XMLDocShredder::processUnknownNodeType()
 251 {
 252         cout << "unknown token encountered during parsing" << endl;
 253         throw xmlpp::parse_error("unknown token encountered during parsing");
 254
 255 }
 256
 257 void XMLDocShredder::parse()
 258 {
 259         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
 260         {
 261                 switch (reader_->get_node_type())
 262                 {
 263                         case TextReader::Element:
 264                                 processStartElement();
 265                                 break;
 266
 267                         case TextReader::Text:
 268                                 processPCDATA();
 269                                 break;
 270
 271                         case TextReader::EndElement:
 272                                 processEndElement();
 273                                 break;
 274
 275                         case TextReader::SignificantWhitespace:
 276                                 processSignificantWhitespace();
 277                                 break;
 278
 279                         case TextReader::Comment:
 280                                 processComment();
 281                                 break;
 282
 283                         case TextReader::DocumentType:
 284                                 processDocTypeDeclaration();
 285                                 break;
 286
 287                         case TextReader::ProcessingInstruction:
 288                                 processProcessingInstruction();
 289                                 break;
 290
 291                         case TextReader::CDATA:
 292                                 processCDATASection();
 293                                 break;
 294
 295                         case TextReader::None:
 296                                 processUnknownNodeType();
 297                                 break;
 298
 299                         default:
 300                                 int type = reader_->get_node_type();
 301                                 cout << "  Node type: " << type << endl;
 302                                 break;
 303
 304                 }
 305         }
 306 }