XMLDocShredder.cpp

   1 /**********************************************************
   2  * XMLDocShredder.cpp
   3  * ---------------------
   4  * Implementation of the class that receives events from the XML parser and
   5  * invokes corresponding construction methods of the storage interface.
   6  *
   7  * Author: Greg Leighton
   8  * Date: 02/11/08
   9  * Changes:
  10  *              05/11/08 -- Fixed bug related to parsing empty elements
  11  *                               -- Set parser properties to automatically resolve
  12  *                                      entity references and load external DTD if present
  13  *                               -- Modified processEndDocument() by adding a nodeFinished()
  14  *                                      call to the storage interface to close off the
  15  *                                      document node
  16  *
  17  */
  18
  19 #include <iostream>
  20 #include "XMLDocShredder.h"
  21 #include <libxml++/exceptions/parse_error.h>
  22 #include "Utils.h"
  23
  24 using namespace Glib;
  25
  26 void XMLDocShredder::doText(){
  27
  28   if (!buffer.empty()){
  29     tb->NewOpenTag(PCDATA_OPEN_TAG);
  30     tb->NewText(buffer);
  31     tb->NewClosingTag(PCDATA_OPEN_TAG);
  32   };
  33   buffer.clear();
  34
  35 }
  36
  37 void XMLDocShredder::setProperties(){
  38   /* instruct the parser to expand entity references and report as
  39    * regular PCDATA
  40    */
  41   reader_->set_parser_property(
  42                                TextReader::SubstEntities, true);
  43
  44   /* instruct parser to read external DTD, if present.  This is
  45          * needed to obtain any entity definitions in the DTD
  46          */
  47   reader_->set_parser_property(
  48                                TextReader::LoadDtd, true);
  49
  50
  51   /*
  52    */
  53   reader_->set_parser_property(
  54                                TextReader::DefaultAttrs, true);
  55
  56
  57   /* but we don't want to do validation since it would slow us down
  58    */
  59
  60
  61   reader_->set_parser_property(
  62                                TextReader::Validate, false);
  63
  64 }
  65 XMLDocShredder::XMLDocShredder(const unsigned char * data,
  66                                TextReader::size_type size,
  67                                int sf,
  68                                bool iet,
  69                                bool dtc)
  70 {
  71   tree = NULL;
  72   reader_ = new TextReader(data,size,"");
  73   setProperties();
  74   tb  = new XMLTreeBuilder();
  75   buffer.clear();
  76   tb->OpenDocument(iet,sf,dtc);
  77 }
  78
  79 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc)
  80 {
  81   tree = NULL;
  82   reader_ = new TextReader(inFileName);
  83   setProperties();
  84   tb = new XMLTreeBuilder();
  85   buffer.clear();
  86   tb->OpenDocument(iet,sf,dtc);
  87 }
  88
  89 XMLDocShredder::~XMLDocShredder()
  90 {
  91         delete reader_;
  92         reader_ = NULL;
  93         delete tb;
  94         tb = NULL;
  95
  96 }
  97
  98
  99 void XMLDocShredder::processStartElement()
 100 {
 101   doText();
 102   // fetch element name; this will be the full qualified name
 103   ustring name = reader_->get_name();
 104   bool empty = false;
 105
 106   tb->NewOpenTag(name);
 107
 108   /* We must be really carefull here. calling process attributes moves
 109      the document pointer on the last attribute, hence calling reader_->is_empty
 110      afterwards will yield the wrong result. It is better to call it while we are
 111      on the element and generate a nodeFinished() call at the end */
 112   empty = reader_->is_empty_element();
 113
 114
 115   // now, process attributes
 116   if (reader_->has_attributes())
 117     processAttributes();
 118
 119
 120   if (empty)
 121     tb->NewClosingTag(name);
 122
 123
 124 }
 125
 126 void XMLDocShredder::processEndElement()
 127 {
 128   doText();
 129   ustring name = reader_->get_name();
 130   tb->NewClosingTag(name);
 131 }
 132
 133 void XMLDocShredder::processPCDATA()
 134 {
 135   // send the content of this PCDATA node to the storage interface as a text node
 136   if (reader_->has_value())
 137     buffer += reader_->get_value();
 138
 139 }
 140
 141 void XMLDocShredder::processAttributes()
 142 {
 143         reader_->move_to_first_attribute();
 144
 145         string nspaceStr = "xmlns";
 146         tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
 147         do
 148           {
 149                 ustring name = reader_->get_name();
 150                 ustring value = reader_->get_value();
 151
 152                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
 153                 * so we have to extract it and build a namespace uri node out of it before
 154                 * passing to the storage interface */
 155
 156                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 157                 {
 158                   //TODO
 159                 }
 160
 161                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
 162                  * parent element to store the attribute name, possessing a child text node storing the
 163                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
 164                  */
 165
 166                 else
 167                 {
 168                   string attname = "<@>"+name;
 169                   tb->NewOpenTag(attname);
 170                   tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
 171                   tb->NewText(value);
 172                   tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
 173                   tb->NewClosingTag(attname);
 174                 }
 175         }
 176         while (reader_->move_to_next_attribute());
 177         tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
 178 }
 179
 180 void XMLDocShredder::processSignificantWhitespace()
 181 {
 182   if (reader_->has_value())
 183     buffer += reader_->get_value();
 184
 185 }
 186
 187 void XMLDocShredder::processStartDocument(const string docName)
 188 {
 189   // tell storage interface to construct the document name
 190
 191   tb->NewOpenTag(DOCUMENT_OPEN_TAG);
 192
 193 }
 194
 195 void XMLDocShredder::processEndDocument()
 196 {
 197   doText();
 198   /* tell the storage interface that document parsing has finished, and structures
 199    * can now be written to disk. */
 200   tb->NewClosingTag(DOCUMENT_OPEN_TAG);
 201   tree = tb->CloseDocument();
 202
 203 }
 204
 205 void XMLDocShredder::processComment()
 206 {
 207   //storageIfc_->newChild("!" + reader_->get_value());
 208   //storageIfc_->nodeFinished();
 209 }
 210
 211 void XMLDocShredder::processProcessingInstruction()
 212 {
 213         ustring name = reader_->get_name();
 214         ustring value = reader_->get_value();
 215
 216         /* Create a child node to store the target of the PI, append a text node to it to store
 217          * the PI data, send to the storage interface.  Close off the PI node with a call to
 218          * nodeFinished
 219          */
 220
 221         // storageIfc_->newChild("?" + name);
 222         // storageIfc_->newText(value);
 223         // storageIfc_->nodeFinished();
 224 }
 225
 226 void XMLDocShredder::processDocTypeDeclaration()
 227 {
 228         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
 229          * in case we do want to process it in the future.
 230         */
 231 }
 232
 233 void XMLDocShredder::processCDATASection()
 234 {
 235         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
 236          * model.  Instead, we simply pass the converted text value to the storage interface as
 237          * a text node attached to the current context node.
 238          */
 239   if (reader_->has_value())
 240     buffer+= reader_->get_value();
 241 }
 242
 243 void XMLDocShredder::processUnknownNodeType()
 244 {
 245         cout << "unknown token encountered during parsing" << endl;
 246         throw xmlpp::parse_error("unknown token encountered during parsing");
 247
 248 }
 249
 250 void XMLDocShredder::parse()
 251 {
 252         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
 253         {
 254                 switch (reader_->get_node_type())
 255                 {
 256                         case TextReader::Element:
 257                                 processStartElement();
 258                                 break;
 259
 260                         case TextReader::Text:
 261                                 processPCDATA();
 262                                 break;
 263
 264                         case TextReader::EndElement:
 265                                 processEndElement();
 266                                 break;
 267
 268                         case TextReader::SignificantWhitespace:
 269                                 processSignificantWhitespace();
 270                                 break;
 271
 272                         case TextReader::Comment:
 273                                 processComment();
 274                                 break;
 275
 276                         case TextReader::DocumentType:
 277                                 processDocTypeDeclaration();
 278                                 break;
 279
 280                         case TextReader::ProcessingInstruction:
 281                                 processProcessingInstruction();
 282                                 break;
 283
 284                         case TextReader::CDATA:
 285                                 processCDATASection();
 286                                 break;
 287
 288                         case TextReader::None:
 289                                 processUnknownNodeType();
 290                                 break;
 291
 292                         default:
 293                                 int type = reader_->get_node_type();
 294                                 cout << "  Node type: " << type << endl;
 295                                 break;
 296
 297                 }
 298         }
 299 }