XMLDocShredder.cpp

   1 /**********************************************************
   2  * XMLDocShredder.cpp
   3  * ---------------------
   4  * Implementation of the class that receives events from the XML parser and
   5  * invokes corresponding construction methods of the storage interface.
   6  *
   7  * Author: Greg Leighton
   8  * Date: 02/11/08
   9  * Changes:
  10  *              05/11/08 -- Fixed bug related to parsing empty elements
  11  *                               -- Set parser properties to automatically resolve
  12  *                                      entity references and load external DTD if present
  13  *                               -- Modified processEndDocument() by adding a nodeFinished()
  14  *                                      call to the storage interface to close off the
  15  *                                      document node
  16  *
  17  */
  18
  19 #include <iostream>
  20 #include "XMLDocShredder.h"
  21 #include "OCamlStorageInterface.h"
  22 #include <libxml++/exceptions/parse_error.h>
  23 #include "Utils.h"
  24
  25 using namespace Glib;
  26
  27 void XMLDocShredder::setProperties(){
  28   /* instruct the parser to expand entity references and report as
  29    * regular PCDATA
  30    */
  31   reader_->set_parser_property(
  32                                TextReader::SubstEntities, true);
  33
  34   /* instruct parser to read external DTD, if present.  This is
  35          * needed to obtain any entity definitions in the DTD
  36          */
  37   reader_->set_parser_property(
  38                                TextReader::LoadDtd, true);
  39
  40
  41   /*
  42    */
  43   reader_->set_parser_property(
  44                                TextReader::DefaultAttrs, true);
  45
  46
  47   /* but we don't want to do validation since it would slow us down
  48    */
  49
  50
  51   reader_->set_parser_property(
  52                                TextReader::Validate, false);
  53
  54 }
  55 XMLDocShredder::XMLDocShredder(const unsigned char * data,
  56                                TextReader::size_type size)
  57 {
  58   reader_ = new TextReader(data,size,"");
  59   setProperties();
  60   storageIfc_ = new OCamlStorageInterface();
  61   //tagsID_ = new unordered_map<int,string>(107);
  62   //idTags_ = new unordered_map<string,int>(107);
  63 }
  64
  65 XMLDocShredder::XMLDocShredder(const string inFileName)
  66 {
  67   reader_ = new TextReader(inFileName);
  68   setProperties();
  69   storageIfc_ = new OCamlStorageInterface();
  70   //  tagsID_ = new unordered_map<int,string>(107);
  71   // idTags_ = new unordered_map<string,int>(107);
  72 }
  73
  74 XMLDocShredder::~XMLDocShredder()
  75 {
  76         delete reader_;
  77         delete storageIfc_;
  78
  79 }
  80
  81 int XMLDocShredder::tagID(string name)
  82 {
  83   int res = tagsID_[name];
  84   return  res;
  85 }
  86 string XMLDocShredder::idTag(int id)
  87 {
  88
  89   return  idTags_[id];
  90 }
  91
  92
  93 void XMLDocShredder::processStartElement()
  94 {
  95         // fetch element name; this will be the full qualified name
  96         ustring name = reader_->get_name();
  97         bool empty = false;
  98
  99         storageIfc_->newChild(name);
 100
 101         /* We must be really carefull here. calling process attributes moves
 102            the document pointer on the last attribute, hence calling reader_->is_empty
 103            afterwards will yield the wrong result. It is better to call it while we are
 104            on the element and generate a nodeFinished() call at the end */
 105         empty = reader_->is_empty_element();
 106
 107
 108         // now, process attributes
 109         if (reader_->has_attributes())
 110           {
 111             processAttributes();
 112           };
 113
 114
 115         if (empty){
 116           DPRINT("Node " << name <<" is empty!\n")
 117             storageIfc_->nodeFinished();
 118         };
 119
 120
 121
 122
 123
 124 }
 125
 126 void XMLDocShredder::processEndElement()
 127 {
 128         // tell the storage interface that the current node has been completely processed
 129         storageIfc_->nodeFinished();
 130 }
 131
 132 void XMLDocShredder::processPCDATA()
 133 {
 134         // send the content of this PCDATA node to the storage interface as a text node
 135         if (reader_->has_value())
 136         {
 137           storageIfc_->newChild("<$>");
 138           storageIfc_->newText(reader_->get_value());
 139         }
 140 }
 141
 142 void XMLDocShredder::processAttributes()
 143 {
 144         reader_->move_to_first_attribute();
 145
 146         string nspaceStr = "xmlns";
 147         storageIfc_->newChild("<@>");
 148         do
 149         {
 150                 ustring name = reader_->get_name();
 151                 ustring value = reader_->get_value();
 152
 153                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
 154                 * so we have to extract it and build a namespace uri node out of it before
 155                 * passing to the storage interface */
 156
 157                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 158                 {
 159                         storageIfc_->newChild(":" + value);
 160                         storageIfc_->nodeFinished();
 161                 }
 162
 163                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
 164                  * parent element to store the attribute name, possessing a child text node storing the
 165                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
 166                  */
 167
 168                 else
 169                 {
 170                         storageIfc_->newChild(name);
 171                         storageIfc_->newChild("<$>");
 172                         storageIfc_->newText(value);
 173                         storageIfc_->nodeFinished();
 174                         //                      storageIfc_->nodeFinished();
 175                 }
 176         }
 177         while (reader_->move_to_next_attribute());
 178         storageIfc_->nodeFinished();
 179 }
 180
 181 void XMLDocShredder::processSignificantWhitespace()
 182 {
 183         ustring value = reader_->get_value();
 184
 185         // each significant whitespace sequence constructs a text node
 186         storageIfc_->newChild("<$>");
 187         storageIfc_->newText(value);
 188         //storageIfc_->nodeFinished();
 189
 190 }
 191
 192 void XMLDocShredder::processStartDocument(const string docName)
 193 {
 194   // tell storage interface to construct the document name
 195   //  storageIfc_->newChild("");
 196 }
 197
 198 void XMLDocShredder::processEndDocument()
 199 {
 200         /* tell the storage interface that document parsing has finished, and structures
 201          * can now be written to disk. */
 202   //  storageIfc_->nodeFinished();
 203   storageIfc_->parsingFinished();
 204 }
 205
 206 void XMLDocShredder::processComment()
 207 {
 208   //storageIfc_->newChild("!" + reader_->get_value());
 209   //storageIfc_->nodeFinished();
 210 }
 211
 212 void XMLDocShredder::processProcessingInstruction()
 213 {
 214         ustring name = reader_->get_name();
 215         ustring value = reader_->get_value();
 216
 217         /* Create a child node to store the target of the PI, append a text node to it to store
 218          * the PI data, send to the storage interface.  Close off the PI node with a call to
 219          * nodeFinished
 220          */
 221
 222         // storageIfc_->newChild("?" + name);
 223         // storageIfc_->newText(value);
 224         // storageIfc_->nodeFinished();
 225 }
 226
 227 void XMLDocShredder::processDocTypeDeclaration()
 228 {
 229         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
 230          * in case we do want to process it in the future.
 231         */
 232 }
 233
 234 void XMLDocShredder::processCDATASection()
 235 {
 236         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
 237          * model.  Instead, we simply pass the converted text value to the storage interface as
 238          * a text node attached to the current context node.
 239          */
 240         ustring value = reader_->get_value();
 241         storageIfc_->newChild("<$>");
 242         storageIfc_->newText(value);
 243         //      storageIfc_->nodeFinished();
 244
 245 }
 246
 247 void XMLDocShredder::processUnknownNodeType()
 248 {
 249         cout << "unknown token encountered during parsing" << endl;
 250         throw xmlpp::parse_error("unknown token encountered during parsing");
 251
 252 }
 253
 254 void XMLDocShredder::parse()
 255 {
 256         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
 257         {
 258                 switch (reader_->get_node_type())
 259                 {
 260                         case TextReader::Element:
 261                                 processStartElement();
 262                                 break;
 263
 264                         case TextReader::Text:
 265                                 processPCDATA();
 266                                 break;
 267
 268                         case TextReader::EndElement:
 269                                 processEndElement();
 270                                 break;
 271
 272                         case TextReader::SignificantWhitespace:
 273                                 processSignificantWhitespace();
 274                                 break;
 275
 276                         case TextReader::Comment:
 277                                 processComment();
 278                                 break;
 279
 280                         case TextReader::DocumentType:
 281                                 processDocTypeDeclaration();
 282                                 break;
 283
 284                         case TextReader::ProcessingInstruction:
 285                                 processProcessingInstruction();
 286                                 break;
 287
 288                         case TextReader::CDATA:
 289                                 processCDATASection();
 290                                 break;
 291
 292                         case TextReader::None:
 293                                 processUnknownNodeType();
 294                                 break;
 295
 296                         default:
 297                                 int type = reader_->get_node_type();
 298                                 cout << "  Node type: " << type << endl;
 299                                 break;
 300
 301                 }
 302         }
 303 }