src/XMLDocShredder.cpp

   1 /**********************************************************
   2  * XMLDocShredder.cpp
   3  * ---------------------
   4  * Implementation of the class that receives events from the XML parser and
   5  * invokes corresponding construction methods of the storage interface.
   6  *
   7  * Author: Greg Leighton
   8  * Date: 02/11/08
   9  * Changes:
  10  *              05/11/08 -- Fixed bug related to parsing empty elements
  11  *                               -- Set parser properties to automatically resolve
  12  *                                      entity references and load external DTD if present
  13  *                               -- Modified processEndDocument() by adding a nodeFinished()
  14  *                                      call to the storage interface to close off the
  15  *                                      document node
  16  *
  17  */
  18
  19 #include <iostream>
  20 #include "XMLDocShredder.h"
  21 #include <libxml++/exceptions/parse_error.h>
  22 #include "Utils.h"
  23
  24 using namespace Glib;
  25
  26 void XMLDocShredder::doText(){
  27
  28   if (!buffer.empty()){
  29     tb->NewOpenTag(PCDATA_OPEN_TAG);
  30     tb->NewText(buffer);
  31     tb->NewClosingTag(PCDATA_OPEN_TAG);
  32   };
  33   buffer.clear();
  34
  35 }
  36
  37 void XMLDocShredder::setProperties(){
  38   /* instruct the parser to expand entity references and report as
  39    * regular PCDATA
  40    */
  41   reader_->set_parser_property(
  42                                TextReader::SubstEntities, true);
  43
  44   /* instruct parser to read external DTD, if present.  This is
  45          * needed to obtain any entity definitions in the DTD
  46          */
  47   reader_->set_parser_property(
  48                                TextReader::LoadDtd, true);
  49
  50
  51   /*
  52    */
  53   reader_->set_parser_property(
  54                                TextReader::DefaultAttrs, true);
  55
  56
  57   /* but we don't want to do validation since it would slow us down
  58    */
  59
  60
  61   reader_->set_parser_property(
  62                                TextReader::Validate, false);
  63
  64 }
  65 XMLDocShredder::XMLDocShredder(const unsigned char * data,
  66                                TextReader::size_type size,
  67                                int sf,
  68                                bool iet,
  69                                bool dtc,
  70                                TextCollectionBuilder::index_type_t index_type
  71                                )
  72 {
  73   tree = NULL;
  74   reader_ = new TextReader(data,size,"");
  75   setProperties();
  76   tb  = new XMLTreeBuilder();
  77   buffer.clear();
  78   tb->OpenDocument(iet,sf,dtc, index_type);
  79 }
  80
  81 XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
  82                                TextCollectionBuilder::index_type_t index_type
  83                                )
  84 {
  85   tree = NULL;
  86   reader_ = new TextReader(inFileName);
  87   setProperties();
  88   tb = new XMLTreeBuilder();
  89   buffer.clear();
  90   tb->OpenDocument(iet,sf,dtc,index_type);
  91 }
  92
  93 XMLDocShredder::~XMLDocShredder()
  94 {
  95         delete reader_;
  96         reader_ = NULL;
  97         delete tb;
  98         tb = NULL;
  99
 100 }
 101
 102
 103 void XMLDocShredder::processStartElement()
 104 {
 105   doText();
 106   // fetch element name; this will be the full qualified name
 107   ustring name = reader_->get_name();
 108   bool empty = false;
 109   size_t found = name.find_first_of(':');
 110   if (found == ustring::npos)
 111     tb->NewOpenTag(name);
 112   else
 113     tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
 114
 115   /* We must be really carefull here. calling process attributes moves
 116      the document pointer on the last attribute, hence calling reader_->is_empty
 117      afterwards will yield the wrong result. It is better to call it while we are
 118      on the element and generate a nodeFinished() call at the end */
 119   empty = reader_->is_empty_element();
 120
 121
 122   // now, process attributes
 123   if (reader_->has_attributes())
 124     processAttributes();
 125
 126
 127   if (empty)
 128     tb->NewClosingTag(name);
 129
 130
 131 }
 132
 133 void XMLDocShredder::processEndElement()
 134 {
 135   doText();
 136   ustring name = reader_->get_name();
 137   tb->NewClosingTag(name);
 138 }
 139
 140 void XMLDocShredder::processPCDATA()
 141 {
 142   // send the content of this PCDATA node to the storage interface as a text node
 143   if (reader_->has_value())
 144     buffer += reader_->get_value();
 145
 146 }
 147
 148 void XMLDocShredder::processAttributes()
 149 {
 150         reader_->move_to_first_attribute();
 151
 152         string nspaceStr = "xmlns";
 153         tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
 154         do
 155           {
 156                 ustring name = reader_->get_name();
 157                 ustring value = reader_->get_value();
 158
 159                 /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
 160                 * so we have to extract it and build a namespace uri node out of it before
 161                 * passing to the storage interface */
 162
 163                 if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 164                 {
 165                   //TODO
 166                 }
 167
 168                 /* otherwise, this is an ordinary attribute, so we construct a new child node of the
 169                  * parent element to store the attribute name, possessing a child text node storing the
 170                  * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
 171                  */
 172
 173                 else
 174                 {
 175                   string attname = "<@>"+name;
 176                   tb->NewOpenTag(attname);
 177                   tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
 178                   tb->NewText(value);
 179                   tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
 180                   tb->NewClosingTag(attname);
 181                 }
 182         }
 183         while (reader_->move_to_next_attribute());
 184         tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
 185 }
 186
 187 void XMLDocShredder::processSignificantWhitespace()
 188 {
 189   if (reader_->has_value())
 190     buffer += reader_->get_value();
 191
 192 }
 193
 194 void XMLDocShredder::processStartDocument(const string docName)
 195 {
 196   // tell storage interface to construct the document name
 197
 198   tb->NewOpenTag(DOCUMENT_OPEN_TAG);
 199
 200 }
 201
 202 void XMLDocShredder::processEndDocument()
 203 {
 204   doText();
 205   /* tell the storage interface that document parsing has finished, and structures
 206    * can now be written to disk. */
 207   tb->NewClosingTag(DOCUMENT_OPEN_TAG);
 208   tree = tb->CloseDocument();
 209
 210 }
 211
 212 void XMLDocShredder::processComment()
 213 {
 214   //storageIfc_->newChild("!" + reader_->get_value());
 215   //storageIfc_->nodeFinished();
 216 }
 217
 218 void XMLDocShredder::processProcessingInstruction()
 219 {
 220         ustring name = reader_->get_name();
 221         ustring value = reader_->get_value();
 222
 223         /* Create a child node to store the target of the PI, append a text node to it to store
 224          * the PI data, send to the storage interface.  Close off the PI node with a call to
 225          * nodeFinished
 226          */
 227
 228         // storageIfc_->newChild("?" + name);
 229         // storageIfc_->newText(value);
 230         // storageIfc_->nodeFinished();
 231 }
 232
 233 void XMLDocShredder::processDocTypeDeclaration()
 234 {
 235         /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
 236          * in case we do want to process it in the future.
 237         */
 238 }
 239
 240 void XMLDocShredder::processCDATASection()
 241 {
 242         /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
 243          * model.  Instead, we simply pass the converted text value to the storage interface as
 244          * a text node attached to the current context node.
 245          */
 246   if (reader_->has_value())
 247     buffer+= reader_->get_value();
 248 }
 249
 250 void XMLDocShredder::processUnknownNodeType()
 251 {
 252         cout << "unknown token encountered during parsing" << endl;
 253         throw xmlpp::parse_error("unknown token encountered during parsing");
 254
 255 }
 256
 257 void XMLDocShredder::parse()
 258 {
 259         while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
 260         {
 261                 switch (reader_->get_node_type())
 262                 {
 263                         case TextReader::Element:
 264                                 processStartElement();
 265                                 break;
 266
 267                         case TextReader::Text:
 268                                 processPCDATA();
 269                                 break;
 270
 271                         case TextReader::EndElement:
 272                                 processEndElement();
 273                                 break;
 274
 275                         case TextReader::SignificantWhitespace:
 276                                 processSignificantWhitespace();
 277                                 break;
 278
 279                         case TextReader::Comment:
 280                                 processComment();
 281                                 break;
 282
 283                         case TextReader::DocumentType:
 284                                 processDocTypeDeclaration();
 285                                 break;
 286
 287                         case TextReader::ProcessingInstruction:
 288                                 processProcessingInstruction();
 289                                 break;
 290
 291                         case TextReader::CDATA:
 292                                 processCDATASection();
 293                                 break;
 294
 295                         case TextReader::None:
 296                                 processUnknownNodeType();
 297                                 break;
 298
 299                         default:
 300                                 int type = reader_->get_node_type();
 301                                 cout << "  Node type: " << type << endl;
 302                                 break;
 303
 304                 }
 305         }
 306 }