Merge branch 'local-ocamlbuild' into local-trunk
[SXSI/xpathcomp.git] / src / XMLDocShredder.cpp
diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp
new file mode 100644 (file)
index 0000000..ffe4de7
--- /dev/null
@@ -0,0 +1,306 @@
+/**********************************************************
+ * XMLDocShredder.cpp
+ * ---------------------
+ * Implementation of the class that receives events from the XML parser and 
+ * invokes corresponding construction methods of the storage interface.
+ * 
+ * Author: Greg Leighton
+ * Date: 02/11/08
+ * Changes:
+ *             05/11/08 -- Fixed bug related to parsing empty elements
+ *                              -- Set parser properties to automatically resolve
+ *                                     entity references and load external DTD if present
+ *                              -- Modified processEndDocument() by adding a nodeFinished()
+ *                                     call to the storage interface to close off the 
+ *                                     document node
+ *
+ */
+
+#include <iostream>
+#include "XMLDocShredder.h"
+#include <libxml++/exceptions/parse_error.h>
+#include "Utils.h"
+
+using namespace Glib;
+
+void XMLDocShredder::doText(){
+
+  if (!buffer.empty()){
+    tb->NewOpenTag(PCDATA_OPEN_TAG);
+    tb->NewText(buffer);
+    tb->NewClosingTag(PCDATA_OPEN_TAG);
+  };
+  buffer.clear();
+
+}
+
+void XMLDocShredder::setProperties(){
+  /* instruct the parser to expand entity references and report as 
+   * regular PCDATA
+   */ 
+  reader_->set_parser_property(
+                              TextReader::SubstEntities, true);
+               
+  /* instruct parser to read external DTD, if present.  This is 
+        * needed to obtain any entity definitions in the DTD
+        */
+  reader_->set_parser_property(
+                              TextReader::LoadDtd, true);
+  
+  
+  /* 
+   */
+  reader_->set_parser_property(
+                              TextReader::DefaultAttrs, true);
+  
+
+  /* but we don't want to do validation since it would slow us down
+   */
+
+
+  reader_->set_parser_property(
+                              TextReader::Validate, false);
+  
+}
+XMLDocShredder::XMLDocShredder(const unsigned char * data,
+                              TextReader::size_type size,
+                              int sf, 
+                              bool iet, 
+                              bool dtc,
+                              TextCollectionBuilder::index_type_t index_type
+                              )                        
+{
+  tree = NULL;
+  reader_ = new TextReader(data,size,"");
+  setProperties();
+  tb  = new XMLTreeBuilder();
+  buffer.clear();
+  tb->OpenDocument(iet,sf,dtc, index_type);
+}
+
+XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
+                              TextCollectionBuilder::index_type_t index_type
+                              )
+{
+  tree = NULL;
+  reader_ = new TextReader(inFileName);
+  setProperties();
+  tb = new XMLTreeBuilder();
+  buffer.clear();
+  tb->OpenDocument(iet,sf,dtc,index_type);
+}
+
+XMLDocShredder::~XMLDocShredder()
+{
+       delete reader_;
+       reader_ = NULL;
+       delete tb;
+       tb = NULL;
+
+}
+
+
+void XMLDocShredder::processStartElement()
+{
+  doText();
+  // fetch element name; this will be the full qualified name
+  ustring name = reader_->get_name();
+  bool empty = false;
+  size_t found = name.find_first_of(':');
+  if (found == ustring::npos)  
+    tb->NewOpenTag(name);
+  else
+    tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
+  
+  /* We must be really carefull here. calling process attributes moves
+     the document pointer on the last attribute, hence calling reader_->is_empty
+     afterwards will yield the wrong result. It is better to call it while we are
+     on the element and generate a nodeFinished() call at the end */
+  empty = reader_->is_empty_element();
+  
+  
+  // now, process attributes
+  if (reader_->has_attributes())
+    processAttributes();
+  
+  
+  if (empty)
+    tb->NewClosingTag(name);
+  
+  
+}
+
+void XMLDocShredder::processEndElement()
+{
+  doText();
+  ustring name = reader_->get_name();
+  tb->NewClosingTag(name);
+}
+
+void XMLDocShredder::processPCDATA()
+{
+  // send the content of this PCDATA node to the storage interface as a text node
+  if (reader_->has_value())
+    buffer += reader_->get_value();
+
+}
+
+void XMLDocShredder::processAttributes()
+{
+       reader_->move_to_first_attribute();
+               
+       string nspaceStr = "xmlns";
+       tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
+       do
+         {
+               ustring name = reader_->get_name();
+               ustring value = reader_->get_value();
+               
+               /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
+               * so we have to extract it and build a namespace uri node out of it before
+               * passing to the storage interface */
+               
+               if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
+               {
+                 //TODO 
+               }
+               
+               /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
+                * parent element to store the attribute name, possessing a child text node storing the 
+                * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
+                */
+                
+               else
+               {
+                 string attname = "<@>"+name;
+                 tb->NewOpenTag(attname);
+                 tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
+                 tb->NewText(value);
+                 tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
+                 tb->NewClosingTag(attname);
+               }
+       }
+       while (reader_->move_to_next_attribute());
+       tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
+}
+
+void XMLDocShredder::processSignificantWhitespace()
+{
+  if (reader_->has_value())
+    buffer += reader_->get_value();
+
+}
+
+void XMLDocShredder::processStartDocument(const string docName)
+{
+  // tell storage interface to construct the document name
+
+  tb->NewOpenTag(DOCUMENT_OPEN_TAG);
+  
+}
+
+void XMLDocShredder::processEndDocument()
+{
+  doText();
+  /* tell the storage interface that document parsing has finished, and structures
+   * can now be written to disk. */
+  tb->NewClosingTag(DOCUMENT_OPEN_TAG);
+  tree = tb->CloseDocument();
+
+}
+
+void XMLDocShredder::processComment()
+{
+  //storageIfc_->newChild("!" + reader_->get_value());
+  //storageIfc_->nodeFinished();
+}
+
+void XMLDocShredder::processProcessingInstruction()
+{
+       ustring name = reader_->get_name();
+       ustring value = reader_->get_value();   
+       
+       /* Create a child node to store the target of the PI, append a text node to it to store 
+        * the PI data, send to the storage interface.  Close off the PI node with a call to
+        * nodeFinished
+        */
+       
+       // storageIfc_->newChild("?" + name);
+       // storageIfc_->newText(value);
+       // storageIfc_->nodeFinished();
+}
+
+void XMLDocShredder::processDocTypeDeclaration()
+{
+       /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
+        * in case we do want to process it in the future.
+       */
+}
+
+void XMLDocShredder::processCDATASection()
+{
+       /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
+        * model.  Instead, we simply pass the converted text value to the storage interface as 
+        * a text node attached to the current context node.
+        */
+  if (reader_->has_value())
+    buffer+= reader_->get_value();
+}
+
+void XMLDocShredder::processUnknownNodeType()
+{
+       cout << "unknown token encountered during parsing" << endl;
+       throw xmlpp::parse_error("unknown token encountered during parsing");
+               
+}
+
+void XMLDocShredder::parse()
+{      
+       while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
+       {
+               switch (reader_->get_node_type())
+               {
+                       case TextReader::Element:
+                               processStartElement();
+                               break;
+                               
+                       case TextReader::Text:
+                               processPCDATA();
+                               break;
+                               
+                       case TextReader::EndElement:
+                               processEndElement();
+                               break;
+                               
+                       case TextReader::SignificantWhitespace:
+                               processSignificantWhitespace();
+                               break;
+                               
+                       case TextReader::Comment:
+                               processComment();
+                               break;
+                       
+                       case TextReader::DocumentType:
+                               processDocTypeDeclaration();
+                               break;
+                               
+                       case TextReader::ProcessingInstruction:
+                               processProcessingInstruction();
+                               break;
+                       
+                       case TextReader::CDATA:
+                               processCDATASection();
+                               break;
+                       
+                       case TextReader::None:
+                               processUnknownNodeType();
+                               break;
+                               
+                       default:
+                               int type = reader_->get_node_type();
+                               cout << "  Node type: " << type << endl;
+                               break;  
+                       
+               }
+       }                       
+}