X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2FXMLDocShredder.cpp;fp=src%2FXMLDocShredder.cpp;h=ffe4de7616064f3fa5d57153c0099e15017fa308;hb=4b52da1a20a4fe031930bb96d2ca46bec06dc529;hp=0000000000000000000000000000000000000000;hpb=a223af3254fb51c279cfbccdc18c59484fdca74e;p=SXSI%2Fxpathcomp.git

diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp
new file mode 100644
index 0000000..ffe4de7
--- /dev/null
+++ b/src/XMLDocShredder.cpp
@@ -0,0 +1,306 @@
+/**********************************************************
+ * XMLDocShredder.cpp
+ * ---------------------
+ * Implementation of the class that receives events from the XML parser and 
+ * invokes corresponding construction methods of the storage interface.
+ * 
+ * Author: Greg Leighton
+ * Date: 02/11/08
+ * Changes:
+ * 		05/11/08 -- Fixed bug related to parsing empty elements
+ * 				 -- Set parser properties to automatically resolve
+ * 					entity references and load external DTD if present
+ * 				 -- Modified processEndDocument() by adding a nodeFinished()
+ * 					call to the storage interface to close off the 
+ * 					document node
+ *
+ */
+
+#include <iostream>
+#include "XMLDocShredder.h"
+#include <libxml++/exceptions/parse_error.h>
+#include "Utils.h"
+
+using namespace Glib;
+
+void XMLDocShredder::doText(){
+
+  if (!buffer.empty()){
+    tb->NewOpenTag(PCDATA_OPEN_TAG);
+    tb->NewText(buffer);
+    tb->NewClosingTag(PCDATA_OPEN_TAG);
+  };
+  buffer.clear();
+
+}
+
+void XMLDocShredder::setProperties(){
+  /* instruct the parser to expand entity references and report as 
+   * regular PCDATA
+   */ 
+  reader_->set_parser_property(
+			       TextReader::SubstEntities, true);
+		
+  /* instruct parser to read external DTD, if present.  This is 
+	 * needed to obtain any entity definitions in the DTD
+	 */
+  reader_->set_parser_property(
+			       TextReader::LoadDtd, true);
+  
+  
+  /* 
+   */
+  reader_->set_parser_property(
+			       TextReader::DefaultAttrs, true);
+  
+
+  /* but we don't want to do validation since it would slow us down
+   */
+
+
+  reader_->set_parser_property(
+			       TextReader::Validate, false);
+  
+}
+XMLDocShredder::XMLDocShredder(const unsigned char * data,
+			       TextReader::size_type size,
+			       int sf, 
+			       bool iet, 
+			       bool dtc,
+			       TextCollectionBuilder::index_type_t index_type
+			       ) 			
+{
+  tree = NULL;
+  reader_ = new TextReader(data,size,"");
+  setProperties();
+  tb  = new XMLTreeBuilder();
+  buffer.clear();
+  tb->OpenDocument(iet,sf,dtc, index_type);
+}
+
+XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
+			       TextCollectionBuilder::index_type_t index_type
+			       )
+{
+  tree = NULL;
+  reader_ = new TextReader(inFileName);
+  setProperties();
+  tb = new XMLTreeBuilder();
+  buffer.clear();
+  tb->OpenDocument(iet,sf,dtc,index_type);
+}
+
+XMLDocShredder::~XMLDocShredder()
+{
+	delete reader_;
+	reader_ = NULL;
+	delete tb;
+	tb = NULL;
+
+}
+
+
+void XMLDocShredder::processStartElement()
+{
+  doText();
+  // fetch element name; this will be the full qualified name
+  ustring name = reader_->get_name();
+  bool empty = false;
+  size_t found = name.find_first_of(':');
+  if (found == ustring::npos)  
+    tb->NewOpenTag(name);
+  else
+    tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
+  
+  /* We must be really carefull here. calling process attributes moves
+     the document pointer on the last attribute, hence calling reader_->is_empty
+     afterwards will yield the wrong result. It is better to call it while we are
+     on the element and generate a nodeFinished() call at the end */
+  empty = reader_->is_empty_element();
+  
+  
+  // now, process attributes
+  if (reader_->has_attributes())
+    processAttributes();
+  
+  
+  if (empty)
+    tb->NewClosingTag(name);
+  
+  
+}
+
+void XMLDocShredder::processEndElement()
+{
+  doText();
+  ustring name = reader_->get_name();
+  tb->NewClosingTag(name);
+}
+
+void XMLDocShredder::processPCDATA()
+{
+  // send the content of this PCDATA node to the storage interface as a text node
+  if (reader_->has_value())
+    buffer += reader_->get_value();
+
+}
+
+void XMLDocShredder::processAttributes()
+{
+	reader_->move_to_first_attribute();
+		
+	string nspaceStr = "xmlns";
+	tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
+	do
+	  {
+		ustring name = reader_->get_name();
+		ustring value = reader_->get_value();
+		
+		/* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
+		* so we have to extract it and build a namespace uri node out of it before
+		* passing to the storage interface */
+		
+		if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
+		{
+		  //TODO 
+		}
+		
+		/* otherwise, this is an ordinary attribute, so we construct a new child node of the 
+		 * parent element to store the attribute name, possessing a child text node storing the 
+		 * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
+		 */
+		 
+		else
+		{
+		  string attname = "<@>"+name;
+		  tb->NewOpenTag(attname);
+		  tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
+		  tb->NewText(value);
+		  tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
+		  tb->NewClosingTag(attname);
+		}
+	}
+	while (reader_->move_to_next_attribute());
+	tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
+}
+
+void XMLDocShredder::processSignificantWhitespace()
+{
+  if (reader_->has_value())
+    buffer += reader_->get_value();
+
+}
+
+void XMLDocShredder::processStartDocument(const string docName)
+{
+  // tell storage interface to construct the document name
+
+  tb->NewOpenTag(DOCUMENT_OPEN_TAG);
+  
+}
+
+void XMLDocShredder::processEndDocument()
+{
+  doText();
+  /* tell the storage interface that document parsing has finished, and structures
+   * can now be written to disk. */
+  tb->NewClosingTag(DOCUMENT_OPEN_TAG);
+  tree = tb->CloseDocument();
+
+}
+
+void XMLDocShredder::processComment()
+{
+  //storageIfc_->newChild("!" + reader_->get_value());
+  //storageIfc_->nodeFinished();
+}
+
+void XMLDocShredder::processProcessingInstruction()
+{
+	ustring name = reader_->get_name();
+	ustring value = reader_->get_value();	
+	
+	/* Create a child node to store the target of the PI, append a text node to it to store 
+	 * the PI data, send to the storage interface.  Close off the PI node with a call to
+	 * nodeFinished
+	 */
+	
+	// storageIfc_->newChild("?" + name);
+	// storageIfc_->newText(value);
+	// storageIfc_->nodeFinished();
+}
+
+void XMLDocShredder::processDocTypeDeclaration()
+{
+	/* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
+	 * in case we do want to process it in the future.
+	*/
+}
+
+void XMLDocShredder::processCDATASection()
+{
+	/* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
+	 * model.  Instead, we simply pass the converted text value to the storage interface as 
+	 * a text node attached to the current context node.
+	 */
+  if (reader_->has_value())
+    buffer+= reader_->get_value();
+}
+
+void XMLDocShredder::processUnknownNodeType()
+{
+	cout << "unknown token encountered during parsing" << endl;
+	throw xmlpp::parse_error("unknown token encountered during parsing");
+		
+}
+
+void XMLDocShredder::parse()
+{	
+	while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
+	{
+		switch (reader_->get_node_type())
+		{
+			case TextReader::Element:
+				processStartElement();
+				break;
+				
+			case TextReader::Text:
+				processPCDATA();
+				break;
+				
+			case TextReader::EndElement:
+				processEndElement();
+				break;
+				
+			case TextReader::SignificantWhitespace:
+				processSignificantWhitespace();
+				break;
+				
+			case TextReader::Comment:
+				processComment();
+				break;
+			
+			case TextReader::DocumentType:
+				processDocTypeDeclaration();
+				break;
+				
+			case TextReader::ProcessingInstruction:
+				processProcessingInstruction();
+				break;
+			
+			case TextReader::CDATA:
+				processCDATASection();
+				break;
+			
+			case TextReader::None:
+				processUnknownNodeType();
+				break;
+				
+			default:
+				int type = reader_->get_node_type();
+				cout << "  Node type: " << type << endl;
+				break;	
+			
+		}
+	}			
+}