X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2FXMLDocShredder.cpp;fp=src%2FXMLDocShredder.cpp;h=0000000000000000000000000000000000000000;hb=ff13d22656fdbdffb2d909192bd17ba135606224;hp=ffe4de7616064f3fa5d57153c0099e15017fa308;hpb=d7e793e387b3a59877765e15c54649a9d74d137e;p=SXSI%2Fxpathcomp.git

diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp
deleted file mode 100644
index ffe4de7..0000000
--- a/src/XMLDocShredder.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/**********************************************************
- * XMLDocShredder.cpp
- * ---------------------
- * Implementation of the class that receives events from the XML parser and 
- * invokes corresponding construction methods of the storage interface.
- * 
- * Author: Greg Leighton
- * Date: 02/11/08
- * Changes:
- * 		05/11/08 -- Fixed bug related to parsing empty elements
- * 				 -- Set parser properties to automatically resolve
- * 					entity references and load external DTD if present
- * 				 -- Modified processEndDocument() by adding a nodeFinished()
- * 					call to the storage interface to close off the 
- * 					document node
- *
- */
-
-#include <iostream>
-#include "XMLDocShredder.h"
-#include <libxml++/exceptions/parse_error.h>
-#include "Utils.h"
-
-using namespace Glib;
-
-void XMLDocShredder::doText(){
-
-  if (!buffer.empty()){
-    tb->NewOpenTag(PCDATA_OPEN_TAG);
-    tb->NewText(buffer);
-    tb->NewClosingTag(PCDATA_OPEN_TAG);
-  };
-  buffer.clear();
-
-}
-
-void XMLDocShredder::setProperties(){
-  /* instruct the parser to expand entity references and report as 
-   * regular PCDATA
-   */ 
-  reader_->set_parser_property(
-			       TextReader::SubstEntities, true);
-		
-  /* instruct parser to read external DTD, if present.  This is 
-	 * needed to obtain any entity definitions in the DTD
-	 */
-  reader_->set_parser_property(
-			       TextReader::LoadDtd, true);
-  
-  
-  /* 
-   */
-  reader_->set_parser_property(
-			       TextReader::DefaultAttrs, true);
-  
-
-  /* but we don't want to do validation since it would slow us down
-   */
-
-
-  reader_->set_parser_property(
-			       TextReader::Validate, false);
-  
-}
-XMLDocShredder::XMLDocShredder(const unsigned char * data,
-			       TextReader::size_type size,
-			       int sf, 
-			       bool iet, 
-			       bool dtc,
-			       TextCollectionBuilder::index_type_t index_type
-			       ) 			
-{
-  tree = NULL;
-  reader_ = new TextReader(data,size,"");
-  setProperties();
-  tb  = new XMLTreeBuilder();
-  buffer.clear();
-  tb->OpenDocument(iet,sf,dtc, index_type);
-}
-
-XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
-			       TextCollectionBuilder::index_type_t index_type
-			       )
-{
-  tree = NULL;
-  reader_ = new TextReader(inFileName);
-  setProperties();
-  tb = new XMLTreeBuilder();
-  buffer.clear();
-  tb->OpenDocument(iet,sf,dtc,index_type);
-}
-
-XMLDocShredder::~XMLDocShredder()
-{
-	delete reader_;
-	reader_ = NULL;
-	delete tb;
-	tb = NULL;
-
-}
-
-
-void XMLDocShredder::processStartElement()
-{
-  doText();
-  // fetch element name; this will be the full qualified name
-  ustring name = reader_->get_name();
-  bool empty = false;
-  size_t found = name.find_first_of(':');
-  if (found == ustring::npos)  
-    tb->NewOpenTag(name);
-  else
-    tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
-  
-  /* We must be really carefull here. calling process attributes moves
-     the document pointer on the last attribute, hence calling reader_->is_empty
-     afterwards will yield the wrong result. It is better to call it while we are
-     on the element and generate a nodeFinished() call at the end */
-  empty = reader_->is_empty_element();
-  
-  
-  // now, process attributes
-  if (reader_->has_attributes())
-    processAttributes();
-  
-  
-  if (empty)
-    tb->NewClosingTag(name);
-  
-  
-}
-
-void XMLDocShredder::processEndElement()
-{
-  doText();
-  ustring name = reader_->get_name();
-  tb->NewClosingTag(name);
-}
-
-void XMLDocShredder::processPCDATA()
-{
-  // send the content of this PCDATA node to the storage interface as a text node
-  if (reader_->has_value())
-    buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processAttributes()
-{
-	reader_->move_to_first_attribute();
-		
-	string nspaceStr = "xmlns";
-	tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
-	do
-	  {
-		ustring name = reader_->get_name();
-		ustring value = reader_->get_value();
-		
-		/* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
-		* so we have to extract it and build a namespace uri node out of it before
-		* passing to the storage interface */
-		
-		if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
-		{
-		  //TODO 
-		}
-		
-		/* otherwise, this is an ordinary attribute, so we construct a new child node of the 
-		 * parent element to store the attribute name, possessing a child text node storing the 
-		 * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
-		 */
-		 
-		else
-		{
-		  string attname = "<@>"+name;
-		  tb->NewOpenTag(attname);
-		  tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
-		  tb->NewText(value);
-		  tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
-		  tb->NewClosingTag(attname);
-		}
-	}
-	while (reader_->move_to_next_attribute());
-	tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
-}
-
-void XMLDocShredder::processSignificantWhitespace()
-{
-  if (reader_->has_value())
-    buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processStartDocument(const string docName)
-{
-  // tell storage interface to construct the document name
-
-  tb->NewOpenTag(DOCUMENT_OPEN_TAG);
-  
-}
-
-void XMLDocShredder::processEndDocument()
-{
-  doText();
-  /* tell the storage interface that document parsing has finished, and structures
-   * can now be written to disk. */
-  tb->NewClosingTag(DOCUMENT_OPEN_TAG);
-  tree = tb->CloseDocument();
-
-}
-
-void XMLDocShredder::processComment()
-{
-  //storageIfc_->newChild("!" + reader_->get_value());
-  //storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processProcessingInstruction()
-{
-	ustring name = reader_->get_name();
-	ustring value = reader_->get_value();	
-	
-	/* Create a child node to store the target of the PI, append a text node to it to store 
-	 * the PI data, send to the storage interface.  Close off the PI node with a call to
-	 * nodeFinished
-	 */
-	
-	// storageIfc_->newChild("?" + name);
-	// storageIfc_->newText(value);
-	// storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processDocTypeDeclaration()
-{
-	/* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
-	 * in case we do want to process it in the future.
-	*/
-}
-
-void XMLDocShredder::processCDATASection()
-{
-	/* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
-	 * model.  Instead, we simply pass the converted text value to the storage interface as 
-	 * a text node attached to the current context node.
-	 */
-  if (reader_->has_value())
-    buffer+= reader_->get_value();
-}
-
-void XMLDocShredder::processUnknownNodeType()
-{
-	cout << "unknown token encountered during parsing" << endl;
-	throw xmlpp::parse_error("unknown token encountered during parsing");
-		
-}
-
-void XMLDocShredder::parse()
-{	
-	while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
-	{
-		switch (reader_->get_node_type())
-		{
-			case TextReader::Element:
-				processStartElement();
-				break;
-				
-			case TextReader::Text:
-				processPCDATA();
-				break;
-				
-			case TextReader::EndElement:
-				processEndElement();
-				break;
-				
-			case TextReader::SignificantWhitespace:
-				processSignificantWhitespace();
-				break;
-				
-			case TextReader::Comment:
-				processComment();
-				break;
-			
-			case TextReader::DocumentType:
-				processDocTypeDeclaration();
-				break;
-				
-			case TextReader::ProcessingInstruction:
-				processProcessingInstruction();
-				break;
-			
-			case TextReader::CDATA:
-				processCDATASection();
-				break;
-			
-			case TextReader::None:
-				processUnknownNodeType();
-				break;
-				
-			default:
-				int type = reader_->get_node_type();
-				cout << "  Node type: " << type << endl;
-				break;	
-			
-		}
-	}			
-}