X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=XMLDocShredder.cpp;h=cb70e19590eef87217c0daaa82c62f66061ca7c7;hb=df5fdb22632be887ecd9f5c46a014e7e970148a2;hp=6e9c41a3e5f2ff138bedc4a91edd4cec972e7b1f;hpb=3623eefccfb5fc69e19ad975a3669f51a2a8b276;p=SXSI%2Fxpathcomp.git

diff --git a/XMLDocShredder.cpp b/XMLDocShredder.cpp
index 6e9c41a..cb70e19 100644
--- a/XMLDocShredder.cpp
+++ b/XMLDocShredder.cpp
@@ -18,12 +18,22 @@
 
 #include <iostream>
 #include "XMLDocShredder.h"
-#include "OCamlStorageInterface.h"
 #include <libxml++/exceptions/parse_error.h>
 #include "Utils.h"
 
 using namespace Glib;
 
+void XMLDocShredder::doText(){
+
+  if (!buffer.empty()){
+    tb->NewOpenTag(PCDATA_OPEN_TAG);
+    tb->NewText(buffer);
+    tb->NewClosingTag(PCDATA_OPEN_TAG);
+  };
+  buffer.clear();
+
+}
+
 void XMLDocShredder::setProperties(){
   /* instruct the parser to expand entity references and report as 
    * regular PCDATA
@@ -53,100 +63,92 @@ void XMLDocShredder::setProperties(){
   
 }
 XMLDocShredder::XMLDocShredder(const unsigned char * data,
-			       TextReader::size_type size) 			
+			       TextReader::size_type size,
+			       int sf, 
+			       bool iet, 
+			       bool dtc) 			
 {
+  tree = NULL;
   reader_ = new TextReader(data,size,"");
   setProperties();
-  storageIfc_ = new OCamlStorageInterface();
-  //tagsID_ = new unordered_map<int,string>(107);
-  //idTags_ = new unordered_map<string,int>(107);
+  tb  = new XMLTreeBuilder();
+  buffer.clear();
+  tb->OpenDocument(iet,sf,dtc);
 }
 
-XMLDocShredder::XMLDocShredder(const string inFileName)
+XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc)
 {
+  tree = NULL;
   reader_ = new TextReader(inFileName);
   setProperties();
-  storageIfc_ = new OCamlStorageInterface();
-  //  tagsID_ = new unordered_map<int,string>(107);
-  // idTags_ = new unordered_map<string,int>(107);
+  tb = new XMLTreeBuilder();
+  buffer.clear();
+  tb->OpenDocument(iet,sf,dtc);
 }
 
 XMLDocShredder::~XMLDocShredder()
 {
 	delete reader_;
-	delete storageIfc_;
+	reader_ = NULL;
+	delete tb;
+	tb = NULL;
 
 }
 
-int XMLDocShredder::tagID(string name)
-{
-  int res = tagsID_[name];
-  return  res;
-}
-string XMLDocShredder::idTag(int id)
-{
-
-  return  idTags_[id];
-}
-
 
 void XMLDocShredder::processStartElement()
 {
-	// fetch element name; this will be the full qualified name
-	ustring name = reader_->get_name();
-	bool empty = false;
-
-	storageIfc_->newChild(name);
-
-	/* We must be really carefull here. calling process attributes moves
-	   the document pointer on the last attribute, hence calling reader_->is_empty
-	   afterwards will yield the wrong result. It is better to call it while we are
-	   on the element and generate a nodeFinished() call at the end */
-	empty = reader_->is_empty_element();
-
-
-	// now, process attributes
-	if (reader_->has_attributes())
-	  {
-	    processAttributes();
-	  };
-
-	
-	if (empty){
-	  DPRINT("Node " << name <<" is empty!\n")
-	    storageIfc_->nodeFinished();
-	};
-
-
-
-
-
+  doText();
+  // fetch element name; this will be the full qualified name
+  ustring name = reader_->get_name();
+  bool empty = false;
+  size_t found = name.find_first_of(':');
+  if (found == ustring::npos)  
+    tb->NewOpenTag(name);
+  else
+    tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
+  
+  /* We must be really carefull here. calling process attributes moves
+     the document pointer on the last attribute, hence calling reader_->is_empty
+     afterwards will yield the wrong result. It is better to call it while we are
+     on the element and generate a nodeFinished() call at the end */
+  empty = reader_->is_empty_element();
+  
+  
+  // now, process attributes
+  if (reader_->has_attributes())
+    processAttributes();
+  
+  
+  if (empty)
+    tb->NewClosingTag(name);
+  
+  
 }
 
 void XMLDocShredder::processEndElement()
 {
-	// tell the storage interface that the current node has been completely processed
-	storageIfc_->nodeFinished();
+  doText();
+  ustring name = reader_->get_name();
+  tb->NewClosingTag(name);
 }
 
 void XMLDocShredder::processPCDATA()
 {
-	// send the content of this PCDATA node to the storage interface as a text node
-	if (reader_->has_value())
-	{	  
-	  storageIfc_->newChild("<$>");
-	  storageIfc_->newText(reader_->get_value());
-	}
+  // send the content of this PCDATA node to the storage interface as a text node
+  if (reader_->has_value())
+    buffer += reader_->get_value();
+
 }
 
 void XMLDocShredder::processAttributes()
 {
 	reader_->move_to_first_attribute();
 		
-	string nspaceStr = "xmlns";	
-	storageIfc_->newChild("<@>");
+	string nspaceStr = "xmlns";
+	tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
 	do
-	{
+	  {
 		ustring name = reader_->get_name();
 		ustring value = reader_->get_value();
 		
@@ -156,8 +158,7 @@ void XMLDocShredder::processAttributes()
 		
 		if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
 		{
-			storageIfc_->newChild(":" + value);
-			storageIfc_->nodeFinished();	
+		  //TODO 
 		}
 		
 		/* otherwise, this is an ordinary attribute, so we construct a new child node of the 
@@ -167,40 +168,41 @@ void XMLDocShredder::processAttributes()
 		 
 		else
 		{
-			storageIfc_->newChild(name);
-			storageIfc_->newChild("<$>");
-			storageIfc_->newText(value);
-			storageIfc_->nodeFinished();
-			//			storageIfc_->nodeFinished();
+		  string attname = "<@>"+name;
+		  tb->NewOpenTag(attname);
+		  tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
+		  tb->NewText(value);
+		  tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
+		  tb->NewClosingTag(attname);
 		}
 	}
 	while (reader_->move_to_next_attribute());
-	storageIfc_->nodeFinished();
+	tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
 }
 
 void XMLDocShredder::processSignificantWhitespace()
 {
-	ustring value = reader_->get_value();
-	
-	// each significant whitespace sequence constructs a text node
-	storageIfc_->newChild("<$>");
-	storageIfc_->newText(value);
-	//storageIfc_->nodeFinished();
-	
+  if (reader_->has_value())
+    buffer += reader_->get_value();
+
 }
 
 void XMLDocShredder::processStartDocument(const string docName)
 {
   // tell storage interface to construct the document name
-  //  storageIfc_->newChild("");  
+
+  tb->NewOpenTag(DOCUMENT_OPEN_TAG);
+  
 }
 
 void XMLDocShredder::processEndDocument()
 {
-	/* tell the storage interface that document parsing has finished, and structures
-	 * can now be written to disk. */
-  //  storageIfc_->nodeFinished();
-  storageIfc_->parsingFinished();	
+  doText();
+  /* tell the storage interface that document parsing has finished, and structures
+   * can now be written to disk. */
+  tb->NewClosingTag(DOCUMENT_OPEN_TAG);
+  tree = tb->CloseDocument();
+
 }
 
 void XMLDocShredder::processComment()
@@ -237,11 +239,8 @@ void XMLDocShredder::processCDATASection()
 	 * model.  Instead, we simply pass the converted text value to the storage interface as 
 	 * a text node attached to the current context node.
 	 */
-	ustring value = reader_->get_value();
-	storageIfc_->newChild("<$>");
-	storageIfc_->newText(value);
-	//	storageIfc_->nodeFinished();
-
+  if (reader_->has_value())
+    buffer+= reader_->get_value();
 }
 
 void XMLDocShredder::processUnknownNodeType()