Don't index empty texts
[SXSI/xpathcomp.git] / XMLDocShredder.cpp
index f81251e..d2e4a75 100644 (file)
@@ -59,6 +59,7 @@ XMLDocShredder::XMLDocShredder(const unsigned char * data,
   reader_ = new TextReader(data,size,"");
   setProperties();
   storageIfc_ = new SXSIStorageInterface();
+  buffer = "";
 }
 
 XMLDocShredder::XMLDocShredder(const string inFileName)
@@ -67,7 +68,7 @@ XMLDocShredder::XMLDocShredder(const string inFileName)
   reader_ = new TextReader(inFileName);
   setProperties();
   storageIfc_ = new SXSIStorageInterface();
-
+  buffer = "";
 }
 
 XMLDocShredder::~XMLDocShredder()
@@ -84,10 +85,9 @@ void XMLDocShredder::processStartElement()
        ustring name = reader_->get_name();
        bool empty = false;
        
-       if (!last_text)
-         storageIfc_->newText(""); //prevText
-       last_text = false;
-
+       storageIfc_->newText(buffer); //prevText
+       buffer.erase();
+       
        storageIfc_->newChild(name);
 
        /* We must be really carefull here. calling process attributes moves
@@ -105,24 +105,18 @@ void XMLDocShredder::processStartElement()
 
        
        if (empty){
-         DPRINT("Node " << name <<" is empty!\n")
            storageIfc_->newText("");  //myText
-           storageIfc_->nodeFinished(name);
-           storageIfc_->newText("");  //nextText
+           storageIfc_->nodeFinished(name);       
        };
 
 
-
-
-
 }
 
 void XMLDocShredder::processEndElement()
 {
   // tell the storage interface that the current node has been completely processed
-  if (!last_text)
-    storageIfc_->newText(""); //nextText of previous node
-  last_text = false;
+  storageIfc_->newText(buffer); //prevText
+  buffer.erase();
   storageIfc_->nodeFinished(reader_->get_name());
 }
 
@@ -131,20 +125,18 @@ void XMLDocShredder::processPCDATA()
        // send the content of this PCDATA node to the storage interface as a text node
          
        if (reader_->has_value())
-       {         
-         storageIfc_->newChild("<$>");
-         storageIfc_->newText(reader_->get_value());
-         last_text = true;
-       }
-       else 
-         storageIfc_->newText("");
+       {
+         buffer += reader_->get_value();
+       };
+
 }
 
 void XMLDocShredder::processAttributes()
 {
        reader_->move_to_first_attribute();
                
-       string nspaceStr = "xmlns";     
+       string nspaceStr = "xmlns";
+       storageIfc_->newText(""); //prevText
        storageIfc_->newChild("<@>");
        do
        {
@@ -168,31 +160,28 @@ void XMLDocShredder::processAttributes()
                 
                else
                {
-                       storageIfc_->newChild(name);
-                       storageIfc_->newChild("<$>");
-                       storageIfc_->newText(value);
-                       storageIfc_->nodeFinished("<$>");
+                 storageIfc_->newText(""); //prevText
+                 storageIfc_->newChild(name);
+                 storageIfc_->newText(value);
+                 storageIfc_->nodeFinished(name);
                }
        }
        while (reader_->move_to_next_attribute());
+       storageIfc_->newText(""); //nextText
        storageIfc_->nodeFinished("<@>");
 }
 
 void XMLDocShredder::processSignificantWhitespace()
 {
-       ustring value = reader_->get_value();
-       
-       // each significant whitespace sequence constructs a text node
-       storageIfc_->newChild("<$>");
-       storageIfc_->newText(value);
-
+  // each significant whitespace sequence constructs a text node
+  buffer += reader_->get_value();      
        
 }
 
 void XMLDocShredder::processStartDocument(const string docName)
 {
   // tell storage interface to construct the document name
-  storageIfc_->newChild("ROOT");  
+  storageIfc_->newChild("");  
   
 }
 
@@ -200,7 +189,8 @@ void XMLDocShredder::processEndDocument()
 {
        /* tell the storage interface that document parsing has finished, and structures
         * can now be written to disk. */
-  storageIfc_->nodeFinished("ROOT");
+  storageIfc_->newText("");
+  storageIfc_->nodeFinished("");
   storageIfc_->parsingFinished();      
 }
 
@@ -240,10 +230,8 @@ void XMLDocShredder::processCDATASection()
         */
   
        ustring value = reader_->get_value();
-       storageIfc_->newChild("<$>");
        storageIfc_->newText(value);
        last_text = true;
-       //      storageIfc_->nodeFinished();
 
 }