improvements...
authorfclaude <fclaude@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sun, 26 Apr 2009 04:01:12 +0000 (04:01 +0000)
committerfclaude <fclaude@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sun, 26 Apr 2009 04:01:12 +0000 (04:01 +0000)
git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@345 3cdefd35-fc62-479d-8e8d-bae585ffb9ca

Makefile
benchmark/depend
myTimeXMLTree.cpp [new file with mode: 0644]

index 26f4e45..956e51d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -110,7 +110,7 @@ clean:
        $(HIDE) rm -rf .libs
 
 
-timeXMLTree: $(CXXOBJECTS) XMLTree/XMLTree.a  timeXMLTree.cpp testXMLTree.cpp
+timeXMLTree: $(CXXOBJECTS) XMLTree/XMLTree.a  timeXMLTree.cpp myTimeXMLTree.cpp
        mkdir -p .libs/
        cd .libs/ && ar x ../XMLTree/XMLTree.a
        $(CXX) -o timeXMLTree $(CXXFLAGS) $(CXXINCLUDES) XMLDocShredder.o \
@@ -119,9 +119,9 @@ timeXMLTree: $(CXXOBJECTS) XMLTree/XMLTree.a  timeXMLTree.cpp testXMLTree.cpp
        $(CXX) -o myTimeXMLTree $(CXXFLAGS) $(CXXINCLUDES) XMLDocShredder.o \
        SXSIStorageInterface.o StorageInterface.o ./.libs/*.o \
        $(LIBS) myTimeXMLTree.cpp
-       $(CXX) -o testXMLTree $(CXXFLAGS) $(CXXINCLUDES) XMLDocShredder.o \
-       SXSIStorageInterface.o StorageInterface.o ./.libs/*.o \
-       $(LIBS) testXMLTree.cpp
+#      $(CXX) -o testXMLTree $(CXXFLAGS) $(CXXINCLUDES) XMLDocShredder.o \
+#      SXSIStorageInterface.o StorageInterface.o ./.libs/*.o \
+#      $(LIBS) testXMLTree.cpp
        rm -rf .libs
 
 SXSIStorageInterface.o: SXSIStorageInterface.h SXSIStorageInterface.cpp StorageInterface.h
index 9572b5f..7526dc6 100644 (file)
@@ -1,5 +1,2 @@
 benchmark.cmo: benchmark.cmi 
 benchmark.cmx: benchmark.cmi 
-main.cmo: benchmark.cmi 
-main.cmx: benchmark.cmx 
-benchmark.cmi: 
diff --git a/myTimeXMLTree.cpp b/myTimeXMLTree.cpp
new file mode 100644 (file)
index 0000000..25bc5aa
--- /dev/null
@@ -0,0 +1,336 @@
+#include "XMLDocShredder.h"
+#include "XMLTree.h"
+#include "Utils.h"
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <vector>
+#include <queue>
+
+using namespace std;
+
+/* Time meassuring */
+double ticks= (double)sysconf(_SC_CLK_TCK);
+struct tms t1,t2;
+
+void start_clock() {
+  times (&t1);
+}
+
+
+double stop_clock() {
+  times (&t2);
+  return (t2.tms_utime-t1.tms_utime)/ticks;
+}
+
+
+/* end Time meassuring */
+
+void printStats(double time, string fname, uint queries) {
+  cout.width(15);
+  cout << std::left << fname;
+  cout << " : ";
+  cout.width(8);
+  cout << std::right << queries << " calls, ";
+  cout.width(8);
+  cout << std::right << time << "ms, mean: ";
+  cout.width(8);
+  cout << std::right << time/queries << endl;
+}
+
+
+#define STATS1(fname,vect) {\
+    start_clock(); \
+    uint q = 0; \
+    while(q<treenodeQueries.size()) \
+    { \
+      acc += tree->fname((vect)[q]); \
+      q++; \
+    } \
+    double t = 1000.0*stop_clock(); \
+    printStats(t,#fname,(vect).size()); \
+  }
+
+#define STATS1p(fname,vect) {\
+    start_clock(); \
+    uint q = 0; \
+    while(q<treenodeQueries.size()) \
+    { \
+      acc += tree->fname((vect)[q]).min; \
+      q++; \
+    } \
+    double t = 1000.0*stop_clock(); \
+    printStats(t,#fname,(vect).size()); \
+  }
+
+#define STATS2(fname,vect) {\
+    start_clock(); \
+    uint q = 0; \
+    while(q<treenodeQueries.size()) \
+    { \
+      acc += tree->fname((vect)[q].first,(vect)[q].second); \
+      q++; \
+    } \
+    double t = 1000.0*stop_clock(); \
+    printStats(t,#fname,(vect).size()); \
+  }
+
+      TagType target_tag = -1;
+vector<treeNode> treenodeQueries;
+vector<pair<treeNode,TagType> > treenodetagQueries;
+vector<DocID> docidQueries;
+
+uint acc = 0;
+
+void runQueries(XMLTree * tree) {
+  STATS1(Tag,treenodeQueries);
+  STATS1(Parent,treenodeQueries);
+  STATS1p(DocIds,treenodeQueries);
+  STATS1(MyText,treenodeQueries);
+  STATS1(PrevText,treenodeQueries);
+  STATS1(NextText,treenodeQueries);
+  STATS1(FirstChild,treenodeQueries);
+  STATS1(NextSibling,treenodeQueries);
+  STATS1(ParentNode,docidQueries);
+  STATS1(PrevNode,treenodeQueries);
+  STATS2(TaggedDesc,treenodetagQueries);
+  STATS2(TaggedFoll,treenodetagQueries);
+}
+
+
+void fill_queries(XMLTree * tree, treeNode node,unsigned char* targettagname) {
+  treeNode res1,res2;
+  TagType tag;
+  DocID id1,id2,id3;
+  queue<treeNode> q;
+  q.push(node);
+  while(!q.empty()) {
+    node = q.front();
+    q.pop();
+    if (node != NULLT) {
+      tag = tree->Tag(node);
+      if (target_tag == -1) {
+        const unsigned char * tagname;
+        tagname = tree->GetTagNameByRef(tag);
+        if (strcmp( (char*) tagname, (char*) targettagname) == 0)
+          target_tag = tag;
+      }
+      treenodeQueries.push_back(node);
+      treenodetagQueries.push_back(pair<treeNode,TagType>(node,tag));
+      id1 = tree->MyText(node);
+      id2 = tree->PrevText(node);
+      id3 = tree->NextText(node);
+      id1 = max(id1, max(id2,id3));
+      docidQueries.push_back(id1);
+      res1 = tree->FirstChild(node);
+      res2 = tree->NextSibling(node);
+      q.push(res1);
+      q.push(res2);
+    }
+  }
+}
+
+
+vector<treeNode> traversalQueries;
+uint cFullTraversal = 0;
+
+void traversal_time(XMLTree * tree) {
+  start_clock();
+  uint q = 0;
+  while(q<traversalQueries.size()) {
+    treeNode node = traversalQueries[q];
+    acc += tree->FirstChild(node);
+    acc += tree->NextSibling(node);
+    q++;
+  }
+  double t = 1000.0*stop_clock();
+  printStats(t,"FullTraversal",traversalQueries.size());
+}
+
+
+unsigned int traversal(XMLTree *tree,treeNode node) {
+  uint ret = 0;
+  TagType tag;
+  queue<treeNode> q;
+  q.push(node);
+  while(!q.empty()) {
+    node = q.front();
+    q.pop();
+    if (node != NULLT) {
+      cFullTraversal++;
+      tag = tree->Tag(node);
+      if (tag == target_tag)
+        ret++;
+                       treeNode t1 = tree->FirstChild(node);
+      q.push(tree->FirstChild(node));
+                       treeNode t2 = tree->NextSibling(node);
+      q.push(tree->NextSibling(node));
+                       if(t1!=NULLT)
+      traversalQueries.push_back(t1);
+                       if(t2!=NULLT)
+      traversalQueries.push_back(t2);
+    }
+  }
+  return ret;
+}
+
+
+vector<pair<treeNode,treeNode> > jumpQueries;
+uint cJumpTraversal = 0;
+
+void jump_time(XMLTree * tree) {
+  start_clock();
+  uint q = 0;
+  while(q<jumpQueries.size()) {
+    treeNode node = jumpQueries[q].first;
+    treeNode root = jumpQueries[q].second;
+    acc += tree->TaggedDesc(node,target_tag);
+    acc += tree->TaggedFollBelow(node,target_tag,root);
+    q++;
+  }
+  double t = 1000.0*stop_clock();
+  printStats(t,"JumpTraversal",jumpQueries.size());
+}
+
+
+/* This simulates the run function of the jumping automata*/
+unsigned int jump_traversal(XMLTree* tree, treeNode node,treeNode root) {
+  uint ret = 0;
+  TagType tag;
+  queue<pair<treeNode,treeNode> > q;
+  q.push(pair<treeNode,treeNode>(node,root));
+  while(!q.empty()) {
+    pair<treeNode,treeNode> p = q.front();
+    q.pop();
+    node = p.first;
+    root = p.second;
+    if (node != NULLT) {
+      cJumpTraversal++;
+      tag = tree->Tag(node);
+      if (tag == target_tag)
+        ret++;
+      pair<treeNode,treeNode> p1(tree->TaggedDesc(node,target_tag),node);
+      pair<treeNode,treeNode> p2(tree->TaggedFollBelow(node,target_tag,root),root);
+                       if(p1.first!=NULLT)
+             jumpQueries.push_back(p1);
+                       if(p2.first!=NULLT)
+             jumpQueries.push_back(p2);
+      q.push(p1);
+      q.push(p2);
+    }
+  }
+  return ret;
+}
+
+
+int usage(char ** argv) {
+  std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
+  return 1;
+}
+
+
+int main(int argc, char ** argv) {
+  unsigned int count1,count2;
+  unsigned char * tagname;
+  string arg,filename,ext;
+  bool disable_tc = false;
+  bool save = false;
+  bool srx;
+  XMLTree * tree;
+
+  int i = 1;
+  if ( i >= argc)
+    return usage(argv);
+
+  arg = argv[i];
+  if (arg.compare("-d") == 0) {
+    disable_tc = true;
+    i++;
+    if ( i >= argc)
+      return usage(argv);
+    arg = argv[i];
+  }
+
+  if (arg.compare("-s") == 0) {
+    save = true;
+    i++;
+    if ( i >= argc)
+      return usage(argv);
+    arg = argv[i];
+  }
+
+  // The filename
+  if (arg.size() < 4)
+    return usage(argv);
+
+  ext=(arg.substr(arg.size()-4,4));
+  if (ext.compare(".srx") == 0) {
+    // must truncate
+    filename = arg.substr(0,arg.size()-4);
+    srx = true;
+  }
+  else if (ext.compare(".xml")==0) {
+    filename = arg;
+    srx = false;
+  }
+  else
+    return usage(argv);
+  i++;
+
+  if (i >= argc)
+    return usage(argv);
+
+  tagname = (unsigned char*) argv[i];
+
+  if (srx)
+    // The samplerate is not taken into account for loading anymore
+    tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
+  else {
+    try
+    {
+      //filename, sampling factor, index empty texts, disable tc
+      XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
+      shredder.processStartDocument("");
+      shredder.parse();
+      shredder.processEndDocument();
+      tree = (XMLTree *) shredder.storageIfc_->returnDocument();
+      if (save) {
+        filename = filename.substr(0,filename.size()-4).append(".srx");
+        struct stat stats;
+        int exists = stat(filename.c_str(),&stats);
+        if(exists == 0) {
+          std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
+        }
+        else {
+          tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
+        }
+
+      }
+    }
+    catch (const std::exception& e) {
+      cout << "Error during parsing : " << e.what() << "\n";
+      return 2;
+    }
+  }
+
+  fill_queries(tree,tree->Root(),tagname);
+  runQueries(tree);
+
+  if (target_tag == -1) {
+    cout << "Warning:  tag " << tagname << " was not found in the document!\n"
+      << "Warning:  not timing traversal and jumping functions\n";
+    return 0;
+  }
+
+  count1 = traversal(tree,tree->Root());
+  count2 = jump_traversal(tree,tree->Root(),tree->Root());
+
+  cout << "Full traversal found " << count1 << " '" << tagname << "' nodes, "
+    << cFullTraversal << " function calls." << endl;
+  traversal_time(tree);
+  cout << "Jump traversal found " << count2 << " '" << tagname << "' nodes, "
+    << cJumpTraversal << " function calls." << endl;
+  jump_time(tree);
+
+  return 0;
+}