From 0611dd6406735f44d7136850eb068648817a9fb9 Mon Sep 17 00:00:00 2001 From: fclaude Date: Sun, 29 Mar 2009 17:24:18 +0000 Subject: [PATCH 1/1] new timeXMLTree git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@299 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- Makefile | 6 +- myTimeXMLTree.cpp | 339 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 myTimeXMLTree.cpp diff --git a/Makefile b/Makefile index e6ce245..9d2f39a 100644 --- a/Makefile +++ b/Makefile @@ -106,7 +106,8 @@ libcamlshredder.a: $(CXXOBJECTS) XMLTree/XMLTree.a clean: @echo [CLEAN] - $(HIDE) rm -f *~ *.cm* *.[oa] *.so main .libs + $(HIDE) rm -f *~ *.cm* *.[oa] *.so main + $(HIDE) rm -rf .libs timeXMLTree: $(CXXOBJECTS) XMLTree/XMLTree.a timeXMLTree.cpp @@ -115,6 +116,9 @@ timeXMLTree: $(CXXOBJECTS) XMLTree/XMLTree.a timeXMLTree.cpp $(CXX) -o timeXMLTree $(CXXFLAGS) $(CXXINCLUDES) XMLDocShredder.o \ SXSIStorageInterface.o StorageInterface.o ./.libs/*.o \ $(LIBS) timeXMLTree.cpp + $(CXX) -o myTimeXMLTree $(CXXFLAGS) $(CXXINCLUDES) XMLDocShredder.o \ + SXSIStorageInterface.o StorageInterface.o ./.libs/*.o \ + $(LIBS) myTimeXMLTree.cpp rm -rf .libs SXSIStorageInterface.o: SXSIStorageInterface.h SXSIStorageInterface.cpp StorageInterface.h diff --git a/myTimeXMLTree.cpp b/myTimeXMLTree.cpp new file mode 100644 index 0000000..e7b97b2 --- /dev/null +++ b/myTimeXMLTree.cpp @@ -0,0 +1,339 @@ +#include "XMLDocShredder.h" +#include "XMLTree.h" +#include "Utils.h" +#include +#include +#include +#include +#include + +using namespace std; + +/* Time meassuring */ +double ticks= (double)sysconf(_SC_CLK_TCK); +struct tms t1,t2; + +void start_clock() { + times (&t1); +} + + +double stop_clock() { + times (&t2); + return (t2.tms_utime-t1.tms_utime)/ticks; +} + + +/* end Time meassuring */ + +void printStats(double time, string fname, uint queries) { + cout.width(15); + cout << std::left << fname; + cout << " : "; + cout.width(8); + cout << std::right << queries << " calls, "; + cout.width(8); + cout << std::right << time << "ms, mean: "; + cout.width(8); + cout << std::right << time/queries << endl; +} + + +#define STATS1(fname,vect) {\ + start_clock(); \ + uint q = 0; \ + while(q<(vect).size()) \ + { \ + acc += tree->fname((vect)[q]); \ + q++; \ + } \ + double t = 1000.0*stop_clock(); \ + printStats(t,#fname,(vect).size()); \ + } + +#define STATS1p(fname,vect) {\ + start_clock(); \ + uint q = 0; \ + while(q<(vect).size()) \ + { \ + acc += tree->fname((vect)[q]).min; \ + q++; \ + } \ + double t = 1000.0*stop_clock(); \ + printStats(t,#fname,(vect).size()); \ + } + +#define STATS2(fname,vect) {\ + start_clock(); \ + uint q = 0; \ + while(q<(vect).size()) \ + { \ + acc += tree->fname((vect)[q].first,(vect)[q].second); \ + q++; \ + } \ + double t = 1000.0*stop_clock(); \ + printStats(t,#fname,(vect).size()); \ + cout.flush();\ + } + +TagType target_tag = -1; +vector treenodeQueries; +vector > treenodetagQueries; +vector docidQueries; + +uint acc = 0; + +void runQueries(XMLTree * tree) { + STATS1(Tag,treenodeQueries); + STATS1(Parent,treenodeQueries); + STATS1p(DocIds,treenodeQueries); + STATS1(MyText,treenodeQueries); + STATS1(PrevText,treenodeQueries); + STATS1(NextText,treenodeQueries); + STATS1(FirstChild,treenodeQueries); + STATS1(NextSibling,treenodeQueries); + STATS1(ParentNode,docidQueries); + STATS1(PrevNode,docidQueries); + STATS2(TaggedDesc,treenodetagQueries); + STATS2(TaggedFoll,treenodetagQueries); +} + + +void fill_queries(XMLTree * tree, treeNode node,unsigned char* targettagname) { + treeNode res1,res2; + TagType tag; + DocID id1,id2,id3; + queue q; + q.push(node); + while(!q.empty()) { + node = q.front(); + q.pop(); + if (node != NULLT) { + tag = tree->Tag(node); + if (target_tag == -1) { + const unsigned char * tagname; + tagname = tree->GetTagNameByRef(tag); + if (strcmp( (char*) tagname, (char*) targettagname) == 0) + target_tag = tag; + } + treenodeQueries.push_back(node); + treenodetagQueries.push_back(pair(node,tag)); + id1 = tree->MyText(node); + id2 = tree->PrevText(node); + id3 = tree->NextText(node); + id1 = max(id1, max(id2,id3)); + docidQueries.push_back(id1); + res1 = tree->FirstChild(node); + res2 = tree->NextSibling(node); + q.push(res1); + q.push(res2); + } + } +} + + +vector traversalQueries; +uint cFullTraversal = 0; + +void traversal_time(XMLTree * tree) { + start_clock(); + uint q = 0; + while(qFirstChild(node); + acc += tree->NextSibling(node); + q++; + } + double t = 1000.0*stop_clock(); + printStats(t,"FullTraversal",traversalQueries.size()); +} + + +unsigned int traversal(XMLTree *tree,treeNode node) { + uint ret = 0; + TagType tag; + queue q; + q.push(node); + while(!q.empty()) { + node = q.front(); + q.pop(); + if (node != NULLT) { + cFullTraversal++; + tag = tree->Tag(node); + if (tag == target_tag) + ret++; + treeNode t1 = tree->FirstChild(node); + q.push(tree->FirstChild(node)); + treeNode t2 = tree->NextSibling(node); + q.push(tree->NextSibling(node)); + if(t1!=NULLT) + traversalQueries.push_back(t1); + if(t2!=NULLT) + traversalQueries.push_back(t2); + } + } + return ret; +} + + +vector > jumpQueries; +uint cJumpTraversal = 0; + +void jump_time(XMLTree * tree) { + start_clock(); + uint q = 0; + while(qTaggedDesc(node,target_tag); + acc += tree->TaggedFollBelow(node,target_tag,root); + q++; + } + double t = 1000.0*stop_clock(); + printStats(t,"JumpTraversal",jumpQueries.size()); +} + + +/* This simulates the run function of the jumping automata*/ +unsigned int jump_traversal(XMLTree* tree, treeNode node,treeNode root) { + uint ret = 0; + TagType tag; + queue > q; + q.push(pair(node,root)); + while(!q.empty()) { + pair p = q.front(); + q.pop(); + node = p.first; + root = p.second; + if (node != NULLT) { + cJumpTraversal++; + tag = tree->Tag(node); + if (tag == target_tag) + ret++; + pair p1(tree->TaggedDesc(node,target_tag),node); + pair p2(tree->TaggedFollBelow(node,target_tag,root),root); + if(p1.first!=NULLT) + jumpQueries.push_back(p1); + if(p2.first!=NULLT) + jumpQueries.push_back(p2); + q.push(p1); + q.push(p2); + } + } + return ret; +} + + +int usage(char ** argv) { + std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n"; + return 1; +} + + +int main(int argc, char ** argv) { + unsigned int count1,count2; + unsigned char * tagname; + string arg,filename,ext; + bool disable_tc = false; + bool save = false; + bool srx; + XMLTree * tree; + + int i = 1; + if ( i >= argc) + return usage(argv); + + arg = argv[i]; + if (arg.compare("-d") == 0) { + disable_tc = true; + i++; + if ( i >= argc) + return usage(argv); + arg = argv[i]; + } + + if (arg.compare("-s") == 0) { + save = true; + i++; + if ( i >= argc) + return usage(argv); + arg = argv[i]; + } + + // The filename + if (arg.size() < 4) + return usage(argv); + + ext=(arg.substr(arg.size()-4,4)); + if (ext.compare(".srx") == 0) { + // must truncate + filename = arg.substr(0,arg.size()-4); + srx = true; + } + else if (ext.compare(".xml")==0) { + filename = arg; + srx = false; + } + else + return usage(argv); + i++; + + if (i >= argc) + return usage(argv); + + tagname = (unsigned char*) argv[i]; + + if (srx) + // The samplerate is not taken into account for loading anymore + tree = XMLTree::Load((unsigned char*) filename.c_str(),64); + else { + try + { + //filename, sampling factor, index empty texts, disable tc + XMLDocShredder shredder(filename.c_str(),64,false,disable_tc); + shredder.processStartDocument(""); + shredder.parse(); + shredder.processEndDocument(); + tree = (XMLTree *) shredder.storageIfc_->returnDocument(); + if (save) { + filename = filename.substr(0,filename.size()-4).append(".srx"); + struct stat stats; + int exists = stat(filename.c_str(),&stats); + if(exists == 0) { + std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n"; + } + else { + tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str()); + } + + } + } + catch (const std::exception& e) { + cout << "Error during parsing : " << e.what() << "\n"; + return 2; + } + } + + fill_queries(tree,tree->Root(),tagname); + runQueries(tree); + + if (target_tag == -1) { + cout << "Warning: tag " << tagname << " was not found in the document!\n" + << "Warning: not timing traversal and jumping functions\n"; + return 0; + } + + count1 = traversal(tree,tree->Root()); + count2 = jump_traversal(tree,tree->Root(),tree->Root()); + + cout << endl << endl; + cout << "Full traversal found " << count1 << " '" << tagname << "' nodes, " + << cFullTraversal << " function calls." << endl; + traversal_time(tree); + cout << endl << endl; + cout << "Jump traversal found " << count2 << " '" << tagname << "' nodes, " + << cJumpTraversal << " function calls." << endl; + jump_time(tree); + + return 0; +} -- 2.17.1