-#include "XMLDocShredder.h"
-#include "XMLTree.h"
-#include "Utils.h"
-#include <sys/time.h>
-#include <time.h>
-#include <sys/stat.h>
-#include <vector>
-#include <queue>
-
-using namespace std;
-
-/* Time meassuring */
-double ticks= (double)sysconf(_SC_CLK_TCK);
-struct tms t1,t2;
-
-void start_clock() {
- times (&t1);
-}
-
-
-double stop_clock() {
- times (&t2);
- return (t2.tms_utime-t1.tms_utime)/ticks;
-}
-
-
-/* end Time meassuring */
-
-void printStats(double time, string fname, uint queries) {
- cout.width(15);
- cout << std::left << fname;
- cout << " : ";
- cout.width(8);
- cout << std::right << queries << " calls, ";
- cout.width(8);
- cout << std::right << time << "ms, mean: ";
- cout.width(8);
- cout << std::right << time/queries << endl;
-}
-
-
-#define STATS1(fname,vect) {\
- start_clock(); \
- uint q = 0; \
- while(q<treenodeQueries.size()) \
- { \
- acc += tree->fname((vect)[q]); \
- q++; \
- } \
- double t = 1000.0*stop_clock(); \
- printStats(t,#fname,(vect).size()); \
- }
-
-#define STATS1p(fname,vect) {\
- start_clock(); \
- uint q = 0; \
- while(q<treenodeQueries.size()) \
- { \
- acc += tree->fname((vect)[q]).min; \
- q++; \
- } \
- double t = 1000.0*stop_clock(); \
- printStats(t,#fname,(vect).size()); \
- }
-
-#define STATS2(fname,vect) {\
- start_clock(); \
- uint q = 0; \
- while(q<treenodeQueries.size()) \
- { \
- acc += tree->fname((vect)[q].first,(vect)[q].second); \
- q++; \
- } \
- double t = 1000.0*stop_clock(); \
- printStats(t,#fname,(vect).size()); \
- }
-
- TagType target_tag = -1;
-vector<treeNode> treenodeQueries;
-vector<pair<treeNode,TagType> > treenodetagQueries;
-vector<DocID> docidQueries;
-
-uint acc = 0;
-
-void runQueries(XMLTree * tree) {
- STATS1(Tag,treenodeQueries);
- STATS1(Parent,treenodeQueries);
- STATS1p(DocIds,treenodeQueries);
- STATS1(MyText,treenodeQueries);
- STATS1(PrevText,treenodeQueries);
- STATS1(NextText,treenodeQueries);
- STATS1(FirstChild,treenodeQueries);
- STATS1(NextSibling,treenodeQueries);
- STATS1(ParentNode,docidQueries);
- STATS1(PrevNode,treenodeQueries);
- STATS2(TaggedDesc,treenodetagQueries);
- STATS2(TaggedFoll,treenodetagQueries);
-}
-
-
-void fill_queries(XMLTree * tree, treeNode node,unsigned char* targettagname) {
- treeNode res1,res2;
- TagType tag;
- DocID id1,id2,id3;
- queue<treeNode> q;
- q.push(node);
- while(!q.empty()) {
- node = q.front();
- q.pop();
- if (node != NULLT) {
- tag = tree->Tag(node);
- if (target_tag == -1) {
- const unsigned char * tagname;
- tagname = tree->GetTagNameByRef(tag);
- if (strcmp( (char*) tagname, (char*) targettagname) == 0)
- target_tag = tag;
- }
- treenodeQueries.push_back(node);
- treenodetagQueries.push_back(pair<treeNode,TagType>(node,tag));
- id1 = tree->MyText(node);
- id2 = tree->PrevText(node);
- id3 = tree->NextText(node);
- id1 = max(id1, max(id2,id3));
- docidQueries.push_back(id1);
- res1 = tree->FirstChild(node);
- res2 = tree->NextSibling(node);
- q.push(res1);
- q.push(res2);
- }
- }
-}
-
-
-vector<treeNode> traversalQueries;
-uint cFullTraversal = 0;
-
-void traversal_time(XMLTree * tree) {
- start_clock();
- uint q = 0;
- while(q<traversalQueries.size()) {
- treeNode node = traversalQueries[q];
- acc += tree->FirstChild(node);
- acc += tree->NextSibling(node);
- q++;
- }
- double t = 1000.0*stop_clock();
- printStats(t,"FullTraversal",traversalQueries.size());
-}
-
-
-unsigned int traversal(XMLTree *tree,treeNode node) {
- uint ret = 0;
- TagType tag;
- queue<treeNode> q;
- q.push(node);
- while(!q.empty()) {
- node = q.front();
- q.pop();
- if (node != NULLT) {
- cFullTraversal++;
- tag = tree->Tag(node);
- if (tag == target_tag)
- ret++;
- treeNode t1 = tree->FirstChild(node);
- q.push(tree->FirstChild(node));
- treeNode t2 = tree->NextSibling(node);
- q.push(tree->NextSibling(node));
- if(t1!=NULLT)
- traversalQueries.push_back(t1);
- if(t2!=NULLT)
- traversalQueries.push_back(t2);
- }
- }
- return ret;
-}
-
-
-vector<pair<treeNode,treeNode> > jumpQueries;
-uint cJumpTraversal = 0;
-
-void jump_time(XMLTree * tree) {
- start_clock();
- uint q = 0;
- while(q<jumpQueries.size()) {
- treeNode node = jumpQueries[q].first;
- treeNode root = jumpQueries[q].second;
- acc += tree->TaggedDesc(node,target_tag);
- acc += tree->TaggedFollBelow(node,target_tag,root);
- q++;
- }
- double t = 1000.0*stop_clock();
- printStats(t,"JumpTraversal",jumpQueries.size());
-}
-
-
-/* This simulates the run function of the jumping automata*/
-unsigned int jump_traversal(XMLTree* tree, treeNode node,treeNode root) {
- uint ret = 0;
- TagType tag;
- queue<pair<treeNode,treeNode> > q;
- q.push(pair<treeNode,treeNode>(node,root));
- while(!q.empty()) {
- pair<treeNode,treeNode> p = q.front();
- q.pop();
- node = p.first;
- root = p.second;
- if (node != NULLT) {
- cJumpTraversal++;
- tag = tree->Tag(node);
- if (tag == target_tag)
- ret++;
- pair<treeNode,treeNode> p1(tree->TaggedDesc(node,target_tag),node);
- pair<treeNode,treeNode> p2(tree->TaggedFollBelow(node,target_tag,root),root);
- if(p1.first!=NULLT)
- jumpQueries.push_back(p1);
- if(p2.first!=NULLT)
- jumpQueries.push_back(p2);
- q.push(p1);
- q.push(p2);
- }
- }
- return ret;
-}
-
-
-int usage(char ** argv) {
- std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
- return 1;
-}
-
-
-int main(int argc, char ** argv) {
- unsigned int count1,count2;
- unsigned char * tagname;
- string arg,filename,ext;
- bool disable_tc = false;
- bool save = false;
- bool srx;
- XMLTree * tree;
-
- int i = 1;
- if ( i >= argc)
- return usage(argv);
-
- arg = argv[i];
- if (arg.compare("-d") == 0) {
- disable_tc = true;
- i++;
- if ( i >= argc)
- return usage(argv);
- arg = argv[i];
- }
-
- if (arg.compare("-s") == 0) {
- save = true;
- i++;
- if ( i >= argc)
- return usage(argv);
- arg = argv[i];
- }
-
- // The filename
- if (arg.size() < 4)
- return usage(argv);
-
- ext=(arg.substr(arg.size()-4,4));
- if (ext.compare(".srx") == 0) {
- // must truncate
- filename = arg.substr(0,arg.size()-4);
- srx = true;
- }
- else if (ext.compare(".xml")==0) {
- filename = arg;
- srx = false;
- }
- else
- return usage(argv);
- i++;
-
- if (i >= argc)
- return usage(argv);
-
- tagname = (unsigned char*) argv[i];
-
- if (srx)
- // The samplerate is not taken into account for loading anymore
- tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
- else {
- try
- {
- //filename, sampling factor, index empty texts, disable tc
- XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
- shredder.processStartDocument("");
- shredder.parse();
- shredder.processEndDocument();
- tree = (XMLTree *) shredder.storageIfc_->returnDocument();
- if (save) {
- filename = filename.substr(0,filename.size()-4).append(".srx");
- struct stat stats;
- int exists = stat(filename.c_str(),&stats);
- if(exists == 0) {
- std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
- }
- else {
- tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
- }
-
- }
- }
- catch (const std::exception& e) {
- cout << "Error during parsing : " << e.what() << "\n";
- return 2;
- }
- }
-
- fill_queries(tree,tree->Root(),tagname);
- runQueries(tree);
-
- if (target_tag == -1) {
- cout << "Warning: tag " << tagname << " was not found in the document!\n"
- << "Warning: not timing traversal and jumping functions\n";
- return 0;
- }
-
- count1 = traversal(tree,tree->Root());
- count2 = jump_traversal(tree,tree->Root(),tree->Root());
-
- cout << "Full traversal found " << count1 << " '" << tagname << "' nodes, "
- << cFullTraversal << " function calls." << endl;
- traversal_time(tree);
- cout << "Jump traversal found " << count2 << " '" << tagname << "' nodes, "
- << cJumpTraversal << " function calls." << endl;
- jump_time(tree);
-
- return 0;
-}