1 #include "XMLDocShredder.h"
13 double ticks= (double)sysconf(_SC_CLK_TCK);
23 return (t2.tms_utime-t1.tms_utime)/ticks;
27 /* end Time meassuring */
29 void printStats(double time, string fname, uint queries) {
31 cout << std::left << fname;
34 cout << std::right << queries << " calls, ";
36 cout << std::right << time << "ms, mean: ";
38 cout << std::right << time/queries << endl;
42 #define STATS1(fname,vect) {\
45 while(q<(vect).size()) \
47 acc += tree->fname((vect)[q]); \
50 double t = 1000.0*stop_clock(); \
51 printStats(t,#fname,(vect).size()); \
54 #define STATS1p(fname,vect) {\
57 while(q<(vect).size()) \
59 acc += tree->fname((vect)[q]).min; \
62 double t = 1000.0*stop_clock(); \
63 printStats(t,#fname,(vect).size()); \
66 #define STATS2(fname,vect) {\
69 while(q<(vect).size()) \
71 acc += tree->fname((vect)[q].first,(vect)[q].second); \
74 double t = 1000.0*stop_clock(); \
75 printStats(t,#fname,(vect).size()); \
79 TagType target_tag = -1;
80 vector<treeNode> treenodeQueries;
81 vector<pair<treeNode,TagType> > treenodetagQueries;
82 vector<DocID> docidQueries;
86 void runQueries(XMLTree * tree) {
87 STATS1(Tag,treenodeQueries);
88 STATS1(Parent,treenodeQueries);
89 STATS1p(DocIds,treenodeQueries);
90 STATS1(MyText,treenodeQueries);
91 STATS1(PrevText,treenodeQueries);
92 STATS1(NextText,treenodeQueries);
93 STATS1(FirstChild,treenodeQueries);
94 STATS1(NextSibling,treenodeQueries);
95 STATS1(ParentNode,docidQueries);
96 STATS1(PrevNode,docidQueries);
97 STATS2(TaggedDesc,treenodetagQueries);
98 STATS2(TaggedFoll,treenodetagQueries);
102 void fill_queries(XMLTree * tree, treeNode node,unsigned char* targettagname) {
112 tag = tree->Tag(node);
113 if (target_tag == -1) {
114 const unsigned char * tagname;
115 tagname = tree->GetTagNameByRef(tag);
116 if (strcmp( (char*) tagname, (char*) targettagname) == 0)
119 treenodeQueries.push_back(node);
120 treenodetagQueries.push_back(pair<treeNode,TagType>(node,tag));
121 id1 = tree->MyText(node);
122 id2 = tree->PrevText(node);
123 id3 = tree->NextText(node);
124 id1 = max(id1, max(id2,id3));
125 docidQueries.push_back(id1);
126 res1 = tree->FirstChild(node);
127 res2 = tree->NextSibling(node);
135 vector<treeNode> traversalQueries;
136 uint cFullTraversal = 0;
138 void traversal_time(XMLTree * tree) {
141 while(q<traversalQueries.size()) {
142 treeNode node = traversalQueries[q];
143 acc += tree->FirstChild(node);
144 acc += tree->NextSibling(node);
147 double t = 1000.0*stop_clock();
148 printStats(t,"FullTraversal",traversalQueries.size());
152 unsigned int traversal(XMLTree *tree,treeNode node) {
162 tag = tree->Tag(node);
163 if (tag == target_tag)
165 treeNode t1 = tree->FirstChild(node);
166 q.push(tree->FirstChild(node));
167 treeNode t2 = tree->NextSibling(node);
168 q.push(tree->NextSibling(node));
170 traversalQueries.push_back(t1);
172 traversalQueries.push_back(t2);
179 vector<pair<treeNode,treeNode> > jumpQueries;
180 uint cJumpTraversal = 0;
182 void jump_time(XMLTree * tree) {
185 while(q<jumpQueries.size()) {
186 treeNode node = jumpQueries[q].first;
187 treeNode root = jumpQueries[q].second;
188 acc += tree->TaggedDesc(node,target_tag);
189 acc += tree->TaggedFollBelow(node,target_tag,root);
192 double t = 1000.0*stop_clock();
193 printStats(t,"JumpTraversal",jumpQueries.size());
197 /* This simulates the run function of the jumping automata*/
198 unsigned int jump_traversal(XMLTree* tree, treeNode node,treeNode root) {
201 queue<pair<treeNode,treeNode> > q;
202 q.push(pair<treeNode,treeNode>(node,root));
204 pair<treeNode,treeNode> p = q.front();
210 tag = tree->Tag(node);
211 if (tag == target_tag)
213 pair<treeNode,treeNode> p1(tree->TaggedDesc(node,target_tag),node);
214 pair<treeNode,treeNode> p2(tree->TaggedFollBelow(node,target_tag,root),root);
216 jumpQueries.push_back(p1);
218 jumpQueries.push_back(p2);
227 int usage(char ** argv) {
228 std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
233 int main(int argc, char ** argv) {
234 unsigned int count1,count2;
235 unsigned char * tagname;
236 string arg,filename,ext;
237 bool disable_tc = false;
247 if (arg.compare("-d") == 0) {
255 if (arg.compare("-s") == 0) {
267 ext=(arg.substr(arg.size()-4,4));
268 if (ext.compare(".srx") == 0) {
270 filename = arg.substr(0,arg.size()-4);
273 else if (ext.compare(".xml")==0) {
284 tagname = (unsigned char*) argv[i];
287 // The samplerate is not taken into account for loading anymore
288 tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
292 //filename, sampling factor, index empty texts, disable tc
293 XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
294 shredder.processStartDocument("");
296 shredder.processEndDocument();
297 tree = (XMLTree *) shredder.storageIfc_->returnDocument();
299 filename = filename.substr(0,filename.size()-4).append(".srx");
301 int exists = stat(filename.c_str(),&stats);
303 std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
306 tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
311 catch (const std::exception& e) {
312 cout << "Error during parsing : " << e.what() << "\n";
317 fill_queries(tree,tree->Root(),tagname);
320 if (target_tag == -1) {
321 cout << "Warning: tag " << tagname << " was not found in the document!\n"
322 << "Warning: not timing traversal and jumping functions\n";
326 count1 = traversal(tree,tree->Root());
327 count2 = jump_traversal(tree,tree->Root(),tree->Root());
329 cout << endl << endl;
330 cout << "Full traversal found " << count1 << " '" << tagname << "' nodes, "
331 << cFullTraversal << " function calls." << endl;
332 traversal_time(tree);
333 cout << endl << endl;
334 cout << "Jump traversal found " << count2 << " '" << tagname << "' nodes, "
335 << cJumpTraversal << " function calls." << endl;