1 #include "XMLDocShredder.h"
13 double ticks= (double)sysconf(_SC_CLK_TCK);
23 return (t2.tms_utime-t1.tms_utime)/ticks;
27 /* end Time meassuring */
29 void printStats(double time, string fname, uint queries) {
31 cout << std::left << fname;
34 cout << std::right << queries << " calls, ";
36 cout << std::right << time << "ms, mean: ";
38 cout << std::right << time/queries << endl;
42 #define STATS1(fname,vect) {\
45 while(q<treenodeQueries.size()) \
47 acc += tree->fname((vect)[q]); \
50 double t = 1000.0*stop_clock(); \
51 printStats(t,#fname,(vect).size()); \
54 #define STATS1p(fname,vect) {\
57 while(q<treenodeQueries.size()) \
59 acc += tree->fname((vect)[q]).min; \
62 double t = 1000.0*stop_clock(); \
63 printStats(t,#fname,(vect).size()); \
66 #define STATS2(fname,vect) {\
69 while(q<treenodeQueries.size()) \
71 acc += tree->fname((vect)[q].first,(vect)[q].second); \
74 double t = 1000.0*stop_clock(); \
75 printStats(t,#fname,(vect).size()); \
78 TagType target_tag = -1;
79 vector<treeNode> treenodeQueries;
80 vector<pair<treeNode,TagType> > treenodetagQueries;
81 vector<DocID> docidQueries;
85 void runQueries(XMLTree * tree) {
86 STATS1(Tag,treenodeQueries);
87 STATS1(Parent,treenodeQueries);
88 STATS1p(DocIds,treenodeQueries);
89 STATS1(MyText,treenodeQueries);
90 STATS1(PrevText,treenodeQueries);
91 STATS1(NextText,treenodeQueries);
92 STATS1(FirstChild,treenodeQueries);
93 STATS1(NextSibling,treenodeQueries);
94 STATS1(ParentNode,docidQueries);
95 STATS1(PrevNode,treenodeQueries);
96 STATS2(TaggedDesc,treenodetagQueries);
97 STATS2(TaggedFoll,treenodetagQueries);
101 void fill_queries(XMLTree * tree, treeNode node,unsigned char* targettagname) {
111 tag = tree->Tag(node);
112 if (target_tag == -1) {
113 const unsigned char * tagname;
114 tagname = tree->GetTagNameByRef(tag);
115 if (strcmp( (char*) tagname, (char*) targettagname) == 0)
118 treenodeQueries.push_back(node);
119 treenodetagQueries.push_back(pair<treeNode,TagType>(node,tag));
120 id1 = tree->MyText(node);
121 id2 = tree->PrevText(node);
122 id3 = tree->NextText(node);
123 id1 = max(id1, max(id2,id3));
124 docidQueries.push_back(id1);
125 res1 = tree->FirstChild(node);
126 res2 = tree->NextSibling(node);
134 vector<treeNode> traversalQueries;
135 uint cFullTraversal = 0;
137 void traversal_time(XMLTree * tree) {
140 while(q<traversalQueries.size()) {
141 treeNode node = traversalQueries[q];
142 acc += tree->FirstChild(node);
143 acc += tree->NextSibling(node);
146 double t = 1000.0*stop_clock();
147 printStats(t,"FullTraversal",traversalQueries.size());
151 unsigned int traversal(XMLTree *tree,treeNode node) {
161 tag = tree->Tag(node);
162 if (tag == target_tag)
164 treeNode t1 = tree->FirstChild(node);
165 q.push(tree->FirstChild(node));
166 treeNode t2 = tree->NextSibling(node);
167 q.push(tree->NextSibling(node));
169 traversalQueries.push_back(t1);
171 traversalQueries.push_back(t2);
178 vector<pair<treeNode,treeNode> > jumpQueries;
179 uint cJumpTraversal = 0;
181 void jump_time(XMLTree * tree) {
184 while(q<jumpQueries.size()) {
185 treeNode node = jumpQueries[q].first;
186 treeNode root = jumpQueries[q].second;
187 acc += tree->TaggedDesc(node,target_tag);
188 acc += tree->TaggedFollBelow(node,target_tag,root);
191 double t = 1000.0*stop_clock();
192 printStats(t,"JumpTraversal",jumpQueries.size());
196 /* This simulates the run function of the jumping automata*/
197 unsigned int jump_traversal(XMLTree* tree, treeNode node,treeNode root) {
200 queue<pair<treeNode,treeNode> > q;
201 q.push(pair<treeNode,treeNode>(node,root));
203 pair<treeNode,treeNode> p = q.front();
209 tag = tree->Tag(node);
210 if (tag == target_tag)
212 pair<treeNode,treeNode> p1(tree->TaggedDesc(node,target_tag),node);
213 pair<treeNode,treeNode> p2(tree->TaggedFollBelow(node,target_tag,root),root);
215 jumpQueries.push_back(p1);
217 jumpQueries.push_back(p2);
226 int usage(char ** argv) {
227 std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
232 int main(int argc, char ** argv) {
233 unsigned int count1,count2;
234 unsigned char * tagname;
235 string arg,filename,ext;
236 bool disable_tc = false;
246 if (arg.compare("-d") == 0) {
254 if (arg.compare("-s") == 0) {
266 ext=(arg.substr(arg.size()-4,4));
267 if (ext.compare(".srx") == 0) {
269 filename = arg.substr(0,arg.size()-4);
272 else if (ext.compare(".xml")==0) {
283 tagname = (unsigned char*) argv[i];
286 // The samplerate is not taken into account for loading anymore
287 tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
291 //filename, sampling factor, index empty texts, disable tc
292 XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
293 shredder.processStartDocument("");
295 shredder.processEndDocument();
296 tree = (XMLTree *) shredder.storageIfc_->returnDocument();
298 filename = filename.substr(0,filename.size()-4).append(".srx");
300 int exists = stat(filename.c_str(),&stats);
302 std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
305 tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
310 catch (const std::exception& e) {
311 cout << "Error during parsing : " << e.what() << "\n";
316 fill_queries(tree,tree->Root(),tagname);
319 if (target_tag == -1) {
320 cout << "Warning: tag " << tagname << " was not found in the document!\n"
321 << "Warning: not timing traversal and jumping functions\n";
325 count1 = traversal(tree,tree->Root());
326 count2 = jump_traversal(tree,tree->Root(),tree->Root());
328 cout << "Full traversal found " << count1 << " '" << tagname << "' nodes, "
329 << cFullTraversal << " function calls." << endl;
330 traversal_time(tree);
331 cout << "Jump traversal found " << count2 << " '" << tagname << "' nodes, "
332 << cJumpTraversal << " function calls." << endl;