1 #include "XMLDocShredder.h"
13 static double tFirstChild = 0;
14 static double tNextSibling = 0;
15 static double tParent = 0;
16 static double tTaggedAncestor = 0;
17 static double tTaggedChild = 0;
18 static double tTaggedDesc = 0;
19 static double tTaggedFoll = 0;
20 static double tParentNode = 0;
21 static double tPrevNode = 0;
22 static double tTag = 0;
23 static double tMyText = 0;
24 static double tPrevText = 0;
25 static double tNextText = 0;
26 static double tDocIds = 0;
28 static double tFullTraversal = 0;
29 static double tJumpTraversal = 0;
31 static unsigned int cFirstChild = 0;
32 static unsigned int cNextSibling = 0;
33 static unsigned int cParent = 0;
34 static unsigned int cTaggedAncestor = 0;
35 static unsigned int cTaggedChild = 0;
36 static unsigned int cTaggedDesc = 0;
37 static unsigned int cTaggedFoll = 0;
38 static unsigned int cParentNode = 0;
39 static unsigned int cPrevNode = 0;
40 static unsigned int cTag = 0;
41 static unsigned int cMyText = 0;
42 static unsigned int cPrevText = 0;
43 static unsigned int cNextText = 0;
44 static unsigned int cDocIds = 0;
46 static unsigned int cFullTraversal = 0;
47 static unsigned int cJumpTraversal = 0;
50 static struct timeval tmpv1;
51 static struct timeval tmpv2;
53 static TagType target_tag = -1;
55 #define STARTTIMER() (gettimeofday(&tmpv1,NULL))
56 #define STOPTIMER(x) do { \
57 gettimeofday(&tmpv2,NULL); \
58 (t##x) = (t##x) + ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \
59 (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \
63 #define PRINTSTATS(x) do { \
64 std::cout.width(15); \
65 std::cout << std::left << #x; \
68 std::cout << std::right << c##x << " calls, "; \
70 std::cout << std::right << (t##x) \
73 std::cout << std::right \
74 << (t##x) *1.00 / c##x \
78 void traversal(XMLTree * tree, treeNode node,unsigned char* targettagname){
83 const unsigned char * tagname;
87 tag = tree->Tag(node);
89 if (target_tag == -1){
90 tagname = tree->GetTagNameByRef(tag);
91 if (strcmp( (char*) tagname, (char*) targettagname) == 0)
95 res1 = tree->Parent(node);
99 res1 = tree->TaggedChild(node,0,tag);
100 STOPTIMER(TaggedChild);
103 res1 = tree->TaggedAncestor(node,tag);
104 STOPTIMER(TaggedAncestor);
107 res1 = tree->TaggedDesc(node,tag);
108 STOPTIMER(TaggedDesc);
111 res1 = tree->TaggedFoll(node,tag);
112 STOPTIMER(TaggedFoll);
115 rg = tree->DocIds(node);
119 id1 = tree->MyText(node);
123 id2 = tree->PrevText(node);
127 id3 = tree->NextText(node);
130 id1 = max(id1, max(id2,id3));
133 res1 = tree->ParentNode(id1);
134 STOPTIMER(ParentNode);
137 res1 = tree->PrevNode(id1);
141 res1 = tree->FirstChild(node);
142 STOPTIMER(FirstChild);
145 res2 = tree->NextSibling(node);
146 STOPTIMER(NextSibling);
148 traversal(tree,res1,targettagname);
149 traversal(tree,res2,targettagname);
155 /* This simulates the run function of the automata */
157 unsigned int time_traversal(XMLTree *tree,treeNode node){
161 tag = tree->Tag(node);
162 if (tag == target_tag)
164 time_traversal(tree,tree->FirstChild(node)) +
165 time_traversal(tree,tree->NextSibling(node));
167 return time_traversal(tree,tree->FirstChild(node)) +
168 time_traversal(tree,tree->NextSibling(node));
175 /* This simulates the run function of the jumping automata*/
176 unsigned int time_jump(XMLTree* tree, treeNode node,treeNode root){
180 tag = tree->Tag(node);
181 if (tag == target_tag)
183 time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
184 time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
187 return time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
188 time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
195 int usage(char ** argv){
197 std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
203 int main(int argc, char ** argv){
204 unsigned int count1,count2;
205 unsigned char * tagname;
206 string arg,filename,ext;
207 bool disable_tc = false;
217 if (arg.compare("-d") == 0){
225 if (arg.compare("-s") == 0){
238 ext=(arg.substr(arg.size()-4,4));
239 if (ext.compare(".srx") == 0){
241 filename = arg.substr(0,arg.size()-4);
245 else if (ext.compare(".xml")==0) {
256 tagname = (unsigned char*) argv[i];
261 // The samplerate is not taken into account for loading anymore
262 tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
265 //filename, sampling factor, index empty texts, disable tc
266 XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
267 shredder.processStartDocument("");
269 shredder.processEndDocument();
270 tree = (XMLTree *) shredder.storageIfc_->returnDocument();
272 filename = filename.substr(0,filename.size()-4).append(".srx");
274 int exists = stat(filename.c_str(),&stats);
276 std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
279 tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
284 catch (const std::exception& e){
285 std::cout << "Error during parsing : " << e.what() << "\n";
289 traversal(tree,tree->Root(),tagname);
294 PRINTSTATS(FirstChild);
295 PRINTSTATS(NextSibling);
297 PRINTSTATS(TaggedAncestor);
298 PRINTSTATS(TaggedChild);
300 PRINTSTATS(TaggedDesc);
301 PRINTSTATS(TaggedFoll);
302 PRINTSTATS(PrevText);
304 PRINTSTATS(NextText);
305 PRINTSTATS(ParentNode);
306 PRINTSTATS(PrevNode);
309 if (target_tag == -1){
310 std::cout << "Warning: tag " << tagname << " was not found in the document!\n"
311 << "Warning: not timing traversal and jumping functions\n";
316 count1 = time_traversal(tree,tree->Root());
317 STOPTIMER(FullTraversal);
319 count2 = time_jump(tree,tree->Root(),tree->Root());
320 STOPTIMER(JumpTraversal);
322 std::cout << "Full traversal found " << count1 << " " << tagname << " nodes\n";
323 PRINTSTATS(FullTraversal);
325 std::cout << "Jump traversal found " << count2 << " " << tagname << " nodes\n";
326 PRINTSTATS(JumpTraversal);