1 #include "XMLDocShredder.h"
10 ((uintnat)(intern_src[-4]) << 24) + (intern_src[-3] << 16) + \
11 (intern_src[-2] << 8) + intern_src[-1])
18 static double tFirstChild = 0;
19 static double tNextSibling = 0;
20 static double tParent = 0;
21 static double tTaggedAncestor = 0;
22 static double tTaggedChild = 0;
23 static double tTaggedDesc = 0;
24 static double tTaggedFoll = 0;
25 static double tParentNode = 0;
26 static double tPrevNode = 0;
27 static double tTag = 0;
28 static double tMyText = 0;
29 static double tPrevText = 0;
30 static double tNextText = 0;
31 static double tDocIds = 0;
33 static double tFullTraversal = 0;
34 static double tJumpTraversal = 0;
36 static unsigned int cFirstChild = 0;
37 static unsigned int cNextSibling = 0;
38 static unsigned int cParent = 0;
39 static unsigned int cTaggedAncestor = 0;
40 static unsigned int cTaggedChild = 0;
41 static unsigned int cTaggedDesc = 0;
42 static unsigned int cTaggedFoll = 0;
43 static unsigned int cParentNode = 0;
44 static unsigned int cPrevNode = 0;
45 static unsigned int cTag = 0;
46 static unsigned int cMyText = 0;
47 static unsigned int cPrevText = 0;
48 static unsigned int cNextText = 0;
49 static unsigned int cDocIds = 0;
51 static unsigned int cFullTraversal = 0;
52 static unsigned int cJumpTraversal = 0;
55 static struct timeval tmpv1;
56 static struct timeval tmpv2;
58 static TagType target_tag = -1;
60 #define STARTTIMER() (gettimeofday(&tmpv1,NULL))
61 #define STOPTIMER(x) do { \
62 gettimeofday(&tmpv2,NULL); \
63 (t##x) = (t##x) + ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \
64 (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \
68 #define PRINTSTATS(x) do { \
69 std::cout.width(15); \
70 std::cout << std::left << #x; \
73 std::cout << std::right << c##x << " calls, "; \
75 std::cout << std::right << (t##x) \
78 std::cout << std::right \
79 << (t##x) *1.00 / c##x \
83 void traversal(XMLTree * tree, treeNode node,unsigned char* targettagname){
88 const unsigned char * tagname;
92 tag = tree->Tag(node);
94 if (target_tag == -1){
95 tagname = tree->GetTagNameByRef(tag);
96 if (strcmp( (char*) tagname, (char*) targettagname) == 0)
100 res1 = tree->Parent(node);
104 res1 = tree->TaggedChild(node,0,tag);
105 STOPTIMER(TaggedChild);
108 res1 = tree->TaggedAncestor(node,tag);
109 STOPTIMER(TaggedAncestor);
112 res1 = tree->TaggedDesc(node,tag);
113 STOPTIMER(TaggedDesc);
116 res1 = tree->TaggedFoll(node,tag);
117 STOPTIMER(TaggedFoll);
120 rg = tree->DocIds(node);
124 id1 = tree->MyText(node);
128 id2 = tree->PrevText(node);
132 id3 = tree->NextText(node);
135 id1 = max(id1, max(id2,id3));
138 res1 = tree->ParentNode(id1);
139 STOPTIMER(ParentNode);
142 res1 = tree->PrevNode(id1);
146 res1 = tree->FirstChild(node);
147 STOPTIMER(FirstChild);
150 res2 = tree->NextSibling(node);
151 STOPTIMER(NextSibling);
153 traversal(tree,res1,targettagname);
154 traversal(tree,res2,targettagname);
160 /* This simulates the run function of the automata */
162 unsigned int time_traversal(XMLTree *tree,treeNode node){
166 tag = tree->Tag(node);
167 if (tag == target_tag)
169 time_traversal(tree,tree->FirstChild(node)) +
170 time_traversal(tree,tree->NextSibling(node));
172 return time_traversal(tree,tree->FirstChild(node)) +
173 time_traversal(tree,tree->NextSibling(node));
180 /* This simulates the run function of the jumping automata*/
181 unsigned int time_jump(XMLTree* tree, treeNode node,treeNode root){
185 tag = tree->Tag(node);
186 if (tag == target_tag)
188 time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
189 time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
192 return time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
193 time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
200 int usage(char ** argv){
202 std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
208 int main(int argc, char ** argv){
209 unsigned int count1,count2;
210 unsigned char * tagname;
211 string arg,filename,ext;
212 bool disable_tc = false;
222 if (arg.compare("-d") == 0){
230 if (arg.compare("-s") == 0){
243 ext=(arg.substr(arg.size()-4,4));
244 if (ext.compare(".srx") == 0){
246 filename = arg.substr(0,arg.size()-4);
250 else if (ext.compare(".xml")==0) {
261 tagname = (unsigned char*) argv[i];
266 // The samplerate is not taken into account for loading anymore
267 tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
270 //filename, sampling factor, index empty texts, disable tc
271 XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
272 shredder.processStartDocument("");
274 shredder.processEndDocument();
275 tree = (XMLTree *) shredder.storageIfc_->returnDocument();
277 filename = filename.substr(0,filename.size()-4).append(".srx");
279 int exists = stat(filename.c_str(),&stats);
281 std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
284 tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
289 catch (const std::exception& e){
290 std::cout << "Error during parsing : " << e.what() << "\n";
294 traversal(tree,tree->Root(),tagname);
299 PRINTSTATS(FirstChild);
300 PRINTSTATS(NextSibling);
302 PRINTSTATS(TaggedAncestor);
303 PRINTSTATS(TaggedChild);
305 PRINTSTATS(TaggedDesc);
306 PRINTSTATS(TaggedFoll);
307 PRINTSTATS(PrevText);
309 PRINTSTATS(NextText);
310 PRINTSTATS(ParentNode);
311 PRINTSTATS(PrevNode);
314 if (target_tag == -1){
315 std::cout << "Warning: tag " << tagname << " was not found in the document!\n"
316 << "Warning: not timing traversal and jumping functions\n";
321 count1 = time_traversal(tree,tree->Root());
322 STOPTIMER(FullTraversal);
324 count2 = time_jump(tree,tree->Root(),tree->Root());
325 STOPTIMER(JumpTraversal);
327 std::cout << "Full traversal found " << count1 << " " << tagname << " nodes\n";
328 PRINTSTATS(FullTraversal);
330 std::cout << "Jump traversal found " << count2 << " " << tagname << " nodes\n";
331 PRINTSTATS(JumpTraversal);