-#include "XMLDocShredder.h"
-#include "XMLTree.h"
-#include "Utils.h"
-#include <sys/time.h>
-#include <time.h>
-#include <sys/stat.h>
-
-#define read32u() \
- (intern_src += 4, \
- ((uintnat)(intern_src[-4]) << 24) + (intern_src[-3] << 16) + \
- (intern_src[-2] << 8) + intern_src[-1])
-
-using std::cout;
-using std::string;
-using std::left;
-using std::right;
-
-static double tFirstChild = 0;
-static double tNextSibling = 0;
-static double tParent = 0;
-static double tTaggedAncestor = 0;
-static double tTaggedChild = 0;
-static double tTaggedDesc = 0;
-static double tTaggedFoll = 0;
-static double tParentNode = 0;
-static double tPrevNode = 0;
-static double tTag = 0;
-static double tMyText = 0;
-static double tPrevText = 0;
-static double tNextText = 0;
-static double tDocIds = 0;
-
-static double tFullTraversal = 0;
-static double tJumpTraversal = 0;
-
-static unsigned int cFirstChild = 0;
-static unsigned int cNextSibling = 0;
-static unsigned int cParent = 0;
-static unsigned int cTaggedAncestor = 0;
-static unsigned int cTaggedChild = 0;
-static unsigned int cTaggedDesc = 0;
-static unsigned int cTaggedFoll = 0;
-static unsigned int cParentNode = 0;
-static unsigned int cPrevNode = 0;
-static unsigned int cTag = 0;
-static unsigned int cMyText = 0;
-static unsigned int cPrevText = 0;
-static unsigned int cNextText = 0;
-static unsigned int cDocIds = 0;
-
-static unsigned int cFullTraversal = 0;
-static unsigned int cJumpTraversal = 0;
-
-
-static struct timeval tmpv1;
-static struct timeval tmpv2;
-
-static TagType target_tag = -1;
-
-#define STARTTIMER() (gettimeofday(&tmpv1,NULL))
-#define STOPTIMER(x) do { \
- gettimeofday(&tmpv2,NULL); \
- (t##x) = (t##x) + ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \
- (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \
- (c##x)= (c##x)+1; \
- } while (0)
-
-#define PRINTSTATS(x) do { \
- std::cout.width(15); \
- std::cout << std::left << #x; \
- std::cout << " : "; \
- std::cout.width(8); \
- std::cout << std::right << c##x << " calls, "; \
- std::cout.width(8); \
- std::cout << std::right << (t##x) \
- << " ms, mean: "; \
- std::cout.width(8); \
- std::cout << std::right \
- << (t##x) *1.00 / c##x \
- << "\n"; \
- } while (0)
-
-void traversal(XMLTree * tree, treeNode node,unsigned char* targettagname){
- treeNode res1,res2;
- TagType tag;
- DocID id1,id2,id3;
- range rg;
- const unsigned char * tagname;
- if (node != NULLT){
-
- STARTTIMER();
- tag = tree->Tag(node);
- STOPTIMER(Tag);
- if (target_tag == -1){
- tagname = tree->GetTagNameByRef(tag);
- if (strcmp( (char*) tagname, (char*) targettagname) == 0)
- target_tag = tag;
- };
- STARTTIMER();
- res1 = tree->Parent(node);
- STOPTIMER(Parent);
- /*
- STARTTIMER();
- res1 = tree->TaggedChild(node,0,tag);
- STOPTIMER(TaggedChild);
-
- STARTTIMER();
- res1 = tree->TaggedAncestor(node,tag);
- STOPTIMER(TaggedAncestor);
- */
- STARTTIMER();
- res1 = tree->TaggedDesc(node,tag);
- STOPTIMER(TaggedDesc);
-
- STARTTIMER();
- res1 = tree->TaggedFoll(node,tag);
- STOPTIMER(TaggedFoll);
-
- STARTTIMER();
- rg = tree->DocIds(node);
- STOPTIMER(DocIds);
-
- STARTTIMER();
- id1 = tree->MyText(node);
- STOPTIMER(MyText);
-
- STARTTIMER();
- id2 = tree->PrevText(node);
- STOPTIMER(PrevText);
-
- STARTTIMER();
- id3 = tree->NextText(node);
- STOPTIMER(NextText);
-
- id1 = max(id1, max(id2,id3));
-
- STARTTIMER();
- res1 = tree->ParentNode(id1);
- STOPTIMER(ParentNode);
-
- STARTTIMER();
- res1 = tree->PrevNode(id1);
- STOPTIMER(PrevNode);
-
- STARTTIMER();
- res1 = tree->FirstChild(node);
- STOPTIMER(FirstChild);
-
- STARTTIMER();
- res2 = tree->NextSibling(node);
- STOPTIMER(NextSibling);
-
- traversal(tree,res1,targettagname);
- traversal(tree,res2,targettagname);
-
- };
-
-}
-
-/* This simulates the run function of the automata */
-
-unsigned int time_traversal(XMLTree *tree,treeNode node){
- TagType tag;
- if (node != NULLT) {
- cFullTraversal++;
- tag = tree->Tag(node);
- if (tag == target_tag)
- return 1 +
- time_traversal(tree,tree->FirstChild(node)) +
- time_traversal(tree,tree->NextSibling(node));
- else
- return time_traversal(tree,tree->FirstChild(node)) +
- time_traversal(tree,tree->NextSibling(node));
-
- }
- else
- return 0;
-}
-
-/* This simulates the run function of the jumping automata*/
-unsigned int time_jump(XMLTree* tree, treeNode node,treeNode root){
- TagType tag;
- if (node != NULLT) {
- cJumpTraversal++;
- tag = tree->Tag(node);
- if (tag == target_tag)
- return 1 +
- time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
- time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
-
- else
- return time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
- time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
- }
- else
- return 0;
-}
-
-
-int usage(char ** argv){
-
- std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
- return 1;
-
-}
-
-
-int main(int argc, char ** argv){
- unsigned int count1,count2;
- unsigned char * tagname;
- string arg,filename,ext;
- bool disable_tc = false;
- bool save = false;
- bool srx;
- XMLTree * tree;
-
- int i = 1;
- if ( i >= argc)
- return usage(argv);
-
- arg = argv[i];
- if (arg.compare("-d") == 0){
- disable_tc = true;
- i++;
- if ( i >= argc)
- return usage(argv);
- arg = argv[i];
- };
-
- if (arg.compare("-s") == 0){
- save = true;
- i++;
- if ( i >= argc)
- return usage(argv);
- arg = argv[i];
- };
-
-
- // The filename
- if (arg.size() < 4)
- return usage(argv);
-
- ext=(arg.substr(arg.size()-4,4));
- if (ext.compare(".srx") == 0){
- // must truncate
- filename = arg.substr(0,arg.size()-4);
-
- srx = true;
- }
- else if (ext.compare(".xml")==0) {
- filename = arg;
- srx = false;
- }
- else
- return usage(argv);
- i++;
-
- if (i >= argc)
- return usage(argv);
-
- tagname = (unsigned char*) argv[i];
-
-
-
- if (srx)
- // The samplerate is not taken into account for loading anymore
- tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
- else {
- try {
- //filename, sampling factor, index empty texts, disable tc
- XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
- shredder.processStartDocument("");
- shredder.parse();
- shredder.processEndDocument();
- tree = (XMLTree *) shredder.storageIfc_->returnDocument();
- if (save){
- filename = filename.substr(0,filename.size()-4).append(".srx");
- struct stat stats;
- int exists = stat(filename.c_str(),&stats);
- if(exists == 0) {
- std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
- }
- else {
- tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
- };
-
- };
- }
- catch (const std::exception& e){
- std::cout << "Error during parsing : " << e.what() << "\n";
- return 2;
- };
- };
- traversal(tree,tree->Root(),tagname);
-
-
-
- PRINTSTATS(Tag);
- PRINTSTATS(FirstChild);
- PRINTSTATS(NextSibling);
- PRINTSTATS(Parent);
- PRINTSTATS(TaggedAncestor);
- PRINTSTATS(TaggedChild);
- PRINTSTATS(DocIds);
- PRINTSTATS(TaggedDesc);
- PRINTSTATS(TaggedFoll);
- PRINTSTATS(PrevText);
- PRINTSTATS(MyText);
- PRINTSTATS(NextText);
- PRINTSTATS(ParentNode);
- PRINTSTATS(PrevNode);
- std::cout << "\n";
-
- if (target_tag == -1){
- std::cout << "Warning: tag " << tagname << " was not found in the document!\n"
- << "Warning: not timing traversal and jumping functions\n";
- return 3;
- };
-
- STARTTIMER();
- count1 = time_traversal(tree,tree->Root());
- STOPTIMER(FullTraversal);
-
- count2 = time_jump(tree,tree->Root(),tree->Root());
- STOPTIMER(JumpTraversal);
-
- std::cout << "Full traversal found " << count1 << " " << tagname << " nodes\n";
- PRINTSTATS(FullTraversal);
- std::cout << "\n";
- std::cout << "Jump traversal found " << count2 << " " << tagname << " nodes\n";
- PRINTSTATS(JumpTraversal);
-
-
- return 0;
-
-}