#include "Utils.h"
#include <sys/time.h>
#include <time.h>
+#include <sys/stat.h>
using std::cout;
using std::string;
using std::left;
using std::right;
-static clock_t tFirstChild = 0;
-static clock_t tNextSibling = 0;
-static clock_t tTaggedDesc = 0;
-static clock_t tTaggedFoll = 0;
-static clock_t tParentNode = 0;
-static clock_t tPrevNode = 0;
-static clock_t tTag = 0;
-static clock_t tMyText = 0;
-static clock_t tPrevText = 0;
-static clock_t tNextText = 0;
-static clock_t tFullTraversal = 0;
-static clock_t tJumpTraversal = 0;
+static double tFirstChild = 0;
+static double tNextSibling = 0;
+static double tParent = 0;
+static double tTaggedAncestor = 0;
+static double tTaggedChild = 0;
+static double tTaggedDesc = 0;
+static double tTaggedFoll = 0;
+static double tParentNode = 0;
+static double tPrevNode = 0;
+static double tTag = 0;
+static double tMyText = 0;
+static double tPrevText = 0;
+static double tNextText = 0;
+static double tDocIds = 0;
+
+static double tFullTraversal = 0;
+static double tJumpTraversal = 0;
static unsigned int cFirstChild = 0;
static unsigned int cNextSibling = 0;
+static unsigned int cParent = 0;
+static unsigned int cTaggedAncestor = 0;
+static unsigned int cTaggedChild = 0;
static unsigned int cTaggedDesc = 0;
static unsigned int cTaggedFoll = 0;
static unsigned int cParentNode = 0;
static unsigned int cMyText = 0;
static unsigned int cPrevText = 0;
static unsigned int cNextText = 0;
+static unsigned int cDocIds = 0;
+
static unsigned int cFullTraversal = 0;
static unsigned int cJumpTraversal = 0;
-static clock_t tmp;
+
+static struct timeval tmpv1;
+static struct timeval tmpv2;
static TagType target_tag = -1;
-#define STARTTIMER() (tmp= clock())
-#define STOPTIMER(x) do { (t##x) = (t##x) + (clock() - tmp); (c##x)= (c##x)+1; } while (0)
+#define STARTTIMER() (gettimeofday(&tmpv1,NULL))
+#define STOPTIMER(x) do { \
+ gettimeofday(&tmpv2,NULL); \
+ (t##x) = (t##x) + ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \
+ (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \
+ (c##x)= (c##x)+1; \
+ } while (0)
+
#define PRINTSTATS(x) do { \
- std::cout.width(11); \
+ std::cout.width(15); \
std::cout << std::left << #x; \
std::cout << " : "; \
std::cout.width(8); \
- std::cout << std::right << c##x << " calls,"; \
+ std::cout << std::right << c##x << " calls, "; \
std::cout.width(8); \
- std::cout << std::right << t##x << " cycles, total:"; \
- std::cout.width(5); \
- std::cout << std::right << ((t##x) *1000.00) /CLOCKS_PER_SEC \
+ std::cout << std::right << (t##x) \
<< " ms, mean: "; \
- std::cout.width(5); \
+ std::cout.width(8); \
std::cout << std::right \
- << (((t##x)* 1000.00) /CLOCKS_PER_SEC) / c##x \
+ << (t##x) *1.00 / c##x \
<< "\n"; \
} while (0)
-
void traversal(XMLTree * tree, treeNode node,unsigned char* targettagname){
treeNode res1,res2;
TagType tag;
DocID id1,id2,id3;
+ range rg;
const unsigned char * tagname;
if (node != NULLT){
+
STARTTIMER();
tag = tree->Tag(node);
STOPTIMER(Tag);
target_tag = tag;
};
STARTTIMER();
+ res1 = tree->Parent(node);
+ STOPTIMER(Parent);
+ /*
+ STARTTIMER();
+ res1 = tree->TaggedChild(node,0,tag);
+ STOPTIMER(TaggedChild);
+
+ STARTTIMER();
+ res1 = tree->TaggedAncestor(node,tag);
+ STOPTIMER(TaggedAncestor);
+ */
+ STARTTIMER();
res1 = tree->TaggedDesc(node,tag);
STOPTIMER(TaggedDesc);
STARTTIMER();
res1 = tree->TaggedFoll(node,tag);
STOPTIMER(TaggedFoll);
+
+ STARTTIMER();
+ rg = tree->DocIds(node);
+ STOPTIMER(DocIds);
STARTTIMER();
id1 = tree->MyText(node);
STARTTIMER();
res2 = tree->NextSibling(node);
STOPTIMER(NextSibling);
+
traversal(tree,res1,targettagname);
traversal(tree,res2,targettagname);
}
-unsigned int time_traversal(XMLTree *tree,treeNode node,unsigned int count){
+/* This simulates the run function of the automata */
+
+unsigned int time_traversal(XMLTree *tree,treeNode node){
TagType tag;
if (node != NULLT) {
cFullTraversal++;
tag = tree->Tag(node);
- if (tag == target_tag)
- count = count + 1;
- return time_traversal(tree,tree->NextSibling(node),
- time_traversal(tree,tree->FirstChild(node),count));
+ if (tag == target_tag)
+ return 1 +
+ time_traversal(tree,tree->FirstChild(node)) +
+ time_traversal(tree,tree->NextSibling(node));
+ else
+ return time_traversal(tree,tree->FirstChild(node)) +
+ time_traversal(tree,tree->NextSibling(node));
}
else
- return count;
+ return 0;
}
-
-unsigned int time_jump(XMLTree* tree, treeNode node,unsigned int count,treeNode root){
+/* This simulates the run function of the jumping automata*/
+unsigned int time_jump(XMLTree* tree, treeNode node,treeNode root){
TagType tag;
if (node != NULLT) {
cJumpTraversal++;
tag = tree->Tag(node);
if (tag == target_tag)
- count = count + 1;
- return time_jump(tree,
- tree->TaggedFollBelow(node,target_tag,root),
- time_jump(tree,
- tree->TaggedDesc(node,target_tag),
- count,
- node),
- root);
-
+ return 1 +
+ time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
+ time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
+
+ else
+ return time_jump(tree, tree->TaggedDesc(node,target_tag),node) +
+ time_jump(tree, tree->TaggedFollBelow(node,target_tag,root), root);
}
else
- return count;
+ return 0;
}
+int usage(char ** argv){
+
+ std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n";
+ return 1;
+}
int main(int argc, char ** argv){
unsigned int count1,count2;
- unsigned char * tagname = (unsigned char *) "keyword";
+ unsigned char * tagname;
+ string arg,filename,ext;
+ bool disable_tc = false;
+ bool save = false;
+ bool srx;
+ XMLTree * tree;
+
+ int i = 1;
+ if ( i >= argc)
+ return usage(argv);
+
+ arg = argv[i];
+ if (arg.compare("-d") == 0){
+ disable_tc = true;
+ i++;
+ if ( i >= argc)
+ return usage(argv);
+ arg = argv[i];
+ };
- if (argc != 2){
- std::cout << "Usage : " << argv[0] << " filename (without .srx)\n";
- return 1;
+ if (arg.compare("-s") == 0){
+ save = true;
+ i++;
+ if ( i >= argc)
+ return usage(argv);
+ arg = argv[i];
};
- // The samplerate is not taken into account for loading anymore
- XMLTree * tree = XMLTree::Load((unsigned char*) argv[1],64);
+
+ // The filename
+ if (arg.size() < 4)
+ return usage(argv);
- traversal(tree,tree->Root(),tagname);
+ ext=(arg.substr(arg.size()-4,4));
+ if (ext.compare(".srx") == 0){
+ // must truncate
+ filename = arg.substr(0,arg.size()-4);
- STARTTIMER();
- count1 = time_traversal(tree,tree->Root(),0);
- STOPTIMER(FullTraversal);
+ srx = true;
+ }
+ else if (ext.compare(".xml")==0) {
+ filename = arg;
+ srx = false;
+ }
+ else
+ return usage(argv);
+ i++;
- count2 = time_jump(tree,tree->Root(),0,tree->Root());
- STOPTIMER(JumpTraversal);
+ if (i >= argc)
+ return usage(argv);
+
+ tagname = (unsigned char*) argv[i];
+
+
+ if (srx)
+ // The samplerate is not taken into account for loading anymore
+ tree = XMLTree::Load((unsigned char*) filename.c_str(),64);
+ else {
+ try {
+ //filename, sampling factor, index empty texts, disable tc
+ XMLDocShredder shredder(filename.c_str(),64,false,disable_tc);
+ shredder.processStartDocument("");
+ shredder.parse();
+ shredder.processEndDocument();
+ tree = (XMLTree *) shredder.storageIfc_->returnDocument();
+ if (save){
+ filename = filename.substr(0,filename.size()-4).append(".srx");
+ struct stat stats;
+ int exists = stat(filename.c_str(),&stats);
+ if(exists == 0) {
+ std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n";
+ }
+ else {
+ tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str());
+ };
+
+ };
+ }
+ catch (const std::exception& e){
+ std::cout << "Error during parsing : " << e.what() << "\n";
+ return 2;
+ };
+ };
+ traversal(tree,tree->Root(),tagname);
+
+
+
+ PRINTSTATS(Tag);
PRINTSTATS(FirstChild);
PRINTSTATS(NextSibling);
- PRINTSTATS(Tag);
+ PRINTSTATS(Parent);
+ PRINTSTATS(TaggedAncestor);
+ PRINTSTATS(TaggedChild);
+ PRINTSTATS(DocIds);
PRINTSTATS(TaggedDesc);
PRINTSTATS(TaggedFoll);
PRINTSTATS(PrevText);
PRINTSTATS(ParentNode);
PRINTSTATS(PrevNode);
std::cout << "\n";
+
+ if (target_tag == -1){
+ std::cout << "Warning: tag " << tagname << " was not found in the document!\n"
+ << "Warning: not timing traversal and jumping functions\n";
+ return 3;
+ };
+
+ STARTTIMER();
+ count1 = time_traversal(tree,tree->Root());
+ STOPTIMER(FullTraversal);
+
+ count2 = time_jump(tree,tree->Root(),tree->Root());
+ STOPTIMER(JumpTraversal);
+
std::cout << "Full traversal found " << count1 << " " << tagname << " nodes\n";
PRINTSTATS(FullTraversal);
std::cout << "\n";
return 0;
+
}