added VectorCount index and the MergeJoin over it
authorSebastian Maneth <seba@sophisto.(none)>
Sun, 4 Nov 2012 18:32:57 +0000 (19:32 +0100)
committerSebastian Maneth <seba@sophisto.(none)>
Sun, 4 Nov 2012 18:32:57 +0000 (19:32 +0100)
src/lexindex_stub.cpp

index 7e5eba6..ce18cd3 100644 (file)
@@ -27,17 +27,23 @@ void printVector(vector<int32_t> v){
     }
 }
 
     }
 }
 
+void printPairVector(vector<pair<int32_t,int32_t>> v){
+    vector<pair<int32_t,int32_t>>::iterator i=v.begin();
+  if (i!=v.end()) {
+    printf("[(%i,%i)", (*i).first, (*i).second);
+      for (++i; i!=v.end(); ++i)
+       printf(",(%i,%i)", (*i).first, (*i).second);
+      printf("]\n");
+    }
+}
+
 //define a type for the lexicographic index
 class lex_index {
 public:
 //define a type for the lexicographic index
 class lex_index {
 public:
-  //The tag IDs
   xml_tree::tag_t tag;
   xml_tree::tag_t tag2;
   xml_tree::tag_t tag;
   xml_tree::tag_t tag2;
-  //The text data
   vector<int32_t> tagVector;           
   vector<int32_t> tagVector;           
-  vector<int32_t>::iterator tagVectorIt;
   vector<int32_t> tag2Vector;           
   vector<int32_t> tag2Vector;           
-  vector<int32_t>::iterator tag2VectorIt;
   void preorderTraverse(xml_tree * tree, xml_tree::tag_t parent_tag, xml_tree::node_t node);
   void createIndex(xml_tree * tree);
   void print();
   void preorderTraverse(xml_tree * tree, xml_tree::tag_t parent_tag, xml_tree::node_t node);
   void createIndex(xml_tree * tree);
   void print();
@@ -45,15 +51,13 @@ public:
 
 void lex_index::preorderTraverse(xml_tree * tree, xml_tree::tag_t parent_tag, xml_tree::node_t node){  
   if (tree->tag(node)==tree->PCDATA_OPEN_TAG_ID)
 
 void lex_index::preorderTraverse(xml_tree * tree, xml_tree::tag_t parent_tag, xml_tree::node_t node){  
   if (tree->tag(node)==tree->PCDATA_OPEN_TAG_ID)
-  if (parent_tag==tag) tagVectorIt = tagVector.insert(tagVectorIt, tree->text_id(node));
-  else if (parent_tag==tag2) tag2VectorIt = tag2Vector.insert(tag2VectorIt, tree->text_id(node));
+  if (parent_tag==tag) tagVector.push_back(tree->text_id(node));
+  else if (parent_tag==tag2) tag2Vector.push_back(tree->text_id(node));
   if (tree->tag(tree->first_child(node))!=0) preorderTraverse(tree,tree->tag(node),tree->first_child(node));
   if (tree->tag(tree->next_sibling(node))!=0) preorderTraverse(tree,parent_tag,tree->next_sibling(node));
 }
 
 void lex_index::createIndex(xml_tree * tree){
   if (tree->tag(tree->first_child(node))!=0) preorderTraverse(tree,tree->tag(node),tree->first_child(node));
   if (tree->tag(tree->next_sibling(node))!=0) preorderTraverse(tree,parent_tag,tree->next_sibling(node));
 }
 
 void lex_index::createIndex(xml_tree * tree){
-  tagVectorIt=tagVector.begin();
-  tag2VectorIt=tag2Vector.begin();
   preorderTraverse(tree, 0, tree->first_child(tree->ROOT));
   sort(tagVector.begin(), tagVector.end(), myobject); 
   sort(tag2Vector.begin(), tag2Vector.end(), myobject); 
   preorderTraverse(tree, 0, tree->first_child(tree->ROOT));
   sort(tagVector.begin(), tagVector.end(), myobject); 
   sort(tag2Vector.begin(), tag2Vector.end(), myobject); 
@@ -66,36 +70,109 @@ void lex_index::print(){
   printVector(tag2Vector);
 }
 
   printVector(tag2Vector);
 }
 
+class lex_index_count : public lex_index {
+public:
+  vector<pair<int32_t,int32_t>> tcountVector;           
+  vector<pair<int32_t,int32_t>> tcount2Vector;    
+  void createCounts(xml_tree * tree);
+  void print();
+};
+
+void lex_index_count::createCounts(xml_tree * tree){
+  int k;
+  int i=0;
+  while((i+1)<tagVector.size())
+    {
+      k=1;
+      while(strcmp((const char*) tree->get_text(tagVector[i]),
+                  (const char*) tree->get_text(tagVector[i+1]))==0){ 
+       k++; 
+       tcountVector.push_back(make_pair(tagVector[i],1));
+       i++;
+      }
+      if(k>1) tcountVector[i-k+1]=make_pair((tcountVector[i-k+1]).first,k);
+      tcountVector.push_back(make_pair(tagVector[i],1));
+      i++;
+    }    
+  if(i<tagVector.size()) 
+    tcountVector.push_back(make_pair(tagVector[i],1));
+  i=0;
+  while((i+1)<tag2Vector.size())
+    {
+      k=1;
+      while(strcmp((const char*) tree->get_text(tag2Vector[i]),
+                  (const char*) tree->get_text(tag2Vector[i+1]))==0){ 
+       k++; 
+       tcount2Vector.push_back(make_pair(tag2Vector[i],1));
+       i++;
+      }
+      if(k>1) tcount2Vector[i-k+1]=make_pair((tcount2Vector[i-k+1]).first,k);
+      tcount2Vector.push_back(make_pair(tag2Vector[i],1));
+      i++;
+    }    
+  if(i<tag2Vector.size()) 
+    tcount2Vector.push_back(make_pair(tag2Vector[i],1));
+}
+
+void lex_index_count::print(){
+  printf("%i-vector: ", tag);
+  printPairVector(tcountVector);
+  printf("%i-vector: ", tag2);
+  printPairVector(tcount2Vector);
+}
+  
 class VectorMergeJoin {
 public: 
   xml_tree * tree;
 class VectorMergeJoin {
 public: 
   xml_tree * tree;
-  //The tag IDs to be semi-joined
-  const char * tagname1;
-  const char * tagname2;
   vector<int32_t> resultVector;
   void mergeJoin(vector<int32_t> v1, vector<int32_t> v2);
 };
 
 void VectorMergeJoin::mergeJoin(vector<int32_t> v1, vector<int32_t> v2){
   vector<int32_t> resultVector;
   void mergeJoin(vector<int32_t> v1, vector<int32_t> v2);
 };
 
 void VectorMergeJoin::mergeJoin(vector<int32_t> v1, vector<int32_t> v2){
-  vector<int32_t>::iterator i=resultVector.begin();
-  vector<int32_t>::iterator i1=v1.begin();
-  vector<int32_t>::iterator i2=v2.begin();
+  int i1=0;
+  int i2=0;
   int k;
   int k;
+  while((i1!=v1.size()) && (i2!=v2.size())){
+    k = strcmp((const char*) tree->get_text(v1[i1]),
+              (const char*) tree->get_text(v2[i2]));
+    if (k==0) 
+      {
+       resultVector.push_back(v1[i1]);
+       i1++; //advance left
+      }
+    else if (k<0) i1++; //advance left
+    else i2++; //advance right
+  }
+}
 
 
-  while((i1!=v1.end()) && (i2!=v2.end())){
-    k = strcmp((const char*) tree->get_text(*i1),
-              (const char*) tree->get_text(*i2));
+class VectorCountMergeJoin {
+public: 
+  xml_tree * tree;
+  vector<pair<int32_t,int32_t>> resultVector;
+  void mergeJoin(vector<pair<int32_t,int32_t>> v1, vector<pair<int32_t,int32_t>> v2);
+};
+                
+void VectorCountMergeJoin::mergeJoin(vector<pair<int32_t,int32_t>> v1, vector<pair<int32_t,int32_t>> v2){
+  int comp=0;
+  int i1=0;
+  int i2=0;
+  int k;
+  while((i1<v1.size()) && (i2<v2.size())){
+    k = strcmp((const char*) tree->get_text(v1[i1].first),
+              (const char*) tree->get_text(v2[i2].first));
+    comp++;
     if (k==0) 
       {
     if (k==0) 
       {
-       i = resultVector.insert( i, *i1 );
-       //advance left
-       i1++;
-       //advance right     
-       i2++;
+       resultVector.push_back(v1[i1]);
+       i1++; //advance left
       }
     else if (k<0) i1++; //advance left
       }
     else if (k<0) i1++; //advance left
+    //    else if (k<0) i1=i1+v1[i1].second; //advance left
     else i2++; //advance right
     else i2++; //advance right
+    //    else i2=i2+v2[i2].second; //advance right
+
   }
   }
+  printf("Number of comparisons during MergeJoin: %i\n",comp);
 }
 
 // class prefix_treeNode {
 }
 
 // class prefix_treeNode {
@@ -117,6 +194,11 @@ static lex_index*& LEXINDEX(value v)
   return Obj_val<lex_index*>(v);
 }
 
   return Obj_val<lex_index*>(v);
 }
 
+static lex_index_count*& LEXINDEXCOUNT(value v)
+{
+  return Obj_val<lex_index_count*>(v);
+}
+
 static xml_tree::node_t TREENODE(value i)
 {
   return static_cast<xml_tree::node_t>(Int_val(i));
 static xml_tree::node_t TREENODE(value i)
 {
   return static_cast<xml_tree::node_t>(Int_val(i));
@@ -132,33 +214,56 @@ ML_BINDING value caml_build_lex_index(value vtree, value vtag, value vtag2)
   if ((TAG(vtag)==-1) || (TAG(vtag2)==-1)) caml_failwith("<INVALID TAG>");
   CAMLparam2(vtree, vtag);
   CAMLlocal1(vindex);
   if ((TAG(vtag)==-1) || (TAG(vtag2)==-1)) caml_failwith("<INVALID TAG>");
   CAMLparam2(vtree, vtag);
   CAMLlocal1(vindex);
-  vindex = sxsi_alloc_custom<lex_index*>();
+  //  Version without count:
+  //  vindex = sxsi_alloc_custom<lex_index*>();
+  //  Version with counts:
+  vindex = sxsi_alloc_custom<lex_index_count*>();
   xml_tree * tree = XMLTREE(vtree);
   myobject.tree = tree;
 
   xml_tree * tree = XMLTREE(vtree);
   myobject.tree = tree;
 
-  //create the index
-  lex_index* mylindex = new lex_index();
+  //  create the index
+  //  Version without count:
+  //  lex_index* mylindex = new lex_index();
+  //  Version with count:
+  lex_index_count* mylindex = new lex_index_count();
   mylindex->tag = TAG(vtag);
   mylindex->tag2 = TAG(vtag2);
   mylindex->createIndex(tree);
 
   mylindex->tag = TAG(vtag);
   mylindex->tag2 = TAG(vtag2);
   mylindex->createIndex(tree);
 
+  //  printf("%i-vector: ", mylindex->tag);
+  //  printVector(mylindex->tagVector);
+
+  // Version with count:
+  mylindex->createCounts(tree);
+
   //do a join
   //do a join
-  VectorMergeJoin* myVMJoin = new VectorMergeJoin();
+  //  VectorMergeJoin* myVMJoin = new VectorMergeJoin();
+  //  myVMJoin->tree = tree;
+  //  myVMJoin->mergeJoin(mylindex->tagVector, mylindex->tag2Vector);
+  //  printf("Result-vector: ");
+  //  printVector(myVMJoin->resultVector);
+
+  VectorCountMergeJoin* myVMJoin = new VectorCountMergeJoin();
   myVMJoin->tree = tree;
   myVMJoin->tree = tree;
-  myVMJoin->mergeJoin(mylindex->tagVector, mylindex->tag2Vector);
+  myVMJoin->mergeJoin(mylindex->tcountVector, mylindex->tcount2Vector);
   printf("Result-vector: ");
   printf("Result-vector: ");
-  printVector(myVMJoin->resultVector);
+  printPairVector(myVMJoin->resultVector);
 
 
-  LEXINDEX(vindex)=mylindex;
+  //  LEXINDEX(vindex)=mylindex;
+  LEXINDEXCOUNT(vindex)=mylindex;
 }
 
 ML_BINDING value caml_print_lex_index(value vindex)
 {
   CAMLparam1(vindex);
 }
 
 ML_BINDING value caml_print_lex_index(value vindex)
 {
   CAMLparam1(vindex);
-  lex_index* index = LEXINDEX(vindex);
+  //  Version without count:
+  //  lex_index* index = LEXINDEX(vindex);
+  //  Version without count:
+  lex_index_count* index = LEXINDEXCOUNT(vindex);
 
   //Print the index to the terminal
   //  caml_failwith("print_lex_index not implemented");
 
   //Print the index to the terminal
   //  caml_failwith("print_lex_index not implemented");
+
   index->print();
   CAMLreturn (Val_unit);
 }
   index->print();
   CAMLreturn (Val_unit);
 }