}
}
+void printPairVector(vector<pair<int32_t,int32_t>> v){
+ vector<pair<int32_t,int32_t>>::iterator i=v.begin();
+ if (i!=v.end()) {
+ printf("[(%i,%i)", (*i).first, (*i).second);
+ for (++i; i!=v.end(); ++i)
+ printf(",(%i,%i)", (*i).first, (*i).second);
+ printf("]\n");
+ }
+}
+
//define a type for the lexicographic index
class lex_index {
public:
- //The tag IDs
xml_tree::tag_t tag;
xml_tree::tag_t tag2;
- //The text data
vector<int32_t> tagVector;
- vector<int32_t>::iterator tagVectorIt;
vector<int32_t> tag2Vector;
- vector<int32_t>::iterator tag2VectorIt;
void preorderTraverse(xml_tree * tree, xml_tree::tag_t parent_tag, xml_tree::node_t node);
void createIndex(xml_tree * tree);
void print();
void lex_index::preorderTraverse(xml_tree * tree, xml_tree::tag_t parent_tag, xml_tree::node_t node){
if (tree->tag(node)==tree->PCDATA_OPEN_TAG_ID)
- if (parent_tag==tag) tagVectorIt = tagVector.insert(tagVectorIt, tree->text_id(node));
- else if (parent_tag==tag2) tag2VectorIt = tag2Vector.insert(tag2VectorIt, tree->text_id(node));
+ if (parent_tag==tag) tagVector.push_back(tree->text_id(node));
+ else if (parent_tag==tag2) tag2Vector.push_back(tree->text_id(node));
if (tree->tag(tree->first_child(node))!=0) preorderTraverse(tree,tree->tag(node),tree->first_child(node));
if (tree->tag(tree->next_sibling(node))!=0) preorderTraverse(tree,parent_tag,tree->next_sibling(node));
}
void lex_index::createIndex(xml_tree * tree){
- tagVectorIt=tagVector.begin();
- tag2VectorIt=tag2Vector.begin();
preorderTraverse(tree, 0, tree->first_child(tree->ROOT));
sort(tagVector.begin(), tagVector.end(), myobject);
sort(tag2Vector.begin(), tag2Vector.end(), myobject);
printVector(tag2Vector);
}
+class lex_index_count : public lex_index {
+public:
+ vector<pair<int32_t,int32_t>> tcountVector;
+ vector<pair<int32_t,int32_t>> tcount2Vector;
+ void createCounts(xml_tree * tree);
+ void print();
+};
+
+void lex_index_count::createCounts(xml_tree * tree){
+ int k;
+ int i=0;
+ while((i+1)<tagVector.size())
+ {
+ k=1;
+ while(strcmp((const char*) tree->get_text(tagVector[i]),
+ (const char*) tree->get_text(tagVector[i+1]))==0){
+ k++;
+ tcountVector.push_back(make_pair(tagVector[i],1));
+ i++;
+ }
+ if(k>1) tcountVector[i-k+1]=make_pair((tcountVector[i-k+1]).first,k);
+ tcountVector.push_back(make_pair(tagVector[i],1));
+ i++;
+ }
+ if(i<tagVector.size())
+ tcountVector.push_back(make_pair(tagVector[i],1));
+ i=0;
+ while((i+1)<tag2Vector.size())
+ {
+ k=1;
+ while(strcmp((const char*) tree->get_text(tag2Vector[i]),
+ (const char*) tree->get_text(tag2Vector[i+1]))==0){
+ k++;
+ tcount2Vector.push_back(make_pair(tag2Vector[i],1));
+ i++;
+ }
+ if(k>1) tcount2Vector[i-k+1]=make_pair((tcount2Vector[i-k+1]).first,k);
+ tcount2Vector.push_back(make_pair(tag2Vector[i],1));
+ i++;
+ }
+ if(i<tag2Vector.size())
+ tcount2Vector.push_back(make_pair(tag2Vector[i],1));
+}
+
+void lex_index_count::print(){
+ printf("%i-vector: ", tag);
+ printPairVector(tcountVector);
+ printf("%i-vector: ", tag2);
+ printPairVector(tcount2Vector);
+}
+
class VectorMergeJoin {
public:
xml_tree * tree;
- //The tag IDs to be semi-joined
- const char * tagname1;
- const char * tagname2;
vector<int32_t> resultVector;
void mergeJoin(vector<int32_t> v1, vector<int32_t> v2);
};
void VectorMergeJoin::mergeJoin(vector<int32_t> v1, vector<int32_t> v2){
- vector<int32_t>::iterator i=resultVector.begin();
- vector<int32_t>::iterator i1=v1.begin();
- vector<int32_t>::iterator i2=v2.begin();
+ int i1=0;
+ int i2=0;
int k;
+ while((i1!=v1.size()) && (i2!=v2.size())){
+ k = strcmp((const char*) tree->get_text(v1[i1]),
+ (const char*) tree->get_text(v2[i2]));
+ if (k==0)
+ {
+ resultVector.push_back(v1[i1]);
+ i1++; //advance left
+ }
+ else if (k<0) i1++; //advance left
+ else i2++; //advance right
+ }
+}
- while((i1!=v1.end()) && (i2!=v2.end())){
- k = strcmp((const char*) tree->get_text(*i1),
- (const char*) tree->get_text(*i2));
+class VectorCountMergeJoin {
+public:
+ xml_tree * tree;
+ vector<pair<int32_t,int32_t>> resultVector;
+ void mergeJoin(vector<pair<int32_t,int32_t>> v1, vector<pair<int32_t,int32_t>> v2);
+};
+
+void VectorCountMergeJoin::mergeJoin(vector<pair<int32_t,int32_t>> v1, vector<pair<int32_t,int32_t>> v2){
+ int comp=0;
+ int i1=0;
+ int i2=0;
+ int k;
+ while((i1<v1.size()) && (i2<v2.size())){
+ k = strcmp((const char*) tree->get_text(v1[i1].first),
+ (const char*) tree->get_text(v2[i2].first));
+ comp++;
if (k==0)
{
- i = resultVector.insert( i, *i1 );
- //advance left
- i1++;
- //advance right
- i2++;
+ resultVector.push_back(v1[i1]);
+ i1++; //advance left
}
else if (k<0) i1++; //advance left
+ // else if (k<0) i1=i1+v1[i1].second; //advance left
else i2++; //advance right
+ // else i2=i2+v2[i2].second; //advance right
+
}
+ printf("Number of comparisons during MergeJoin: %i\n",comp);
}
// class prefix_treeNode {
return Obj_val<lex_index*>(v);
}
+static lex_index_count*& LEXINDEXCOUNT(value v)
+{
+ return Obj_val<lex_index_count*>(v);
+}
+
static xml_tree::node_t TREENODE(value i)
{
return static_cast<xml_tree::node_t>(Int_val(i));
if ((TAG(vtag)==-1) || (TAG(vtag2)==-1)) caml_failwith("<INVALID TAG>");
CAMLparam2(vtree, vtag);
CAMLlocal1(vindex);
- vindex = sxsi_alloc_custom<lex_index*>();
+ // Version without count:
+ // vindex = sxsi_alloc_custom<lex_index*>();
+ // Version with counts:
+ vindex = sxsi_alloc_custom<lex_index_count*>();
xml_tree * tree = XMLTREE(vtree);
myobject.tree = tree;
- //create the index
- lex_index* mylindex = new lex_index();
+ // create the index
+ // Version without count:
+ // lex_index* mylindex = new lex_index();
+ // Version with count:
+ lex_index_count* mylindex = new lex_index_count();
mylindex->tag = TAG(vtag);
mylindex->tag2 = TAG(vtag2);
mylindex->createIndex(tree);
+ // printf("%i-vector: ", mylindex->tag);
+ // printVector(mylindex->tagVector);
+
+ // Version with count:
+ mylindex->createCounts(tree);
+
//do a join
- VectorMergeJoin* myVMJoin = new VectorMergeJoin();
+ // VectorMergeJoin* myVMJoin = new VectorMergeJoin();
+ // myVMJoin->tree = tree;
+ // myVMJoin->mergeJoin(mylindex->tagVector, mylindex->tag2Vector);
+ // printf("Result-vector: ");
+ // printVector(myVMJoin->resultVector);
+
+ VectorCountMergeJoin* myVMJoin = new VectorCountMergeJoin();
myVMJoin->tree = tree;
- myVMJoin->mergeJoin(mylindex->tagVector, mylindex->tag2Vector);
+ myVMJoin->mergeJoin(mylindex->tcountVector, mylindex->tcount2Vector);
printf("Result-vector: ");
- printVector(myVMJoin->resultVector);
+ printPairVector(myVMJoin->resultVector);
- LEXINDEX(vindex)=mylindex;
+ // LEXINDEX(vindex)=mylindex;
+ LEXINDEXCOUNT(vindex)=mylindex;
}
ML_BINDING value caml_print_lex_index(value vindex)
{
CAMLparam1(vindex);
- lex_index* index = LEXINDEX(vindex);
+ // Version without count:
+ // lex_index* index = LEXINDEX(vindex);
+ // Version without count:
+ lex_index_count* index = LEXINDEXCOUNT(vindex);
//Print the index to the terminal
// caml_failwith("print_lex_index not implemented");
+
index->print();
CAMLreturn (Val_unit);
}