From 04639fe524ee20f7f84c8b08387312d714c9bd56 Mon Sep 17 00:00:00 2001 From: kim Date: Thu, 30 Apr 2009 14:25:16 +0000 Subject: [PATCH] Cleaned up every thing, prepared to remove deprecated interface. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@367 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- OCamlDriver.cpp | 395 +++++++++++++-------------------------- SXSIStorageInterface.cpp | 9 +- XMLDocShredder.cpp | 141 +++++++------- XMLDocShredder.h | 24 ++- ata.ml | 192 +++++++++---------- ata.mli | 2 +- main.ml | 5 +- options.ml | 1 + ptset.ml | 34 ++-- ptset.mli | 5 +- tests/test.xml | 15 +- tree.ml | 303 ++++++++++++++++++++---------- tree.mli | 10 +- 13 files changed, 561 insertions(+), 575 deletions(-) diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index 5b7e0b6..a35cd75 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -9,6 +9,7 @@ */ /* OCaml memory managment */ +#include extern "C" { #include #include @@ -18,24 +19,6 @@ extern "C" { #include -#include -#include -#include -#include - - struct tms t1; - struct tms t2; - double ticks = (double) sysconf(_SC_CLK_TCK)/1000; - - void start_clock() { - times (&t1); - } - - - double stop_clock() { - times (&t2); - return (t2.tms_utime-t1.tms_utime)/ticks; - } } //extern C @@ -44,110 +27,121 @@ extern "C" { #include "XMLTree.h" #include "Utils.h" -#define CAMLRAISECPP(e) (caml_failwith( ((e).what()))) +#define CAMLRAISEMSG(msg) (caml_raise_with_string(*cpp_exception,(msg) )) #define NOT_IMPLEMENTED(s) (caml_failwith(s)) #define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x))) +#define HSET(x) ((std::unordered_set*)((* (XMLTree**) Data_custom_val(x)))) #define TEXTCOLLECTION(x) #define TREENODEVAL(i) ((treeNode) (Int_val(i))) #define XMLTREE_ROOT 0 + + extern "C" { static struct custom_operations ops; - static bool initialized = false; + static struct custom_operations set_ops; + static value * cpp_exception = NULL; + static bool ops_initialized = false; + } extern "C" void caml_xml_tree_finalize(value tree){ delete XMLTREE(tree); return; } +extern "C" void caml_hset_finalize(value hblock){ + delete HSET(hblock); + return; +} -extern "C" void caml_init_ops () { - - if (initialized) - return; +extern "C" CAMLprim value caml_init_lib (value unit) { + CAMLparam1(unit); + if (!ops_initialized){ + + ops.identifier = (char*) "XMLTree"; ops.finalize = caml_xml_tree_finalize; - return; + set_ops.identifier = (char*) "unordered_set"; + set_ops.finalize = caml_hset_finalize; + + cpp_exception = caml_named_value("CPlusPlusError"); + + ops_initialized = true; + + }; + CAMLreturn(Val_unit); + +} +extern "C" CAMLprim value caml_shredder_parse(XMLDocShredder *shredder){ + CAMLparam0(); + CAMLlocal1(doc); + XMLTree * tree; + shredder->processStartDocument(""); + shredder->parse(); + shredder->processEndDocument(); + doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); + tree = (XMLTree *) shredder->getXMLTree(); + memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); + CAMLreturn(doc); + } - extern "C" CAMLprim value caml_call_shredder_uri(value uri,value sf, value iet, value dtc){ CAMLparam1(uri); CAMLlocal1(doc); char *fn = String_val(uri); + XMLDocShredder * shredder; try { - XMLDocShredder shredder(fn,Int_val(sf),Bool_val(iet),Bool_val(dtc)); - XMLTree * tree; - shredder.processStartDocument(fn); - shredder.parse(); - shredder.processEndDocument(); - caml_init_ops(); - doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); - tree = (XMLTree *) shredder.storageIfc_->returnDocument(); - memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); - CAMLreturn(doc); + shredder = new XMLDocShredder(fn,Int_val(sf),Bool_val(iet),Bool_val(dtc)); + doc = caml_shredder_parse(shredder); + delete shredder; } - catch (const std::exception& e){ - CAMLRAISECPP(e); - }; + catch (const std::exception& e){ CAMLRAISEMSG(e.what()); } + catch (string msg){ CAMLRAISEMSG(msg.c_str()); } + catch (char const * msg){ CAMLRAISEMSG(msg); }; + CAMLreturn (doc); } - extern "C" CAMLprim value caml_call_shredder_string(value data,value sf, value iet, value dtc){ CAMLparam1(data); CAMLlocal1(doc); + XMLDocShredder * shredder; unsigned int ln = string_length(data); unsigned char *fn = (unsigned char*) String_val(data); - try { - XMLDocShredder shredder(fn,ln,Int_val(sf),Bool_val(iet),Bool_val(dtc)); - XMLTree* tree; - shredder.processStartDocument(""); - shredder.parse(); - shredder.processEndDocument(); - caml_init_ops(); - doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); - tree = (XMLTree *) shredder.storageIfc_->returnDocument(); - memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); - CAMLreturn(doc); + shredder = new XMLDocShredder (fn,ln,Int_val(sf),Bool_val(iet),Bool_val(dtc)); + doc = caml_shredder_parse(shredder); + delete shredder; } - catch (const std::exception& e) { - CAMLRAISECPP(e); - }; + catch (const std::exception& e){ CAMLRAISEMSG(e.what()); } + catch (string msg){ CAMLRAISEMSG(msg.c_str()); } + catch (char const * msg){ CAMLRAISEMSG(msg); }; + CAMLreturn(doc); } - - - -void traversal_rec(XMLTree* tree, treeNode id){ - DocID tid; - if (id == NULLT) - return; - //int tag = tree->Tag(id); - if (id) { - tid = tree->PrevText(id); - char * data = (char *) (tree->getTextCollection())->GetText(tid); - if (tree->IsLeaf(id)){ - tid = tree->MyText(id); - - data = (char*) (tree->getTextCollection())->GetText(tid); - }; - - if (tree->NextSibling(id) == NULLT){ - tid = tree->NextText(id); - data = (char*) (tree->getTextCollection())->GetText(tid); - }; - }; - traversal_rec(tree,tree->FirstChild(id)); - traversal_rec(tree,tree->NextSibling(id)); - return; +extern "C" CAMLprim value caml_xml_tree_save(value tree,value fd){ + CAMLparam2(tree,fd); + XMLTREE(tree)->Save(Int_val(fd)); + CAMLreturn (Val_unit); } -extern "C" CAMLprim value caml_cpp_traversal(value tree){ - CAMLparam1(tree); - traversal_rec(XMLTREE(tree),XMLTREE_ROOT); - CAMLreturn(Val_unit); +extern "C" CAMLprim value caml_xml_tree_load(value fd){ + CAMLparam1(fd); + CAMLlocal1(doc); + XMLTree * tree; + try { + tree = XMLTree::Load(Int_val(fd)); + doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); + memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); + CAMLreturn(doc); + } + catch (const xmlpp::internal_error& e){ CAMLRAISEMSG(e.what()); } + catch (const std::exception& e){ CAMLRAISEMSG(e.what()); } + catch (string msg){ CAMLRAISEMSG(msg.c_str()); } + catch (char const * msg){ CAMLRAISEMSG(msg); }; } + + extern "C" CAMLprim value caml_text_collection_get_text(value tree, value id){ CAMLparam2(tree,id); CAMLlocal1(str); @@ -210,10 +204,7 @@ extern "C" CAMLprim value caml_text_collection_unsorted_contains(value tree,valu CAMLparam2(tree,str); uchar * cstr = (uchar *) String_val(str); std::vector results; - start_clock(); results = XMLTREE(tree)->Contains(cstr); - double d = stop_clock(); - std::cerr << "Internal timing " << d <<" ms\n"; CAMLreturn (Val_unit); } @@ -227,113 +218,80 @@ extern "C" CAMLprim value caml_xml_tree_text_collection(value tree){ CAMLreturn((value) XMLTREE(tree)->getTextCollection()); } extern "C" CAMLprim value caml_xml_tree_parent(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->Parent(TREENODEVAL(id)))); + return(Val_int (XMLTREE(tree)->Parent(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_prev_sibling(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->PrevSibling(TREENODEVAL(id)))); + return(Val_int (XMLTREE(tree)->PrevSibling(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_parent_doc(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->ParentNode((DocID) Int_val(id)))); + return (Val_int (XMLTREE(tree)->ParentNode((DocID) Int_val(id)))); } -extern "C" CAMLprim value caml_xml_tree_prev_doc(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->PrevNode((DocID) Int_val(id)))); -} extern "C" CAMLprim value caml_xml_tree_is_ancestor(value tree,value id1, value id2) { CAMLparam3(tree,id1,id2); CAMLreturn(Val_bool (XMLTREE(tree)->IsAncestor(TREENODEVAL(id1),TREENODEVAL(id2)))); } -extern "C" CAMLprim value caml_xml_tree_serialize(value tree, value filename){ - CAMLparam2(tree,filename); - NOT_IMPLEMENTED("caml_xml_tree_serialize"); - CAMLreturn(Val_unit); -} - -extern "C" CAMLprim value caml_xml_tree_unserialize(value filename){ - CAMLparam1(filename); - NOT_IMPLEMENTED("caml_xml_tree_unserialize"); - CAMLreturn(Val_unit); -} - extern "C" CAMLprim value caml_xml_tree_last_child(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->LastChild(TREENODEVAL(id)))); + return(Val_int (XMLTREE(tree)->LastChild(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_is_first_child(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_bool (XMLTREE(tree)->IsFirstChild(TREENODEVAL(id)))); + return Val_bool (XMLTREE(tree)->IsFirstChild(TREENODEVAL(id))); } - extern "C" CAMLprim value caml_xml_tree_first_child(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->FirstChild(TREENODEVAL(id)))); + return(Val_int (XMLTREE(tree)->FirstChild(TREENODEVAL(id)))); +} +extern "C" CAMLprim value caml_xml_tree_first_element(value tree, value id){ + return(Val_int (XMLTREE(tree)->FirstElement(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_tagged_child(value tree, value id, value tag){ - CAMLparam3(tree,id,tag); - CAMLreturn(Val_int (XMLTREE(tree)->TaggedChild(TREENODEVAL(id),Int_val(tag)))); + return(Val_int (XMLTREE(tree)->TaggedChild(TREENODEVAL(id),Int_val(tag)))); +} + +extern "C" CAMLprim value caml_xml_tree_next_sibling(value tree, value id){ + return(Val_int (XMLTREE(tree)->NextSibling(TREENODEVAL(id)))); +} + +extern "C" CAMLprim value caml_xml_tree_next_element(value tree, value id){ + return(Val_int (XMLTREE(tree)->NextSibling(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_tagged_sibling(value tree, value id, value tag){ - CAMLparam3(tree,id,tag); - CAMLreturn(Val_int (XMLTREE(tree)->TaggedFollSibling(TREENODEVAL(id),Int_val(tag)))); + return(Val_int (XMLTREE(tree)->TaggedFollSibling(TREENODEVAL(id),Int_val(tag)))); } extern "C" CAMLprim value caml_xml_tree_is_leaf(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_bool (XMLTREE(tree)->IsLeaf(TREENODEVAL(id)))); + return(Val_bool (XMLTREE(tree)->IsLeaf(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_tagged_desc(value tree, value id, value tag){ - CAMLparam3(tree,id,tag); - CAMLreturn(Val_int (XMLTREE(tree)->TaggedDesc(TREENODEVAL(id),(TagType) Int_val(tag)))); + return(Val_int (XMLTREE(tree)->TaggedDesc(TREENODEVAL(id),(TagType) Int_val(tag)))); } extern "C" CAMLprim value caml_xml_tree_tagged_foll(value tree, value id, value tag){ - CAMLparam3(tree,id,tag); - CAMLreturn(Val_int (XMLTREE(tree)->TaggedFoll(TREENODEVAL(id),(TagType) Int_val(tag)))); + return(Val_int (XMLTREE(tree)->TaggedFoll(TREENODEVAL(id),(TagType) Int_val(tag)))); } extern "C" CAMLprim value caml_xml_tree_tagged_foll_below(value tree, value id, value tag,value root){ - CAMLparam4(tree,id,tag,root); - CAMLreturn(Val_int (XMLTREE(tree)->TaggedFollBelow(TREENODEVAL(id),(TagType) Int_val(tag),TREENODEVAL(root)))); + return(Val_int (XMLTREE(tree)->TaggedFollBelow(TREENODEVAL(id),(TagType) Int_val(tag),TREENODEVAL(root)))); } -extern "C" CAMLprim value caml_xml_tree_next_sibling(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int (XMLTREE(tree)->NextSibling(TREENODEVAL(id)))); -} -extern "C" CAMLprim value caml_xml_tree_prev_text(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int((XMLTREE(tree)->PrevText(TREENODEVAL(id))))); -} -extern "C" CAMLprim value caml_xml_tree_next_text(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int((XMLTREE(tree)->NextText(TREENODEVAL(id))))); -} extern "C" CAMLprim value caml_xml_tree_my_text(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int((XMLTREE(tree)->MyText(TREENODEVAL(id))))); + return(Val_int((XMLTREE(tree)->MyText(TREENODEVAL(id))))); } extern "C" CAMLprim value caml_xml_tree_text_xml_id(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int((XMLTREE(tree)->TextXMLId(TREENODEVAL(id))))); + return(Val_int((XMLTREE(tree)->TextXMLId(TREENODEVAL(id))))); } extern "C" CAMLprim value caml_xml_tree_node_xml_id(value tree, value id){ - CAMLparam2(tree,id); - CAMLreturn(Val_int((XMLTREE(tree)->NodeXMLId(TREENODEVAL(id))))); + return(Val_int((XMLTREE(tree)->NodeXMLId(TREENODEVAL(id))))); } extern "C" CAMLprim value caml_xml_tree_tag_name(value tree, value tagid){ @@ -347,13 +305,11 @@ extern "C" CAMLprim value caml_xml_tree_tag_name(value tree, value tagid){ extern "C" CAMLprim value caml_xml_tree_tag_id(value tree,value id){ - CAMLparam2(tree,id); - CAMLreturn (Val_int(XMLTREE(tree)->Tag(TREENODEVAL(id)))); + return (Val_int(XMLTREE(tree)->Tag(TREENODEVAL(id)))); } extern "C" CAMLprim value caml_xml_tree_subtree_tags(value tree,value id,value tag){ - CAMLparam3(tree,id,tag); - CAMLreturn (Val_int(XMLTREE(tree)->SubtreeTags(TREENODEVAL(id),Int_val(tag)))); + return (Val_int(XMLTREE(tree)->SubtreeTags(TREENODEVAL(id),Int_val(tag)))); } @@ -367,138 +323,47 @@ extern "C" CAMLprim value caml_xml_tree_register_tag(value tree,value str){ } extern "C" CAMLprim value caml_xml_tree_nullt(value unit){ - CAMLparam1(unit); - CAMLreturn (NULLT); -} - -extern "C" CAMLprim value caml_xml_tree_save(value tree,value filename){ - CAMLparam2(tree,filename); - XMLTREE(tree)->Save((unsigned char *) String_val(filename)); - CAMLreturn (Val_unit); + return (NULLT); } -extern "C" CAMLprim value caml_xml_tree_load(value filename,value samplerate){ - CAMLparam2(filename,samplerate); - CAMLlocal1(doc); - XMLTree * tree; - tree = XMLTree::Load((unsigned char *) String_val(filename),Int_val(samplerate)); - caml_init_ops(); - doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); - memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); - CAMLreturn(doc); +extern "C" CAMLprim value caml_unordered_set_length(value hset){ + CAMLparam1(hset); + CAMLreturn (Val_int((HSET(hset))->size())); } -extern "C" { - static int caml_empty_vector[] = { 0 }; -} - -extern "C" CAMLprim value caml_int_vector_empty(value unit){ - CAMLparam1(unit); - CAMLreturn ((value) caml_empty_vector); -} - -extern "C" CAMLprim value caml_int_vector_length(value vec){ - CAMLparam1(vec); - CAMLreturn (Val_int( ((int*) caml_empty_vector)[0] )); -} -extern "C" CAMLprim value caml_int_vector_alloc(value len){ +extern "C" CAMLprim value caml_unordered_set_alloc(value len){ CAMLparam1(len); - int * vec = (int *) malloc(sizeof(int)*(Int_val(len)+1)); - vec[0] = Int_val(len); - CAMLreturn ((value) vec); + CAMLlocal1(hset); + hset = caml_alloc_custom(&set_ops,sizeof(std::unordered_set*),1,2); + std::unordered_set* ht = new std::unordered_set(); + memcpy(Data_custom_val(hset),&ht,sizeof(std::unordered_set*)); + CAMLreturn (hset); } -extern "C" CAMLprim value caml_int_vector_set(value vec, value i, value v){ - CAMLparam3(vec,i,v); - ((int*) vec)[Int_val(i)+1] = Int_val(v); - CAMLreturn (Val_unit); +extern "C" CAMLprim value caml_unordered_set_set(value vec, value v){ + HSET(vec)->insert((int) Int_val(v)); + return (Val_unit); } - -#define VECT(x) ((int*) (x)) extern "C" CAMLprim value caml_xml_tree_select_desc(value tree, value node, value tags){ - CAMLparam3(tree,node,tags); - - CAMLreturn (Val_int (XMLTREE(tree)->SelectDesc(TREENODEVAL(node), - &(VECT(tags)[1]), - VECT(tags)[0]))); + return (Val_int (XMLTREE(tree)->SelectDesc(TREENODEVAL(node), + HSET(tags)))); } extern "C" CAMLprim value caml_xml_tree_select_child(value tree, value node, value tags){ - CAMLparam3(tree,node,tags); - - CAMLreturn (Val_int (XMLTREE(tree)->SelectChild(TREENODEVAL(node), - &(VECT(tags)[1]), - VECT(tags)[0]))); + return (Val_int (XMLTREE(tree)->SelectChild(TREENODEVAL(node), + HSET(tags)))); } extern "C" CAMLprim value caml_xml_tree_select_foll_sibling(value tree, value node, value tags){ - CAMLparam3(tree,node,tags); - - CAMLreturn (Val_int (XMLTREE(tree)->SelectFollSibling(TREENODEVAL(node), - &(VECT(tags)[1]), - VECT(tags)[0]))); + return (Val_int (XMLTREE(tree)->SelectFollSibling(TREENODEVAL(node), + HSET(tags)))); } extern "C" CAMLprim value caml_xml_tree_select_foll_below(value tree, value node, value tags,value ctx){ - CAMLparam4(tree,node,tags,ctx); - - CAMLreturn (Val_int (XMLTREE(tree)->SelectFollBelow(TREENODEVAL(node), - &(VECT(tags)[1]), - VECT(tags)[0],Int_val(ctx)))); -} - - - -/* -extern "C" CAMLprim value caml_xml_tree_select_below(value tree, value node, value ctags, value dtags){ - CAMLparam4(tree,node,ctags,dtags); - - CAMLreturn (Val_int ( - (XMLTREE(tree)->TaggedBelow(TREENODEVAL(node), - &(VECT(ctags)[1]), - VECT(ctags)[0], - &(VECT(dtags)[1]), - VECT(dtags)[0])))); - } -*/ -/* -extern "C" CAMLprim value caml_xml_tree_select_next(value tree, value node, value ctags, value ftags,value root){ - CAMLparam5(tree,node,ctags,ftags,root); - CAMLreturn (Val_int ( - (XMLTREE(tree)->TaggedNext(TREENODEVAL(node), - &(VECT(ctags)[1]), - VECT(ctags)[0], - &(VECT(ftags)[1]), - VECT(ftags)[0], - TREENODEVAL(root))))); -} -*/ -/* -extern "C" CAMLprim value caml_xml_tree_select_desc_only(value tree, value node,value dtags){ - CAMLparam3(tree,node,dtags); - - CAMLreturn (Val_int ( - (XMLTREE(tree)->TaggedDescOnly(TREENODEVAL(node), - &(VECT(dtags)[1]), - VECT(dtags)[0])))); -} - -extern "C" CAMLprim value caml_xml_tree_select_foll_only(value tree, value node, value ftags,value root){ - CAMLparam4(tree,node,ftags,root); - CAMLreturn (Val_int ( - (XMLTREE(tree)->TaggedFollOnly(TREENODEVAL(node), - &(VECT(ftags)[1]), - VECT(ftags)[0], - TREENODEVAL(root))))); -} - -extern "C" CAMLprim value caml_xml_tree_select_desc_or_foll_only(value tree, value node, value ftags,value root){ - CAMLparam4(tree,node,ftags,root); - CAMLreturn (Val_int ( - (XMLTREE(tree)->TaggedDescOrFollOnly(TREENODEVAL(node), - &(VECT(ftags)[1]), - VECT(ftags)[0], - TREENODEVAL(root))))); -} -*/ + return (Val_int (XMLTREE(tree)->SelectFollBelow(TREENODEVAL(node), + HSET(tags), + TREENODEVAL(ctx)))); +} + + extern "C" CAMLprim value caml_xml_tree_doc_ids(value tree, value node){ CAMLparam2(tree,node); CAMLlocal1(tuple); diff --git a/SXSIStorageInterface.cpp b/SXSIStorageInterface.cpp index 43ea155..2594bd2 100644 --- a/SXSIStorageInterface.cpp +++ b/SXSIStorageInterface.cpp @@ -24,23 +24,20 @@ SXSIStorageInterface::~SXSIStorageInterface() void SXSIStorageInterface::newChild(string name) { - _new_child++; - tb->NewOpenTag((unsigned char*) name.c_str()); + tb->NewOpenTag(name); } void SXSIStorageInterface::newText(string text) { - _new_text++; - _length_text += text.size(); - tb->NewText((unsigned char*) text.c_str()); + tb->NewText(text); } void SXSIStorageInterface::nodeFinished(string name) { - tb->NewClosingTag((unsigned char*) name.c_str()); + tb->NewClosingTag(name); } diff --git a/XMLDocShredder.cpp b/XMLDocShredder.cpp index 1516c9f..7f7d408 100644 --- a/XMLDocShredder.cpp +++ b/XMLDocShredder.cpp @@ -18,12 +18,22 @@ #include #include "XMLDocShredder.h" -#include "SXSIStorageInterface.h" #include #include "Utils.h" using namespace Glib; +void XMLDocShredder::doText(){ + + if (!buffer.empty()){ + tb->NewOpenTag(PCDATA_OPEN_TAG); + tb->NewText(buffer); + tb->NewClosingTag(PCDATA_OPEN_TAG); + }; + buffer.clear(); + +} + void XMLDocShredder::setProperties(){ /* instruct the parser to expand entity references and report as * regular PCDATA @@ -58,74 +68,74 @@ XMLDocShredder::XMLDocShredder(const unsigned char * data, bool iet, bool dtc) { - last_text = false; + tree = NULL; reader_ = new TextReader(data,size,""); setProperties(); - storageIfc_ = new SXSIStorageInterface(sf,iet,dtc); - buffer = ""; + tb = new XMLTreeBuilder(); + buffer.clear(); + tb->OpenDocument(iet,sf,dtc); } XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc) { - last_text = false; + tree = NULL; reader_ = new TextReader(inFileName); setProperties(); - storageIfc_ = new SXSIStorageInterface(sf,iet,dtc); - buffer = ""; + tb = new XMLTreeBuilder(); + buffer.clear(); + tb->OpenDocument(iet,sf,dtc); } XMLDocShredder::~XMLDocShredder() { delete reader_; - delete storageIfc_; + reader_ = NULL; + delete tb; + tb = NULL; } void XMLDocShredder::processStartElement() { - // fetch element name; this will be the full qualified name - ustring name = reader_->get_name(); - bool empty = false; - - storageIfc_->newChild(name); - - /* We must be really carefull here. calling process attributes moves - the document pointer on the last attribute, hence calling reader_->is_empty - afterwards will yield the wrong result. It is better to call it while we are - on the element and generate a nodeFinished() call at the end */ - empty = reader_->is_empty_element(); - - - // now, process attributes - if (reader_->has_attributes()) - { - processAttributes(); - }; - - - if (empty){ - storageIfc_->nodeFinished(name); - }; - - + doText(); + // fetch element name; this will be the full qualified name + ustring name = reader_->get_name(); + bool empty = false; + + tb->NewOpenTag(name); + + /* We must be really carefull here. calling process attributes moves + the document pointer on the last attribute, hence calling reader_->is_empty + afterwards will yield the wrong result. It is better to call it while we are + on the element and generate a nodeFinished() call at the end */ + empty = reader_->is_empty_element(); + + + // now, process attributes + if (reader_->has_attributes()) + processAttributes(); + + + if (empty) + tb->NewClosingTag(name); + + } void XMLDocShredder::processEndElement() { - // tell the storage interface that the current node has been completely processed - storageIfc_->nodeFinished(reader_->get_name()); + doText(); + ustring name = reader_->get_name(); + tb->NewClosingTag(name); } void XMLDocShredder::processPCDATA() { // send the content of this PCDATA node to the storage interface as a text node - - if (reader_->has_value()){ - storageIfc_->newChild("<$>"); - storageIfc_->newText(reader_->get_value()); - storageIfc_->nodeFinished("<$>"); - }; + if (reader_->has_value()) + buffer += reader_->get_value(); + } void XMLDocShredder::processAttributes() @@ -133,9 +143,9 @@ void XMLDocShredder::processAttributes() reader_->move_to_first_attribute(); string nspaceStr = "xmlns"; - storageIfc_->newChild("<@>"); + tb->NewOpenTag(ATTRIBUTE_OPEN_TAG); do - { + { ustring name = reader_->get_name(); ustring value = reader_->get_value(); @@ -145,8 +155,7 @@ void XMLDocShredder::processAttributes() if ((name.find(nspaceStr.c_str(), 0, 5)) == 0) { - storageIfc_->newChild(":" + value); - storageIfc_->nodeFinished(":" + value); + //TODO } /* otherwise, this is an ordinary attribute, so we construct a new child node of the @@ -157,40 +166,40 @@ void XMLDocShredder::processAttributes() else { string attname = "<@>"+name; - storageIfc_->newChild(attname); - storageIfc_->newChild("<@$>"); - storageIfc_->newText(value); - storageIfc_->nodeFinished("<@$>"); - storageIfc_->nodeFinished(attname); + tb->NewOpenTag(attname); + tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG); + tb->NewText(value); + tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG); + tb->NewClosingTag(attname); } } while (reader_->move_to_next_attribute()); - storageIfc_->nodeFinished("<@>"); + tb->NewClosingTag(ATTRIBUTE_OPEN_TAG); } void XMLDocShredder::processSignificantWhitespace() { - - if (reader_->has_value()){ - storageIfc_->newChild("<$>"); - storageIfc_->newText(reader_->get_value()); - storageIfc_->nodeFinished("<$>"); - }; + if (reader_->has_value()) + buffer += reader_->get_value(); + } void XMLDocShredder::processStartDocument(const string docName) { // tell storage interface to construct the document name - storageIfc_->newChild(""); + + tb->NewOpenTag(DOCUMENT_OPEN_TAG); } void XMLDocShredder::processEndDocument() { - /* tell the storage interface that document parsing has finished, and structures - * can now be written to disk. */ - storageIfc_->nodeFinished(""); - storageIfc_->parsingFinished(); + doText(); + /* tell the storage interface that document parsing has finished, and structures + * can now be written to disk. */ + tb->NewClosingTag(DOCUMENT_OPEN_TAG); + tree = tb->CloseDocument(); + } void XMLDocShredder::processComment() @@ -227,12 +236,8 @@ void XMLDocShredder::processCDATASection() * model. Instead, we simply pass the converted text value to the storage interface as * a text node attached to the current context node. */ - if (reader_->has_value()){ - storageIfc_->newChild("<$>"); - storageIfc_->newText(reader_->get_value()); - storageIfc_->nodeFinished("<$>"); - }; - + if (reader_->has_value()) + buffer+= reader_->get_value(); } void XMLDocShredder::processUnknownNodeType() diff --git a/XMLDocShredder.h b/XMLDocShredder.h index 58d4053..ca555fa 100644 --- a/XMLDocShredder.h +++ b/XMLDocShredder.h @@ -15,7 +15,8 @@ #include #include #include -#include "StorageInterface.h" +#include "XMLTree.h" +#include "XMLTreeBuilder.h" using namespace std; using namespace xmlpp; @@ -23,6 +24,7 @@ using namespace xmlpp; class XMLDocShredder { + void doText(); public: XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc); XMLDocShredder(const unsigned char * data, TextReader::size_type size,int sf, bool iet, bool dtc); @@ -41,16 +43,20 @@ public: virtual void processCDATASection(); virtual void parse(); - StorageInterface *storageIfc_; + XMLTree * getXMLTree(){ + return tree; + } -private: - TextReader *reader_; - void setProperties(); - bool last_text; - string buffer; - // used to coalece successive text events - // which can occur if we discard pi and comment nodes. + private: + XMLTreeBuilder * tb; + XMLTree * tree; + TextReader *reader_; + void setProperties(); + bool last_text; + string buffer; + // used to coalece successive text events + // which can occur if we discard pi and comment nodes. }; #endif /*XMLDOCSHREDDER_H_*/ diff --git a/ata.ml b/ata.ml index 13d3cce..ca137a7 100644 --- a/ata.ml +++ b/ata.ml @@ -25,14 +25,7 @@ struct if x < 0 then failwith (Printf.sprintf "State: Assertion %i < 0 failed" x) end -module StateSet = struct - include Ptset.Int - let print ppf s = - Format.pp_print_string ppf "{ "; - iter (fun i -> Format.fprintf ppf "%i " i) s; - Format.pp_print_string ppf "}"; - Format.pp_print_flush ppf () -end +module StateSet = Ptset.Int module Formula = struct @@ -374,22 +367,6 @@ module FTable = Hashtbl.Make( struct let hash (f,s,t) = HASHINT3(Formlist.uid f ,StateSet.uid s,StateSet.uid t);; end) -(* -module MemoFormlist = Memoizer.Make(FTable) - - Too slow - let eval_formlist = MemoFormlist.make_rec ( - fun eval (fl,((s1,s2)as sets)) -> - match Formlist.node fl with - | Formlist.Nil -> StateSet.empty,false,false,false,false - | Formlist.Cons(f,fll) -> - let q,mark,f,_ = Transition.node f in - let b,b1,b2 = eval_form_bool f s1 s2 in - let s,b',b1',b2',amark = eval (fll,sets) in - if b then (StateSet.add q s, b, b1'||b1,b2'||b2,mark||amark) - else s,b',b1',b2',amark ) -*) - let h_f = FTable.create BIG_H_SIZE @@ -403,11 +380,11 @@ let eval_formlist s1 s2 fl = | Formlist.Cons(f,fll) -> let q,mark,f,_ = Transition.node f in let b,b1,b2 = eval_form_bool f s1 s2 in - let s,b',b1',b2',amark = loop fll in - let r = if b then (StateSet.add q s, b, b1'||b1,b2'||b2,mark||amark) - else s,b',b1',b2',amark + let (s,(b',b1',b2',amark)) as res = loop fll in + let r = if b then (StateSet.add q s, (b, b1'||b1,b2'||b2,mark||amark)) + else res in FTable.add h_f (fl,s1,s2) r;r - | Formlist.Nil -> StateSet.empty,false,false,false,false + | Formlist.Nil -> StateSet.empty,(false,false,false,false) in loop fl let tags_of_state a q = @@ -447,7 +424,7 @@ let tags_of_state a q = val fold : ( elt -> 'a -> 'a) -> t -> 'a -> 'a val map : ( elt -> elt) -> t -> t val length : t -> int - val merge : bool -> bool -> bool -> bool -> elt -> t -> t -> t + val merge : (bool*bool*bool*bool) -> elt -> t -> t -> t end module Integer : ResultSet = @@ -461,7 +438,7 @@ let tags_of_state a q = let fold _ _ _ = failwith "fold not implemented" let map _ _ = failwith "map not implemented" let length x = x - let merge rb rb1 rb2 mark t res1 res2 = + let merge (rb,rb1,rb2,mark) t res1 res2 = if rb then let res1 = if rb1 then res1 else 0 and res2 = if rb2 then res2 else 0 @@ -513,7 +490,7 @@ let tags_of_state a q = in { l with node = loop l.node } - let merge rb rb1 rb2 mark t res1 res2 = + let merge (rb,rb1,rb2,mark) t res1 res2 = if rb then let res1 = if rb1 then res1 else empty and res2 = if rb2 then res2 else empty @@ -552,7 +529,7 @@ END let string_of_ts tags = (Ptset.Int.fold (fun t a -> a ^ " " ^ (Tag.to_string t) ) tags "{")^ " }" - let choose_jump tagset qtags1 qtagsn a f_nil f_t1 f_s1 f_tn f_sn f_notext = + let choose_jump tagset qtags1 qtagsn a f_nil f_t1 f_s1 f_tn f_sn f_notext f_maytext = let tags1,hastext1,fin1 = inter_text tagset (tags a qtags1) in let tagsn,hastextn,finn = inter_text tagset (tags a qtagsn) in (*if (hastext1||hastextn) then (`ANY,f_text) (* jumping to text nodes doesn't work really well *) @@ -570,24 +547,27 @@ END let tag = (Ptset.Int.choose tagsn) in (`TAG(tag),mk_app_fun f_tn tag (Tag.to_string tag)) else (* SelectDesc/Following *) (`ANY,mk_app_fun f_sn tagsn (string_of_ts tagsn)) + else if (hastext1||hastextn) then (`ANY,f_maytext) else (`ANY,f_notext) let choose_jump_down tree a b c d = choose_jump a b c d (mk_fun (fun _ -> Tree.nil) "Tree.mk_nil") (mk_fun (Tree.tagged_child tree) "Tree.tagged_child") - (mk_fun (Tree.select_child tree) "Tree.select_child") (* !! no select_child in Tree.ml *) + (mk_fun (Tree.select_child tree) "Tree.select_child") (mk_fun (Tree.tagged_desc tree) "Tree.tagged_desc") - (mk_fun (Tree.select_desc tree) "Tree.select_desc") (* !! no select_desc *) + (mk_fun (Tree.select_desc tree) "Tree.select_desc") + (mk_fun (Tree.first_element tree) "Tree.first_element") (mk_fun (Tree.first_child tree) "Tree.first_child") let choose_jump_next tree a b c d = choose_jump a b c d (mk_fun (fun _ _ -> Tree.nil) "Tree.mk_nil2") - (mk_fun (Tree.tagged_sibling_ctx tree) "Tree.tagged_sibling_ctx")(* !! no tagged_sibling in Tree.ml *) - (mk_fun (Tree.select_sibling_ctx tree) "Tree.select_sibling_ctx")(* !! no select_sibling in Tree.ml *) + (mk_fun (Tree.tagged_sibling_ctx tree) "Tree.tagged_sibling_ctx") + (mk_fun (Tree.select_sibling_ctx tree) "Tree.select_sibling_ctx") (mk_fun (Tree.tagged_foll_ctx tree) "Tree.tagged_foll_ctx") - (mk_fun (Tree.select_foll_ctx tree) "Tree.select_foll_ctx")(* !! no select_foll *) + (mk_fun (Tree.select_foll_ctx tree) "Tree.select_foll_ctx") + (mk_fun (Tree.next_element_ctx tree) "Tree.node_element_ctx") (mk_fun (Tree.next_sibling_ctx tree) "Tree.node_sibling_ctx") @@ -615,7 +595,8 @@ END if mark then RS.cons t (RS.concat res1 res2) else RS.concat res1 res2 else RS.empty - + + let top_down ?(noright=false) a tree t slist ctx slot_size = let pempty = empty_size slot_size in (* evaluation starts from the right so we put sl1,res1 at the end *) @@ -626,8 +607,8 @@ END | SList.Cons(s1,ll1), SList.Cons(s2,ll2), fl::fll -> - let r',rb,rb1,rb2,mark = eval_formlist s1 s2 fl in - let _ = res.(i) <- RS.merge rb rb1 rb2 mark t res1.(i) res2.(i) + let r',flags = eval_formlist s1 s2 fl in + let _ = res.(i) <- RS.merge flags t res1.(i) res2.(i) in fold ll1 ll2 fll (i+1) (SList.cons r' aq) @@ -650,68 +631,68 @@ END try CachedTransTable.find td_trans (tag,slist) with - | Not_found -> - let fl_list,llist,rlist,ca,da,sa,fa = - SList.fold - (fun set (fll_acc,lllacc,rllacc,ca,da,sa,fa) -> (* For each set *) - let fl,ll,rr,ca,da,sa,fa = - StateSet.fold - (fun q acc -> - List.fold_left - (fun ((fl_acc,ll_acc,rl_acc,c_acc,d_acc,s_acc,f_acc) as acc) - (ts,t) -> - if (TagSet.mem tag ts) - then - let _,_,f,_ = Transition.node t in - let (child,desc,below),(sibl,foll,after) = Formula.st f in - (Formlist.cons t fl_acc, - StateSet.union ll_acc below, - StateSet.union rl_acc after, - StateSet.union child c_acc, - StateSet.union desc d_acc, - StateSet.union sibl s_acc, - StateSet.union foll f_acc) - else acc ) acc ( - try Hashtbl.find a.trans q - with - Not_found -> Printf.eprintf "Looking for state %i, doesn't exist!!!\n%!" - q;[] - ) - - ) set (Formlist.nil,StateSet.empty,StateSet.empty,ca,da,sa,fa) - in fl::fll_acc, (SList.cons ll lllacc), (SList.cons rr rllacc),ca,da,sa,fa) - slist ([],SList.nil,SList.nil,StateSet.empty,StateSet.empty,StateSet.empty,StateSet.empty) - in - (* Logic to chose the first and next function *) - let tags_below,tags_after = Tree.tags tree tag in - let f_kind,first = choose_jump_down tree tags_below ca da a - and n_kind,next = if noright then (`NIL, fun _ _ -> Tree.nil ) - else choose_jump_next tree tags_after sa fa a in - let empty_res = null_result() in - let cont = - match f_kind,n_kind with - | `NIL,`NIL -> - (fun _ _ -> eval_fold2_slist fl_list t empty_res empty_res ) - | _,`NIL -> ( - match f_kind with - |`TAG(tag) -> - (fun t _ -> eval_fold2_slist fl_list t empty_res - (loop_tag tag (first t) llist t)) - | `ANY -> - (fun t _ -> eval_fold2_slist fl_list t empty_res - (loop (first t) llist t)) - | _ -> assert false) - + | Not_found -> + let fl_list,llist,rlist,ca,da,sa,fa = + SList.fold + (fun set (fll_acc,lllacc,rllacc,ca,da,sa,fa) -> (* For each set *) + let fl,ll,rr,ca,da,sa,fa = + StateSet.fold + (fun q acc -> + List.fold_left + (fun ((fl_acc,ll_acc,rl_acc,c_acc,d_acc,s_acc,f_acc) as acc) + (ts,t) -> + if (TagSet.mem tag ts) + then + let _,_,f,_ = Transition.node t in + let (child,desc,below),(sibl,foll,after) = Formula.st f in + (Formlist.cons t fl_acc, + StateSet.union ll_acc below, + StateSet.union rl_acc after, + StateSet.union child c_acc, + StateSet.union desc d_acc, + StateSet.union sibl s_acc, + StateSet.union foll f_acc) + else acc ) acc ( + try Hashtbl.find a.trans q + with + Not_found -> Printf.eprintf "Looking for state %i, doesn't exist!!!\n%!" + q;[] + ) + + ) set (Formlist.nil,StateSet.empty,StateSet.empty,ca,da,sa,fa) + in fl::fll_acc, (SList.cons ll lllacc), (SList.cons rr rllacc),ca,da,sa,fa) + slist ([],SList.nil,SList.nil,StateSet.empty,StateSet.empty,StateSet.empty,StateSet.empty) + in + (* Logic to chose the first and next function *) + let _,tags_below,_,tags_after = Tree.tags tree tag in + let f_kind,first = choose_jump_down tree tags_below ca da a + and n_kind,next = if noright then (`NIL, fun _ _ -> Tree.nil ) + else choose_jump_next tree tags_after sa fa a in + let empty_res = null_result() in + let cont = + match f_kind,n_kind with + | `NIL,`NIL -> + (fun _ _ -> eval_fold2_slist fl_list t empty_res empty_res ) + | _,`NIL -> ( + match f_kind with + |`TAG(tag) -> + (fun t _ -> eval_fold2_slist fl_list t empty_res + (loop_tag tag (first t) llist t)) + | `ANY -> + (fun t _ -> eval_fold2_slist fl_list t empty_res + (loop (first t) llist t)) + | _ -> assert false) + | `NIL,_ -> ( match n_kind with |`TAG(tag) -> (fun t ctx -> eval_fold2_slist fl_list t (loop_tag tag (next t ctx) rlist ctx) empty_res) - + | `ANY -> (fun t ctx -> eval_fold2_slist fl_list t (loop (next t ctx) rlist ctx) empty_res) - + | _ -> assert false) | `TAG(tag1),`TAG(tag2) -> @@ -735,19 +716,20 @@ END (loop (next t ctx) rlist ctx) (loop (first t) llist t) ) | _ -> assert false - in - let cont = D_IF_( (fun t ctx -> - let a,b = cont t ctx in - register_trace t (slist,a,fl_list,first,next,ctx); - (a,b) - ) ,cont) - in - (CachedTransTable.add td_trans (tag,slist) cont;cont) + in + let cont = D_IF_( (fun t ctx -> + let a,b = cont t ctx in + register_trace t (slist,a,fl_list,first,next,ctx); + (a,b) + ) ,cont) + in + (CachedTransTable.add td_trans (tag,slist) cont;cont) in cont t ctx + in (if noright then loop_no_right else loop) t slist ctx - - + + let run_top_down a tree = let init = SList.cons a.init SList.nil in let _,res = top_down a tree Tree.root init Tree.root 1 @@ -823,7 +805,7 @@ END match SList.node sl,fl with |SList.Nil,[] -> acc |SList.Cons(s,sll), formlist::fll -> - let r',rb,rb1,rb2,mark = + let r',(rb,rb1,rb2,mark) = let key = SList.hash sl,Formlist.hash formlist,dir in try Hashtbl.find h_fold key diff --git a/ata.mli b/ata.mli index ae4e479..24d0832 100644 --- a/ata.mli +++ b/ata.mli @@ -92,7 +92,7 @@ module type ResultSet = val fold : (elt -> 'a -> 'a) -> t -> 'a -> 'a val map : (elt -> elt) -> t -> t val length : t -> int - val merge : bool -> bool -> bool -> bool -> elt -> t -> t -> t + val merge : (bool*bool*bool*bool)-> elt -> t -> t -> t end module IdSet : ResultSet diff --git a/main.ml b/main.ml index 4fcc0af..021a18e 100644 --- a/main.ml +++ b/main.ml @@ -79,6 +79,7 @@ let main v query_string output = (if !Options.count_only then "(counting only)" else if !Options.backward then "(bottomup)" else ""); begin let _ = Gc.full_major();Gc.compact() in + let _ = Printf.eprintf "%!" in (* let _ = Gc.set (disabled_gc) in *) if !Options.backward && ((snd test_list) != `NOTHING )then @@ -109,7 +110,7 @@ let main v query_string output = IdSet.iter (fun t -> Tree.print_xml_fast oc v t; output_char oc '\n'; - output_string oc "----------\n"; + ) result) (); end; end; @@ -125,7 +126,7 @@ let v = begin Printf.eprintf "Loading from file : "; time (Tree.load ~sample:!Options.sample_factor ) - (Filename.chop_suffix !Options.input_file ".srx"); + !Options.input_file; end else let v = diff --git a/options.ml b/options.ml index 6698921..fdfc867 100644 --- a/options.ml +++ b/options.ml @@ -41,3 +41,4 @@ let parse_cmdline() = + diff --git a/ptset.ml b/ptset.ml index 3185da4..4fc92d6 100644 --- a/ptset.ml +++ b/ptset.ml @@ -377,15 +377,25 @@ let from_list l = List.fold_left (fun acc e -> add e acc) empty l end -module Int : S with type elt = int - = - Make ( struct type t = int - type data = t - external hash : t -> int = "%identity" - external uid : t -> int = "%identity" - let equal : t -> t -> bool = (==) - external make : t -> int = "%identity" - external node : t -> int = "%identity" - - end - ) +module Int : sig + include S with type elt = int + val print : Format.formatter -> t -> unit +end + = +struct + include Make ( struct type t = int + type data = t + external hash : t -> int = "%identity" + external uid : t -> int = "%identity" + let equal : t -> t -> bool = (==) + external make : t -> int = "%identity" + external node : t -> int = "%identity" + + end + ) + let print ppf s = + Format.pp_print_string ppf "{ "; + iter (fun i -> Format.fprintf ppf "%i " i) s; + Format.pp_print_string ppf "}"; + Format.pp_print_flush ppf () + end diff --git a/ptset.mli b/ptset.mli index cfdedae..3b46f5c 100644 --- a/ptset.mli +++ b/ptset.mli @@ -71,5 +71,8 @@ val node : t -> data end -module Int : S with type elt = int +module Int : sig + include S with type elt = int + val print : Format.formatter -> t -> unit +end module Make ( H : Hcons.S ) : S with type elt = H.t diff --git a/tests/test.xml b/tests/test.xml index 4c57628..0d548e4 100644 --- a/tests/test.xml +++ b/tests/test.xml @@ -1,4 +1,13 @@ -12 - -3 + + + + + + +

+ + +
+
+ diff --git a/tree.ml b/tree.ml index 9cb5ef6..9e80e6d 100644 --- a/tree.ml +++ b/tree.ml @@ -6,36 +6,43 @@ (******************************************************************************) INCLUDE "utils.ml" + +external init_lib : unit -> unit = "caml_init_lib" + +exception CPlusPlusError of string + +let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "") + +let () = init_lib () + + type tree -type 'a node = int +type 'a node = private int type node_kind = [`Text | `Tree ] - -let compare_node : 'a node -> 'a node -> int = (-) + +external inode : 'a node -> int = "%identity" +external nodei : int -> 'a node = "%identity" +let compare_node x y = (inode x) - (inode y) let equal_node : 'a node -> 'a node -> bool = (==) - -(* abstract type, values are pointers to a XMLTree C++ object *) -external int_of_node : 'a node -> int = "%identity" external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri" external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string" -external save_tree : tree -> string -> unit = "caml_xml_tree_save" -external load_tree : string -> int -> tree = "caml_xml_tree_load" +external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save" +external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load" external nullt : unit -> 'a node = "caml_xml_tree_nullt" -let nil : 'a node = -1 -let root : [`Tree ] node = 0 +let nil : [`Tree ] node = nodei ~-1 +let nulldoc : [`Text ] node = nodei ~-1 +let root : [`Tree ] node = nodei 0 external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text" external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text" -let text_is_empty t n = - (equal_node nil n) || text_is_empty t n - - +let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains" external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains" @@ -43,67 +50,63 @@ external text_count : tree -> string -> int = "caml_text_collection_count" external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains" external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains" external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text" + +external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" +let tree_is_nil x = equal_node x nil -external tree_serialize : tree -> string -> unit = "caml_xml_tree_serialize" +external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc" +external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc" +(*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *) +external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc" +external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc" +external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc" +external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc" +external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc" +external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc" -external tree_unserialize : string -> tree = "caml_xml_tree_unserialize" - -external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" +external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc" +external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc" +external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc" +external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc" -let tree_is_nil x = equal_node x nil -external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" -external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" -external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" -external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" -external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" -external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" -external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" - -external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" -external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" -external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" -external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" - -(* external tag : tree -> [`Tree ] node -> T = "caml_xml_tree_tag"*) -external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" +external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc" let tree_is_last t n = equal_node nil (tree_next_sibling t n) -external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" +(*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *) -external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" -external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" +external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc" +(*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *) external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids" -let text_size tree = int_of_node (snd ( tree_doc_ids tree (Obj.magic 0) )) +let text_size tree = inode (snd ( tree_doc_ids tree root )) -let text_get_cached_text t x = - if x == -1 then "" +let text_get_cached_text t (x:[`Text] node) = + if x == nulldoc then "" else text_get_cached_text t x -external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" -external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" -external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" -external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" -external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" -external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" +external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc" +external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc" +external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc" +external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc" +external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc" +external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc" +type unordered_set +external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc" +external unordered_set_length : unordered_set -> int = "caml_unordered_set_length" +external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc" -type int_vector -external int_vector_alloc : int -> int_vector = "caml_int_vector_alloc" -external int_vector_length : int_vector -> int = "caml_int_vector_length" -external int_vector_set : int_vector -> int -> int -> unit = "caml_int_vector_set" - -external tree_select_child : tree -> [`Tree ] node -> int_vector -> [`Tree] node = "caml_xml_tree_select_child" -external tree_select_foll_sibling : tree -> [`Tree ] node -> int_vector -> [`Tree] node = "caml_xml_tree_select_foll_sibling" -external tree_select_desc : tree -> [`Tree ] node -> int_vector -> [`Tree] node = "caml_xml_tree_select_desc" -external tree_select_foll_below : tree -> [`Tree ] node -> int_vector -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" +external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc" +external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc" +external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc" +external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc" module HPtset = Hashtbl.Make(Ptset.Int) @@ -115,14 +118,14 @@ let ptset_to_vector s = HPtset.find vector_htbl s with Not_found -> - let v = int_vector_alloc (Ptset.Int.cardinal s) in - let _ = Ptset.Int.fold (fun e i -> int_vector_set v i e;i+1) s 0 in + let v = unordered_set_alloc (Ptset.Int.cardinal s) in + let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in HPtset.add vector_htbl s v; v type t = { doc : tree; - ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t)) Hashtbl.t; + ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t; } let text_size t = text_size t.doc @@ -138,6 +141,13 @@ module MemUnion = Hashtbl.Make (struct if x < y then HASHINT2(x,y) else HASHINT2(y,x) end) +module MemAdd = Hashtbl.Make ( + struct + type t = Tag.t*Ptset.Int.t + let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t) + let hash (x,y) = HASHINT2(x,Ptset.Int.hash y) + end) + let collect_tags tree = let h_union = MemUnion.create BIG_H_SIZE in let pt_cup s1 s2 = @@ -148,41 +158,48 @@ let collect_tags tree = in MemUnion.add h_union (s1,s2) s;s in - let h_add = Hashtbl.create BIG_H_SIZE in - let pt_add t s = - let k = HASHINT2(Tag.hash t,Ptset.Int.hash s) in - try - Hashtbl.find h_add k - with + let h_add = MemAdd.create BIG_H_SIZE in + let pt_add t s = + try MemAdd.find h_add (t,s) + with | Not_found -> let r = Ptset.Int.add t s in - Hashtbl.add h_add k r;r + MemAdd.add h_add (t,s) r;r in let h = Hashtbl.create BIG_H_SIZE in - let update t sb sa = - let sbelow,safter = + let update t sc sb ss sa = + let schild,sbelow,ssibling,safter = try Hashtbl.find h t with | Not_found -> - (Ptset.Int.empty,Ptset.Int.empty) + (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty) in - Hashtbl.replace h t (pt_cup sbelow sb, pt_cup safter sa) + Hashtbl.replace h t + (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa) in - let rec loop id acc = - if equal_node id nil - then (Ptset.Int.empty,acc) + let rec loop_right id acc_sibling acc_after= + if id == nil + then (acc_sibling,acc_after) else - let below2,after2 = loop (tree_next_sibling tree id) acc in - let below1,after1 = loop (tree_first_child tree id) after2 in + let sibling2,after2 = loop_right (tree_next_sibling tree id) acc_sibling acc_after in + let child1,below1 = loop_left (tree_first_child tree id) after2 in let tag = tree_tag_id tree id in - update tag below1 after2; - pt_add tag (pt_cup below1 below2), (pt_add tag after1) + update tag child1 below1 sibling2 after2; + (pt_add tag sibling2, (pt_add tag (pt_cup after2 below1))) + and loop_left id acc_after = + if id == nil + then (Ptset.Int.empty,Ptset.Int.empty) + else + let sibling2,after2 = loop_right (tree_next_sibling tree id) Ptset.Int.empty acc_after in + let child1,below1 = loop_left (tree_first_child tree id) after2 in + let tag = tree_tag_id tree id in + update tag child1 below1 sibling2 after2; + (pt_add tag sibling2,(pt_add tag (pt_cup after2 below1))) in - let _ = loop (tree_root tree) Ptset.Int.empty in h - - - - + let _ = loop_left (tree_root tree) Ptset.Int.empty in h + + + let contains_array = ref [| |] let contains_index = Hashtbl.create 4096 @@ -218,11 +235,11 @@ let init_naive_contains t s = let s = text_get_cached_text t.doc n in if matching s - then loop (n+1) (n::acc) (l+1) - else loop (n+1) acc l + then loop (nodei ((inode n)+1)) (n::acc) (l+1) + else loop (nodei ((inode n)+1)) acc l in let acc,l = loop i [] 0 in - let a = Array.create l nil in + let a = Array.create l nulldoc in let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc in contains_array := a @@ -263,18 +280,84 @@ let parse_xml_string str = parse parse_xml_string str external pool : tree -> Tag.pool = "%identity" -let save t str = (save_tree t.doc str) +let magic_string = "SXSI_INDEX" +let version_string = "1" + +let pos fd = + Unix.lseek fd 0 Unix.SEEK_CUR + +let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd) + +let write fd s = + let sl = String.length s in + let ssl = Printf.sprintf "%020i" sl in + ignore (Unix.write fd ssl 0 20); + ignore (Unix.write fd s 0 (String.length s)) + +let rec really_read fd buffer start length = + if length <= 0 then () else + match Unix.read fd buffer start length with + 0 -> raise End_of_file + | r -> really_read fd buffer (start + r) (length - r);; + +let read fd = + let buffer = String.create 20 in + let _ = really_read fd buffer 0 20 in + let size = int_of_string buffer in + let buffer = String.create size in + let _ = really_read fd buffer 0 size in + buffer + + +let save t str = + let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in + let out_c = Unix.out_channel_of_descr fd in + let _ = set_binary_mode_out out_c true in + output_string out_c magic_string; + output_char out_c '\n'; + output_string out_c version_string; + output_char out_c '\n'; + Marshal.to_channel out_c t.ttable [ ]; + (* we need to move the fd to the correct position *) + flush out_c; + ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET); + tree_save t.doc fd; + close_out out_c ;; let load ?(sample=64) str = - node_of_t (load_tree str sample) - + let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in + let in_c = Unix.in_channel_of_descr fd in + let _ = set_binary_mode_in in_c true in + (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file"); + (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file"); + let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t = + Marshal.from_channel in_c + in + let ntable = Hashtbl.create (Hashtbl.length table) in + Hashtbl.iter (fun k (s1,s2,s3,s4) -> + let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty + and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty + and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty + and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty + in Hashtbl.add ntable k (ss1,ss2,ss3,ss4) + ) table; + Hashtbl.clear table; + (* The in_channel read a chunk of fd, so we might be after + the start of the XMLTree save file. Reset to the correct + position *) + ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET); + let tree = { doc = tree_load fd; + ttable = ntable;} + in close_in in_c; + tree + let tag_pool t = pool t.doc -let compare a b = a - b +let compare = compare_node let equal a b = a == b @@ -282,7 +365,7 @@ let nts = function -1 -> "Nil" | i -> Printf.sprintf "Node (%i)" i -let dump_node t = nts t +let dump_node t = nts (inode t) let is_left t n = tree_is_first_child t.doc n @@ -292,6 +375,7 @@ let is_below_right t n1 n2 = tree_is_ancestor t.doc (tree_parent t.doc n1) n2 let parent t n = tree_parent t.doc n let first_child t = (); fun n -> tree_first_child t.doc n +let first_element t = (); fun n -> tree_first_element t.doc n (* these function will be called in two times: first partial application on the tag, then application of the tag and the tree, then application of @@ -305,6 +389,8 @@ let select_child t = fun ts -> fun n -> tree_select_child t.doc n v let next_sibling t = (); fun n -> tree_next_sibling t.doc n +let next_element t = (); fun n -> tree_next_element t.doc n + let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag let select_sibling t = fun ts -> @@ -312,6 +398,7 @@ let select_sibling t = fun ts -> fun n -> tree_select_foll_sibling t.doc n v let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n +let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag let select_sibling_ctx t = fun ts -> @@ -328,7 +415,7 @@ let select_desc t = fun ts -> let v = (ptset_to_vector ts) in (); fun n -> tree_select_desc t.doc n v -let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx +let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx let select_foll_ctx t = fun ts -> let v = (ptset_to_vector ts) in (); @@ -356,10 +443,15 @@ let array_find a i j = then let tagid = tree_tag_id tree.doc t in if tagid==Tag.pcdata - then output_string outc (text_get_cached_text tree.doc t); - if print_right - then loop (next_sibling tree t) - + then + begin + let tid = tree_my_text tree.doc t in + let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode t) (inode tid) + in + output_string outc (text_get_cached_text tree.doc tid); + if print_right + then loop (next_sibling tree t); + end else let tagstr = Tag.to_string tagid in let l = first_child tree t @@ -391,14 +483,19 @@ let array_find a i j = output_char outc '>'; end; if print_right then loop r - and loop_attributes a = + and loop_attributes a = + if a != nil + then let s = (Tag.to_string (tag tree a)) in let attname = String.sub s 3 ((String.length s) -3) in + let fsa = first_child tree a in + let tid = tree_my_text tree.doc fsa in + let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode fsa) (inode tid) + in output_char outc ' '; output_string outc attname; output_string outc "=\""; - output_string outc (text_get_cached_text tree.doc - (tree_my_text tree.doc (first_child tree a))); + output_string outc (text_get_cached_text tree.doc tid); output_char outc '"'; loop_attributes (next_sibling tree a) in @@ -410,11 +507,15 @@ let array_find a i j = print_xml_fast outc tree (first_child tree t) else print_xml_fast outc tree t +let tags_children t tag = + let a,_,_,_ = Hashtbl.find t.ttable tag in a let tags_below t tag = - fst(Hashtbl.find t.ttable tag) - + let _,a,_,_ = Hashtbl.find t.ttable tag in a +let tags_siblings t tag = + let _,_,a,_ = Hashtbl.find t.ttable tag in a let tags_after t tag = - snd(Hashtbl.find t.ttable tag) + let _,_,_,a = Hashtbl.find t.ttable tag in a + let tags t tag = Hashtbl.find t.ttable tag @@ -432,7 +533,7 @@ let subtree_tags t tag = (); let get_text t n = let tid = tree_my_text t.doc n in - if tid == nil then "" else + if tid == nulldoc then "" else text_get_cached_text t.doc tid diff --git a/tree.mli b/tree.mli index 6da77f5..c38ab02 100644 --- a/tree.mli +++ b/tree.mli @@ -18,7 +18,7 @@ val compare : [ `Tree ] node -> [ `Tree ] node -> int val dump_node : 'a node -> string -val nil : 'a node +val nil : [ `Tree ] node val root : [ `Tree ] node val is_root : [ `Tree ] node -> bool @@ -26,12 +26,16 @@ val is_nil : [ `Tree ] node -> bool val parent : t -> [ `Tree ] node -> [ `Tree ] node val first_child : t -> [ `Tree ] node -> [ `Tree ] node +val first_element : t -> [ `Tree ] node -> [ `Tree ] node val tagged_child : t -> Tag.t -> [ `Tree ] node -> [ `Tree ] node val select_child : t -> Ptset.Int.t -> [ `Tree ] node -> [ `Tree ] node val next_sibling : t -> [ `Tree ] node -> [ `Tree ] node +val next_element : t -> [ `Tree ] node -> [ `Tree ] node + val next_sibling_ctx : t -> [ `Tree ] node -> [ `Tree ] node -> [ `Tree ] node +val next_element_ctx : t -> [ `Tree ] node -> [ `Tree ] node -> [ `Tree ] node val tagged_sibling : t -> Tag.t -> [ `Tree ] node -> [ `Tree ] node val tagged_sibling_ctx : t -> Tag.t -> [ `Tree ] node -> [ `Tree ] node -> [ `Tree ] node @@ -53,9 +57,11 @@ val select_foll_ctx : t -> Ptset.Int.t -> [ `Tree ] node -> [`Tree] node -> [ ` val count : t -> string -> int val print_xml_fast : out_channel -> t -> [ `Tree ] node -> unit +val tags_children : t -> Tag.t -> Ptset.Int.t val tags_below : t -> Tag.t -> Ptset.Int.t +val tags_siblings : t -> Tag.t -> Ptset.Int.t val tags_after : t -> Tag.t -> Ptset.Int.t -val tags : t -> Tag.t -> Ptset.Int.t*Ptset.Int.t +val tags : t -> Tag.t -> Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t val is_below_right : t -> [`Tree] node -> [`Tree] node -> bool val is_left : t -> [`Tree] node -> bool -- 2.17.1