From ff13d22656fdbdffb2d909192bd17ba135606224 Mon Sep 17 00:00:00 2001 From: kim Date: Sat, 3 Dec 2011 21:52:16 +0000 Subject: [PATCH] Refactoring, 1st tier: - replace libxml++-2.6 by ocaml-expat - kill XMLDocSchredder.* - move common OCaml/C++ utility functions in common_stub.{hpp,cpp} git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@1179 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- configure | 19 +-- myocamlbuild.ml | 2 +- myocamlbuild_config.ml.in | 2 +- src/OCamlDriver.cpp | 253 +++++++++++++------------------ src/XMLDocShredder.cpp | 306 -------------------------------------- src/XMLDocShredder.h | 68 --------- src/common_stub.cpp | 92 ++++++++++++ src/common_stub.hpp | 58 ++++++++ src/libcamlshredder.clib | 2 +- src/tree.ml | 109 ++++++++++++-- 10 files changed, 365 insertions(+), 546 deletions(-) delete mode 100644 src/XMLDocShredder.cpp delete mode 100644 src/XMLDocShredder.h create mode 100644 src/common_stub.cpp create mode 100644 src/common_stub.hpp diff --git a/configure b/configure index c985457..6d86ab3 100755 --- a/configure +++ b/configure @@ -14,18 +14,11 @@ Conf.check_prog "ocamlbuild" "ocamlbuild -version";; Conf.check_prog "ocamlfind" "ocamlfind printconf";; Conf.check_prog "pkg-config" "pkg-config --version";; -Conf.check_prog "libxml++-2.6" "pkg-config --exists libxml++-2.6" ;; +(*Conf.check_prog "libxml++-2.6" "pkg-config --exists libxml++-2.6" ;;*) Conf.check_prog "ulex" "ocamlfind query ulex";; -Conf.check "XMLTree" (Conf.absolute) ("%s/src/XMLTree/libXMLTree.a") (Sys.file_exists);; - -let _, libxmlI, _ = Conf.exec "pkg-config --cflags-only-I libxml++-2.6";; -let libxmlI = Conf.explode (libxmlI);; +Conf.check_prog "expat" "ocamlfind query expat";; -let _, libxmlL, _ = Conf.exec "pkg-config --libs-only-L libxml++-2.6";; -let libxmlL = Conf.explode (libxmlL);; - -let _, libxmll, _ = Conf.exec "pkg-config --libs-only-l libxml++-2.6";; -let libxmll = Conf.explode (libxmll);; +Conf.check "XMLTree" (Conf.absolute) ("%s/src/XMLTree/libXMLTree.a") (Sys.file_exists);; let libXMLTreeI = [ Conf.absolute "-I%s/src/XMLTree"; Conf.absolute "-I%s/src/XMLTree/libcds/includes"; @@ -39,9 +32,9 @@ let ocamlI = [ "-I" ^ ocamlI ];; Conf.def_str "cxx_cmd" "g++";; -Conf.def_list "cxx_includes" (libxmlI @ libXMLTreeI @ ocamlI);; -Conf.def_list "cxx_lpaths" (libxmlL @ libXMLTreeL);; -Conf.def_list "cxx_libs" ( libXMLTreel @ libxmll );; +Conf.def_list "cxx_includes" (libXMLTreeI @ ocamlI);; +Conf.def_list "cxx_lpaths" libXMLTreeL;; +Conf.def_list "cxx_libs" libXMLTreel;; diff --git a/myocamlbuild.ml b/myocamlbuild.ml index 23901cd..41c25b5 100644 --- a/myocamlbuild.ml +++ b/myocamlbuild.ml @@ -106,7 +106,7 @@ let cxx_compile env build = let src = env "%.cpp" and obj = env "%.o" in let local_include = Depends.cxx src in let local_dispatch = List.map (fun p -> List.map (fun p' -> p'/p) project_dirs) local_include in - let () = ignore (build local_dispatch) in + let () = ignore (build local_dispatch ) in Cmd(S[A cxx_cmd; A "-o" ; P obj; A "-c"; S !cxx_flags; cxx_include_flags; P src]) (* Native compile and link action *) diff --git a/myocamlbuild_config.ml.in b/myocamlbuild_config.ml.in index ebe7675..d8e98c9 100644 --- a/myocamlbuild_config.ml.in +++ b/myocamlbuild_config.ml.in @@ -2,7 +2,7 @@ let ocaml_inline = "1000";; let include_path = "include";; let src_path = "src";; let ocaml_link = [ "dynlink"; "camlp4lib" ];; -let ocamlfind_packages = "unix,ulex,camlp4";; +let ocamlfind_packages = "unix,ulex,expat,camlp4";; let cxx_flags = [ "-fno-PIC"; "-std=c++0x"; "-static" ];; let main_targets = [ "native","src/main.native"; "byte", "src/main.byte" ];; diff --git a/src/OCamlDriver.cpp b/src/OCamlDriver.cpp index 275758a..8eb3a22 100644 --- a/src/OCamlDriver.cpp +++ b/src/OCamlDriver.cpp @@ -17,165 +17,136 @@ #include #include -#include "XMLDocShredder.h" + #include "XMLTree.h" +#include "XMLTreeBuilder.h" #include "Utils.h" +#include "common_stub.hpp" extern "C" { -/* OCaml memory managment */ -#include -#include -#include -#include -#include -#include -#include - //#include "results.h" #include +} + +#define CAMLRAISEMSG(msg) (sxsi_raise_msg((char*) (msg))) + +#define XMLTREE(x) (Obj_val(x)) + +#define HSET(x) (Obj_val(x)) + +#define XMLTREEBUILDER(x) (Obj_val(x)) + -#define CAMLRAISEMSG(msg) (caml_raise_with_string(*cpp_exception,(msg) )) -#define NOT_IMPLEMENTED(s) (caml_failwith(s)) -#define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x))) -#define HSET(x) ((TagIdSet*)((* (TagIdSet**) Data_custom_val(x)))) -#define TEXTCOLLECTION(x) #define TREENODEVAL(i) ((treeNode) (Int_val(i))) #define TAGVAL(i) ((TagType) (Int_val(i))) #define XMLTREE_ROOT 0 #define NoAlloc - - static struct custom_operations ops; - static struct custom_operations set_ops; - static value * cpp_exception = NULL; - static bool ops_initialized = false; - +extern "C" { #include #include - - -} - -extern "C" void caml_xml_tree_finalize(value tree){ - delete XMLTREE(tree); - return; -} - -extern "C" void caml_hset_finalize(value hblock){ - delete HSET(hblock); - return; } -extern "C" value caml_init_lib (value unit) { - CAMLparam1(unit); - - struct rlimit rlim; - - if (!ops_initialized){ - getrlimit(RLIMIT_STACK, &rlim); - - if (rlim.rlim_max == RLIM_INFINITY && rlim.rlim_cur != RLIM_INFINITY) { - rlim.rlim_cur = RLIM_INFINITY; - setrlimit(RLIMIT_STACK, &rlim); - }; - ops.identifier = (char*) "XMLTree"; - ops.finalize = caml_xml_tree_finalize; - set_ops.identifier = (char*) "unordered_set"; - set_ops.finalize = caml_hset_finalize; - - cpp_exception = caml_named_value("CPlusPlusError"); - if (cpp_exception == NULL){ - string s = "FATAL: Unregistered exception "; - s += "CPlusPlusError"; - caml_failwith(s.c_str()); - }; - - ops_initialized = true; - - }; - CAMLreturn(Val_unit); - -} +/** XMLTreeBuilder bindings + * + */ -extern "C" value caml_shredder_parse(XMLDocShredder *shredder){ - CAMLparam0(); - CAMLlocal1(doc); - XMLTree * tree; - shredder->processStartDocument(""); - shredder->parse(); - shredder->processEndDocument(); - doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); - tree = (XMLTree *) shredder->getXMLTree(); - memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); - CAMLreturn(doc); - -} - -extern "C" value caml_call_shredder_uri(value uri,value sf, value iet, value dtc, value idtype){ - CAMLparam1(uri); - CAMLlocal1(doc); - char *fn = String_val(uri); - XMLDocShredder * shredder; - TextCollectionBuilder::index_type_t id; - switch (Int_val(idtype)){ +extern "C" value caml_xml_tree_builder_create(value unit) +{ + CAMLparam1(unit); + CAMLlocal1(result); + result = sxsi_alloc_custom(); + Obj_val(result) = new XMLTreeBuilder(); + CAMLreturn(result); +} + +extern "C" value caml_xml_tree_builder_open_document(value vbuilder, + value vet, + value vsrate, + value vdtc, + value vidxtype) +{ + CAMLparam5(vbuilder, vet, vsrate, vdtc, vidxtype); + bool empty_text = Bool_val(vet); + int sample_rate = Int_val(vsrate); + bool disable_tc = Bool_val(vdtc); + TextCollectionBuilder::index_type_t idx_type; + switch (Int_val(vidxtype)){ case 0: - id = TextCollectionBuilder::index_type_default; + idx_type = TextCollectionBuilder::index_type_default; break; case 1: - id = TextCollectionBuilder::index_type_swcsa; + idx_type = TextCollectionBuilder::index_type_swcsa; break; case 2: - id = TextCollectionBuilder::index_type_rlcsa; + idx_type = TextCollectionBuilder::index_type_rlcsa; break; default: - CAMLRAISEMSG(""); + CAMLRAISEMSG("Invalid Index Type"); }; + int res = XMLTREEBUILDER(vbuilder)->OpenDocument(empty_text, + sample_rate, + disable_tc, + idx_type); + if (res == NULLT) + CAMLRAISEMSG("OpenDocument"); - try { - shredder = new XMLDocShredder(fn,Int_val(sf),Bool_val(iet),Bool_val(dtc), id); - doc = caml_shredder_parse(shredder); - delete shredder; - } - catch (const std::exception& e){ CAMLRAISEMSG(e.what()); } - catch (string msg){ CAMLRAISEMSG(msg.c_str()); } - catch (char const * msg){ CAMLRAISEMSG(msg); }; - CAMLreturn (doc); + CAMLreturn (Val_unit); +} +extern "C" value caml_xml_tree_builder_close_document(value vbuilder) +{ + CAMLparam1(vbuilder); + CAMLlocal1(result); + XMLTree * tree = XMLTREEBUILDER(vbuilder)->CloseDocument(); + if (tree == NULL) + CAMLRAISEMSG("CloseDocument"); + result = sxsi_alloc_custom(); + Obj_val(result) = tree; + CAMLreturn (result); } -extern "C" value caml_call_shredder_string(value data,value sf, value iet, value dtc, value idtype){ - CAMLparam1(data); - CAMLlocal1(doc); - XMLDocShredder * shredder; - unsigned int ln = caml_string_length(data); - unsigned char *fn = (unsigned char*) String_val(data); - TextCollectionBuilder::index_type_t id; - switch (Int_val(idtype)){ - case 0: - id = TextCollectionBuilder::index_type_default; - break; - case 1: - id = TextCollectionBuilder::index_type_swcsa; - break; - case 2: - id = TextCollectionBuilder::index_type_rlcsa; - break; - default: - CAMLRAISEMSG(""); - }; - try { +extern "C" value caml_xml_tree_builder_new_open_tag(value vbuilder, value vtag) +{ + CAMLparam2(vbuilder, vtag); + const char * tag = String_val(vtag); + if (XMLTREEBUILDER(vbuilder)->NewOpenTag(std::string(tag)) == NULLT) + CAMLRAISEMSG("NewOpenTag"); - shredder = new XMLDocShredder (fn,ln,Int_val(sf),Bool_val(iet),Bool_val(dtc), id); - doc = caml_shredder_parse(shredder); - delete shredder; - } - catch (const std::exception& e){ CAMLRAISEMSG(e.what()); } - catch (string msg){ CAMLRAISEMSG(msg.c_str()); } - catch (char const * msg){ CAMLRAISEMSG(msg); }; - CAMLreturn(doc); + CAMLreturn (Val_unit); +} + +extern "C" value caml_xml_tree_builder_new_closing_tag(value vbuilder, value vtag) +{ + CAMLparam2(vbuilder, vtag); + const char * tag = String_val(vtag); + if (XMLTREEBUILDER(vbuilder)->NewClosingTag(std::string(tag)) == NULLT) + CAMLRAISEMSG("NewClosingTag"); + + CAMLreturn (Val_unit); } +extern "C" value caml_xml_tree_builder_new_text(value vbuilder, value vtext) +{ + CAMLparam2(vbuilder, vtext); + const char * text = String_val(vtext); + if (XMLTREEBUILDER(vbuilder)->NewText(std::string(text)) == NULLT) + CAMLRAISEMSG("NewText"); + + CAMLreturn (Val_unit); +} + + +/*************************************************************************/ + +/** + * XMLTree bindings + * All of the functions here call the _unsafe version and implement the logics themselves + * (test for NULLT and so on). This avoids one indirection + one call when the tests fails. + */ + + extern "C" value caml_xml_tree_save(value tree,value fd, value name){ CAMLparam3(tree, fd, name); XMLTREE(tree)->Save(Int_val(fd), String_val(name)); @@ -184,31 +155,22 @@ extern "C" value caml_xml_tree_save(value tree,value fd, value name){ extern "C" value caml_xml_tree_load(value fd, value name, value load_tc,value sf){ CAMLparam4(fd, name, load_tc, sf); - CAMLlocal1(doc); + CAMLlocal1(result); XMLTree * tree; try { tree = XMLTree::Load(Int_val(fd),Bool_val(load_tc),Int_val(sf), String_val(name)); - doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); - memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); - CAMLreturn(doc); + result = sxsi_alloc_custom(); + Obj_val(result) = tree; + CAMLreturn(result); } - catch (const xmlpp::internal_error& e){ CAMLRAISEMSG(e.what()); } catch (const std::exception& e){ CAMLRAISEMSG(e.what()); } - catch (string msg){ CAMLRAISEMSG(msg.c_str()); } + catch (std::string msg){ CAMLRAISEMSG(msg.c_str()); } catch (char const * msg){ CAMLRAISEMSG(msg); }; + //never reached + return (Val_unit); } - -/*************************************************************************/ - -/** - * XMLTree bindings - * All of the functions here call the _unsafe version and implement the logics themselves - * (test for NULLT and so on). This avoids one indirection + one call when the tests fails. - */ - - NoAlloc extern "C" value caml_xml_tree_root(value tree){ return (Val_int(XMLTREE_ROOT)); } @@ -454,9 +416,8 @@ NoAlloc extern "C" value caml_unordered_set_length(value hset){ extern "C" value caml_unordered_set_alloc(value unit){ CAMLparam1(unit); CAMLlocal1(hset); - hset = caml_alloc_custom(&set_ops,sizeof(TagIdSet*),1,2); - TagIdSet* ht = new TagIdSet(); - memcpy(Data_custom_val(hset),&ht,sizeof(TagIdSet*)); + hset = sxsi_alloc_custom(); + Obj_val(hset) = new TagIdSet(); CAMLreturn (hset); } @@ -761,7 +722,7 @@ extern "C" value caml_text_collection_empty_text(value tree,value id){ CAMLreturn ( Val_int((XMLTREE(tree))->EmptyText((DocID) Int_val(id)))); } -bool docId_comp(DocID x, DocID y) { return x < y; }; +bool docId_comp(DocID x, DocID y) { return x < y; } /** * Existential queries diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp deleted file mode 100644 index ffe4de7..0000000 --- a/src/XMLDocShredder.cpp +++ /dev/null @@ -1,306 +0,0 @@ -/********************************************************** - * XMLDocShredder.cpp - * --------------------- - * Implementation of the class that receives events from the XML parser and - * invokes corresponding construction methods of the storage interface. - * - * Author: Greg Leighton - * Date: 02/11/08 - * Changes: - * 05/11/08 -- Fixed bug related to parsing empty elements - * -- Set parser properties to automatically resolve - * entity references and load external DTD if present - * -- Modified processEndDocument() by adding a nodeFinished() - * call to the storage interface to close off the - * document node - * - */ - -#include -#include "XMLDocShredder.h" -#include -#include "Utils.h" - -using namespace Glib; - -void XMLDocShredder::doText(){ - - if (!buffer.empty()){ - tb->NewOpenTag(PCDATA_OPEN_TAG); - tb->NewText(buffer); - tb->NewClosingTag(PCDATA_OPEN_TAG); - }; - buffer.clear(); - -} - -void XMLDocShredder::setProperties(){ - /* instruct the parser to expand entity references and report as - * regular PCDATA - */ - reader_->set_parser_property( - TextReader::SubstEntities, true); - - /* instruct parser to read external DTD, if present. This is - * needed to obtain any entity definitions in the DTD - */ - reader_->set_parser_property( - TextReader::LoadDtd, true); - - - /* - */ - reader_->set_parser_property( - TextReader::DefaultAttrs, true); - - - /* but we don't want to do validation since it would slow us down - */ - - - reader_->set_parser_property( - TextReader::Validate, false); - -} -XMLDocShredder::XMLDocShredder(const unsigned char * data, - TextReader::size_type size, - int sf, - bool iet, - bool dtc, - TextCollectionBuilder::index_type_t index_type - ) -{ - tree = NULL; - reader_ = new TextReader(data,size,""); - setProperties(); - tb = new XMLTreeBuilder(); - buffer.clear(); - tb->OpenDocument(iet,sf,dtc, index_type); -} - -XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc, - TextCollectionBuilder::index_type_t index_type - ) -{ - tree = NULL; - reader_ = new TextReader(inFileName); - setProperties(); - tb = new XMLTreeBuilder(); - buffer.clear(); - tb->OpenDocument(iet,sf,dtc,index_type); -} - -XMLDocShredder::~XMLDocShredder() -{ - delete reader_; - reader_ = NULL; - delete tb; - tb = NULL; - -} - - -void XMLDocShredder::processStartElement() -{ - doText(); - // fetch element name; this will be the full qualified name - ustring name = reader_->get_name(); - bool empty = false; - size_t found = name.find_first_of(':'); - if (found == ustring::npos) - tb->NewOpenTag(name); - else - tb->NewOpenTag(name.substr(found+1,name.length() - found - 1)); - - /* We must be really carefull here. calling process attributes moves - the document pointer on the last attribute, hence calling reader_->is_empty - afterwards will yield the wrong result. It is better to call it while we are - on the element and generate a nodeFinished() call at the end */ - empty = reader_->is_empty_element(); - - - // now, process attributes - if (reader_->has_attributes()) - processAttributes(); - - - if (empty) - tb->NewClosingTag(name); - - -} - -void XMLDocShredder::processEndElement() -{ - doText(); - ustring name = reader_->get_name(); - tb->NewClosingTag(name); -} - -void XMLDocShredder::processPCDATA() -{ - // send the content of this PCDATA node to the storage interface as a text node - if (reader_->has_value()) - buffer += reader_->get_value(); - -} - -void XMLDocShredder::processAttributes() -{ - reader_->move_to_first_attribute(); - - string nspaceStr = "xmlns"; - tb->NewOpenTag(ATTRIBUTE_OPEN_TAG); - do - { - ustring name = reader_->get_name(); - ustring value = reader_->get_value(); - - /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute, - * so we have to extract it and build a namespace uri node out of it before - * passing to the storage interface */ - - if ((name.find(nspaceStr.c_str(), 0, 5)) == 0) - { - //TODO - } - - /* otherwise, this is an ordinary attribute, so we construct a new child node of the - * parent element to store the attribute name, possessing a child text node storing the - * attribute value. Then, we close off the attribute node with a call to nodeFinished() - */ - - else - { - string attname = "<@>"+name; - tb->NewOpenTag(attname); - tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG); - tb->NewText(value); - tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG); - tb->NewClosingTag(attname); - } - } - while (reader_->move_to_next_attribute()); - tb->NewClosingTag(ATTRIBUTE_OPEN_TAG); -} - -void XMLDocShredder::processSignificantWhitespace() -{ - if (reader_->has_value()) - buffer += reader_->get_value(); - -} - -void XMLDocShredder::processStartDocument(const string docName) -{ - // tell storage interface to construct the document name - - tb->NewOpenTag(DOCUMENT_OPEN_TAG); - -} - -void XMLDocShredder::processEndDocument() -{ - doText(); - /* tell the storage interface that document parsing has finished, and structures - * can now be written to disk. */ - tb->NewClosingTag(DOCUMENT_OPEN_TAG); - tree = tb->CloseDocument(); - -} - -void XMLDocShredder::processComment() -{ - //storageIfc_->newChild("!" + reader_->get_value()); - //storageIfc_->nodeFinished(); -} - -void XMLDocShredder::processProcessingInstruction() -{ - ustring name = reader_->get_name(); - ustring value = reader_->get_value(); - - /* Create a child node to store the target of the PI, append a text node to it to store - * the PI data, send to the storage interface. Close off the PI node with a call to - * nodeFinished - */ - - // storageIfc_->newChild("?" + name); - // storageIfc_->newText(value); - // storageIfc_->nodeFinished(); -} - -void XMLDocShredder::processDocTypeDeclaration() -{ - /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton - * in case we do want to process it in the future. - */ -} - -void XMLDocShredder::processCDATASection() -{ - /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data - * model. Instead, we simply pass the converted text value to the storage interface as - * a text node attached to the current context node. - */ - if (reader_->has_value()) - buffer+= reader_->get_value(); -} - -void XMLDocShredder::processUnknownNodeType() -{ - cout << "unknown token encountered during parsing" << endl; - throw xmlpp::parse_error("unknown token encountered during parsing"); - -} - -void XMLDocShredder::parse() -{ - while (reader_->read() && (reader_->get_read_state() != TextReader::Error)) - { - switch (reader_->get_node_type()) - { - case TextReader::Element: - processStartElement(); - break; - - case TextReader::Text: - processPCDATA(); - break; - - case TextReader::EndElement: - processEndElement(); - break; - - case TextReader::SignificantWhitespace: - processSignificantWhitespace(); - break; - - case TextReader::Comment: - processComment(); - break; - - case TextReader::DocumentType: - processDocTypeDeclaration(); - break; - - case TextReader::ProcessingInstruction: - processProcessingInstruction(); - break; - - case TextReader::CDATA: - processCDATASection(); - break; - - case TextReader::None: - processUnknownNodeType(); - break; - - default: - int type = reader_->get_node_type(); - cout << " Node type: " << type << endl; - break; - - } - } -} diff --git a/src/XMLDocShredder.h b/src/XMLDocShredder.h deleted file mode 100644 index a12500d..0000000 --- a/src/XMLDocShredder.h +++ /dev/null @@ -1,68 +0,0 @@ -/************************************** - * XMLDocShredder.h - * -------------------- - * Header file for the shredder routine that invokes the XML parser and - * calls the appropriate construction methods of the storage interface in - * correspondence with received parsing events. - * - * Author: Greg Leighton - * Date: 02/11/08 - */ - -#ifndef XMLDOCSHREDDER_H_ -#define XMLDOCSHREDDER_H_ - -#include -#include -#include -#include "XMLTree.h" -#include "XMLTreeBuilder.h" - -using namespace std; -using namespace xmlpp; - - -class XMLDocShredder -{ - void doText(); -public: - XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc, - TextCollectionBuilder::index_type_t index_type - ); - XMLDocShredder(const unsigned char * data, - TextReader::size_type size, - int sf, bool iet, bool dtc, - TextCollectionBuilder::index_type_t index_type - ); - virtual ~XMLDocShredder(); - virtual void processStartElement(); - virtual void processEndElement(); - virtual void processPCDATA(); - virtual void processAttributes(); - virtual void processSignificantWhitespace(); - virtual void processStartDocument(const string docName); - virtual void processEndDocument(); - virtual void processComment(); - virtual void processProcessingInstruction(); - virtual void processDocTypeDeclaration(); - virtual void processUnknownNodeType(); - virtual void processCDATASection(); - virtual void parse(); - - XMLTree * getXMLTree(){ - return tree; - } - - - private: - XMLTreeBuilder * tb; - XMLTree * tree; - TextReader *reader_; - void setProperties(); - bool last_text; - string buffer; - // used to coalece successive text events - // which can occur if we discard pi and comment nodes. -}; - -#endif /*XMLDOCSHREDDER_H_*/ diff --git a/src/common_stub.cpp b/src/common_stub.cpp new file mode 100644 index 0000000..6f56d33 --- /dev/null +++ b/src/common_stub.cpp @@ -0,0 +1,92 @@ +#include +#include + +#include "common_stub.hpp" + +extern "C" { +#include +#include +} + +using std::string; +using std::pair; +using std::unordered_map; +using std::make_pair; + +typedef unordered_map> type_map_t; +static type_map_t * type_map = 0; +static value * cpp_exception = 0; + +static void init_error() +{ + caml_failwith("C++: initialization error"); +} + +static void init_exception() +{ + cpp_exception = caml_named_value("CPlusPlusError"); + if (cpp_exception == 0) + init_error(); +} + +static void init_type_map() +{ + if (type_map == 0) + type_map = new type_map_t(); + if (type_map == 0) + init_error(); +} + +void register_custom_(char* name, + size_t size, + void (*finalize)(value v)) +{ + if (type_map == 0) init_error(); + struct custom_operations * ops = + (struct custom_operations*) calloc(1, sizeof(struct custom_operations)); + ops->identifier = name; + ops->finalize = finalize; + ops->compare = custom_compare_default; + ops->hash = custom_hash_default; + ops->serialize = custom_serialize_default; + ops->deserialize = custom_deserialize_default; + type_map->insert(make_pair(string(name), make_pair(ops, size))); +} + +value alloc_custom_(char* name) +{ + CAMLparam0(); + CAMLlocal1(result); + if (type_map == 0) init_error(); + string key = string(name); + type_map_t::iterator it = type_map->find(key); + if (it == type_map->end()) + result = Val_unit; + else + result = caml_alloc_custom(it->second.first, it->second.second, 1, 2); + + CAMLreturn(result); +} + + +extern "C" value sxsi_cpp_init(value unit) +{ + struct rlimit rlim; + init_exception(); + init_type_map(); + + /* Set the stack space to unlimited */ + getrlimit(RLIMIT_STACK, &rlim); + if (rlim.rlim_max == RLIM_INFINITY && rlim.rlim_cur != RLIM_INFINITY) { + rlim.rlim_cur = RLIM_INFINITY; + setrlimit(RLIMIT_STACK, &rlim); + }; + + return Val_unit; +} + +void sxsi_raise_msg(char * msg) +{ + if (cpp_exception == 0) init_error(); + caml_raise_with_string(*cpp_exception, msg); +} diff --git a/src/common_stub.hpp b/src/common_stub.hpp new file mode 100644 index 0000000..bfacdac --- /dev/null +++ b/src/common_stub.hpp @@ -0,0 +1,58 @@ +#ifndef COMMON_STUB_H_ +#define COMMON_STUB_H_ + +extern "C" { +#define CAML_NAME_SPACE +#include +#include +#include +#include +#include +#include +#include +#include +#include +} +#include +#include + +void register_custom_(char* name, + size_t size, + void (*finalize)(value v)); + +value alloc_custom_(char* name); + + +template void sxsi_finalize_custom(value v) +{ + X * obj = * ((X **) Data_custom_val(v)); + std::cerr << "Finalizing object: " << typeid(X).name() << std::endl; + std::cerr.flush(); + delete obj; +} + +template value sxsi_alloc_custom() +{ + char * name = const_cast(typeid(X).name()); + value v = alloc_custom_(name); + if (v == Val_unit) { + register_custom_(name, sizeof(X*), sxsi_finalize_custom); + v = alloc_custom_(name); + }; + return v; +} + +template X*& Obj_val(value v) +{ + //Cannot use Data_custom_val here, it is not a correct lvalue. :-( + return (X*&) Field(v,1); +} + +void sxsi_raise_msg(char * msg); + +extern "C" { +value sxsi_cpp_init(value unit); +} + + +#endif diff --git a/src/libcamlshredder.clib b/src/libcamlshredder.clib index 02b7c35..5a8dd8b 100644 --- a/src/libcamlshredder.clib +++ b/src/libcamlshredder.clib @@ -1,2 +1,2 @@ OCamlDriver.o -XMLDocShredder.o +common_stub.o diff --git a/src/tree.ml b/src/tree.ml index 97a850c..a8a3a96 100644 --- a/src/tree.ml +++ b/src/tree.ml @@ -8,7 +8,7 @@ INCLUDE "debug.ml" INCLUDE "utils.ml" -external init_lib : unit -> unit = "caml_init_lib" +external init_lib : unit -> unit = "sxsi_cpp_init" exception CPlusPlusError of string @@ -20,6 +20,99 @@ type node = [ `Tree ] Node.t type tree + +module TreeBuilder = +struct + type t + external create : unit -> t = "caml_xml_tree_builder_create" + external open_document : t -> bool -> int -> bool -> int -> unit = "caml_xml_tree_builder_open_document" + external close_document : t -> tree = "caml_xml_tree_builder_close_document" + external open_tag : t -> string -> unit = "caml_xml_tree_builder_new_open_tag" + external close_tag : t -> string -> unit = "caml_xml_tree_builder_new_closing_tag" + external text : t -> string -> unit = "caml_xml_tree_builder_new_text" + + + let do_text b t = + if Buffer.length t > 0 then begin + open_tag b "<$>"; + text b (Buffer.contents t); + close_tag b "<$>"; + Buffer.clear t + end + + let output_attr b name value = + let atname = "<@>" ^ name in + open_tag b atname; + open_tag b "<@$>"; + text b value; + close_tag b "<@$>"; + close_tag b atname + + let start_element_handler b t tag attr_list = + do_text b t; + open_tag b tag; + match attr_list with + [] -> () + | l -> + open_tag b "<@>"; + List.iter (fun (name, value) -> output_attr b name value) l; + close_tag b "<@>" + + + let end_element_handler b t tag = + do_text b t; + close_tag b tag + + let character_data_handler _b t text = + Buffer.add_string t text + + let create_parser () = + let buf = Buffer.create 512 in + let build = create () in + let parser_ = Expat.parser_create ~encoding:None in + let finalize () = + do_text build buf; + close_tag build ""; + close_document build + in + Expat.set_start_element_handler parser_ (start_element_handler build buf); + Expat.set_end_element_handler parser_ (end_element_handler build buf); + Expat.set_character_data_handler parser_ (character_data_handler build buf); + open_document build !Options.index_empty_texts !Options.sample_factor + !Options.disable_text_collection !Options.text_index_type; + open_tag build ""; + parser_, finalize + + let parse_string s = + let parser_, finalizer = create_parser () in + Expat.parse parser_ s; + finalizer () + + let parse_file file = + let in_chan = open_in file in + let buffer = String.create 4096 in + let parser_, finalizer = create_parser () in + let () = + try + while true do + let read = input in_chan buffer 0 4096 in + if read == 0 then raise End_of_file else + Expat.parse_sub parser_ buffer 0 read; + done + + with + | End_of_file -> close_in in_chan + | e -> raise e + in + finalizer () + + + +end + + + + type bit_vector = string external bool_of_int : int -> bool = "%identity" @@ -41,8 +134,10 @@ type t = { +(* external parse_xml_uri : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_uri" external parse_xml_string : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_string" +*) external tree_print_xml_fast3 : tree -> [`Tree ] Node.t -> Unix.file_descr -> unit = "caml_xml_tree_print" let print_xml t n fd = tree_print_xml_fast3 t.doc n fd @@ -418,16 +513,9 @@ let node_of_t t = } -let parse f str = - node_of_t - (f str !Options.sample_factor - !Options.index_empty_texts - !Options.disable_text_collection - !Options.text_index_type - ) -let parse_xml_uri str = parse parse_xml_uri str -let parse_xml_string str = parse parse_xml_string str +let parse_xml_uri str = node_of_t (TreeBuilder.parse_file str) +let parse_xml_string str = node_of_t (TreeBuilder.parse_string str) let size t = tree_size t.doc;; @@ -503,6 +591,7 @@ let load ?(sample=64) ?(load_text=true) str = let c, s, d, f = time ~msg:"Loading tag table"(load_table) () in ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET); let xml_tree = tree_load fd str load_text sample in + Printf.eprintf "Root is: %i\n" (Obj.magic (tree_root xml_tree)); let () = Tag.init (Obj.magic xml_tree) in let attributes = Ptset.Int.add Tag.attribute d.(Tag.attribute) in let elements = Ptset.Int.add Tag.document_node -- 2.17.1