Conf.check_prog "ocamlfind" "ocamlfind printconf";;
Conf.check_prog "pkg-config" "pkg-config --version";;
-Conf.check_prog "libxml++-2.6" "pkg-config --exists libxml++-2.6" ;;
+(*Conf.check_prog "libxml++-2.6" "pkg-config --exists libxml++-2.6" ;;*)
Conf.check_prog "ulex" "ocamlfind query ulex";;
-Conf.check "XMLTree" (Conf.absolute) ("%s/src/XMLTree/libXMLTree.a") (Sys.file_exists);;
-
-let _, libxmlI, _ = Conf.exec "pkg-config --cflags-only-I libxml++-2.6";;
-let libxmlI = Conf.explode (libxmlI);;
+Conf.check_prog "expat" "ocamlfind query expat";;
-let _, libxmlL, _ = Conf.exec "pkg-config --libs-only-L libxml++-2.6";;
-let libxmlL = Conf.explode (libxmlL);;
-
-let _, libxmll, _ = Conf.exec "pkg-config --libs-only-l libxml++-2.6";;
-let libxmll = Conf.explode (libxmll);;
+Conf.check "XMLTree" (Conf.absolute) ("%s/src/XMLTree/libXMLTree.a") (Sys.file_exists);;
let libXMLTreeI = [ Conf.absolute "-I%s/src/XMLTree";
Conf.absolute "-I%s/src/XMLTree/libcds/includes";
Conf.def_str "cxx_cmd" "g++";;
-Conf.def_list "cxx_includes" (libxmlI @ libXMLTreeI @ ocamlI);;
-Conf.def_list "cxx_lpaths" (libxmlL @ libXMLTreeL);;
-Conf.def_list "cxx_libs" ( libXMLTreel @ libxmll );;
+Conf.def_list "cxx_includes" (libXMLTreeI @ ocamlI);;
+Conf.def_list "cxx_lpaths" libXMLTreeL;;
+Conf.def_list "cxx_libs" libXMLTreel;;
let src = env "%.cpp" and obj = env "%.o" in
let local_include = Depends.cxx src in
let local_dispatch = List.map (fun p -> List.map (fun p' -> p'/p) project_dirs) local_include in
- let () = ignore (build local_dispatch) in
+ let () = ignore (build local_dispatch ) in
Cmd(S[A cxx_cmd; A "-o" ; P obj; A "-c"; S !cxx_flags; cxx_include_flags; P src])
(* Native compile and link action *)
let include_path = "include";;
let src_path = "src";;
let ocaml_link = [ "dynlink"; "camlp4lib" ];;
-let ocamlfind_packages = "unix,ulex,camlp4";;
+let ocamlfind_packages = "unix,ulex,expat,camlp4";;
let cxx_flags = [ "-fno-PIC"; "-std=c++0x"; "-static" ];;
let main_targets = [ "native","src/main.native";
"byte", "src/main.byte" ];;
#include <unordered_set>
#include <algorithm>
-#include "XMLDocShredder.h"
+
#include "XMLTree.h"
+#include "XMLTreeBuilder.h"
#include "Utils.h"
+#include "common_stub.hpp"
extern "C" {
-/* OCaml memory managment */
-#include <caml/mlvalues.h>
-#include <caml/alloc.h>
-#include <caml/memory.h>
-#include <caml/callback.h>
-#include <caml/fail.h>
-#include <caml/custom.h>
-#include <caml/bigarray.h>
- //#include "results.h"
#include <stdio.h>
+}
+
+#define CAMLRAISEMSG(msg) (sxsi_raise_msg((char*) (msg)))
+
+#define XMLTREE(x) (Obj_val<XMLTree>(x))
+
+#define HSET(x) (Obj_val<TagIdSet>(x))
+
+#define XMLTREEBUILDER(x) (Obj_val<XMLTreeBuilder>(x))
+
-#define CAMLRAISEMSG(msg) (caml_raise_with_string(*cpp_exception,(msg) ))
-#define NOT_IMPLEMENTED(s) (caml_failwith(s))
-#define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x)))
-#define HSET(x) ((TagIdSet*)((* (TagIdSet**) Data_custom_val(x))))
-#define TEXTCOLLECTION(x)
#define TREENODEVAL(i) ((treeNode) (Int_val(i)))
#define TAGVAL(i) ((TagType) (Int_val(i)))
#define XMLTREE_ROOT 0
#define NoAlloc
-
- static struct custom_operations ops;
- static struct custom_operations set_ops;
- static value * cpp_exception = NULL;
- static bool ops_initialized = false;
-
+extern "C" {
#include <sys/time.h>
#include <sys/resource.h>
-
-
-}
-
-extern "C" void caml_xml_tree_finalize(value tree){
- delete XMLTREE(tree);
- return;
-}
-
-extern "C" void caml_hset_finalize(value hblock){
- delete HSET(hblock);
- return;
}
-extern "C" value caml_init_lib (value unit) {
- CAMLparam1(unit);
-
- struct rlimit rlim;
-
- if (!ops_initialized){
- getrlimit(RLIMIT_STACK, &rlim);
-
- if (rlim.rlim_max == RLIM_INFINITY && rlim.rlim_cur != RLIM_INFINITY) {
- rlim.rlim_cur = RLIM_INFINITY;
- setrlimit(RLIMIT_STACK, &rlim);
- };
- ops.identifier = (char*) "XMLTree";
- ops.finalize = caml_xml_tree_finalize;
- set_ops.identifier = (char*) "unordered_set";
- set_ops.finalize = caml_hset_finalize;
-
- cpp_exception = caml_named_value("CPlusPlusError");
- if (cpp_exception == NULL){
- string s = "FATAL: Unregistered exception ";
- s += "CPlusPlusError";
- caml_failwith(s.c_str());
- };
-
- ops_initialized = true;
-
- };
- CAMLreturn(Val_unit);
-
-}
+/** XMLTreeBuilder bindings
+ *
+ */
-extern "C" value caml_shredder_parse(XMLDocShredder *shredder){
- CAMLparam0();
- CAMLlocal1(doc);
- XMLTree * tree;
- shredder->processStartDocument("");
- shredder->parse();
- shredder->processEndDocument();
- doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2);
- tree = (XMLTree *) shredder->getXMLTree();
- memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*));
- CAMLreturn(doc);
-
-}
-
-extern "C" value caml_call_shredder_uri(value uri,value sf, value iet, value dtc, value idtype){
- CAMLparam1(uri);
- CAMLlocal1(doc);
- char *fn = String_val(uri);
- XMLDocShredder * shredder;
- TextCollectionBuilder::index_type_t id;
- switch (Int_val(idtype)){
+extern "C" value caml_xml_tree_builder_create(value unit)
+{
+ CAMLparam1(unit);
+ CAMLlocal1(result);
+ result = sxsi_alloc_custom<XMLTreeBuilder>();
+ Obj_val<XMLTreeBuilder>(result) = new XMLTreeBuilder();
+ CAMLreturn(result);
+}
+
+extern "C" value caml_xml_tree_builder_open_document(value vbuilder,
+ value vet,
+ value vsrate,
+ value vdtc,
+ value vidxtype)
+{
+ CAMLparam5(vbuilder, vet, vsrate, vdtc, vidxtype);
+ bool empty_text = Bool_val(vet);
+ int sample_rate = Int_val(vsrate);
+ bool disable_tc = Bool_val(vdtc);
+ TextCollectionBuilder::index_type_t idx_type;
+ switch (Int_val(vidxtype)){
case 0:
- id = TextCollectionBuilder::index_type_default;
+ idx_type = TextCollectionBuilder::index_type_default;
break;
case 1:
- id = TextCollectionBuilder::index_type_swcsa;
+ idx_type = TextCollectionBuilder::index_type_swcsa;
break;
case 2:
- id = TextCollectionBuilder::index_type_rlcsa;
+ idx_type = TextCollectionBuilder::index_type_rlcsa;
break;
default:
- CAMLRAISEMSG("");
+ CAMLRAISEMSG("Invalid Index Type");
};
+ int res = XMLTREEBUILDER(vbuilder)->OpenDocument(empty_text,
+ sample_rate,
+ disable_tc,
+ idx_type);
+ if (res == NULLT)
+ CAMLRAISEMSG("OpenDocument");
- try {
- shredder = new XMLDocShredder(fn,Int_val(sf),Bool_val(iet),Bool_val(dtc), id);
- doc = caml_shredder_parse(shredder);
- delete shredder;
- }
- catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
- catch (string msg){ CAMLRAISEMSG(msg.c_str()); }
- catch (char const * msg){ CAMLRAISEMSG(msg); };
- CAMLreturn (doc);
+ CAMLreturn (Val_unit);
+}
+extern "C" value caml_xml_tree_builder_close_document(value vbuilder)
+{
+ CAMLparam1(vbuilder);
+ CAMLlocal1(result);
+ XMLTree * tree = XMLTREEBUILDER(vbuilder)->CloseDocument();
+ if (tree == NULL)
+ CAMLRAISEMSG("CloseDocument");
+ result = sxsi_alloc_custom<XMLTree>();
+ Obj_val<XMLTree>(result) = tree;
+ CAMLreturn (result);
}
-extern "C" value caml_call_shredder_string(value data,value sf, value iet, value dtc, value idtype){
- CAMLparam1(data);
- CAMLlocal1(doc);
- XMLDocShredder * shredder;
- unsigned int ln = caml_string_length(data);
- unsigned char *fn = (unsigned char*) String_val(data);
- TextCollectionBuilder::index_type_t id;
- switch (Int_val(idtype)){
- case 0:
- id = TextCollectionBuilder::index_type_default;
- break;
- case 1:
- id = TextCollectionBuilder::index_type_swcsa;
- break;
- case 2:
- id = TextCollectionBuilder::index_type_rlcsa;
- break;
- default:
- CAMLRAISEMSG("");
- };
- try {
+extern "C" value caml_xml_tree_builder_new_open_tag(value vbuilder, value vtag)
+{
+ CAMLparam2(vbuilder, vtag);
+ const char * tag = String_val(vtag);
+ if (XMLTREEBUILDER(vbuilder)->NewOpenTag(std::string(tag)) == NULLT)
+ CAMLRAISEMSG("NewOpenTag");
- shredder = new XMLDocShredder (fn,ln,Int_val(sf),Bool_val(iet),Bool_val(dtc), id);
- doc = caml_shredder_parse(shredder);
- delete shredder;
- }
- catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
- catch (string msg){ CAMLRAISEMSG(msg.c_str()); }
- catch (char const * msg){ CAMLRAISEMSG(msg); };
- CAMLreturn(doc);
+ CAMLreturn (Val_unit);
+}
+
+extern "C" value caml_xml_tree_builder_new_closing_tag(value vbuilder, value vtag)
+{
+ CAMLparam2(vbuilder, vtag);
+ const char * tag = String_val(vtag);
+ if (XMLTREEBUILDER(vbuilder)->NewClosingTag(std::string(tag)) == NULLT)
+ CAMLRAISEMSG("NewClosingTag");
+
+ CAMLreturn (Val_unit);
}
+extern "C" value caml_xml_tree_builder_new_text(value vbuilder, value vtext)
+{
+ CAMLparam2(vbuilder, vtext);
+ const char * text = String_val(vtext);
+ if (XMLTREEBUILDER(vbuilder)->NewText(std::string(text)) == NULLT)
+ CAMLRAISEMSG("NewText");
+
+ CAMLreturn (Val_unit);
+}
+
+
+/*************************************************************************/
+
+/**
+ * XMLTree bindings
+ * All of the functions here call the _unsafe version and implement the logics themselves
+ * (test for NULLT and so on). This avoids one indirection + one call when the tests fails.
+ */
+
+
extern "C" value caml_xml_tree_save(value tree,value fd, value name){
CAMLparam3(tree, fd, name);
XMLTREE(tree)->Save(Int_val(fd), String_val(name));
extern "C" value caml_xml_tree_load(value fd, value name, value load_tc,value sf){
CAMLparam4(fd, name, load_tc, sf);
- CAMLlocal1(doc);
+ CAMLlocal1(result);
XMLTree * tree;
try {
tree = XMLTree::Load(Int_val(fd),Bool_val(load_tc),Int_val(sf), String_val(name));
- doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2);
- memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*));
- CAMLreturn(doc);
+ result = sxsi_alloc_custom<XMLTree>();
+ Obj_val<XMLTree>(result) = tree;
+ CAMLreturn(result);
}
- catch (const xmlpp::internal_error& e){ CAMLRAISEMSG(e.what()); }
catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
- catch (string msg){ CAMLRAISEMSG(msg.c_str()); }
+ catch (std::string msg){ CAMLRAISEMSG(msg.c_str()); }
catch (char const * msg){ CAMLRAISEMSG(msg); };
+ //never reached
+ return (Val_unit);
}
-
-/*************************************************************************/
-
-/**
- * XMLTree bindings
- * All of the functions here call the _unsafe version and implement the logics themselves
- * (test for NULLT and so on). This avoids one indirection + one call when the tests fails.
- */
-
-
NoAlloc extern "C" value caml_xml_tree_root(value tree){
return (Val_int(XMLTREE_ROOT));
}
extern "C" value caml_unordered_set_alloc(value unit){
CAMLparam1(unit);
CAMLlocal1(hset);
- hset = caml_alloc_custom(&set_ops,sizeof(TagIdSet*),1,2);
- TagIdSet* ht = new TagIdSet();
- memcpy(Data_custom_val(hset),&ht,sizeof(TagIdSet*));
+ hset = sxsi_alloc_custom<TagIdSet>();
+ Obj_val<TagIdSet>(hset) = new TagIdSet();
CAMLreturn (hset);
}
CAMLreturn ( Val_int((XMLTREE(tree))->EmptyText((DocID) Int_val(id))));
}
-bool docId_comp(DocID x, DocID y) { return x < y; };
+bool docId_comp(DocID x, DocID y) { return x < y; }
/**
* Existential queries
+++ /dev/null
-/**********************************************************
- * XMLDocShredder.cpp
- * ---------------------
- * Implementation of the class that receives events from the XML parser and
- * invokes corresponding construction methods of the storage interface.
- *
- * Author: Greg Leighton
- * Date: 02/11/08
- * Changes:
- * 05/11/08 -- Fixed bug related to parsing empty elements
- * -- Set parser properties to automatically resolve
- * entity references and load external DTD if present
- * -- Modified processEndDocument() by adding a nodeFinished()
- * call to the storage interface to close off the
- * document node
- *
- */
-
-#include <iostream>
-#include "XMLDocShredder.h"
-#include <libxml++/exceptions/parse_error.h>
-#include "Utils.h"
-
-using namespace Glib;
-
-void XMLDocShredder::doText(){
-
- if (!buffer.empty()){
- tb->NewOpenTag(PCDATA_OPEN_TAG);
- tb->NewText(buffer);
- tb->NewClosingTag(PCDATA_OPEN_TAG);
- };
- buffer.clear();
-
-}
-
-void XMLDocShredder::setProperties(){
- /* instruct the parser to expand entity references and report as
- * regular PCDATA
- */
- reader_->set_parser_property(
- TextReader::SubstEntities, true);
-
- /* instruct parser to read external DTD, if present. This is
- * needed to obtain any entity definitions in the DTD
- */
- reader_->set_parser_property(
- TextReader::LoadDtd, true);
-
-
- /*
- */
- reader_->set_parser_property(
- TextReader::DefaultAttrs, true);
-
-
- /* but we don't want to do validation since it would slow us down
- */
-
-
- reader_->set_parser_property(
- TextReader::Validate, false);
-
-}
-XMLDocShredder::XMLDocShredder(const unsigned char * data,
- TextReader::size_type size,
- int sf,
- bool iet,
- bool dtc,
- TextCollectionBuilder::index_type_t index_type
- )
-{
- tree = NULL;
- reader_ = new TextReader(data,size,"");
- setProperties();
- tb = new XMLTreeBuilder();
- buffer.clear();
- tb->OpenDocument(iet,sf,dtc, index_type);
-}
-
-XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
- TextCollectionBuilder::index_type_t index_type
- )
-{
- tree = NULL;
- reader_ = new TextReader(inFileName);
- setProperties();
- tb = new XMLTreeBuilder();
- buffer.clear();
- tb->OpenDocument(iet,sf,dtc,index_type);
-}
-
-XMLDocShredder::~XMLDocShredder()
-{
- delete reader_;
- reader_ = NULL;
- delete tb;
- tb = NULL;
-
-}
-
-
-void XMLDocShredder::processStartElement()
-{
- doText();
- // fetch element name; this will be the full qualified name
- ustring name = reader_->get_name();
- bool empty = false;
- size_t found = name.find_first_of(':');
- if (found == ustring::npos)
- tb->NewOpenTag(name);
- else
- tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
-
- /* We must be really carefull here. calling process attributes moves
- the document pointer on the last attribute, hence calling reader_->is_empty
- afterwards will yield the wrong result. It is better to call it while we are
- on the element and generate a nodeFinished() call at the end */
- empty = reader_->is_empty_element();
-
-
- // now, process attributes
- if (reader_->has_attributes())
- processAttributes();
-
-
- if (empty)
- tb->NewClosingTag(name);
-
-
-}
-
-void XMLDocShredder::processEndElement()
-{
- doText();
- ustring name = reader_->get_name();
- tb->NewClosingTag(name);
-}
-
-void XMLDocShredder::processPCDATA()
-{
- // send the content of this PCDATA node to the storage interface as a text node
- if (reader_->has_value())
- buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processAttributes()
-{
- reader_->move_to_first_attribute();
-
- string nspaceStr = "xmlns";
- tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
- do
- {
- ustring name = reader_->get_name();
- ustring value = reader_->get_value();
-
- /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
- * so we have to extract it and build a namespace uri node out of it before
- * passing to the storage interface */
-
- if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
- {
- //TODO
- }
-
- /* otherwise, this is an ordinary attribute, so we construct a new child node of the
- * parent element to store the attribute name, possessing a child text node storing the
- * attribute value. Then, we close off the attribute node with a call to nodeFinished()
- */
-
- else
- {
- string attname = "<@>"+name;
- tb->NewOpenTag(attname);
- tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
- tb->NewText(value);
- tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
- tb->NewClosingTag(attname);
- }
- }
- while (reader_->move_to_next_attribute());
- tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
-}
-
-void XMLDocShredder::processSignificantWhitespace()
-{
- if (reader_->has_value())
- buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processStartDocument(const string docName)
-{
- // tell storage interface to construct the document name
-
- tb->NewOpenTag(DOCUMENT_OPEN_TAG);
-
-}
-
-void XMLDocShredder::processEndDocument()
-{
- doText();
- /* tell the storage interface that document parsing has finished, and structures
- * can now be written to disk. */
- tb->NewClosingTag(DOCUMENT_OPEN_TAG);
- tree = tb->CloseDocument();
-
-}
-
-void XMLDocShredder::processComment()
-{
- //storageIfc_->newChild("!" + reader_->get_value());
- //storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processProcessingInstruction()
-{
- ustring name = reader_->get_name();
- ustring value = reader_->get_value();
-
- /* Create a child node to store the target of the PI, append a text node to it to store
- * the PI data, send to the storage interface. Close off the PI node with a call to
- * nodeFinished
- */
-
- // storageIfc_->newChild("?" + name);
- // storageIfc_->newText(value);
- // storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processDocTypeDeclaration()
-{
- /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton
- * in case we do want to process it in the future.
- */
-}
-
-void XMLDocShredder::processCDATASection()
-{
- /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
- * model. Instead, we simply pass the converted text value to the storage interface as
- * a text node attached to the current context node.
- */
- if (reader_->has_value())
- buffer+= reader_->get_value();
-}
-
-void XMLDocShredder::processUnknownNodeType()
-{
- cout << "unknown token encountered during parsing" << endl;
- throw xmlpp::parse_error("unknown token encountered during parsing");
-
-}
-
-void XMLDocShredder::parse()
-{
- while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
- {
- switch (reader_->get_node_type())
- {
- case TextReader::Element:
- processStartElement();
- break;
-
- case TextReader::Text:
- processPCDATA();
- break;
-
- case TextReader::EndElement:
- processEndElement();
- break;
-
- case TextReader::SignificantWhitespace:
- processSignificantWhitespace();
- break;
-
- case TextReader::Comment:
- processComment();
- break;
-
- case TextReader::DocumentType:
- processDocTypeDeclaration();
- break;
-
- case TextReader::ProcessingInstruction:
- processProcessingInstruction();
- break;
-
- case TextReader::CDATA:
- processCDATASection();
- break;
-
- case TextReader::None:
- processUnknownNodeType();
- break;
-
- default:
- int type = reader_->get_node_type();
- cout << " Node type: " << type << endl;
- break;
-
- }
- }
-}
+++ /dev/null
-/**************************************
- * XMLDocShredder.h
- * --------------------
- * Header file for the shredder routine that invokes the XML parser and
- * calls the appropriate construction methods of the storage interface in
- * correspondence with received parsing events.
- *
- * Author: Greg Leighton
- * Date: 02/11/08
- */
-
-#ifndef XMLDOCSHREDDER_H_
-#define XMLDOCSHREDDER_H_
-
-#include <libxml++/libxml++.h>
-#include <libxml++/parsers/textreader.h>
-#include <string>
-#include "XMLTree.h"
-#include "XMLTreeBuilder.h"
-
-using namespace std;
-using namespace xmlpp;
-
-
-class XMLDocShredder
-{
- void doText();
-public:
- XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
- TextCollectionBuilder::index_type_t index_type
- );
- XMLDocShredder(const unsigned char * data,
- TextReader::size_type size,
- int sf, bool iet, bool dtc,
- TextCollectionBuilder::index_type_t index_type
- );
- virtual ~XMLDocShredder();
- virtual void processStartElement();
- virtual void processEndElement();
- virtual void processPCDATA();
- virtual void processAttributes();
- virtual void processSignificantWhitespace();
- virtual void processStartDocument(const string docName);
- virtual void processEndDocument();
- virtual void processComment();
- virtual void processProcessingInstruction();
- virtual void processDocTypeDeclaration();
- virtual void processUnknownNodeType();
- virtual void processCDATASection();
- virtual void parse();
-
- XMLTree * getXMLTree(){
- return tree;
- }
-
-
- private:
- XMLTreeBuilder * tb;
- XMLTree * tree;
- TextReader *reader_;
- void setProperties();
- bool last_text;
- string buffer;
- // used to coalece successive text events
- // which can occur if we discard pi and comment nodes.
-};
-
-#endif /*XMLDOCSHREDDER_H_*/
--- /dev/null
+#include <unordered_map>
+#include <string>
+
+#include "common_stub.hpp"
+
+extern "C" {
+#include <sys/time.h>
+#include <sys/resource.h>
+}
+
+using std::string;
+using std::pair;
+using std::unordered_map;
+using std::make_pair;
+
+typedef unordered_map<string, pair<struct custom_operations*, size_t>> type_map_t;
+static type_map_t * type_map = 0;
+static value * cpp_exception = 0;
+
+static void init_error()
+{
+ caml_failwith("C++: initialization error");
+}
+
+static void init_exception()
+{
+ cpp_exception = caml_named_value("CPlusPlusError");
+ if (cpp_exception == 0)
+ init_error();
+}
+
+static void init_type_map()
+{
+ if (type_map == 0)
+ type_map = new type_map_t();
+ if (type_map == 0)
+ init_error();
+}
+
+void register_custom_(char* name,
+ size_t size,
+ void (*finalize)(value v))
+{
+ if (type_map == 0) init_error();
+ struct custom_operations * ops =
+ (struct custom_operations*) calloc(1, sizeof(struct custom_operations));
+ ops->identifier = name;
+ ops->finalize = finalize;
+ ops->compare = custom_compare_default;
+ ops->hash = custom_hash_default;
+ ops->serialize = custom_serialize_default;
+ ops->deserialize = custom_deserialize_default;
+ type_map->insert(make_pair(string(name), make_pair(ops, size)));
+}
+
+value alloc_custom_(char* name)
+{
+ CAMLparam0();
+ CAMLlocal1(result);
+ if (type_map == 0) init_error();
+ string key = string(name);
+ type_map_t::iterator it = type_map->find(key);
+ if (it == type_map->end())
+ result = Val_unit;
+ else
+ result = caml_alloc_custom(it->second.first, it->second.second, 1, 2);
+
+ CAMLreturn(result);
+}
+
+
+extern "C" value sxsi_cpp_init(value unit)
+{
+ struct rlimit rlim;
+ init_exception();
+ init_type_map();
+
+ /* Set the stack space to unlimited */
+ getrlimit(RLIMIT_STACK, &rlim);
+ if (rlim.rlim_max == RLIM_INFINITY && rlim.rlim_cur != RLIM_INFINITY) {
+ rlim.rlim_cur = RLIM_INFINITY;
+ setrlimit(RLIMIT_STACK, &rlim);
+ };
+
+ return Val_unit;
+}
+
+void sxsi_raise_msg(char * msg)
+{
+ if (cpp_exception == 0) init_error();
+ caml_raise_with_string(*cpp_exception, msg);
+}
--- /dev/null
+#ifndef COMMON_STUB_H_
+#define COMMON_STUB_H_
+
+extern "C" {
+#define CAML_NAME_SPACE
+#include <stdlib.h>
+#include <caml/mlvalues.h>
+#include <caml/alloc.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+#include <caml/fail.h>
+#include <caml/custom.h>
+#include <caml/bigarray.h>
+#include <stdio.h>
+}
+#include <iostream>
+#include <typeinfo>
+
+void register_custom_(char* name,
+ size_t size,
+ void (*finalize)(value v));
+
+value alloc_custom_(char* name);
+
+
+template <class X> void sxsi_finalize_custom(value v)
+{
+ X * obj = * ((X **) Data_custom_val(v));
+ std::cerr << "Finalizing object: " << typeid(X).name() << std::endl;
+ std::cerr.flush();
+ delete obj;
+}
+
+template <class X> value sxsi_alloc_custom()
+{
+ char * name = const_cast<char*>(typeid(X).name());
+ value v = alloc_custom_(name);
+ if (v == Val_unit) {
+ register_custom_(name, sizeof(X*), sxsi_finalize_custom<X>);
+ v = alloc_custom_(name);
+ };
+ return v;
+}
+
+template <class X> X*& Obj_val(value v)
+{
+ //Cannot use Data_custom_val here, it is not a correct lvalue. :-(
+ return (X*&) Field(v,1);
+}
+
+void sxsi_raise_msg(char * msg);
+
+extern "C" {
+value sxsi_cpp_init(value unit);
+}
+
+
+#endif
OCamlDriver.o
-XMLDocShredder.o
+common_stub.o
INCLUDE "utils.ml"
-external init_lib : unit -> unit = "caml_init_lib"
+external init_lib : unit -> unit = "sxsi_cpp_init"
exception CPlusPlusError of string
type tree
+
+module TreeBuilder =
+struct
+ type t
+ external create : unit -> t = "caml_xml_tree_builder_create"
+ external open_document : t -> bool -> int -> bool -> int -> unit = "caml_xml_tree_builder_open_document"
+ external close_document : t -> tree = "caml_xml_tree_builder_close_document"
+ external open_tag : t -> string -> unit = "caml_xml_tree_builder_new_open_tag"
+ external close_tag : t -> string -> unit = "caml_xml_tree_builder_new_closing_tag"
+ external text : t -> string -> unit = "caml_xml_tree_builder_new_text"
+
+
+ let do_text b t =
+ if Buffer.length t > 0 then begin
+ open_tag b "<$>";
+ text b (Buffer.contents t);
+ close_tag b "<$>";
+ Buffer.clear t
+ end
+
+ let output_attr b name value =
+ let atname = "<@>" ^ name in
+ open_tag b atname;
+ open_tag b "<@$>";
+ text b value;
+ close_tag b "<@$>";
+ close_tag b atname
+
+ let start_element_handler b t tag attr_list =
+ do_text b t;
+ open_tag b tag;
+ match attr_list with
+ [] -> ()
+ | l ->
+ open_tag b "<@>";
+ List.iter (fun (name, value) -> output_attr b name value) l;
+ close_tag b "<@>"
+
+
+ let end_element_handler b t tag =
+ do_text b t;
+ close_tag b tag
+
+ let character_data_handler _b t text =
+ Buffer.add_string t text
+
+ let create_parser () =
+ let buf = Buffer.create 512 in
+ let build = create () in
+ let parser_ = Expat.parser_create ~encoding:None in
+ let finalize () =
+ do_text build buf;
+ close_tag build "";
+ close_document build
+ in
+ Expat.set_start_element_handler parser_ (start_element_handler build buf);
+ Expat.set_end_element_handler parser_ (end_element_handler build buf);
+ Expat.set_character_data_handler parser_ (character_data_handler build buf);
+ open_document build !Options.index_empty_texts !Options.sample_factor
+ !Options.disable_text_collection !Options.text_index_type;
+ open_tag build "";
+ parser_, finalize
+
+ let parse_string s =
+ let parser_, finalizer = create_parser () in
+ Expat.parse parser_ s;
+ finalizer ()
+
+ let parse_file file =
+ let in_chan = open_in file in
+ let buffer = String.create 4096 in
+ let parser_, finalizer = create_parser () in
+ let () =
+ try
+ while true do
+ let read = input in_chan buffer 0 4096 in
+ if read == 0 then raise End_of_file else
+ Expat.parse_sub parser_ buffer 0 read;
+ done
+
+ with
+ | End_of_file -> close_in in_chan
+ | e -> raise e
+ in
+ finalizer ()
+
+
+
+end
+
+
+
+
type bit_vector = string
external bool_of_int : int -> bool = "%identity"
+(*
external parse_xml_uri : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_uri"
external parse_xml_string : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_string"
+*)
external tree_print_xml_fast3 : tree -> [`Tree ] Node.t -> Unix.file_descr -> unit = "caml_xml_tree_print"
let print_xml t n fd =
tree_print_xml_fast3 t.doc n fd
}
-let parse f str =
- node_of_t
- (f str !Options.sample_factor
- !Options.index_empty_texts
- !Options.disable_text_collection
- !Options.text_index_type
- )
-let parse_xml_uri str = parse parse_xml_uri str
-let parse_xml_string str = parse parse_xml_string str
+let parse_xml_uri str = node_of_t (TreeBuilder.parse_file str)
+let parse_xml_string str = node_of_t (TreeBuilder.parse_string str)
let size t = tree_size t.doc;;
let c, s, d, f = time ~msg:"Loading tag table"(load_table) () in
ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
let xml_tree = tree_load fd str load_text sample in
+ Printf.eprintf "Root is: %i\n" (Obj.magic (tree_root xml_tree));
let () = Tag.init (Obj.magic xml_tree) in
let attributes = Ptset.Int.add Tag.attribute d.(Tag.attribute) in
let elements = Ptset.Int.add Tag.document_node