Refactoring, 1st tier:
authorkim <kim@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 3 Dec 2011 21:52:16 +0000 (21:52 +0000)
committerkim <kim@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 3 Dec 2011 21:52:16 +0000 (21:52 +0000)
- replace libxml++-2.6 by ocaml-expat
- kill XMLDocSchredder.*
- move common OCaml/C++ utility functions in common_stub.{hpp,cpp}

git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@1179 3cdefd35-fc62-479d-8e8d-bae585ffb9ca

configure
myocamlbuild.ml
myocamlbuild_config.ml.in
src/OCamlDriver.cpp
src/XMLDocShredder.cpp [deleted file]
src/XMLDocShredder.h [deleted file]
src/common_stub.cpp [new file with mode: 0644]
src/common_stub.hpp [new file with mode: 0644]
src/libcamlshredder.clib
src/tree.ml

index c985457..6d86ab3 100755 (executable)
--- a/configure
+++ b/configure
@@ -14,18 +14,11 @@ Conf.check_prog "ocamlbuild" "ocamlbuild -version";;
 Conf.check_prog "ocamlfind" "ocamlfind printconf";;
 Conf.check_prog "pkg-config" "pkg-config --version";;
 
-Conf.check_prog "libxml++-2.6" "pkg-config --exists libxml++-2.6" ;;
+(*Conf.check_prog "libxml++-2.6" "pkg-config --exists libxml++-2.6" ;;*)
 Conf.check_prog "ulex" "ocamlfind query ulex";;
-Conf.check "XMLTree" (Conf.absolute) ("%s/src/XMLTree/libXMLTree.a") (Sys.file_exists);;
-
-let _, libxmlI, _ = Conf.exec "pkg-config --cflags-only-I libxml++-2.6";;
-let libxmlI = Conf.explode (libxmlI);;
+Conf.check_prog "expat" "ocamlfind query expat";;
 
-let _, libxmlL, _ = Conf.exec "pkg-config --libs-only-L libxml++-2.6";;
-let libxmlL = Conf.explode (libxmlL);;
-
-let _, libxmll, _ = Conf.exec "pkg-config --libs-only-l libxml++-2.6";;
-let libxmll = Conf.explode (libxmll);;
+Conf.check "XMLTree" (Conf.absolute) ("%s/src/XMLTree/libXMLTree.a") (Sys.file_exists);;
 
 let libXMLTreeI = [ Conf.absolute "-I%s/src/XMLTree";
                          Conf.absolute "-I%s/src/XMLTree/libcds/includes";
@@ -39,9 +32,9 @@ let ocamlI = [ "-I" ^ ocamlI ];;
 
 Conf.def_str "cxx_cmd" "g++";;
 
-Conf.def_list "cxx_includes" (libxmlI @ libXMLTreeI @ ocamlI);;
-Conf.def_list "cxx_lpaths" (libxmlL @ libXMLTreeL);;
-Conf.def_list "cxx_libs" ( libXMLTreel @ libxmll );;
+Conf.def_list "cxx_includes" (libXMLTreeI @ ocamlI);;
+Conf.def_list "cxx_lpaths" libXMLTreeL;;
+Conf.def_list "cxx_libs" libXMLTreel;;
 
 
 
index 23901cd..41c25b5 100644 (file)
@@ -106,7 +106,7 @@ let cxx_compile env build =
   let src = env "%.cpp" and obj = env "%.o" in
   let local_include = Depends.cxx src in
   let local_dispatch = List.map (fun p -> List.map (fun p' -> p'/p) project_dirs) local_include in
-  let  () = ignore (build local_dispatch) in
+  let  () = ignore (build local_dispatch ) in
   Cmd(S[A cxx_cmd; A "-o" ; P obj; A "-c";  S !cxx_flags; cxx_include_flags; P src])
 
 (* Native compile and link action *)
index ebe7675..d8e98c9 100644 (file)
@@ -2,7 +2,7 @@ let ocaml_inline = "1000";;
 let include_path = "include";;
 let src_path = "src";;
 let ocaml_link = [ "dynlink"; "camlp4lib" ];;
-let ocamlfind_packages = "unix,ulex,camlp4";;
+let ocamlfind_packages = "unix,ulex,expat,camlp4";;
 let cxx_flags = [ "-fno-PIC"; "-std=c++0x"; "-static" ];;
 let main_targets = [ "native","src/main.native";
                     "byte", "src/main.byte" ];;
index 275758a..8eb3a22 100644 (file)
 
 #include <unordered_set>
 #include <algorithm>
-#include "XMLDocShredder.h"
+
 #include "XMLTree.h"
+#include "XMLTreeBuilder.h"
 #include "Utils.h"
+#include "common_stub.hpp"
 
 extern "C" {
-/* OCaml memory managment */
-#include <caml/mlvalues.h>
-#include <caml/alloc.h>
-#include <caml/memory.h>
-#include <caml/callback.h>
-#include <caml/fail.h>
-#include <caml/custom.h>
-#include <caml/bigarray.h>
-  //#include "results.h"
 #include <stdio.h>
+}
+
+#define CAMLRAISEMSG(msg) (sxsi_raise_msg((char*) (msg)))
+
+#define XMLTREE(x) (Obj_val<XMLTree>(x))
+
+#define HSET(x) (Obj_val<TagIdSet>(x))
+
+#define XMLTREEBUILDER(x) (Obj_val<XMLTreeBuilder>(x))
+
 
-#define CAMLRAISEMSG(msg) (caml_raise_with_string(*cpp_exception,(msg) ))
-#define NOT_IMPLEMENTED(s)  (caml_failwith(s))
-#define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x)))
-#define HSET(x) ((TagIdSet*)((* (TagIdSet**) Data_custom_val(x))))
-#define TEXTCOLLECTION(x)
 #define TREENODEVAL(i) ((treeNode) (Int_val(i)))
 #define TAGVAL(i) ((TagType) (Int_val(i)))
 #define XMLTREE_ROOT 0
 #define NoAlloc
 
-
-  static struct custom_operations ops;
-  static struct custom_operations set_ops;
-  static value * cpp_exception = NULL;
-  static bool ops_initialized = false;
-
+extern "C" {
 #include <sys/time.h>
 #include <sys/resource.h>
-
-
-}
-
-extern "C" void caml_xml_tree_finalize(value tree){
-  delete XMLTREE(tree);
-  return;
-}
-
-extern "C" void caml_hset_finalize(value hblock){
-  delete HSET(hblock);
-  return;
 }
 
-extern "C"  value caml_init_lib (value unit) {
-  CAMLparam1(unit);
-
-  struct rlimit rlim;
-
-  if (!ops_initialized){
 
-    getrlimit(RLIMIT_STACK, &rlim);
-
-    if (rlim.rlim_max == RLIM_INFINITY && rlim.rlim_cur != RLIM_INFINITY) {
-      rlim.rlim_cur = RLIM_INFINITY;
-      setrlimit(RLIMIT_STACK, &rlim);
-    };
 
-    ops.identifier = (char*) "XMLTree";
-    ops.finalize = caml_xml_tree_finalize;
-    set_ops.identifier = (char*) "unordered_set";
-    set_ops.finalize = caml_hset_finalize;
-
-    cpp_exception = caml_named_value("CPlusPlusError");
-    if (cpp_exception == NULL){
-      string s = "FATAL: Unregistered exception ";
-    s += "CPlusPlusError";
-    caml_failwith(s.c_str());
-    };
-
-    ops_initialized = true;
-
-  };
-  CAMLreturn(Val_unit);
-
-}
+/** XMLTreeBuilder bindings
+ *
+ */
 
-extern "C"  value caml_shredder_parse(XMLDocShredder *shredder){
-  CAMLparam0();
-  CAMLlocal1(doc);
-  XMLTree * tree;
-  shredder->processStartDocument("");
-  shredder->parse();
-  shredder->processEndDocument();
-  doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2);
-  tree = (XMLTree *) shredder->getXMLTree();
-  memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*));
-  CAMLreturn(doc);
-
-}
-
-extern "C"  value caml_call_shredder_uri(value uri,value sf, value iet, value dtc, value idtype){
-  CAMLparam1(uri);
-  CAMLlocal1(doc);
-  char *fn = String_val(uri);
-  XMLDocShredder * shredder;
-  TextCollectionBuilder::index_type_t id;
-  switch (Int_val(idtype)){
+extern "C" value caml_xml_tree_builder_create(value unit)
+{
+  CAMLparam1(unit);
+  CAMLlocal1(result);
+  result = sxsi_alloc_custom<XMLTreeBuilder>();
+  Obj_val<XMLTreeBuilder>(result) = new XMLTreeBuilder();
+  CAMLreturn(result);
+}
+
+extern "C" value caml_xml_tree_builder_open_document(value vbuilder,
+                                                    value vet,
+                                                    value vsrate,
+                                                    value vdtc,
+                                                    value vidxtype)
+{
+  CAMLparam5(vbuilder, vet, vsrate, vdtc, vidxtype);
+  bool empty_text = Bool_val(vet);
+  int sample_rate = Int_val(vsrate);
+  bool disable_tc = Bool_val(vdtc);
+  TextCollectionBuilder::index_type_t idx_type;
+  switch (Int_val(vidxtype)){
   case 0:
-    id = TextCollectionBuilder::index_type_default;
+    idx_type = TextCollectionBuilder::index_type_default;
     break;
   case 1:
-    id = TextCollectionBuilder::index_type_swcsa;
+    idx_type = TextCollectionBuilder::index_type_swcsa;
     break;
   case 2:
-    id = TextCollectionBuilder::index_type_rlcsa;
+    idx_type = TextCollectionBuilder::index_type_rlcsa;
     break;
   default:
-    CAMLRAISEMSG("");
+    CAMLRAISEMSG("Invalid Index Type");
   };
+  int res = XMLTREEBUILDER(vbuilder)->OpenDocument(empty_text,
+                                                  sample_rate,
+                                                  disable_tc,
+                                                  idx_type);
+  if (res == NULLT)
+    CAMLRAISEMSG("OpenDocument");
 
-  try {
-    shredder = new XMLDocShredder(fn,Int_val(sf),Bool_val(iet),Bool_val(dtc), id);
-    doc = caml_shredder_parse(shredder);
-    delete shredder;
-  }
-  catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
-  catch (string  msg){  CAMLRAISEMSG(msg.c_str()); }
-  catch (char const * msg){ CAMLRAISEMSG(msg);  };
-  CAMLreturn (doc);
+  CAMLreturn (Val_unit);
+}
 
+extern "C" value caml_xml_tree_builder_close_document(value vbuilder)
+{
+  CAMLparam1(vbuilder);
+  CAMLlocal1(result);
+  XMLTree * tree = XMLTREEBUILDER(vbuilder)->CloseDocument();
+  if (tree == NULL)
+    CAMLRAISEMSG("CloseDocument");
+  result = sxsi_alloc_custom<XMLTree>();
+  Obj_val<XMLTree>(result) = tree;
+  CAMLreturn (result);
 }
-extern "C"  value caml_call_shredder_string(value data,value sf, value iet, value dtc, value idtype){
-  CAMLparam1(data);
-  CAMLlocal1(doc);
-  XMLDocShredder * shredder;
-  unsigned int ln = caml_string_length(data);
-  unsigned char *fn = (unsigned char*) String_val(data);
-  TextCollectionBuilder::index_type_t id;
-  switch (Int_val(idtype)){
-  case 0:
-    id = TextCollectionBuilder::index_type_default;
-    break;
-  case 1:
-    id = TextCollectionBuilder::index_type_swcsa;
-    break;
-  case 2:
-    id = TextCollectionBuilder::index_type_rlcsa;
-    break;
-  default:
-    CAMLRAISEMSG("");
-  };
 
-  try {
+extern "C" value caml_xml_tree_builder_new_open_tag(value vbuilder, value vtag)
+{
+  CAMLparam2(vbuilder, vtag);
+  const char * tag = String_val(vtag);
+  if (XMLTREEBUILDER(vbuilder)->NewOpenTag(std::string(tag)) == NULLT)
+    CAMLRAISEMSG("NewOpenTag");
 
-    shredder = new  XMLDocShredder (fn,ln,Int_val(sf),Bool_val(iet),Bool_val(dtc), id);
-    doc = caml_shredder_parse(shredder);
-    delete shredder;
-  }
-  catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
-  catch (string  msg){  CAMLRAISEMSG(msg.c_str()); }
-  catch (char const * msg){ CAMLRAISEMSG(msg);  };
-  CAMLreturn(doc);
+  CAMLreturn (Val_unit);
+}
+
+extern "C" value caml_xml_tree_builder_new_closing_tag(value vbuilder, value vtag)
+{
+  CAMLparam2(vbuilder, vtag);
+  const char * tag = String_val(vtag);
+  if (XMLTREEBUILDER(vbuilder)->NewClosingTag(std::string(tag)) == NULLT)
+    CAMLRAISEMSG("NewClosingTag");
+
+  CAMLreturn (Val_unit);
 }
 
+extern "C" value caml_xml_tree_builder_new_text(value vbuilder, value vtext)
+{
+  CAMLparam2(vbuilder, vtext);
+  const char * text = String_val(vtext);
+  if (XMLTREEBUILDER(vbuilder)->NewText(std::string(text)) == NULLT)
+    CAMLRAISEMSG("NewText");
+
+  CAMLreturn (Val_unit);
+}
+
+
+/*************************************************************************/
+
+/**
+ *  XMLTree bindings
+ *  All of the functions here call the _unsafe version and implement the logics themselves
+ *  (test for NULLT and so on). This avoids one indirection + one call when the tests fails.
+ */
+
+
 extern "C"  value caml_xml_tree_save(value tree,value fd, value name){
   CAMLparam3(tree, fd, name);
   XMLTREE(tree)->Save(Int_val(fd), String_val(name));
@@ -184,31 +155,22 @@ extern "C"  value caml_xml_tree_save(value tree,value fd, value name){
 
 extern "C"  value caml_xml_tree_load(value fd, value name, value load_tc,value sf){
   CAMLparam4(fd, name, load_tc, sf);
-  CAMLlocal1(doc);
+  CAMLlocal1(result);
   XMLTree * tree;
   try {
     tree = XMLTree::Load(Int_val(fd),Bool_val(load_tc),Int_val(sf), String_val(name));
-    doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2);
-    memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*));
-    CAMLreturn(doc);
+    result = sxsi_alloc_custom<XMLTree>();
+    Obj_val<XMLTree>(result) = tree;
+    CAMLreturn(result);
   }
-  catch (const xmlpp::internal_error& e){ CAMLRAISEMSG(e.what()); }
   catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
-  catch (string  msg){  CAMLRAISEMSG(msg.c_str()); }
+  catch (std::string msg){  CAMLRAISEMSG(msg.c_str()); }
   catch (char const * msg){ CAMLRAISEMSG(msg);  };
+  //never reached
+  return (Val_unit);
 }
 
 
-
-/*************************************************************************/
-
-/**
- *  XMLTree bindings
- *  All of the functions here call the _unsafe version and implement the logics themselves
- *  (test for NULLT and so on). This avoids one indirection + one call when the tests fails.
- */
-
-
 NoAlloc extern "C"  value caml_xml_tree_root(value tree){
   return (Val_int(XMLTREE_ROOT));
 }
@@ -454,9 +416,8 @@ NoAlloc extern "C"  value caml_unordered_set_length(value hset){
 extern "C"  value caml_unordered_set_alloc(value unit){
   CAMLparam1(unit);
   CAMLlocal1(hset);
-  hset = caml_alloc_custom(&set_ops,sizeof(TagIdSet*),1,2);
-  TagIdSet* ht = new TagIdSet();
-  memcpy(Data_custom_val(hset),&ht,sizeof(TagIdSet*));
+  hset = sxsi_alloc_custom<TagIdSet>();
+  Obj_val<TagIdSet>(hset) = new TagIdSet();
   CAMLreturn (hset);
 }
 
@@ -761,7 +722,7 @@ extern "C"  value caml_text_collection_empty_text(value tree,value id){
   CAMLreturn ( Val_int((XMLTREE(tree))->EmptyText((DocID) Int_val(id))));
 }
 
-bool docId_comp(DocID x, DocID y) { return x < y; };
+bool docId_comp(DocID x, DocID y) { return x < y; }
 
 /**
  * Existential queries
diff --git a/src/XMLDocShredder.cpp b/src/XMLDocShredder.cpp
deleted file mode 100644 (file)
index ffe4de7..0000000
+++ /dev/null
@@ -1,306 +0,0 @@
-/**********************************************************
- * XMLDocShredder.cpp
- * ---------------------
- * Implementation of the class that receives events from the XML parser and 
- * invokes corresponding construction methods of the storage interface.
- * 
- * Author: Greg Leighton
- * Date: 02/11/08
- * Changes:
- *             05/11/08 -- Fixed bug related to parsing empty elements
- *                              -- Set parser properties to automatically resolve
- *                                     entity references and load external DTD if present
- *                              -- Modified processEndDocument() by adding a nodeFinished()
- *                                     call to the storage interface to close off the 
- *                                     document node
- *
- */
-
-#include <iostream>
-#include "XMLDocShredder.h"
-#include <libxml++/exceptions/parse_error.h>
-#include "Utils.h"
-
-using namespace Glib;
-
-void XMLDocShredder::doText(){
-
-  if (!buffer.empty()){
-    tb->NewOpenTag(PCDATA_OPEN_TAG);
-    tb->NewText(buffer);
-    tb->NewClosingTag(PCDATA_OPEN_TAG);
-  };
-  buffer.clear();
-
-}
-
-void XMLDocShredder::setProperties(){
-  /* instruct the parser to expand entity references and report as 
-   * regular PCDATA
-   */ 
-  reader_->set_parser_property(
-                              TextReader::SubstEntities, true);
-               
-  /* instruct parser to read external DTD, if present.  This is 
-        * needed to obtain any entity definitions in the DTD
-        */
-  reader_->set_parser_property(
-                              TextReader::LoadDtd, true);
-  
-  
-  /* 
-   */
-  reader_->set_parser_property(
-                              TextReader::DefaultAttrs, true);
-  
-
-  /* but we don't want to do validation since it would slow us down
-   */
-
-
-  reader_->set_parser_property(
-                              TextReader::Validate, false);
-  
-}
-XMLDocShredder::XMLDocShredder(const unsigned char * data,
-                              TextReader::size_type size,
-                              int sf, 
-                              bool iet, 
-                              bool dtc,
-                              TextCollectionBuilder::index_type_t index_type
-                              )                        
-{
-  tree = NULL;
-  reader_ = new TextReader(data,size,"");
-  setProperties();
-  tb  = new XMLTreeBuilder();
-  buffer.clear();
-  tb->OpenDocument(iet,sf,dtc, index_type);
-}
-
-XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
-                              TextCollectionBuilder::index_type_t index_type
-                              )
-{
-  tree = NULL;
-  reader_ = new TextReader(inFileName);
-  setProperties();
-  tb = new XMLTreeBuilder();
-  buffer.clear();
-  tb->OpenDocument(iet,sf,dtc,index_type);
-}
-
-XMLDocShredder::~XMLDocShredder()
-{
-       delete reader_;
-       reader_ = NULL;
-       delete tb;
-       tb = NULL;
-
-}
-
-
-void XMLDocShredder::processStartElement()
-{
-  doText();
-  // fetch element name; this will be the full qualified name
-  ustring name = reader_->get_name();
-  bool empty = false;
-  size_t found = name.find_first_of(':');
-  if (found == ustring::npos)  
-    tb->NewOpenTag(name);
-  else
-    tb->NewOpenTag(name.substr(found+1,name.length() - found - 1));
-  
-  /* We must be really carefull here. calling process attributes moves
-     the document pointer on the last attribute, hence calling reader_->is_empty
-     afterwards will yield the wrong result. It is better to call it while we are
-     on the element and generate a nodeFinished() call at the end */
-  empty = reader_->is_empty_element();
-  
-  
-  // now, process attributes
-  if (reader_->has_attributes())
-    processAttributes();
-  
-  
-  if (empty)
-    tb->NewClosingTag(name);
-  
-  
-}
-
-void XMLDocShredder::processEndElement()
-{
-  doText();
-  ustring name = reader_->get_name();
-  tb->NewClosingTag(name);
-}
-
-void XMLDocShredder::processPCDATA()
-{
-  // send the content of this PCDATA node to the storage interface as a text node
-  if (reader_->has_value())
-    buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processAttributes()
-{
-       reader_->move_to_first_attribute();
-               
-       string nspaceStr = "xmlns";
-       tb->NewOpenTag(ATTRIBUTE_OPEN_TAG);
-       do
-         {
-               ustring name = reader_->get_name();
-               ustring value = reader_->get_value();
-               
-               /* the libxml++ TextReader treats the xmlns attribute like an ordinary attribute,
-               * so we have to extract it and build a namespace uri node out of it before
-               * passing to the storage interface */
-               
-               if ((name.find(nspaceStr.c_str(), 0, 5)) == 0)
-               {
-                 //TODO 
-               }
-               
-               /* otherwise, this is an ordinary attribute, so we construct a new child node of the 
-                * parent element to store the attribute name, possessing a child text node storing the 
-                * attribute value.  Then, we close off the attribute node with a call to nodeFinished()
-                */
-                
-               else
-               {
-                 string attname = "<@>"+name;
-                 tb->NewOpenTag(attname);
-                 tb->NewOpenTag(ATTRIBUTE_DATA_OPEN_TAG);
-                 tb->NewText(value);
-                 tb->NewClosingTag(ATTRIBUTE_DATA_OPEN_TAG);
-                 tb->NewClosingTag(attname);
-               }
-       }
-       while (reader_->move_to_next_attribute());
-       tb->NewClosingTag(ATTRIBUTE_OPEN_TAG);
-}
-
-void XMLDocShredder::processSignificantWhitespace()
-{
-  if (reader_->has_value())
-    buffer += reader_->get_value();
-
-}
-
-void XMLDocShredder::processStartDocument(const string docName)
-{
-  // tell storage interface to construct the document name
-
-  tb->NewOpenTag(DOCUMENT_OPEN_TAG);
-  
-}
-
-void XMLDocShredder::processEndDocument()
-{
-  doText();
-  /* tell the storage interface that document parsing has finished, and structures
-   * can now be written to disk. */
-  tb->NewClosingTag(DOCUMENT_OPEN_TAG);
-  tree = tb->CloseDocument();
-
-}
-
-void XMLDocShredder::processComment()
-{
-  //storageIfc_->newChild("!" + reader_->get_value());
-  //storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processProcessingInstruction()
-{
-       ustring name = reader_->get_name();
-       ustring value = reader_->get_value();   
-       
-       /* Create a child node to store the target of the PI, append a text node to it to store 
-        * the PI data, send to the storage interface.  Close off the PI node with a call to
-        * nodeFinished
-        */
-       
-       // storageIfc_->newChild("?" + name);
-       // storageIfc_->newText(value);
-       // storageIfc_->nodeFinished();
-}
-
-void XMLDocShredder::processDocTypeDeclaration()
-{
-       /* We currently ignore the DOCTYPE declaration, but we'll provide this method skeleton 
-        * in case we do want to process it in the future.
-       */
-}
-
-void XMLDocShredder::processCDATASection()
-{
-       /* Currently, we don't preserve CDATA sections since they aren't part of the XPath data
-        * model.  Instead, we simply pass the converted text value to the storage interface as 
-        * a text node attached to the current context node.
-        */
-  if (reader_->has_value())
-    buffer+= reader_->get_value();
-}
-
-void XMLDocShredder::processUnknownNodeType()
-{
-       cout << "unknown token encountered during parsing" << endl;
-       throw xmlpp::parse_error("unknown token encountered during parsing");
-               
-}
-
-void XMLDocShredder::parse()
-{      
-       while (reader_->read() && (reader_->get_read_state() != TextReader::Error))
-       {
-               switch (reader_->get_node_type())
-               {
-                       case TextReader::Element:
-                               processStartElement();
-                               break;
-                               
-                       case TextReader::Text:
-                               processPCDATA();
-                               break;
-                               
-                       case TextReader::EndElement:
-                               processEndElement();
-                               break;
-                               
-                       case TextReader::SignificantWhitespace:
-                               processSignificantWhitespace();
-                               break;
-                               
-                       case TextReader::Comment:
-                               processComment();
-                               break;
-                       
-                       case TextReader::DocumentType:
-                               processDocTypeDeclaration();
-                               break;
-                               
-                       case TextReader::ProcessingInstruction:
-                               processProcessingInstruction();
-                               break;
-                       
-                       case TextReader::CDATA:
-                               processCDATASection();
-                               break;
-                       
-                       case TextReader::None:
-                               processUnknownNodeType();
-                               break;
-                               
-                       default:
-                               int type = reader_->get_node_type();
-                               cout << "  Node type: " << type << endl;
-                               break;  
-                       
-               }
-       }                       
-}
diff --git a/src/XMLDocShredder.h b/src/XMLDocShredder.h
deleted file mode 100644 (file)
index a12500d..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/**************************************
- * XMLDocShredder.h
- * --------------------
- * Header file for the shredder routine that invokes the XML parser and 
- * calls the appropriate construction methods of the storage interface in
- * correspondence with received parsing events.
- * 
- * Author: Greg Leighton
- * Date: 02/11/08
- */
-
-#ifndef XMLDOCSHREDDER_H_
-#define XMLDOCSHREDDER_H_
-
-#include <libxml++/libxml++.h>
-#include <libxml++/parsers/textreader.h>
-#include <string>
-#include "XMLTree.h"
-#include "XMLTreeBuilder.h"
-
-using namespace std;
-using namespace xmlpp;
-
-
-class XMLDocShredder
-{
-  void doText();
-public:
-  XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc,
-                TextCollectionBuilder::index_type_t index_type
-                );
-  XMLDocShredder(const unsigned char * data,
-                TextReader::size_type size,
-                int sf, bool iet, bool dtc,
-                TextCollectionBuilder::index_type_t index_type
-                );
-  virtual ~XMLDocShredder();
-  virtual void processStartElement();
-  virtual void processEndElement();
-  virtual void processPCDATA();
-  virtual void processAttributes();
-  virtual void processSignificantWhitespace();
-  virtual void processStartDocument(const string docName);
-  virtual void processEndDocument();
-  virtual void processComment();
-  virtual void processProcessingInstruction();
-  virtual void processDocTypeDeclaration();
-  virtual void processUnknownNodeType();
-  virtual void processCDATASection();
-  virtual void parse();
-  
-  XMLTree * getXMLTree(){
-    return tree;
-  }
-
-       
- private:
-  XMLTreeBuilder * tb;
-  XMLTree * tree;
-  TextReader *reader_;
-  void setProperties();
-  bool last_text;
-  string buffer; 
-  // used to coalece successive text events
-  // which can occur if we discard pi and comment nodes.
-};
-
-#endif /*XMLDOCSHREDDER_H_*/
diff --git a/src/common_stub.cpp b/src/common_stub.cpp
new file mode 100644 (file)
index 0000000..6f56d33
--- /dev/null
@@ -0,0 +1,92 @@
+#include <unordered_map>
+#include <string>
+
+#include "common_stub.hpp"
+
+extern "C" {
+#include <sys/time.h>
+#include <sys/resource.h>
+}
+
+using std::string;
+using std::pair;
+using std::unordered_map;
+using std::make_pair;
+
+typedef unordered_map<string, pair<struct custom_operations*, size_t>> type_map_t;
+static type_map_t * type_map = 0;
+static value * cpp_exception = 0;
+
+static void init_error()
+{
+  caml_failwith("C++: initialization error");
+}
+
+static void init_exception()
+{
+  cpp_exception = caml_named_value("CPlusPlusError");
+  if (cpp_exception == 0)
+    init_error();
+}
+
+static void init_type_map()
+{
+  if (type_map == 0)
+    type_map = new type_map_t();
+  if (type_map == 0)
+    init_error();
+}
+
+void register_custom_(char* name,
+                     size_t size,
+                     void (*finalize)(value v))
+{
+  if (type_map == 0) init_error();
+  struct custom_operations * ops =
+    (struct custom_operations*) calloc(1, sizeof(struct custom_operations));
+  ops->identifier = name;
+  ops->finalize = finalize;
+  ops->compare = custom_compare_default;
+  ops->hash = custom_hash_default;
+  ops->serialize = custom_serialize_default;
+  ops->deserialize = custom_deserialize_default;
+  type_map->insert(make_pair(string(name), make_pair(ops, size)));
+}
+
+value alloc_custom_(char* name)
+{
+  CAMLparam0();
+  CAMLlocal1(result);
+  if (type_map == 0) init_error();
+  string key = string(name);
+  type_map_t::iterator it = type_map->find(key);
+  if (it == type_map->end())
+    result = Val_unit;
+  else
+    result = caml_alloc_custom(it->second.first, it->second.second, 1, 2);
+
+  CAMLreturn(result);
+}
+
+
+extern "C" value sxsi_cpp_init(value unit)
+{
+  struct rlimit rlim;
+  init_exception();
+  init_type_map();
+
+  /* Set the stack space to unlimited */
+  getrlimit(RLIMIT_STACK, &rlim);
+  if (rlim.rlim_max == RLIM_INFINITY && rlim.rlim_cur != RLIM_INFINITY) {
+    rlim.rlim_cur = RLIM_INFINITY;
+    setrlimit(RLIMIT_STACK, &rlim);
+  };
+
+  return Val_unit;
+}
+
+void sxsi_raise_msg(char * msg)
+{
+  if (cpp_exception == 0) init_error();
+  caml_raise_with_string(*cpp_exception, msg);
+}
diff --git a/src/common_stub.hpp b/src/common_stub.hpp
new file mode 100644 (file)
index 0000000..bfacdac
--- /dev/null
@@ -0,0 +1,58 @@
+#ifndef COMMON_STUB_H_
+#define COMMON_STUB_H_
+
+extern "C" {
+#define CAML_NAME_SPACE
+#include <stdlib.h>
+#include <caml/mlvalues.h>
+#include <caml/alloc.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+#include <caml/fail.h>
+#include <caml/custom.h>
+#include <caml/bigarray.h>
+#include <stdio.h>
+}
+#include <iostream>
+#include <typeinfo>
+
+void register_custom_(char* name,
+                     size_t size,
+                     void (*finalize)(value v));
+
+value alloc_custom_(char* name);
+
+
+template <class X> void sxsi_finalize_custom(value v)
+{
+  X * obj = * ((X **) Data_custom_val(v));
+  std::cerr << "Finalizing object: " << typeid(X).name() << std::endl;
+  std::cerr.flush();
+  delete obj;
+}
+
+template <class X> value sxsi_alloc_custom()
+{
+  char * name = const_cast<char*>(typeid(X).name());
+  value v = alloc_custom_(name);
+  if (v == Val_unit) {
+    register_custom_(name, sizeof(X*), sxsi_finalize_custom<X>);
+    v = alloc_custom_(name);
+  };
+  return v;
+}
+
+template <class X> X*& Obj_val(value v)
+{
+  //Cannot use Data_custom_val here, it is not a correct lvalue. :-(
+  return (X*&) Field(v,1);
+}
+
+void sxsi_raise_msg(char * msg);
+
+extern "C" {
+value sxsi_cpp_init(value unit);
+}
+
+
+#endif
index 02b7c35..5a8dd8b 100644 (file)
@@ -1,2 +1,2 @@
 OCamlDriver.o
-XMLDocShredder.o
+common_stub.o
index 97a850c..a8a3a96 100644 (file)
@@ -8,7 +8,7 @@ INCLUDE "debug.ml"
 INCLUDE "utils.ml"
 
 
-external init_lib : unit -> unit = "caml_init_lib"
+external init_lib : unit -> unit = "sxsi_cpp_init"
 
 exception CPlusPlusError of string
 
@@ -20,6 +20,99 @@ type node = [ `Tree ] Node.t
 
 type tree
 
+
+module TreeBuilder =
+struct
+  type t
+  external create : unit -> t = "caml_xml_tree_builder_create"
+  external open_document : t -> bool -> int -> bool -> int -> unit = "caml_xml_tree_builder_open_document"
+  external close_document : t -> tree = "caml_xml_tree_builder_close_document"
+  external open_tag : t -> string -> unit = "caml_xml_tree_builder_new_open_tag"
+  external close_tag : t -> string -> unit = "caml_xml_tree_builder_new_closing_tag"
+  external text : t -> string -> unit = "caml_xml_tree_builder_new_text"
+
+
+  let do_text b t =
+    if Buffer.length t > 0 then begin
+      open_tag b "<$>";
+      text b (Buffer.contents t);
+      close_tag b "<$>";
+      Buffer.clear t
+    end
+
+  let output_attr b name value =
+    let atname = "<@>" ^ name in
+    open_tag b atname;
+    open_tag b "<@$>";
+      text b value;
+      close_tag b "<@$>";
+      close_tag b atname
+
+  let start_element_handler b t tag attr_list =
+    do_text b t;
+    open_tag b tag;
+    match attr_list with
+       [] -> ()
+      | l ->
+       open_tag b "<@>";
+       List.iter (fun (name, value) -> output_attr b name value) l;
+         close_tag b "<@>"
+
+
+  let end_element_handler b t tag =
+    do_text b t;
+    close_tag b tag
+
+  let character_data_handler _b t text =
+    Buffer.add_string t text
+
+  let create_parser () =
+    let buf = Buffer.create 512 in
+    let build = create () in
+    let parser_ = Expat.parser_create ~encoding:None in
+    let finalize () =
+      do_text build buf;
+      close_tag build "";
+      close_document build
+    in
+    Expat.set_start_element_handler parser_ (start_element_handler build buf);
+    Expat.set_end_element_handler parser_ (end_element_handler build buf);
+    Expat.set_character_data_handler parser_ (character_data_handler build buf);
+    open_document build !Options.index_empty_texts !Options.sample_factor
+      !Options.disable_text_collection !Options.text_index_type;
+    open_tag build "";
+    parser_, finalize
+
+  let parse_string s =
+    let parser_, finalizer = create_parser () in
+    Expat.parse parser_ s;
+    finalizer ()
+
+  let parse_file file =
+    let in_chan = open_in file in
+    let buffer = String.create 4096 in
+    let parser_, finalizer = create_parser () in
+    let () =
+      try
+       while true do
+         let read = input in_chan buffer 0 4096 in
+         if read == 0 then raise End_of_file else
+            Expat.parse_sub parser_ buffer 0 read;
+         done
+
+      with
+       | End_of_file -> close_in in_chan
+       | e -> raise e
+    in
+      finalizer ()
+
+
+
+end
+
+
+
+
 type bit_vector = string
 
 external bool_of_int : int -> bool = "%identity"
@@ -41,8 +134,10 @@ type t = {
 
 
 
+(*
 external parse_xml_uri : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_uri"
 external parse_xml_string :  string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_string"
+*)
 external tree_print_xml_fast3 : tree -> [`Tree ] Node.t -> Unix.file_descr -> unit = "caml_xml_tree_print"
 let print_xml t n fd =
   tree_print_xml_fast3 t.doc n fd
@@ -418,16 +513,9 @@ let node_of_t t  =
 
     }
 
-let parse f str =
-  node_of_t
-    (f str !Options.sample_factor
-       !Options.index_empty_texts
-       !Options.disable_text_collection
-       !Options.text_index_type
-    )
 
-let parse_xml_uri str = parse parse_xml_uri str
-let parse_xml_string str =  parse parse_xml_string str
+let parse_xml_uri str = node_of_t (TreeBuilder.parse_file str)
+let parse_xml_string str = node_of_t (TreeBuilder.parse_string str)
 
 let size t = tree_size t.doc;;
 
@@ -503,6 +591,7 @@ let load ?(sample=64) ?(load_text=true) str =
   let c, s, d, f = time ~msg:"Loading tag table"(load_table) () in
   ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
     let xml_tree = tree_load fd str load_text sample in
+    Printf.eprintf "Root is: %i\n" (Obj.magic (tree_root xml_tree));
     let () = Tag.init (Obj.magic xml_tree) in
     let attributes = Ptset.Int.add Tag.attribute d.(Tag.attribute) in
     let elements = Ptset.Int.add Tag.document_node