DEBUG=false
PROFILE=true
+VERBOSE=false
MLSRCS = memory.ml tag.ml tagSet.ml tree.ml automaton.ml ulexer.ml xPath.ml main.ml
MLISRCS = memory.mli automaton.mli tag.mli tagSet.mli tree.mli ulexer.mli xPath.mli
-IXMLTree/libcds/includes \
-IXMLTree/TextCollection
-CXXFLAGS = -O3 -Wall $(INCLUDEDIRS) -fPIC -std=c++0x
+CXXFLAGS = -O3 -Wall $(INCLUDEDIRS) -fPIC
+ifeq ($(VERBOSE),true)
+HIDE=
+else
+HIDE=@
+endif
ifeq ($(DEBUG), true)
CXX = g++ -DDEBUG
SYNT_DEBUG = -ppopt -DDEBUG
else
CXX = g++
-OCAMLOPT = ocamlopt -cc "$(CXX)" -noassert -inline 10000
+OCAMLOPT = ocamlopt -cc "$(CXX)" -noassert -inline 100
endif
ifeq ($(PROFILE), true)
LIBS=-lxml2 -lxml++-2.6 -lglibmm-2.4 -lgobject-2.0 -lglib-2.0 -lsigc-2.0
-all: version libcamlshredder.a $(MLOBJS)
+all: main
- $(OCAMLFIND) $(LINK) -o main -package "$(OCAMLPACKAGES)" $(SYNTAX) -cclib \
+main: libcamlshredder.a $(MLOBJS)
+ @echo [LINK] $@
+ $(HIDE) $(OCAMLFIND) $(LINK) -o main -package "$(OCAMLPACKAGES)" $(SYNTAX) -cclib \
"$(LIBS) ./libcamlshredder.a" $(MLOBJS)
.SUFFIXES: .ml .mli .cmx .cmi .cpp
.PHONY:compute_depend version
.cpp.o:
- $(CXX) $(CXXINCLUDES) -c $(CXXFLAGS) $<
+ @echo [CPP] $@
+ $(HIDE) $(CXX) $(CXXINCLUDES) -c $(CXXFLAGS) $<
.ml.cmx:
- $(OCAMLFIND) $(OCAMLOPT) -package "$(OCAMLPACKAGES)" $(SYNTAX) -c $<
+ @echo [OCAMLOPT] $@
+ $(HIDE) $(OCAMLFIND) $(OCAMLOPT) -package "$(OCAMLPACKAGES)" $(SYNTAX) -c $<
.mli.cmi:
- $(OCAMLFIND) $(OCAMLOPT) -package "$(OCAMLPACKAGES)" $(SYNTAX) -c $<
+ @echo [OCAMLOPT] $@
+ $(HIDE) $(OCAMLFIND) $(OCAMLOPT) -package "$(OCAMLPACKAGES)" $(SYNTAX) -c $<
libcamlshredder.a: $(CXXOBJECTS) XMLTree/XMLTree.a
- mkdir -p .libs/
- cd .libs/ && ar x ../XMLTree/XMLTree.a
- $(OCAMLMKLIB) -o camlshredder -custom $(CXXOBJECTS) ./.libs/*.o $(LIBS)
- rm -rf .libs
+ @echo [LIB] $@
+ $(HIDE) mkdir -p .libs/
+ $(HIDE) cd .libs/ && ar x ../XMLTree/XMLTree.a
+ $(HIDE) $(OCAMLMKLIB) -o camlshredder -custom $(CXXOBJECTS) ./.libs/*.o $(LIBS)
+ $(HIDE) rm -rf .libs
clean:
- rm -f *~ *.cm* *.[oa] *.so main .libs
+ @echo [CLEAN]
+ $(HIDE) rm -f *~ *.cm* *.[oa] *.so main .libs
testSuccint: $(CXXOBJECTS) XMLTree/XMLTree.a
SXSIStorageInterface.o: SXSIStorageInterface.h SXSIStorageInterface.cpp StorageInterface.h
StorageInterface.o: StorageInterface.h
-XMLDocShredder.o: XMLDocShredder.h XMLDocShredder.cpp OCamlStorageInterface.h StorageInterface.h
+XMLDocShredder.o: XMLDocShredder.h XMLDocShredder.cpp SXSIStorageInterface.h StorageInterface.h
OCamlDriver.o: XMLDocShredder.h StorageInterface.h
compute_depend:
- $(OCAMLFIND) $(OCAMLDEP) -package "$(OCAMLPACKAGES)" $(SYNTAX) $(MLSRCS) $(MLISRCS) >depend
-
-
+ @echo [DEP]
+ $(HIDE) $(OCAMLFIND) $(OCAMLDEP) -package "$(OCAMLPACKAGES)" $(SYNTAX) $(MLSRCS) $(MLISRCS) >depend
include depend
#include <caml/fail.h>
} //extern C
-#include "TextCollection/TextCollection.h"
+//#include "TextCollection/TextCollection.h"
#include "XMLDocShredder.h"
#include "XMLTree.h"
#include "Utils.h"
DocID tid;
if (id == NULLT)
return;
- int tag = tree->Tag(id);
+ // int tag = tree->Tag(id);
if (id) {
tid = tree->PrevText(id);
char * data = (char *) (tree->getTextCollection())->GetText(tid);
CAMLreturn(Val_unit);
}
-
-extern "C" CAMLprim value caml_text_collection_get_text(value tc, value id){
- CAMLparam2(tc,id);
-
- const char* txt = (const char*) ((TextCollection*) tc)->GetText((DocID) Int_val(id));
+extern "C" CAMLprim value caml_text_collection_get_text(value tree, value id){
+ CAMLparam2(tree,id);
+ const char* txt = (const char*) (XMLTREE(tree)->GetText((DocID) Int_val(id)));
CAMLreturn (caml_copy_string(txt));
}
-extern "C" CAMLprim value caml_text_collection_empty_text(value tc,value id){
- CAMLparam2(tc,id);
- CAMLreturn ( Val_int(((TextCollection*) tc)->EmptyText((DocID) Int_val(id))));
+extern "C" CAMLprim value caml_text_collection_empty_text(value tree,value id){
+ CAMLparam2(tree,id);
+ CAMLreturn ( Val_int((XMLTREE(tree))->EmptyText((DocID) Int_val(id))));
}
-extern "C" CAMLprim value caml_text_collection_is_contains(value tc,value str){
- CAMLparam2(tc,str);
+extern "C" CAMLprim value caml_text_collection_is_contains(value tree,value str){
+ CAMLparam2(tree,str);
uchar * cstr = (uchar *) String_val(str);
- CAMLreturn ( Val_bool((int) ((TextCollection*) tc)->IsContains(cstr)));
+ CAMLreturn ( Val_bool((int) XMLTREE(tree)->IsContains(cstr)));
}
-extern "C" CAMLprim value caml_text_collection_count_contains(value tc,value str){
- CAMLparam2(tc,str);
+extern "C" CAMLprim value caml_text_collection_count_contains(value tree,value str){
+ CAMLparam2(tree,str);
uchar * cstr = (uchar *) String_val(str);
- CAMLreturn ( Val_int(((TextCollection*) tc)->CountContains(cstr)));
+ CAMLreturn (Val_int((XMLTREE(tree)->CountContains(cstr))));
}
-extern "C" CAMLprim value caml_text_collection_contains(value tc,value str){
- CAMLparam2(tc,str);
+extern "C" CAMLprim value caml_text_collection_contains(value tree,value str){
+ CAMLparam2(tree,str);
CAMLlocal1(resarray);
uchar * cstr = (uchar *) String_val(str);
std::vector<DocID> results;
- results = ((TextCollection*) tc)->Contains(cstr);
+ results = XMLTREE(tree)->Contains(cstr);
resarray = caml_alloc_tuple(results.size());
- for (int i=0; i<results.size();i++){
+ for (unsigned int i=0; i<results.size();i++){
caml_initialize(&Field(resarray,i),Val_int(results[i]));
};
CAMLreturn (resarray);
CAMLreturn (caml_copy_string(tag));
}
extern "C" CAMLprim value caml_xml_tree_tag_id(value tree,value id){
- CAMLparam2(tree,id);
-
- int tag =XMLTREE(tree)->Tag(TREENODEVAL(id));
-
- CAMLreturn (Val_unit);
+ CAMLparam2(tree,id);
+ CAMLreturn (Val_int(XMLTREE(tree)->Tag(TREENODEVAL(id))));
}
extern "C" CAMLprim value caml_xml_tree_nullt(value unit){
CAMLparam1(unit);
+++ /dev/null
-/**************************************
- * OCamlDriver.cpp
- * -------------------
- * The Ocaml Driver which calls the C++ methods and
- * adds a C wrapper interface with OCaml code.
- *
- * Author: Kim Nguyen
- * Date: 19/11/08
- */
-
-/* OCaml memory managment */
-extern "C" {
-#include <caml/mlvalues.h>
-#include <caml/alloc.h>
-#include <caml/memory.h>
-#include <caml/callback.h>
-#include <caml/fail.h>
-} //extern C
-
-#include "XMLDocShredder.h"
-
+++ /dev/null
-/*******************************************
- * OCamlStorageInterface.cpp
- * ------------------------
- *
- *
- * Author: Kim Nguyen
- * Date: 04/11/08
- */
-
-
-#include "OCamlStorageInterface.h"
-#include "Utils.h"
-
-/* see caml/mlvalues.h
- */
-/* tags */
-#define NODE 0
-#define STRING 1
-
-/* fields */
-#define PCDATA 0
-#define ID 0
-#define TAG 1
-#define LEFT 2
-#define RIGHT 3
-#define FATHER 4
-
-#define NIL (Val_unit)
-
-/* The OCaml function which computes the hash value of a tag */
-static value *caml_hash_tag = NULL;
-
-OCamlStorageInterface::OCamlStorageInterface()
-{
- CAMLparam0();
- CAMLlocal4(node,nodeptr,dummytag,father);
- nodeid=1;
- if (caml_hash_tag == NULL) {
- /* First time around, look up by name */
- caml_hash_tag = caml_named_value("caml_hash_tag");
-
- }
-
- dummytag = caml_callback(*caml_hash_tag, caml_copy_string(""));
-
- // Atomic block, initialize must be called for every field
- // Before any other allocation takes place. In particular,
- // One should NOT place the call to caml_callback as an argument to
- // caml_initialize but rather store its result in a variable.
- father = caml_alloc_shr(1,0);
- caml_initialize(&Field(father,0),NIL);
-
- node = caml_alloc_shr(5,0);
- caml_initialize(&Field(node,ID),Val_int(nodeid++));
- caml_initialize(&Field(node,TAG),dummytag);
- caml_initialize(&Field(node,LEFT),NIL);
- caml_initialize(&Field(node,RIGHT),NIL);
- caml_initialize(&Field(node,FATHER),father);
-
- nodeptr = caml_alloc_shr(1,LEFT);
- caml_initialize(&Field(nodeptr,0),node);
-
- stack.push_front(nodeptr);
- caml_register_global_root(&stack.front());
-
- CAMLreturn0;
-}
-
-OCamlStorageInterface::~OCamlStorageInterface()
-{
- caml_remove_global_root(&stack.back());
-}
-
-void OCamlStorageInterface::newChild(string name)
-{
- CAMLparam0();
- CAMLlocal5(nnode,onode,tag,taghash,id);
- CAMLlocal3(father,nnodeptr,onodeptr);
- DPRINT("newChild " << name <<"\n")
- onode = stack.front();
- /* Allocate the new Node(tag,l,r) */
-
- /* Compute the new tag hash and store it in the new block */
- tag = caml_copy_string(name.c_str());
- taghash = caml_callback(*caml_hash_tag, tag);
- id = Val_int(nodeid++);
-
- /* Again, initialization must be atomic */
- father=caml_alloc_shr(1,0);
- caml_initialize(&Field(father,0),onode);
-
- nnode = caml_alloc_shr(5,0);
- caml_initialize(&Field(nnode,ID),id);
- caml_initialize(&Field(nnode,TAG),taghash);
- caml_initialize(&Field(nnode,LEFT),NIL);
- caml_initialize(&Field(nnode,RIGHT),NIL);
- caml_initialize(&Field(nnode,FATHER),father);
-
- nnodeptr = caml_alloc_shr(1,LEFT);
- caml_initialize(&Field(nnodeptr,0),nnode);
-
- switch (Tag_val(onode)){
- case LEFT:
- caml_modify(&Field(Field(onode,0),LEFT),nnodeptr);
- Tag_val(onode) = RIGHT;
- break;
-
- case RIGHT:
- caml_modify(&Field(Field(onode,0),RIGHT),nnodeptr);
- Tag_val(onode) = NODE;
- break;
- };
-
- stack.push_front(nnodeptr);
-
- CAMLreturn0;
-}
-
-
-
-void OCamlStorageInterface::newText(string text)
-{
- CAMLparam0();
- CAMLlocal3(pcdata,snode,node);
- DPRINT("newText " << text <<"\n")
-
- pcdata = caml_copy_string(text.c_str());
- snode = caml_alloc_shr(1,STRING);
- caml_initialize(&(Field(snode,PCDATA)),pcdata);
- node = stack.front();
-
- switch (Tag_val(node)){
- case LEFT:
- caml_modify(&Field(Field(node,0),LEFT),snode);
- Tag_val(node) = RIGHT;
- break;
- case RIGHT:
- caml_modify(&Field(Field(node,0),RIGHT),snode);
- Tag_val(node) = NODE;
- break;
-
-
- };
-
- CAMLreturn0;
-}
-
-
-
-void OCamlStorageInterface::nodeFinished()
-{
-
- CAMLparam0();
- CAMLlocal1(node);
- DPRINT("nodeFinished\n")
- node = stack.front();
-
- switch (Tag_val(node)){
- case LEFT:
- DPRINT("Tagged left\n")
- Tag_val(node) = RIGHT;
- break;
-
- case RIGHT:
- DPRINT("Tagged right\n")
- Tag_val(node) = NODE;
-
- case NODE:
- DPRINT("Under NODE\n");
- while (Tag_val(node) == NODE){
- stack.pop_front();
- node = stack.front();
- };
- break;
- };
-
- CAMLreturn0;
-}
-
- void OCamlStorageInterface::parsingFinished()
-{
- CAMLparam0();
- CAMLlocal1(rnode);
- caml_register_global_root(&document);
- document = stack.front();
-
- stack.pop_front(); /* removes the root */
- rnode = stack.front();
-
- DPRINT ("Stack size is "<< stack.size() <<"\n")
- /* reinitializes the stack */
- caml_modify(&Field(Field(rnode,0),ID),Val_int(nodeid=1));
- caml_modify(&Field(Field(rnode,0),LEFT),NIL);
- caml_modify(&Field(Field(rnode,0),RIGHT),NIL);
- caml_modify(&Field(Field(Field(rnode,0),FATHER),0),NIL);
- Tag_val(rnode) = LEFT;
-
-
- caml_modify(&Field(Field(Field(document,0),FATHER),0),NIL);
- Tag_val(document) = NODE;
-
- CAMLreturn0;
-
-
-}
-value OCamlStorageInterface::getDocument (){
- CAMLparam0();
- CAMLlocal1(doc);
- doc = document;
- caml_remove_global_root(&document);
- CAMLreturn(doc);
-}
-void *OCamlStorageInterface::returnDocument(){
-
- return ((void *) getDocument());
-
-}
+++ /dev/null
-/*****************************************
- * OCamlStorageInterface.h
- * ------------------------
- * Header file for an OCaml Storage Interface
- *
- * Author: Kim Nguyen
- * Date: 04/11/08
- */
-
-#ifndef OCAMLSTORAGEINTERFACE_H_
-#define OCAMLSTORAGEINTERFACE_H_
-
-#include "StorageInterface.h"
-extern "C" {
-/* OCaml memory managment */
-
-#include <caml/mlvalues.h>
-#include <caml/alloc.h>
-#include <caml/memory.h>
-#include <caml/callback.h>
-
-}// extern C
-
-#include <list>
-
-using namespace std;
-
-class OCamlStorageInterface: public StorageInterface
-{
- public:
- OCamlStorageInterface();
- virtual ~OCamlStorageInterface();
- virtual void newChild(string name);
- virtual void newText(string text);
- virtual void nodeFinished();
- virtual void parsingFinished();
- virtual void* returnDocument();
-
- private:
- value getDocument();
- list<value> stack;
- value document;
- int nodeid;
-};
-
-
-#endif /*OCAMLSTORAGEINTERFACE_H_*/
-
#include <libxml++/libxml++.h>
#include <libxml++/parsers/textreader.h>
#include <string>
-#include <unordered_map>
#include "StorageInterface.h"
using namespace std;
using namespace xmlpp;
-/* For Hashmap. Seems fairly well supported */
-using namespace __gnu_cxx;
-typedef pair<int,string> cons_str;
-typedef pair<string,int> cons_int;
class XMLDocShredder
{
let pcdata = T.pcdata
external tag : string -> t = "%identity"
external clear_pool : unit -> unit = "%identity"
+ let init _ = ()
let compare = String.compare
let equal = (=)
let print fmt s = Format.fprintf fmt "%s" s
val attribute : t
val pcdata : t
val tag : string -> t
+val init : string array -> unit
val clear_pool : unit -> unit
val compare : t -> t -> int
val equal : t -> t -> bool
module Text =
struct
- type t (* pointer to the text collection *)
+
(* Todo *)
external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt"
let nil = nullt ()
external tag : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag"
external tag_id : t -> [`Tree ] node -> unit = "caml_xml_tree_tag_id"
- external text_collection : t -> Text.t = "caml_xml_tree_text_collection"
-
let is_last t n = equal nil (next_sibling t n)
external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text"
external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor"
let print_skel t =
- let textcol = text_collection t in
let rec aux id =
if (is_nil id)
then Printf.eprintf "#\n"
(Tag.to_string (tag t id))
(node_xml_id t id)
(int_of_node (prev_text t id))
- (Text.get_text textcol (prev_text t id))
+ (Text.get_text t (prev_text t id))
(int_of_node (my_text t id))
- (Text.get_text textcol (my_text t id))
+ (Text.get_text t (my_text t id))
(int_of_node (next_text t id))
- (Text.get_text textcol (next_text t id));
+ (Text.get_text t (next_text t id));
aux(first_child t id);
aux(next_sibling t id);
end
aux (root t)
let traversal t =
- let textcol = text_collection t in
let rec aux id =
if not (is_nil id)
then
begin
(* ignore (tag t id);
- ignore (Text.get_text textcol (prev_text t id));
+ ignore (Text.get_text t (prev_text t id));
if (is_leaf t id)
- then ignore (Text.get_text textcol (my_text t id));
+ then ignore (Text.get_text t (my_text t id));
if (is_last t id)
- then ignore (Text.get_text textcol (next_text t id)); *)
+ then ignore (Text.get_text t (next_text t id)); *)
aux (first_child t id);
aux (next_sibling t id);
end
type doc = t
- type t = { doc : doc;
- text : Text.t;
+ type t = { doc : doc;
node : descr }
let dump { doc=t } = Tree.print_skel t
open Tree
let node_of_t t = { doc= t;
- text = text_collection t;
node = Node(NC (root t)) }
let equal a b = (compare a b) == 0
let string t = match t.node with
- | String i -> Text.get_text t.text i
+ | String i -> Text.get_text t.doc i
| _ -> assert false
let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n)
match n.node with
| Node (NC t) when is_leaf n.doc t ->
let txt = my_text n.doc t in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then Nil
else Node(SC (txt,Tree.nil))
| Node (NC t) ->
let fs = first_child n.doc t in
let txt = prev_text n.doc fs in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then norm fs
else Node (SC (txt, fs))
| Node(SC (i,_)) -> String i
| Node(NC t) ->
let ns = next_sibling n.doc t in
let txt = next_text n.doc t in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then norm ns
else Node (SC (txt, ns))
| Nil | String _ -> failwith "next_sibling"
| _ -> false
let contains t s =
- Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.text s)
+ Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s)
let contains_old t s =
let regexp = Str.regexp_string s in
let rec aux n =
match n.node with
| Nil -> ()
- | String i -> () (*ignore(Text.get_text t.text i) *)
+ | String i -> () (*ignore(Text.get_text t.doc i) *)
| Node(_) ->
(* tag_id n; *)
aux (first_child n);
match n.node with
| Node (NC t) when is_leaf_ n.doc t ->
let txt = my_text_ n.doc t in
- if is_empty_ n.text txt
+ if is_empty_ n.doc txt
then Nil
else Node(SC (txt,XML.Tree.nil))
| Node (NC t) ->
let fs = first_child_ n.doc t in
let txt = prev_text_ n.doc fs in
- if is_empty_ n.text txt
+ if is_empty_ n.doc txt
then norm fs
else Node (SC (txt, fs))
| Node(SC (i,_)) -> String i
| Node(NC t) ->
let ns = next_sibling_ n.doc t in
let txt = next_text_ n.doc t in
- if is_empty_ n.text txt
+ if is_empty_ n.doc txt
then norm ns
else Node (SC (txt, ns))
| Nil | String _ -> failwith "next_sibling"