From: kim Date: Wed, 28 Jan 2009 02:42:48 +0000 (+0000) Subject: Added parsing of command line options to set sample factor, disabling storage X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;ds=sidebyside;h=eebef30070a951d852ce5811b289d8131a5300eb;p=SXSI%2Fxpathcomp.git Added parsing of command line options to set sample factor, disabling storage of empty text and disabling use of text collection git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@82 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/Makefile b/Makefile index 63c78f5..75a3922 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ DEBUG=false PROFILE=true VERBOSE=false -MLSRCS = memory.ml tag.ml tagSet.ml tree.ml automaton.ml ulexer.ml xPath.ml main.ml -MLISRCS = memory.mli automaton.mli tag.mli tagSet.mli tree.mli ulexer.mli xPath.mli +MLSRCS = memory.ml tag.ml tagSet.ml options.ml tree.ml automaton.ml ulexer.ml xPath.ml main.ml +MLISRCS = memory.mli options.mli automaton.mli tag.mli tagSet.mli tree.mli ulexer.mli xPath.mli MLOBJS = $(MLSRCS:.ml=.cmx) MLCINT = $(MLISRCS:.mli=.cmi) diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index f6764fa..38d660b 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -15,6 +15,9 @@ extern "C" { #include #include #include +#include + + } //extern C //#include "TextCollection/TextCollection.h" @@ -24,21 +27,42 @@ extern "C" { #define CAMLRAISECPP(e) (caml_failwith( ((e).what()))) #define NOT_IMPLEMENTED(s) (caml_failwith(s)) -#define XMLTREE(x) ((XMLTree *)(x)) +#define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x))) #define TEXTCOLLECTION(x) #define TREENODEVAL(i) ((treeNode) (Int_val(i))) -extern "C" CAMLprim value caml_call_shredder_uri(value uri){ +extern "C" { + static struct custom_operations ops; + static bool initialized = false; +} +extern "C" void caml_xml_tree_finalize(value tree){ + delete XMLTREE(tree); + return; +} + +extern "C" void caml_init_ops () { + + if (initialized) + return; + ops.identifier = (char*) "XMLTree"; + ops.finalize = caml_xml_tree_finalize; + return; +} + +extern "C" CAMLprim value caml_call_shredder_uri(value uri,value sf, value iet, value dtc){ CAMLparam1(uri); CAMLlocal1(doc); char *fn = String_val(uri); try { - XMLDocShredder shredder(fn); + XMLDocShredder shredder(fn,Int_val(sf),Bool_val(iet),Bool_val(dtc)); + XMLTree * tree; shredder.processStartDocument(fn); shredder.parse(); shredder.processEndDocument(); - doc = (value) shredder.storageIfc_->returnDocument(); - + caml_init_ops(); + doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); + tree = (XMLTree *) shredder.storageIfc_->returnDocument(); + memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); CAMLreturn(doc); } catch (const std::exception& e){ @@ -47,19 +71,22 @@ extern "C" CAMLprim value caml_call_shredder_uri(value uri){ } -extern "C" CAMLprim value caml_call_shredder_string(value data){ +extern "C" CAMLprim value caml_call_shredder_string(value data,value sf, value iet, value dtc){ CAMLparam1(data); CAMLlocal1(doc); unsigned int ln = string_length(data); unsigned char *fn = (unsigned char*) String_val(data); try { - XMLDocShredder shredder(fn,ln); + XMLDocShredder shredder(fn,ln,Int_val(sf),Bool_val(iet),Bool_val(dtc)); + XMLTree* tree; shredder.processStartDocument(""); shredder.parse(); shredder.processEndDocument(); - doc = (value) shredder.storageIfc_->returnDocument(); - + caml_init_ops(); + doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2); + tree = (XMLTree *) shredder.storageIfc_->returnDocument(); + memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*)); CAMLreturn(doc); } catch (const std::exception& e) { @@ -216,6 +243,7 @@ extern "C" CAMLprim value caml_xml_tree_tag(value tree, value id){ CAMLreturn (caml_copy_string(tag)); } + extern "C" CAMLprim value caml_xml_tree_tag_name(value tree, value tagid){ CAMLparam2(tree,tagid); const char* tag; diff --git a/SXSIStorageInterface.cpp b/SXSIStorageInterface.cpp index e5bd7cf..98d15b7 100644 --- a/SXSIStorageInterface.cpp +++ b/SXSIStorageInterface.cpp @@ -12,10 +12,10 @@ #include "Utils.h" -SXSIStorageInterface::SXSIStorageInterface() +SXSIStorageInterface::SXSIStorageInterface(int sf,bool iet,bool dtc) { tree = new XMLTree(); - tree->OpenDocument(false,64); + tree->OpenDocument(iet,sf,dtc); } SXSIStorageInterface::~SXSIStorageInterface() @@ -42,7 +42,6 @@ void SXSIStorageInterface::newText(string text) tree->NewText((unsigned char*) text.c_str()); } } - void SXSIStorageInterface::nodeFinished(string name) diff --git a/SXSIStorageInterface.h b/SXSIStorageInterface.h index a627d3e..807fc79 100644 --- a/SXSIStorageInterface.h +++ b/SXSIStorageInterface.h @@ -19,7 +19,7 @@ using namespace std; class SXSIStorageInterface: public StorageInterface { public: - SXSIStorageInterface(); + SXSIStorageInterface(int sf, bool iet, bool dtc); virtual ~SXSIStorageInterface(); virtual void newChild(string name); virtual void newText(string text); diff --git a/XMLDocShredder.cpp b/XMLDocShredder.cpp index d2e4a75..c048d2e 100644 --- a/XMLDocShredder.cpp +++ b/XMLDocShredder.cpp @@ -53,21 +53,24 @@ void XMLDocShredder::setProperties(){ } XMLDocShredder::XMLDocShredder(const unsigned char * data, - TextReader::size_type size) + TextReader::size_type size, + int sf, + bool iet, + bool dtc) { last_text = false; reader_ = new TextReader(data,size,""); setProperties(); - storageIfc_ = new SXSIStorageInterface(); + storageIfc_ = new SXSIStorageInterface(sf,iet,dtc); buffer = ""; } -XMLDocShredder::XMLDocShredder(const string inFileName) +XMLDocShredder::XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc) { last_text = false; reader_ = new TextReader(inFileName); setProperties(); - storageIfc_ = new SXSIStorageInterface(); + storageIfc_ = new SXSIStorageInterface(sf,iet,dtc); buffer = ""; } diff --git a/XMLDocShredder.h b/XMLDocShredder.h index 437452e..58d4053 100644 --- a/XMLDocShredder.h +++ b/XMLDocShredder.h @@ -24,23 +24,23 @@ using namespace xmlpp; class XMLDocShredder { public: - XMLDocShredder(const string inFileName); - XMLDocShredder(const unsigned char * data, TextReader::size_type size); - virtual ~XMLDocShredder(); - virtual void processStartElement(); - virtual void processEndElement(); - virtual void processPCDATA(); - virtual void processAttributes(); - virtual void processSignificantWhitespace(); - virtual void processStartDocument(const string docName); - virtual void processEndDocument(); - virtual void processComment(); - virtual void processProcessingInstruction(); - virtual void processDocTypeDeclaration(); - virtual void processUnknownNodeType(); - virtual void processCDATASection(); - virtual void parse(); - + XMLDocShredder(const string inFileName,int sf, bool iet, bool dtc); + XMLDocShredder(const unsigned char * data, TextReader::size_type size,int sf, bool iet, bool dtc); + virtual ~XMLDocShredder(); + virtual void processStartElement(); + virtual void processEndElement(); + virtual void processPCDATA(); + virtual void processAttributes(); + virtual void processSignificantWhitespace(); + virtual void processStartDocument(const string docName); + virtual void processEndDocument(); + virtual void processComment(); + virtual void processProcessingInstruction(); + virtual void processDocTypeDeclaration(); + virtual void processUnknownNodeType(); + virtual void processCDATASection(); + virtual void parse(); + StorageInterface *storageIfc_; diff --git a/benchmark/main.ml b/benchmark/main.ml index 98a9519..7f6b3ad 100644 --- a/benchmark/main.ml +++ b/benchmark/main.ml @@ -83,19 +83,19 @@ module CONF : CONFIGURATION = struct let path = "." let result_basename = "test" - let num_runs = 5 - let run_with_output = true + let num_runs = 1 + let run_with_output = false let run_without_output = true end module I = INIT_TESTER (CONF) -module Test = MK (SXSI) (MK (SaxonBXQuery) (I)) - +module TestOld = MK (SXSI) (MK (SaxonBXQuery) (I)) +module Test = MK (SXSI) (I) let l = Test.test_engine [] (make_queryset ["/home/kim/Documents/Work/Code/xpathcomp/tests/tiny.xml"] - ["/descendant::*/descendant::*/descendant::*"]) + ["/child::*"]) ;; diff --git a/depend b/depend index ad5505d..fa95f37 100644 --- a/depend +++ b/depend @@ -4,17 +4,23 @@ tag.cmo: tag.cmi tag.cmx: tag.cmi tagSet.cmo: tag.cmi tagSet.cmi tagSet.cmx: tag.cmx tagSet.cmi -tree.cmo: tag.cmi tree.cmi -tree.cmx: tag.cmx tree.cmi +options.cmo: options.cmi +options.cmx: options.cmi +tree.cmo: tag.cmi options.cmi tree.cmi +tree.cmx: tag.cmx options.cmx tree.cmi automaton.cmo: tree.cmi tagSet.cmi tag.cmi automaton.cmi automaton.cmx: tree.cmx tagSet.cmx tag.cmx automaton.cmi ulexer.cmo: ulexer.cmi ulexer.cmx: ulexer.cmi xPath.cmo: ulexer.cmi tree.cmi tagSet.cmi tag.cmi automaton.cmi xPath.cmi xPath.cmx: ulexer.cmx tree.cmx tagSet.cmx tag.cmx automaton.cmx xPath.cmi -main.cmo: xPath.cmi ulexer.cmi tree.cmi tag.cmi automaton.cmi -main.cmx: xPath.cmx ulexer.cmx tree.cmx tag.cmx automaton.cmx +main.cmo: xPath.cmi ulexer.cmi tree.cmi tag.cmi options.cmi automaton.cmi +main.cmx: xPath.cmx ulexer.cmx tree.cmx tag.cmx options.cmx automaton.cmx +memory.cmi: +options.cmi: automaton.cmi: tree.cmi tagSet.cmi +tag.cmi: tagSet.cmi: tag.cmi tree.cmi: tag.cmi +ulexer.cmi: xPath.cmi: tagSet.cmi automaton.cmi diff --git a/main.ml b/main.ml index ba93328..b342388 100644 --- a/main.ml +++ b/main.ml @@ -61,17 +61,11 @@ let main filename query output = Printf.eprintf "Total time : %fms\n Coherence : %i\n%!" (total_time()) ;; -let argc = Array.length Sys.argv;; -if (argc < 3 || argc >4) -then - (prerr_endline ("usage : " ^ Sys.argv.(0) ^ " \'query\'[ ]"); - exit 1) -;; +Options.parse_cmdline();; -main Sys.argv.(1) Sys.argv.(2) (if argc == 4 then Some Sys.argv.(3) else None) ;; +main !Options.input_file !Options.query !Options.output_file;; Printf.eprintf "\n=================================================\nDEBUGGING\n%!"; Tree.DEBUGTREE.print_stats Format.err_formatter;; - - +Gc.full_major() diff --git a/options.ml b/options.ml new file mode 100644 index 0000000..6d6dc86 --- /dev/null +++ b/options.ml @@ -0,0 +1,25 @@ +let index_empty_texts = ref false +let sample_factor = ref 64 +let disable_text_collection = ref false + +let query = ref "" +let input_file = ref "" +let output_file = ref None + + +let usage_msg = Printf.sprintf "%s 'query' [output]" Sys.argv.(0) + +let anon_fun = let pos = ref 0 in + fun s -> match !pos with + | 0 -> input_file:= s;incr pos + | 1 -> query := s; incr pos + | 2 -> output_file := Some s; incr pos + | _ -> raise (Arg.Bad(s)) + +let spec = [ "-f", Arg.Set_int(sample_factor),"sample factor [default=64]"; + "-i", Arg.Set(index_empty_texts),"index empty texts [default=false]"; + "-d", Arg.Set(disable_text_collection),"Disable text collection[default=false]"; ] + +let parse_cmdline() = Arg.parse spec anon_fun usage_msg + + diff --git a/options.mli b/options.mli new file mode 100644 index 0000000..aabd1fd --- /dev/null +++ b/options.mli @@ -0,0 +1,9 @@ +val parse_cmdline : unit -> unit +val index_empty_texts : bool ref +val sample_factor : int ref +val disable_text_collection : bool ref +val query : string ref +val input_file : string ref +val output_file : string option ref + + diff --git a/tag.ml b/tag.ml index e5c3820..057f09d 100644 --- a/tag.ml +++ b/tag.ml @@ -16,24 +16,29 @@ external register_tag : pool -> string -> t = "caml_xml_tree_register_tag" external tag_name : pool -> t -> string = "caml_xml_tree_tag_name" let nullt = null_tag () -let pcdata = max_int -let attribute = max_int - 1 +(* Defined in XMLTree.cpp *) +let pcdata = 1 +let attribute = 0 -let pool = ref (null_pool ()) +let pool = Weak.create 1 -let init p = pool := p +let init p = Weak.set pool 0 (Some p) + +let get_pool () = match Weak.get pool 0 with + | Some x -> x + | None -> failwith "Tag.ml: Uninitialized Document" let tag s = match s with | "<$>" -> pcdata | "<@>" -> attribute - | _ -> register_tag !pool s + | _ -> register_tag (get_pool()) s let compare = (-) let equal = (==) let to_string t = if t = pcdata then "<$>" else if t = attribute then "<@>" - else tag_name !pool t + else tag_name (get_pool()) t let print ppf t = Format.fprintf ppf "%s" (to_string t) diff --git a/tree.ml b/tree.ml index 1b4ce2e..ecd8e3b 100644 --- a/tree.ml +++ b/tree.ml @@ -44,12 +44,11 @@ struct external int_of_node : 'a node -> int = "%identity" - external parse_xml_uri : string -> t = "caml_call_shredder_uri" - let parse_xml_uri uri = parse_xml_uri uri - - external parse_xml_string : string -> t = "caml_call_shredder_string" - let parse_xml_string uri = parse_xml_string uri + external parse_xml_uri : string -> int -> bool -> bool -> t = "caml_call_shredder_uri" + + external parse_xml_string : string -> int -> bool -> bool -> t = "caml_call_shredder_string" + module Text = struct @@ -178,8 +177,13 @@ struct node = Node(NC (root t)) } - let parse_xml_uri str = node_of_t (parse_xml_uri str) - let parse_xml_string str = node_of_t (parse_xml_string str) + let parse_xml_uri str = node_of_t + (parse_xml_uri str + !Options.sample_factor !Options.index_empty_texts !Options.disable_text_collection) + + let parse_xml_string str = node_of_t + (parse_xml_string str + !Options.sample_factor !Options.index_empty_texts !Options.disable_text_collection) external pool : doc -> Tag.pool = "%identity" @@ -372,6 +376,8 @@ struct aux (first_child n); aux (next_sibling n) in aux t + + let print_stats _ = () end end