From: kim Date: Thu, 29 Jan 2009 08:19:51 +0000 (+0000) Subject: Some more bugfixing for the contains. X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=7dea5fd8bedede27d4d601f85630a249bfab420b;p=SXSI%2Fxpathcomp.git Some more bugfixing for the contains. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@92 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index 723f9d9..e3b5f44 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -125,9 +125,12 @@ extern "C" CAMLprim value caml_cpp_traversal(value tree){ } extern "C" CAMLprim value caml_text_collection_get_text(value tree, value id){ - CAMLparam2(tree,id); - const char* txt = (const char*) (XMLTREE(tree)->GetText((DocID) Int_val(id))); - CAMLreturn (caml_copy_string(txt)); + CAMLparam2(tree,id); + CAMLlocal1(str); + uchar* txt = XMLTREE(tree)->GetText((DocID) Int_val(id)); + str = caml_copy_string((const char*)txt); + delete (txt); + CAMLreturn (str); } extern "C" CAMLprim value caml_text_collection_empty_text(value tree,value id){ CAMLparam2(tree,id); @@ -153,7 +156,7 @@ extern "C" CAMLprim value caml_text_collection_contains(value tree,value str){ uchar * cstr = (uchar *) String_val(str); std::vector results; results = XMLTREE(tree)->Contains(cstr); - + //free(cstr); resarray = caml_alloc_tuple(results.size()); for (unsigned int i=0; iParentNode(TREENODEVAL(id)))); + CAMLreturn(Val_int (XMLTREE(tree)->ParentNode((DocID) Int_val(id)))); } extern "C" CAMLprim value caml_xml_tree_is_ancestor(value tree,value id1, value id2) { @@ -236,20 +239,14 @@ extern "C" CAMLprim value caml_xml_tree_node_xml_id(value tree, value id){ CAMLparam2(tree,id); CAMLreturn(Val_int((XMLTREE(tree)->NodeXMLId(TREENODEVAL(id))))); } -extern "C" CAMLprim value caml_xml_tree_tag(value tree, value id){ - CAMLparam2(tree,id); - const char* tag; - tag =(const char*) XMLTREE(tree)->GetTagName(XMLTREE(tree)->Tag(TREENODEVAL(id))); - - CAMLreturn (caml_copy_string(tag)); -} extern "C" CAMLprim value caml_xml_tree_tag_name(value tree, value tagid){ CAMLparam2(tree,tagid); - const char* tag; - tag = (const char*) XMLTREE(tree)->GetTagName((TagType) (Int_val(tagid))); - - CAMLreturn (caml_copy_string(tag)); + CAMLlocal1(str); + char* tag; + tag = (char*) XMLTREE(tree)->GetTagNameByRef((TagType) (Int_val(tagid))); + str = caml_copy_string((const char*) tag); + CAMLreturn (str); } diff --git a/automaton.ml b/automaton.ml index 27c8264..0c815a4 100644 --- a/automaton.ml +++ b/automaton.ml @@ -228,6 +228,7 @@ type t = { initial : SSet.t; (* Statistics *) mutable numbt : int; mutable max_states : int; + contains : (string,Tree.Binary.DocIdSet.t) Hashtbl.t; } let mk () = { initial = SSet.empty; @@ -237,7 +238,9 @@ let mk () = { initial = SSet.empty; ignore = SSet.empty; result = BST.empty; numbt = 0; - max_states = 0 + max_states = 0; + contains = Hashtbl.create 37; + };; let print_tags fmt l = @@ -304,11 +307,11 @@ struct let mem s x = SSet.mem x s - let rec accepting_among ?(strings=None)auto t r = + let rec accepting_among ?(nobrother=false) ?(strings=None) auto t r = if SSet.is_empty r then r else match strings with - | Some valid_strings when (Tree.Binary.DocIdSet.for_all (fun i -> - not (Tree.Binary.string_below t i)) valid_strings ) + | Some valid_strings when Tree.Binary.DocIdSet.for_all (fun i -> not(Tree.Binary.string_below t i) + ) valid_strings -> SSet.empty | _ -> ( @@ -322,7 +325,8 @@ struct match strings with | None -> SSet.inter r auto.final | Some valid_strings when (Tree.Binary.DocIdSet.mem id valid_strings) - -> SSet.inter r auto.final + -> Printf.eprintf "Selecting string '%s'\n%!" (Tree.Binary.get_string t id); + SSet.inter r auto.final | _ -> SSet.empty ) | Tree.Binary.Node(_) -> @@ -349,7 +353,8 @@ struct (fun x->SSet.mem (Transition.dest1 x) s1) Transition.dest2 transitions in - let s2 = accepting_among auto t2 r2 + let s2 = if nobrother then SSet.union (SSet.inter r auto.final) to_ignore + else accepting_among auto t2 r2 in let _,s = filter_map_rev (fun x -> SSet.mem (Transition.dest2 x) s2) @@ -364,7 +369,7 @@ struct let accept ?(strings=None) auto t = auto.result <- BST.empty; - if SSet.is_empty (accepting_among ~strings:strings auto t auto.initial) + if SSet.is_empty (accepting_among ~nobrother:true ~strings:strings auto t auto.initial) then false else true end diff --git a/automaton.mli b/automaton.mli index ddb90ce..af4cf56 100644 --- a/automaton.mli +++ b/automaton.mli @@ -54,13 +54,14 @@ type t = { initial : SSet.t; mutable result : BST.t; mutable numbt : int; mutable max_states : int; + contains : (string,Tree.Binary.DocIdSet.t) Hashtbl.t; } val mk : unit -> t val dump : Format.formatter -> t -> unit module BottomUp : sig - val accepting_among : ?strings:Tree.Binary.DocIdSet.t option -> + val accepting_among : ?nobrother:bool -> ?strings:Tree.Binary.DocIdSet.t option -> t -> Tree.Binary.t -> SSet.t -> SSet.t val accept : ?strings:Tree.Binary.DocIdSet.t option -> t -> Tree.Binary.t -> bool diff --git a/benchmark/config.ml b/benchmark/config.ml index 025b417..01aee85 100644 --- a/benchmark/config.ml +++ b/benchmark/config.ml @@ -1,8 +1,8 @@ (* semi-colon separated list of input documents *) -let documents = [ "../tests/base.xml" ] +let documents = [ "../tests/tiny.srx" ] (* semi-colon separated list of XPath queries *) -let queries = [ "/child::*"; "//*" ] +let queries = [ "/*" ] (* I is the initial configuration @@ -16,12 +16,12 @@ module CONF : CONFIGURATION = struct let path = "." let result_basename = "test" - let num_runs = 5 + let num_runs = 1 let run_with_output = true let run_without_output = true end module I = INIT_TESTER (CONF) -module TEST = MK (SXSI) (MK (SaxonBXQuery) (I)) +module TEST = MK (SXSI) (I) diff --git a/benchmark/main.ml b/benchmark/main.ml index 935528c..ae45a49 100644 --- a/benchmark/main.ml +++ b/benchmark/main.ml @@ -62,7 +62,7 @@ struct let reference = false let time_factor = 1.0 let mk_queryfile b doc q out = () - let mk_cmdline b qout qfile doc q = [ "-d"; doc; q ]@ (if b then [qout] else []) + let mk_cmdline b qout qfile doc q = [ doc; q ]@ (if b then [qout] else []) let parse_rules = [ ( ".*Parsing document :[ \\t]*\\([0-9]+\\.[0-9]*\\)ms.*", [ Input_parsing_time 1]); diff --git a/main.ml b/main.ml index c53edf2..b989329 100644 --- a/main.ml +++ b/main.ml @@ -42,8 +42,8 @@ let main v query output = let auto = time XPath.Compile.compile query in XPath.Ast.print Format.err_formatter query; Format.eprintf "\n%!"; - (* Format.eprintf "Internal rep of the tree is :\n%!"; - Tree.Binary.dump v; *) +(* Format.eprintf "Internal rep of the tree is :\n%!"; + Tree.Binary.dump v; *) Printf.eprintf "Execution time : "; time (fun v -> ignore (TopDown.accept auto v)) v; Printf.eprintf "Number of nodes in the result set : %i\n" (BST.cardinal auto.result); diff --git a/tree.ml b/tree.ml index 9cab2c7..6c3cc1b 100644 --- a/tree.ml +++ b/tree.ml @@ -26,11 +26,16 @@ sig val print_xml_fast : out_channel -> t -> unit val compare : t -> t -> int val equal : t -> t -> bool - module DocIdSet : Set.S with type elt = string_content + module DocIdSet : + sig + include Set.S + end + with type elt = string_content val string_below : t -> string_content -> bool val contains : t -> string -> DocIdSet.t val contains_old : t -> string -> bool val dump : t -> unit + val get_string : t -> string_content -> string end module XML = @@ -56,7 +61,8 @@ struct module Text = struct - + let equal : [`Text] node -> [`Text] node -> bool = equal + (* Todo *) external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt" let nil = nullt () @@ -79,7 +85,7 @@ struct module Tree = struct - + let equal : [`Tree ] node -> [`Tree] node -> bool = equal external serialize : t -> string -> unit = "caml_xml_tree_serialize" external unserialize : string -> t = "caml_xml_tree_unserialize" @@ -120,7 +126,7 @@ struct then Printf.eprintf "#\n" else begin - Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!" + Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" (int_of_node id) (Tag.to_string (tag_id t id)) (node_xml_id t id) @@ -129,7 +135,9 @@ struct (int_of_node (my_text t id)) (Text.get_text t (my_text t id)) (int_of_node (next_text t id)) - (Text.get_text t (next_text t id)); + (Text.get_text t (next_text t id)) + (int_of_node(parent_doc t (my_text t id))); + aux(first_child t id); aux(next_sibling t id); end @@ -172,10 +180,12 @@ struct node : descr } let dump { doc=t } = Tree.print_skel t - module DocIdSet = Set.Make (struct type t = string_content - let compare = (-) end) - - + module DocIdSet = struct + include Set.Make (struct type t = string_content + let compare = (-) end) + + end + let get_string t (i:string_content) = Text.get_text t.doc i open Tree let node_of_t t = { doc= t; node = Node(NC (root t)) } @@ -296,11 +306,13 @@ struct | _ -> () *) let string_below t id = - let pid = parent_doc t.doc id in + let strid = parent_doc t.doc id in match t.node with - | Node(NC(i)) -> (is_ancestor t.doc i pid) - | Node(SC(i,_)) -> (is_ancestor t.doc (parent_doc t.doc i) pid) + | Node(NC(i)) -> + (Tree.equal i strid) || (is_ancestor t.doc i strid) + | Node(SC(i,_)) -> Text.equal i id | _ -> false + let contains t s = Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s) @@ -324,7 +336,9 @@ struct let rec loop ?(print_right=true) t = match t.node with | Nil -> () | String (s) -> output_string outc (string t) - | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t) + | Node _ when Tag.equal (tag t) Tag.pcdata -> + loop (left t); + if print_right then loop (right t) | Node (_) -> let tg = Tag.to_string (tag t) in diff --git a/tree.mli b/tree.mli index ad244e4..eb13139 100644 --- a/tree.mli +++ b/tree.mli @@ -25,11 +25,15 @@ sig val print_xml_fast : out_channel -> t -> unit val compare : t -> t -> int val equal : t -> t -> bool - module DocIdSet : Set.S with type elt = string_content + module DocIdSet : + sig + include Set.S + end with type elt = string_content val string_below : t -> string_content -> bool val contains : t -> string -> DocIdSet.t val contains_old : t -> string -> bool val dump : t -> unit + val get_string : t -> string_content -> string end module Binary : BINARY diff --git a/xPath.ml b/xPath.ml index 6c88d9a..6dfde87 100644 --- a/xPath.ml +++ b/xPath.ml @@ -215,6 +215,8 @@ module Functions = struct let text t = Tree.Binary.string (Tree.Binary.left t) + + let rec eval_expr tree (e:expr) : value = match e with | `Call (f,args) -> (List.assoc f globals) (List.map (eval_expr tree) args) | `Auto(a) -> `NodeSet(ignore (Automaton.BottomUp.accept a tree); @@ -223,8 +225,17 @@ module Functions = struct begin match args with [ `Auto(a); `String(s) ] -> - let docs = Tree.Binary.contains tree s - in + let docs = try + Hashtbl.find a.Automaton.contains s + with + | Not_found -> + let r = Tree.Binary.contains tree s + in + (* Tree.Binary.DocIdSet.iter (fun id -> + Printf.eprintf "%s matches %s\n%!" (Tree.Binary.get_string tree id) s) r; *) + + Hashtbl.add a.Automaton.contains s r;r + in let _ = Automaton.BottomUp.accept ~strings:(Some docs) a tree in `NodeSet(a.Automaton.result) | _ -> failwith "contains invalid" @@ -303,13 +314,16 @@ module Compile = struct let rec map_dir (d,acc) = function | [] -> acc | s::r -> map_dir ((dir s),(s,d)::acc) r - in let l = match p with - | Absolute p | Relative p -> map_dir (Final,[]) p - | AbsoluteDoS p -> - let l = (map_dir (Final,[]) p) - in ((DescendantOrSelf,TagSet.Xml.node,Expr True),dir (fst(List.hd l)))::l - in ((Child,TagSet.Xml.node,Expr True),dir (fst(List.hd l)))::l - + in + let l = + match p with + | Absolute p + | Relative p -> map_dir (Final,[]) p + | AbsoluteDoS p -> + let l = (map_dir (Final,[]) p) + in ((DescendantOrSelf,TagSet.Xml.node,Expr True),dir (fst(List.hd l)))::l + in ((Child,TagSet.Xml.node,Expr True),dir (fst(List.hd l)))::l + let rec compile_step q dir trs final initial ignore (axis,test,pred) = let q' = State.mk() in