X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=tree.ml;h=7ea6f0337765b432c01add6681acbdfa9adb7eda;hb=ac8c1ac563a2c089f789eed5a03ff5b84a2c4fe0;hp=b1b1d15cb084e050a85d8b11626c998d14ee2935;hpb=609094fe14ca90cd5417ee22de621f76d1d0ec94;p=SXSI%2Fxpathcomp.git diff --git a/tree.ml b/tree.ml index b1b1d15..7ea6f03 100644 --- a/tree.ml +++ b/tree.ml @@ -34,8 +34,8 @@ let equal_node : 'a node -> 'a node -> bool = (==) external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri" external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string" external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print" -external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save" -external tree_load : Unix.file_descr -> bool -> int -> tree = "caml_xml_tree_load" +external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save" +external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load" external nullt : unit -> 'a node = "caml_xml_tree_nullt" @@ -132,9 +132,13 @@ external benchmark_jump : tree -> Tag.t -> unit = "caml_benchmark_jump" "noalloc let benchmark_jump t s = benchmark_jump t.doc s -external benchmark_fsns : tree -> unit = "caml_benchmark_fsns" "noalloc" +external benchmark_fcns : tree -> int = "caml_benchmark_fcns" "noalloc" -let benchmark_fsns t = benchmark_fsns t.doc +let benchmark_fcns t = benchmark_fcns t.doc + +external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc" + +let benchmark_lcps t = benchmark_lcps t.doc @@ -172,25 +176,40 @@ let text_size t = text_size t.doc module MemUnion = Hashtbl.Make (struct type t = Ptset.Int.t*Ptset.Int.t - let equal (x,y) (z,t) = x == z || y == t + let equal (x,y) (z,t) = x == z && y == t let equal a b = equal a b || equal b a let hash (x,y) = (* commutative hash *) - let x = Ptset.Int.uid x - and y = Ptset.Int.uid y + let x = Uid.to_int (Ptset.Int.uid x) + and y = Uid.to_int (Ptset.Int.uid y) in - if x <= y then HASHINT2(x,y) else HASHINT2(y,x) + if x <= y then HASHINT2(x,y) else HASHINT2(y,x) end) module MemAdd = Hashtbl.Make ( struct type t = Tag.t*Ptset.Int.t let equal (x,y) (z,t) = (x == z)&&(y == t) - let hash (x,y) = HASHINT2(x,Ptset.Int.uid y) + let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y)) end) +module MemUpdate = struct +include Hashtbl.Make ( + struct + type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t + let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 && + b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2 + let hash (a,b,c,d,e) = + HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)), + Uid.to_int (Ptset.Int.uid c), + Uid.to_int (Ptset.Int.uid d), + Uid.to_int (Ptset.Int.uid e)) + end) + +end + let collect_tags tree = let _ = Printf.eprintf "Collecting Tags\n%!" in -(* let h_union = MemUnion.create BIG_H_SIZE in + let h_union = MemUnion.create BIG_H_SIZE in let pt_cup s1 s2 = try MemUnion.find h_union (s1,s2) @@ -205,9 +224,7 @@ let collect_tags tree = with | Not_found -> let r = Ptset.Int.add t s in MemAdd.add h_add (t,s) r;r - in *) - let pt_cup = Ptset.Int.union in - let pt_add = Ptset.Int.add in + in let h = Hashtbl.create BIG_H_SIZE in let update t sc sb ss sa = let schild,sbelow,ssibling,safter = @@ -234,9 +251,9 @@ let collect_tags tree = let _ = loop false (tree_root tree) Ptset.Int.empty in let _ = Printf.eprintf "Finished\n%!" in h - - - + + + let contains_array = ref [| |] let contains_index = Hashtbl.create 4096 @@ -404,7 +421,7 @@ let save t str = (* we need to move the fd to the correct position *) flush out_c; ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET); - tree_save t.doc fd; + tree_save t.doc fd str; close_out out_c ;; @@ -435,7 +452,7 @@ let load ?(sample=64) ?(load_text=true) str = let _ = Printf.eprintf "\nLoading tag table : " in let ntable = time (load_table) () in ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET); - let tree = { doc = tree_load fd load_text sample; + let tree = { doc = tree_load fd str load_text sample; ttable = ntable;} in close_in in_c; tree @@ -472,8 +489,8 @@ let is_binary_ancestor t n1 n2 = let parent t n = tree_parent t.doc n -let first_child t = (); fun n -> tree_first_child t.doc n -let first_element t = (); fun n -> tree_first_element t.doc n +let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n +let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n (* these function will be called in two times: first partial application on the tag, then application of the tag and the tree, then application of @@ -486,8 +503,8 @@ let select_child t = fun ts -> let v = ptset_to_vector ts in (); fun n -> tree_select_child t.doc n v -let next_sibling t = (); fun n -> tree_next_sibling t.doc n -let next_element t = (); fun n -> tree_next_element t.doc n +let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n +let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag @@ -496,7 +513,8 @@ let select_following_sibling t = fun ts -> fun n -> tree_select_following_sibling t.doc n v let next_sibling_below t = (); fun n _ -> tree_next_sibling t.doc n -let next_element_below t = (); fun n _ -> tree_next_element t.doc n +let next_element_below t = (); fun n _ -> tree_next_element t.doc n + let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag let select_following_sibling_below t = fun ts -> @@ -752,10 +770,26 @@ let print_xml_fast3 t = tree_print_xml_fast3 t.doc +let stats t = + let tree = t.doc in + let rec loop left node acc_d total_d num_leaves = + if node == nil then + (acc_d+total_d,if left then num_leaves+1 else num_leaves) + else + let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in + loop false (tree_next_sibling tree node) (acc_d) d td + in + let a,b = loop true root 0 0 0 + in + Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b +;; - +let test_prefix t s = Array.length (text_prefix t.doc s) +let test_suffix t s = Array.length (text_suffix t.doc s) +let test_contains t s = Array.length (text_contains t.doc s) +let test_equals t s = Array.length (text_equals t.doc s)