From: kim Date: Mon, 7 Feb 2011 17:32:35 +0000 (+0000) Subject: Changed building of tag tables and format. X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=1b4d4c7a0537d30e21068f06535c5d3a1af92f88;p=SXSI%2Fxpathcomp.git Changed building of tag tables and format. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@954 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/Makefile b/Makefile index 8ecd5d8..5f225f1 100644 --- a/Makefile +++ b/Makefile @@ -64,7 +64,7 @@ PROFILE_FLAGS = -p -S SYNT_PROF = -ppopt -DPROFILE endif SYNT_FLAGS = $(SYNT_DEBUG) $(SYNT_PROF) -OPT_FLAGS = $(DEBUG_FLAGS) $(PROFILE_FLAGS) -nodynlink -fno-PIC +OPT_FLAGS = $(DEBUG_FLAGS) $(PROFILE_FLAGS) -nodynlink -fno-PIC -unsafe OCAMLOPT = ocamlopt -cc "$(CXX)" $(OPT_FLAGS) -ccopt -O3 -ccopt -std=c++0x -noassert -inline $(INLINE) @@ -75,7 +75,7 @@ OCAMLDEP = ocamldep #Ugly but seems difficult with a makefile LINK=$(OCAMLOPT) -linkpkg `ocamlc -version | grep -q "3.1[12].[012]" && echo dynlink.cmxa` camlp4lib.cmxa -SYNTAX= -syntax camlp4o $(PPINCLUDES) -ppopt pa_macro.cmo $(SYNT_FLAGS) +SYNTAX= -syntax camlp4o $(PPINCLUDES) -ppopt -printer -ppopt auto -ppopt pa_macro.cmo $(SYNT_FLAGS) diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index 0e3af12..39190d6 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -430,6 +430,10 @@ NoAlloc extern "C" value caml_xml_tree_size(value tree){ return (Val_int(XMLTREE(tree)->Size())); } +NoAlloc extern "C" value caml_xml_tree_num_tags(value tree){ + return (Val_int(XMLTREE(tree)->NumTags())); +} + NoAlloc extern "C" value caml_xml_tree_subtree_size(value tree, value node){ return (Val_int(XMLTREE(tree)->SubtreeSize(TREENODEVAL(node)))); } diff --git a/tree.ml b/tree.ml index ee5e76b..4958a82 100644 --- a/tree.ml +++ b/tree.ml @@ -22,7 +22,10 @@ type node_kind = [`Text | `Tree ] type t = { doc : tree; - ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t; + children : Ptset.Int.t array; + siblings : Ptset.Int.t array; + descendants: Ptset.Int.t array; + followings: Ptset.Int.t array; } external inode : 'a node -> int = "%identity" @@ -70,6 +73,7 @@ external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_coll external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc" external tree_size : tree -> int = "caml_xml_tree_size" "noalloc" +external tree_num_tags : tree -> int = "caml_xml_tree_num_tags" "noalloc" external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc" external tree_subtree_elements : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_elements" "noalloc" external tree_subtree_tags : tree -> [`Tree] node -> Tag.t -> int = "caml_xml_tree_subtree_elements" "noalloc" @@ -180,87 +184,122 @@ let subtree_size t i = tree_subtree_size t.doc i let subtree_elements t i = tree_subtree_elements t.doc i let text_size t = text_size t.doc -module MemUnion = Hashtbl.Make (struct - type t = Ptset.Int.t*Ptset.Int.t - let equal (x,y) (z,t) = x == z && y == t - let equal a b = equal a b || equal b a - let hash (x,y) = (* commutative hash *) - let x = Uid.to_int (Ptset.Int.uid x) - and y = Uid.to_int (Ptset.Int.uid y) - in - if x <= y then HASHINT2(x,y) else HASHINT2(y,x) - end) - -module MemAdd = Hashtbl.Make ( - struct - type t = Tag.t*Ptset.Int.t - let equal (x,y) (z,t) = (x == z)&&(y == t) - let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y)) - end) - -module MemUpdate = struct -include Hashtbl.Make ( - struct - type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t - let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 && - b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2 - let hash (a,b,c,d,e) = - HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)), - Uid.to_int (Ptset.Int.uid c), - Uid.to_int (Ptset.Int.uid d), - Uid.to_int (Ptset.Int.uid e)) - end) - -end -let collect_tags tree = - let _ = Printf.eprintf "Collecting Tags\n%!" in - let h_union = MemUnion.create BIG_H_SIZE in - let pt_cup s1 s2 = - try - MemUnion.find h_union (s1,s2) - with - | Not_found -> let s = Ptset.Int.union s1 s2 - in - MemUnion.add h_union (s1,s2) s;s - in - let h_add = MemAdd.create BIG_H_SIZE in - let pt_add t s = - try MemAdd.find h_add (t,s) - with - | Not_found -> let r = Ptset.Int.add t s in - MemAdd.add h_add (t,s) r;r - in - let h = Hashtbl.create BIG_H_SIZE in - let update t sc sb ss sa = - let schild,sbelow,ssibling,safter = - try - Hashtbl.find h t - with - | Not_found -> - (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty) - in - Hashtbl.replace h t - (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa) +let rec fold_siblings tree f node acc = + if node == nil then acc else fold_siblings tree f (tree_next_sibling tree node) (f node acc) +module TS = + struct + type t = bool array + let create n = Array.create n false + let add e a = a.(e) <- true; a + let merge a b = + for i = 0 to Array.length a - 1 do + a.(i) <- a.(i) || b.(i) + done + let clear a = + for i = 0 to Array.length a - 1 do + a.(i) <- false; + done + + let to_ptset a = + let r = ref Ptset.Int.empty in + for i = 0 to Array.length a - 1 do + r := Ptset.Int.add i !r; + done; + !r + end + + +let collect_children_siblings tree = + let ntags = (tree_num_tags tree) in + let () = Printf.eprintf ">>>length: %i\n%!" ntags in + let table_c = Array.init (tree_num_tags tree) (fun _ -> TS.create ntags) in + let table_n = Array.init (tree_num_tags tree) (fun _ -> TS.create ntags) in + let acc_tag n s = TS.add (tree_tag tree n) s in + let count = ref 0 in + let size = tree_subtree_size tree root in + let tmp = TS.create ntags in + let rec loop node = + if node == nil then () + else + let () = if !count mod 10000 == 0 then + Printf.eprintf "Node %i / %i\n%!" !count size; + in + let () = if !count mod 1000000 == 0 then Gc.compact() in + let () = count := !count + 1 in + let tag = tree_tag tree node in + let () = TS.clear tmp in + let children = + fold_siblings tree + acc_tag + (tree_first_child tree node) tmp + in + let () = TS.merge table_c.(tag) children in + let () = TS.clear tmp in + let siblings = + fold_siblings tree + acc_tag + (tree_next_sibling tree node) tmp + in + TS.merge table_n.(tag) siblings; + loop (tree_first_child tree node); + loop (tree_next_sibling tree node) in - let rec loop right id acc_after = - if id == nil - then Ptset.Int.empty,Ptset.Int.empty,acc_after else - let sibling2,desc2,after2 = loop true (tree_next_sibling tree id) acc_after in - let child1,desc1,after1 = loop false (tree_first_child tree id) after2 in - let tag = tree_tag tree id in - update tag child1 desc1 sibling2 after2; - ( pt_add tag sibling2, - pt_add tag (pt_cup desc1 desc2), - if right then pt_cup after1 (pt_cup desc1 desc2) else acc_after ) + loop root; + ( Array.map TS.to_ptset table_c, + Array.map TS.to_ptset table_n ) + +let collect_children_siblings tree = + let table_c = Array.create (tree_num_tags tree) Ptset.Int.empty in + let table_n = Array.copy table_c in + let rec loop node = + if node == nil then Ptset.Int.empty + else + let children = loop (tree_first_child tree node) in + let tag = tree_tag tree node in + let () = table_c.(tag) <- Ptset.Int.union table_c.(tag) children in + let siblings = loop (tree_next_sibling tree node) in + Ptset.Int.add tag siblings in - let _ = loop false (tree_root tree) Ptset.Int.empty in - let _ = Printf.eprintf "Finished\n%!" in - h + ignore (loop root); + table_c, table_n +let collect_descendants tree = + let table_d = Array.create (tree_num_tags tree) Ptset.Int.empty in + let rec loop node = + if node == nil then Ptset.Int.empty + else + let d1 = loop (tree_first_child tree node) in + let d2 = loop (tree_next_sibling tree node) in + let tag = tree_tag tree node in + table_d.(tag) <- Ptset.Int.union table_d.(tag) d1; + Ptset.Int.add tag (Ptset.Int.union d1 d2) + in + ignore (loop root); + table_d + +let collect_followings tree = + let table_f = Array.create (tree_num_tags tree) Ptset.Int.empty in + let rec loop node acc = + if node == nil then acc else + let f1 = loop (tree_next_sibling tree node) acc in + let f2 = loop (tree_first_child tree node) f1 in + let tag = tree_tag tree node in + table_f.(tag) <- Ptset.Int.union table_f.(tag) f1; + Ptset.Int.add tag (Ptset.Int.union f1 f2) + in + ignore (loop root Ptset.Int.empty); + table_f + +let collect_tags tree = + let c,n = time (collect_children_siblings) tree ~msg:"Collecting child and sibling tags" in + let d = time collect_descendants tree ~msg:"Collecting descendant tags" in + let f = time collect_followings tree ~msg:"Collecting following tags" in + c,n,d,f + let contains_array = ref [| |] let contains_index = Hashtbl.create 4096 let in_array _ i = @@ -349,24 +388,14 @@ let is_root t = t == root let node_of_t t = let _ = Tag.init (Obj.magic t) in - let table = collect_tags t - in (* - let _ = Hashtbl.iter (fun t (c,d,ns,f) -> - Printf.eprintf "Tag %s has:\n" (Tag.to_string t); - Printf.eprintf "Child tags: "; - Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) c; - Printf.eprintf "\nDescendant tags: "; - Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) d; - Printf.eprintf "\nNextSibling tags: "; - Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) ns; - Printf.eprintf "\nFollowing tags: "; - Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f; - Printf.eprintf "\n\n%!";) table + let c,n,d,f = collect_tags t in - - *) { doc= t; - ttable = table; + children = c; + siblings = n; + descendants = d; + followings = f + } let finalize _ = Printf.eprintf "Release the string list !\n%!" @@ -387,7 +416,7 @@ let size t = tree_size t.doc;; external pool : tree -> Tag.pool = "%identity" let magic_string = "SXSI_INDEX" -let version_string = "2" +let version_string = "3" let pos fd = Unix.lseek fd 0 Unix.SEEK_CUR @@ -414,6 +443,9 @@ let read fd = let _ = really_read fd buffer 0 size in buffer +let save_tag_table channel t = + let t = Array.map (fun s -> Array.of_list (Ptset.Int.elements s)) t in + Marshal.to_channel channel t [] let save t str = let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in @@ -423,13 +455,19 @@ let save t str = output_char out_c '\n'; output_string out_c version_string; output_char out_c '\n'; - Marshal.to_channel out_c t.ttable [ ]; + save_tag_table out_c t.children; + save_tag_table out_c t.siblings; + save_tag_table out_c t.descendants; + save_tag_table out_c t.followings; (* we need to move the fd to the correct position *) flush out_c; ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET); tree_save t.doc fd str; close_out out_c ;; +let load_tag_table channel = + let table : int array array = Marshal.from_channel channel in + Array.map (fun a -> Ptset.Int.from_list (Array.to_list a)) table let load ?(sample=64) ?(load_text=true) str = let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in @@ -438,28 +476,21 @@ let load ?(sample=64) ?(load_text=true) str = let load_table () = (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file"); (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file"); - let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t = - Marshal.from_channel in_c - in - let ntable = Hashtbl.create (Hashtbl.length table) in - Hashtbl.iter (fun k (s1,s2,s3,s4) -> - let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty - and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty - and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty - and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty - in Hashtbl.add ntable k (ss1,ss2,ss3,ss4) - ) table; - Hashtbl.clear table; - (* The in_channel read a chunk of fd, so we might be after - the start of the XMLTree save file. Reset to the correct - position *) - ntable + let c = load_tag_table in_c in + let s = load_tag_table in_c in + let d = load_tag_table in_c in + let f = load_tag_table in_c in + c,s,d,f in let _ = Printf.eprintf "\nLoading tag table : " in - let ntable = time (load_table) () in + let c,s,d,f = time (load_table) () in ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET); let tree = { doc = tree_load fd str load_text sample; - ttable = ntable;} + children = c; + siblings = s; + descendants = d; + followings = f + } in close_in in_c; tree @@ -716,17 +747,21 @@ let array_find a i j = print_xml_fast outc tree (first_child tree t) else print_xml_fast outc tree t -let tags_children t tag = - let a,_,_,_ = Hashtbl.find t.ttable tag in a -let tags_below t tag = - let _,a,_,_ = Hashtbl.find t.ttable tag in a -let tags_siblings t tag = - let _,_,a,_ = Hashtbl.find t.ttable tag in a -let tags_after t tag = - let _,_,_,a = Hashtbl.find t.ttable tag in a +let tags_children t tag = t.children.(tag) + +let tags_below t tag = t.descendants.(tag) + +let tags_siblings t tag = t.siblings.(tag) + +let tags_after t tag = t.followings.(tag) + -let tags t tag = Hashtbl.find t.ttable tag +let tags t tag = + t.children.(tag), + t.descendants.(tag), + t.siblings.(tag), + t.followings.(tag) let rec binary_parent t n = diff --git a/utils.ml b/utils.ml index 36334f6..b076733 100644 --- a/utils.ml +++ b/utils.ml @@ -86,13 +86,13 @@ let time_mem f x = Printf.eprintf "Final Mem: %s\n\n\n%!" s2; r ;; -let time f ?(count=1) x = +let time f ?(count=1) ?(msg="") x = let rec loop i = let t1 = Unix.gettimeofday () in let r = f x in let t2 = Unix.gettimeofday () in let t = (1000. *. (t2 -. t1)) in - Printf.eprintf "run %i/%i, %fms\n%!" i count t; + Printf.eprintf "%s: run %i/%i, %fms\n%!" msg i count t; if i >= count then (l:= t::!l;r) else loop (i+1) in loop 1