X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2Ftree.ml;h=b7b856641c9075a97e19c5f82bbbc79b31d08c85;hb=7e27afe6fa006ad355237ccc0695c6493ea57929;hp=af2ec3f259596dac27266d988b5eac22f638e4a8;hpb=d6c57f01eabebe2b11e1c701835562c2efc2fd92;p=SXSI%2Fxpathcomp.git diff --git a/src/tree.ml b/src/tree.ml index af2ec3f..b7b8566 100644 --- a/src/tree.ml +++ b/src/tree.ml @@ -5,7 +5,8 @@ (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) INCLUDE "debug.ml" - INCLUDE "utils.ml" +INCLUDE "log.ml" +INCLUDE "utils.ml" external init_lib : unit -> unit = "sxsi_cpp_init" @@ -56,6 +57,7 @@ struct external open_tag : t -> string -> unit = "caml_xml_tree_builder_open_tag" external close_tag : t -> string -> unit = "caml_xml_tree_builder_close_tag" external text : t -> string -> unit = "caml_xml_tree_builder_text" + external trim : string -> string = "caml_trim" let is_whitespace s = let rec loop len i = @@ -79,11 +81,12 @@ struct let do_text b t = if Buffer.length t > 0 then begin let s = Buffer.contents t in - begin - open_tag b "<$>"; - text b s; - close_tag b "<$>"; - end; + if (!Config.index_empty_texts) || not (is_whitespace s) then + begin + open_tag b "<$>"; + text b s; + close_tag b "<$>"; + end; Buffer.clear t end @@ -120,17 +123,18 @@ struct let finalize () = do_text build buf; close_tag build ""; - Logger.print Format.err_formatter "Finished parsing@\n"; - Logger.print Format.err_formatter "Starting index construction@\n"; - let r = close_document build in - Logger.print Format.err_formatter "Finished index construction@\n"; + LOG ( __ "parsing" 2 "%s\n" "Finished parsing"); + LOG ( __ "indexing" 2 "%s\n" "Starting index construction"); + let r = close_document build + in + LOG ( __ "indexing" 2 "%s\n" "Finished index construction"); r in Expat.set_start_element_handler parser_ (start_element_handler parser_ build buf); Expat.set_end_element_handler parser_ (end_element_handler parser_ build buf); Expat.set_character_data_handler parser_ (character_data_handler parser_ build buf); - Logger.print Format.err_formatter "Started parsing@\n"; - open_document build !Options.sample_factor !Options.disable_text_collection !Options.text_index_type; + LOG ( __ "parsing" 2 "%s\n" "Started parsing"); + open_document build !Config.sample_factor !Config.disable_text_collection !Config.text_index_type; open_tag build ""; parser_, finalize @@ -143,7 +147,7 @@ struct let in_chan = open_in file in let buffer = String.create 4096 in let parser_, finalizer = create_parser () in - let () = + let parse () = try while true do let read = input in_chan buffer 0 4096 in @@ -155,7 +159,8 @@ struct | End_of_file -> close_in in_chan | e -> raise e in - finalizer () + Utils.time ~msg:"Parsing XML file" parse (); + Utils.time ~msg:"Creating tree and text-collection index" finalizer () end @@ -182,11 +187,16 @@ let bit_vector_create n = let len = if n <= 0 then 0 else (n - 1) / 8 + 1 in String.make len '\000' +type tag_list + +external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc" +external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc" + type t = { doc : tree; elements: Ptset.Int.t; attributes: Ptset.Int.t; - attribute_array : Tag.t array; + attribute_array : tag_list; children : Ptset.Int.t array; siblings : Ptset.Int.t array; descendants: Ptset.Int.t array; @@ -212,10 +222,6 @@ external nullt : unit -> 'a Node.t = "caml_xml_tree_nullt" let nil : [`Tree ] Node.t = Node.nil let root : [`Tree ] Node.t = Node.null -type tag_list - -external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc" -external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc" module HPtset = Hashtbl.Make(Ptset.Int) @@ -337,8 +343,25 @@ let subtree_tags t n tag = tree_subtree_tags t.doc n tag external tree_subtree_size : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_size" "noalloc" let subtree_size t n = tree_subtree_size t.doc n +let rec iter_array_tag i a len tree node acc = + if i == len then acc + else + iter_array_tag (i+1) a len tree node + (acc - (tree_subtree_tags tree node a.(i))) + external tree_subtree_elements : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_elements" "noalloc" -let subtree_elements t n = tree_subtree_elements t.doc n + +let subtree_elements t node = + tree_subtree_elements t.doc node +(* +let subtree_elements t node = + let size = tree_subtree_size t.doc node - 1 in + if size <= 0 then 0 + else let size = size - (tree_subtree_tags t.doc node Tag.pcdata) in + if size < 3 then size else + let a = t.attribute_array in + iter_array_tag 0 a (Array.length a) t.doc node size +*) external tree_closing : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_closing" "noalloc" let closing t n = tree_closing t.doc n @@ -350,19 +373,7 @@ external tree_size : tree -> int = "caml_xml_tree_size" "noalloc" let size t = tree_size t.doc -let stats t = - let tree = t.doc in - let rec loop left node acc_d total_d num_leaves = - if node == nil then - (acc_d+total_d,if left then num_leaves+1 else num_leaves) - else - let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in - loop false (tree_next_sibling tree node) (acc_d) d td - in - let a,b = loop true root 0 0 0 - in - Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b -;; + module TagS = struct @@ -531,10 +542,10 @@ let is_node t = t != nil let is_root t = t == root let node_of_t t = - Logger.print err_formatter "Initializing tag structure@\n"; + LOG ( __ "indexing" 2 "%s\n" "Initializing tag structure"); let _ = Tag.init (mk_tag_ops t) in - Logger.print err_formatter "Starting tag table construction@\n"; - let f, n, c, d = time collect_labels t ~msg:"Building tag relationship table" in + LOG ( __ "indexing" 2 "%s\n" "Starting tag table construction"); + let f, n, c, d = Utils.time ~msg:"Building tag relationship table" collect_labels t in let c = Array.map TagS.to_ptset c in let n = Array.map TagS.to_ptset n in let f = Array.map TagS.to_ptset f in @@ -547,7 +558,7 @@ let node_of_t t = in { doc= t; attributes = attributes; - attribute_array = Array.of_list (Ptset.Int.elements attributes); + attribute_array = tag_list_of_set attributes; elements = elements; children = c; siblings = n; @@ -563,7 +574,7 @@ let parse_xml_string str = node_of_t (TreeBuilder.parse_string str) let size t = tree_size t.doc;; let magic_string = "SXSI_INDEX" -let version_string = "3" +let version_string = "4" let pos fd = Unix.lseek fd 0 Unix.SEEK_CUR @@ -624,7 +635,7 @@ let load ?(sample=64) ?(load_text=true) str = let _ = set_binary_mode_in in_c true in let load_table () = (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file"); - (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file"); + (let vs = input_line in_c in if vs <> version_string then failwith "Unsupported index format"); let c = load_tag_table in_c in let s = load_tag_table in_c in let d = load_tag_table in_c in @@ -642,7 +653,7 @@ let load ?(sample=64) ?(load_text=true) str = in let tree = { doc = xml_tree; attributes = attributes; - attribute_array = Array.of_list (Ptset.Int.elements attributes); + attribute_array = tag_list_of_set attributes; elements = elements; children = c; siblings = s; @@ -711,6 +722,7 @@ let query_fun = function let _pred_cache = Hashtbl.create 17 ;; let mk_pred query s = + LOG ( __ "bottom-up" 3 "Calling mk_pred for '%s'\n" s); let f = query_fun query in let memo = ref (fun _ _ -> failwith "Undefined") in memo := begin fun tree node -> @@ -723,9 +735,13 @@ let mk_pred query s = in let bv = results.bv in memo := begin fun _ n -> - bit_vector_unsafe_get bv (Node.to_int n) + let r = bit_vector_unsafe_get bv (Node.to_int n) in + LOG( __ "bottom-up" 3 "Running predicate on node %i = %b@\n" (Node.to_int n) r); + r end; - bit_vector_unsafe_get bv (Node.to_int node) + let r = bit_vector_unsafe_get bv (Node.to_int node) in + LOG( __ "bottom-up" 3 "Running predicate on node %i = %b@\n" (Node.to_int node) r); + r end; Predicate.make memo @@ -742,3 +758,54 @@ let full_text_query q t s = let res = (query_fun q) t s true in Hashtbl.replace _pred_cache (q,s) res; res.pos + +let stats tree = + let h = Hashtbl.create 1024 in + let depth = ref 0 in + let numleaves = ref 0 in + let numtexts = ref 0 in + let rec traverse tree t p d = + if is_nil t then + let oldc = + try + Hashtbl.find h p + with Not_found -> 0 + in + Hashtbl.replace h p (oldc + 1); + if d > !depth then depth := d; + incr numleaves + else + let label = tree_tag tree t in + if label == Tag.pcdata || label == Tag.attribute_data then incr numtexts; + iter_siblings tree t (label::p) (d+1) + and iter_siblings tree t p d = + if is_nil t then () else + let fs = tree_first_child tree t in + traverse tree fs p d; + let ns = tree_next_sibling tree t in + iter_siblings tree ns p d + in + traverse tree.doc root [] 0; + let sumdepth = Hashtbl.fold (fun p c acc -> (List.length p) * c + acc) h 0 in + let alltags = Ptset.Int.union tree.elements tree.attributes in + Logger.print err_formatter "Statistics :@\n\ +Average depth: %f@\n\ +Longest path: %i@\n\ +Number of distinct paths: %i@\n\ +Number of nodes: %i@\n\ +Number of leaves: %i@\n\ +Number of pcdata/cdata nodes: %i@\n\ +Number of distinct tags: %i@\n\ +Largest tag id: %i@\n@?" + (float_of_int sumdepth /. float_of_int !numleaves) + !depth + (Hashtbl.length h) + (tree_subtree_size tree.doc root) + !numleaves + !numtexts + (Ptset.Int.cardinal alltags) + (Ptset.Int.max_elt alltags) + + +type tree_pointer = tree +let get_tree_pointer x = x.doc