X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2Ftree.ml;h=f73e347e041919c780b55fd5edccc9068b1004ab;hb=744e0c89a77d0eb82c64a98d8a8a6860af875653;hp=ae256ddfc6baf24c4d381b247e012175fb77a47f;hpb=e4e34e687798285e65df74561915f7721b066631;p=SXSI%2Fxpathcomp.git diff --git a/src/tree.ml b/src/tree.ml index ae256dd..f73e347 100644 --- a/src/tree.ml +++ b/src/tree.ml @@ -84,7 +84,6 @@ struct if (!Options.index_empty_texts) || not (is_whitespace s) then begin open_tag b "<$>"; - Printf.eprintf "Inserting >>%s<<\n" s; text b s; close_tag b "<$>"; end; @@ -348,10 +347,10 @@ let rec iter_array_tag i a len tree node acc = iter_array_tag (i+1) a len tree node (acc - (tree_subtree_tags tree node a.(i))) -external tree_subtree_elements : tree -> [`Tree] Node.t -> tag_list -> int = "caml_xml_tree_subtree_elements" "noalloc" +external tree_subtree_elements : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_elements" "noalloc" let subtree_elements t node = - tree_subtree_elements t.doc node t.attribute_array + tree_subtree_elements t.doc node (* let subtree_elements t node = let size = tree_subtree_size t.doc node - 1 in @@ -372,19 +371,7 @@ external tree_size : tree -> int = "caml_xml_tree_size" "noalloc" let size t = tree_size t.doc -let stats t = - let tree = t.doc in - let rec loop left node acc_d total_d num_leaves = - if node == nil then - (acc_d+total_d,if left then num_leaves+1 else num_leaves) - else - let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in - loop false (tree_next_sibling tree node) (acc_d) d td - in - let a,b = loop true root 0 0 0 - in - Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b -;; + module TagS = struct @@ -585,7 +572,7 @@ let parse_xml_string str = node_of_t (TreeBuilder.parse_string str) let size t = tree_size t.doc;; let magic_string = "SXSI_INDEX" -let version_string = "3" +let version_string = "4" let pos fd = Unix.lseek fd 0 Unix.SEEK_CUR @@ -646,7 +633,7 @@ let load ?(sample=64) ?(load_text=true) str = let _ = set_binary_mode_in in_c true in let load_table () = (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file"); - (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file"); + (let vs = input_line in_c in if vs <> version_string then failwith "Unsupported index format"); let c = load_tag_table in_c in let s = load_tag_table in_c in let d = load_tag_table in_c in @@ -769,3 +756,59 @@ let full_text_query q t s = let res = (query_fun q) t s true in Hashtbl.replace _pred_cache (q,s) res; res.pos + +let stats tree = + let h = Hashtbl.create 1024 in + let depth = ref 0 in + let numleaves = ref 0 in + let numtexts = ref 0 in + let rec traverse tree t p d = + if is_nil t then + let oldc = + try + Hashtbl.find h p + with Not_found -> 0 + in + Hashtbl.replace h p (oldc + 1); + if d > !depth then depth := d; + incr numleaves + else + let label = tree_tag tree t in + if label == Tag.pcdata || label == Tag.attribute_data then incr numtexts; + iter_siblings tree t (label::p) (d+1) + and iter_siblings tree t p d = + if is_nil t then () else + let fs = tree_first_child tree t in + traverse tree fs p d; + let ns = tree_next_sibling tree t in + iter_siblings tree ns p d + in + traverse tree.doc root [] 0; + let sumdepth = Hashtbl.fold (fun p c acc -> (List.length p) * c + acc) h 0 in + let alltags = Ptset.Int.union tree.elements tree.attributes in + Logger.print err_formatter "Statistics :@\n\ +Average depth: %f@\n\ +Longest path: %i@\n\ +Number of distinct paths: %i@\n\ +Number of nodes: %i@\n\ +Number of leaves: %i@\n\ +Number of pcdata/cdata nodes: %i@\n\ +Number of distinct tags: %i@\n\ +Largest tag id: %i@\n@?" + (float_of_int sumdepth /. float_of_int !numleaves) + !depth + (Hashtbl.length h) + (tree_subtree_size tree.doc root) + !numleaves + !numtexts + (Ptset.Int.cardinal alltags) + (Ptset.Int.max_elt alltags) + +(* + Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b +;; + +*) + +type tree_pointer = tree +let get_tree_pointer x = x.doc