From 0da8c3c7c76ab06d5ccfc6ae52488d7549735059 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Fri, 4 May 2012 16:00:11 +0200 Subject: [PATCH] Add -doc-stats options to print document statistics. --- src/main.ml | 1 + src/options.ml | 3 +++ src/options.mli | 1 + src/tree.ml | 66 ++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 57 insertions(+), 14 deletions(-) diff --git a/src/main.ml b/src/main.ml index 48f2e46..8e0e130 100644 --- a/src/main.ml +++ b/src/main.ml @@ -33,6 +33,7 @@ let mk_runtime run auto doc arg count print outfile = let main v query_string output = Tag.init (Tree.tag_operations v); + if !Options.docstats then Tree.stats v; let query = time ~msg:"Parsing query" XPath.parse query_string in diff --git a/src/options.ml b/src/options.ml index 0c7c92d..1e349a6 100644 --- a/src/options.ml +++ b/src/options.ml @@ -19,6 +19,7 @@ let text_index_type = ref 0 let do_perf = ref false let twopass = ref false let repeat = ref 1 +let docstats = ref false let set_index_type = function | "default" -> text_index_type := 0 @@ -89,6 +90,8 @@ let spec = Arg.align "-r", Arg.Set_int(repeat), " repeat query execution n time (benchmarking only, default 1)"; + "-doc-stats", Arg.Set(docstats), + " Compute document statistics (performs full traversal)"; "-v", Arg.Set(verbose), " verbose mode"; ] @ diff --git a/src/options.mli b/src/options.mli index 8393ead..cfb229d 100644 --- a/src/options.mli +++ b/src/options.mli @@ -16,3 +16,4 @@ val text_index_type : int ref val do_perf : bool ref val twopass : bool ref val repeat : int ref +val docstats : bool ref diff --git a/src/tree.ml b/src/tree.ml index ae256dd..ec253bc 100644 --- a/src/tree.ml +++ b/src/tree.ml @@ -84,7 +84,6 @@ struct if (!Options.index_empty_texts) || not (is_whitespace s) then begin open_tag b "<$>"; - Printf.eprintf "Inserting >>%s<<\n" s; text b s; close_tag b "<$>"; end; @@ -372,19 +371,7 @@ external tree_size : tree -> int = "caml_xml_tree_size" "noalloc" let size t = tree_size t.doc -let stats t = - let tree = t.doc in - let rec loop left node acc_d total_d num_leaves = - if node == nil then - (acc_d+total_d,if left then num_leaves+1 else num_leaves) - else - let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in - loop false (tree_next_sibling tree node) (acc_d) d td - in - let a,b = loop true root 0 0 0 - in - Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b -;; + module TagS = struct @@ -769,3 +756,54 @@ let full_text_query q t s = let res = (query_fun q) t s true in Hashtbl.replace _pred_cache (q,s) res; res.pos + +let stats tree = + let h = Hashtbl.create 1024 in + let depth = ref 0 in + let numleaves = ref 0 in + let numtexts = ref 0 in + let rec traverse tree t p d = + if is_nil t then + let oldc = + try + Hashtbl.find h p + with Not_found -> 0 + in + Hashtbl.replace h p (oldc + 1); + if d > !depth then depth := d; + incr numleaves + else + let label = tree_tag tree t in + if label == Tag.pcdata || label == Tag.attribute_data then incr numtexts; + iter_siblings tree t (label::p) (d+1) + and iter_siblings tree t p d = + if is_nil t then () else + let fs = tree_first_child tree t in + traverse tree fs p d; + let ns = tree_next_sibling tree t in + iter_siblings tree ns p d + in + traverse tree.doc root [] 0; + let sumdepth = Hashtbl.fold (fun p c acc -> (List.length p) * c + acc) h 0 in + + Logger.print err_formatter "Statistics :@\n\ +Average depth: %f@\n\ +Longest path: %i@\n\ +Number of distinct paths: %i@\n\ +Number of nodes: %i@\n\ +Number of leaves: %i@\n\ +Number of pcdata/cdata nodes: %i@\n\ +Number of distinct tags: %i@\n@?" + (float_of_int sumdepth /. float_of_int !numleaves) + !depth + (Hashtbl.length h) + (tree_subtree_size tree.doc root) + !numleaves + !numtexts + (Ptset.Int.cardinal (Ptset.Int.union tree.elements tree.attributes)) + +(* + Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b +;; + +*) -- 2.17.1