let do_perf = ref false
let twopass = ref false
let repeat = ref 1
+let docstats = ref false
let set_index_type = function
| "default" -> text_index_type := 0
"-r", Arg.Set_int(repeat),
" repeat query execution n time (benchmarking only, default 1)";
+ "-doc-stats", Arg.Set(docstats),
+ " Compute document statistics (performs full traversal)";
"-v", Arg.Set(verbose), " verbose mode"; ] @
if (!Options.index_empty_texts) || not (is_whitespace s) then
begin
open_tag b "<$>";
- Printf.eprintf "Inserting >>%s<<\n" s;
text b s;
close_tag b "<$>";
end;
let size t = tree_size t.doc
-let stats t =
- let tree = t.doc in
- let rec loop left node acc_d total_d num_leaves =
- if node == nil then
- (acc_d+total_d,if left then num_leaves+1 else num_leaves)
- else
- let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
- loop false (tree_next_sibling tree node) (acc_d) d td
- in
- let a,b = loop true root 0 0 0
- in
- Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b
-;;
+
module TagS =
struct
let res = (query_fun q) t s true in
Hashtbl.replace _pred_cache (q,s) res;
res.pos
+
+let stats tree =
+ let h = Hashtbl.create 1024 in
+ let depth = ref 0 in
+ let numleaves = ref 0 in
+ let numtexts = ref 0 in
+ let rec traverse tree t p d =
+ if is_nil t then
+ let oldc =
+ try
+ Hashtbl.find h p
+ with Not_found -> 0
+ in
+ Hashtbl.replace h p (oldc + 1);
+ if d > !depth then depth := d;
+ incr numleaves
+ else
+ let label = tree_tag tree t in
+ if label == Tag.pcdata || label == Tag.attribute_data then incr numtexts;
+ iter_siblings tree t (label::p) (d+1)
+ and iter_siblings tree t p d =
+ if is_nil t then () else
+ let fs = tree_first_child tree t in
+ traverse tree fs p d;
+ let ns = tree_next_sibling tree t in
+ iter_siblings tree ns p d
+ in
+ traverse tree.doc root [] 0;
+ let sumdepth = Hashtbl.fold (fun p c acc -> (List.length p) * c + acc) h 0 in
+
+ Logger.print err_formatter "Statistics :@\n\
+Average depth: %f@\n\
+Longest path: %i@\n\
+Number of distinct paths: %i@\n\
+Number of nodes: %i@\n\
+Number of leaves: %i@\n\
+Number of pcdata/cdata nodes: %i@\n\
+Number of distinct tags: %i@\n@?"
+ (float_of_int sumdepth /. float_of_int !numleaves)
+ !depth
+ (Hashtbl.length h)
+ (tree_subtree_size tree.doc root)
+ !numleaves
+ !numtexts
+ (Ptset.Int.cardinal (Ptset.Int.union tree.elements tree.attributes))
+
+(*
+ Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b
+;;
+
+*)