let do_text b t =
if Buffer.length t > 0 then begin
let s = Buffer.contents t in
- if (!Options.index_empty_texts) || not (is_whitespace s) then
+ if (!Config.index_empty_texts) || not (is_whitespace s) then
begin
open_tag b "<$>";
- Printf.eprintf "Inserting >>%s<<\n" s;
text b s;
close_tag b "<$>";
end;
let finalize () =
do_text build buf;
close_tag build "";
- Logger.print Format.err_formatter "Finished parsing@\n";
- Logger.print Format.err_formatter "Starting index construction@\n";
- let r = close_document build in
- Logger.print Format.err_formatter "Finished index construction@\n";
+ LOG ( __ "parsing" 2 "%s\n" "Finished parsing");
+ LOG ( __ "indexing" 2 "%s\n" "Starting index construction");
+ let r = close_document build
+ in
+ LOG ( __ "indexing" 2 "%s\n" "Finished index construction");
r
in
Expat.set_start_element_handler parser_ (start_element_handler parser_ build buf);
Expat.set_end_element_handler parser_ (end_element_handler parser_ build buf);
Expat.set_character_data_handler parser_ (character_data_handler parser_ build buf);
- Logger.print Format.err_formatter "Started parsing@\n";
- open_document build !Options.sample_factor !Options.disable_text_collection !Options.text_index_type;
+ LOG ( __ "parsing" 2 "%s\n" "Started parsing");
+ open_document build !Config.sample_factor !Config.disable_text_collection !Config.text_index_type;
open_tag build "";
parser_, finalize
let in_chan = open_in file in
let buffer = String.create 4096 in
let parser_, finalizer = create_parser () in
- let () =
+ let parse () =
try
while true do
let read = input in_chan buffer 0 4096 in
| End_of_file -> close_in in_chan
| e -> raise e
in
- finalizer ()
+ Utils.time ~msg:"Parsing XML file" parse ();
+ Utils.time ~msg:"Creating tree and text-collection index" finalizer ()
end
iter_array_tag (i+1) a len tree node
(acc - (tree_subtree_tags tree node a.(i)))
-external tree_subtree_elements : tree -> [`Tree] Node.t -> tag_list -> int = "caml_xml_tree_subtree_elements" "noalloc"
+external tree_subtree_elements : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
let subtree_elements t node =
- tree_subtree_elements t.doc node t.attribute_array
+ tree_subtree_elements t.doc node
(*
let subtree_elements t node =
let size = tree_subtree_size t.doc node - 1 in
let size t = tree_size t.doc
-let stats t =
- let tree = t.doc in
- let rec loop left node acc_d total_d num_leaves =
- if node == nil then
- (acc_d+total_d,if left then num_leaves+1 else num_leaves)
- else
- let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
- loop false (tree_next_sibling tree node) (acc_d) d td
- in
- let a,b = loop true root 0 0 0
- in
- Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b
-;;
+
module TagS =
struct
let is_root t = t == root
let node_of_t t =
- Logger.print err_formatter "Initializing tag structure@\n";
+ LOG ( __ "indexing" 2 "%s\n" "Initializing tag structure");
let _ = Tag.init (mk_tag_ops t) in
- Logger.print err_formatter "Starting tag table construction@\n";
- let f, n, c, d = time collect_labels t ~msg:"Building tag relationship table" in
+ LOG ( __ "indexing" 2 "%s\n" "Starting tag table construction");
+ let f, n, c, d = Utils.time ~msg:"Building tag relationship table" collect_labels t in
let c = Array.map TagS.to_ptset c in
let n = Array.map TagS.to_ptset n in
let f = Array.map TagS.to_ptset f in
let size t = tree_size t.doc;;
let magic_string = "SXSI_INDEX"
-let version_string = "3"
+let version_string = "4"
let pos fd =
Unix.lseek fd 0 Unix.SEEK_CUR
let _ = set_binary_mode_in in_c true in
let load_table () =
(let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
- (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
+ (let vs = input_line in_c in if vs <> version_string then failwith "Unsupported index format");
let c = load_tag_table in_c in
let s = load_tag_table in_c in
let d = load_tag_table in_c in
let res = (query_fun q) t s true in
Hashtbl.replace _pred_cache (q,s) res;
res.pos
+
+let stats tree =
+ let h = Hashtbl.create 1024 in
+ let depth = ref 0 in
+ let numleaves = ref 0 in
+ let numtexts = ref 0 in
+ let rec traverse tree t p d =
+ if is_nil t then
+ let oldc =
+ try
+ Hashtbl.find h p
+ with Not_found -> 0
+ in
+ Hashtbl.replace h p (oldc + 1);
+ if d > !depth then depth := d;
+ incr numleaves
+ else
+ let label = tree_tag tree t in
+ if label == Tag.pcdata || label == Tag.attribute_data then incr numtexts;
+ iter_siblings tree t (label::p) (d+1)
+ and iter_siblings tree t p d =
+ if is_nil t then () else
+ let fs = tree_first_child tree t in
+ traverse tree fs p d;
+ let ns = tree_next_sibling tree t in
+ iter_siblings tree ns p d
+ in
+ traverse tree.doc root [] 0;
+ let sumdepth = Hashtbl.fold (fun p c acc -> (List.length p) * c + acc) h 0 in
+ let alltags = Ptset.Int.union tree.elements tree.attributes in
+ Logger.print err_formatter "Statistics :@\n\
+Average depth: %f@\n\
+Longest path: %i@\n\
+Number of distinct paths: %i@\n\
+Number of nodes: %i@\n\
+Number of leaves: %i@\n\
+Number of pcdata/cdata nodes: %i@\n\
+Number of distinct tags: %i@\n\
+Largest tag id: %i@\n@?"
+ (float_of_int sumdepth /. float_of_int !numleaves)
+ !depth
+ (Hashtbl.length h)
+ (tree_subtree_size tree.doc root)
+ !numleaves
+ !numtexts
+ (Ptset.Int.cardinal alltags)
+ (Ptset.Int.max_elt alltags)
+
+
+type tree_pointer = tree
+let get_tree_pointer x = x.doc