(* Distributed under the terms of the LGPL (see LICENCE) *)
(******************************************************************************)
INCLUDE "debug.ml"
- INCLUDE "utils.ml"
+INCLUDE "log.ml"
+INCLUDE "utils.ml"
external init_lib : unit -> unit = "sxsi_cpp_init"
external open_tag : t -> string -> unit = "caml_xml_tree_builder_open_tag"
external close_tag : t -> string -> unit = "caml_xml_tree_builder_close_tag"
external text : t -> string -> unit = "caml_xml_tree_builder_text"
+ external trim : string -> string = "caml_trim"
let is_whitespace s =
let rec loop len i =
let do_text b t =
if Buffer.length t > 0 then begin
let s = Buffer.contents t in
- begin
- open_tag b "<$>";
- text b s;
- close_tag b "<$>";
- end;
+ if (!Options.index_empty_texts) || not (is_whitespace s) then
+ begin
+ open_tag b "<$>";
+ text b s;
+ close_tag b "<$>";
+ end;
Buffer.clear t
end
let len = if n <= 0 then 0 else (n - 1) / 8 + 1 in
String.make len '\000'
+type tag_list
+
+external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc"
+external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc"
+
type t = {
doc : tree;
elements: Ptset.Int.t;
attributes: Ptset.Int.t;
- attribute_array : Tag.t array;
+ attribute_array : tag_list;
children : Ptset.Int.t array;
siblings : Ptset.Int.t array;
descendants: Ptset.Int.t array;
let nil : [`Tree ] Node.t = Node.nil
let root : [`Tree ] Node.t = Node.null
-type tag_list
-
-external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc"
-external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc"
module HPtset = Hashtbl.Make(Ptset.Int)
external tree_subtree_size : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_size" "noalloc"
let subtree_size t n = tree_subtree_size t.doc n
+let rec iter_array_tag i a len tree node acc =
+ if i == len then acc
+ else
+ iter_array_tag (i+1) a len tree node
+ (acc - (tree_subtree_tags tree node a.(i)))
+
external tree_subtree_elements : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
-let subtree_elements t n = tree_subtree_elements t.doc n
+
+let subtree_elements t node =
+ tree_subtree_elements t.doc node
+(*
+let subtree_elements t node =
+ let size = tree_subtree_size t.doc node - 1 in
+ if size <= 0 then 0
+ else let size = size - (tree_subtree_tags t.doc node Tag.pcdata) in
+ if size < 3 then size else
+ let a = t.attribute_array in
+ iter_array_tag 0 a (Array.length a) t.doc node size
+*)
external tree_closing : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_closing" "noalloc"
let closing t n = tree_closing t.doc n
let size t = tree_size t.doc
-let stats t =
- let tree = t.doc in
- let rec loop left node acc_d total_d num_leaves =
- if node == nil then
- (acc_d+total_d,if left then num_leaves+1 else num_leaves)
- else
- let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
- loop false (tree_next_sibling tree node) (acc_d) d td
- in
- let a,b = loop true root 0 0 0
- in
- Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b
-;;
+
module TagS =
struct
in
{ doc= t;
attributes = attributes;
- attribute_array = Array.of_list (Ptset.Int.elements attributes);
+ attribute_array = tag_list_of_set attributes;
elements = elements;
children = c;
siblings = n;
let size t = tree_size t.doc;;
let magic_string = "SXSI_INDEX"
-let version_string = "3"
+let version_string = "4"
let pos fd =
Unix.lseek fd 0 Unix.SEEK_CUR
let _ = set_binary_mode_in in_c true in
let load_table () =
(let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
- (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
+ (let vs = input_line in_c in if vs <> version_string then failwith "Unsupported index format");
let c = load_tag_table in_c in
let s = load_tag_table in_c in
let d = load_tag_table in_c in
in
let tree = { doc = xml_tree;
attributes = attributes;
- attribute_array = Array.of_list (Ptset.Int.elements attributes);
+ attribute_array = tag_list_of_set attributes;
elements = elements;
children = c;
siblings = s;
let _pred_cache = Hashtbl.create 17
;;
let mk_pred query s =
+ LOG ( __ "bottom-up" 3 "Calling mk_pred for '%s'\n" s);
let f = query_fun query in
let memo = ref (fun _ _ -> failwith "Undefined") in
memo := begin fun tree node ->
in
let bv = results.bv in
memo := begin fun _ n ->
- bit_vector_unsafe_get bv (Node.to_int n)
+ let r = bit_vector_unsafe_get bv (Node.to_int n) in
+ LOG( __ "bottom-up" 3 "Running predicate on node %i = %b@\n" (Node.to_int n) r);
+ r
end;
- bit_vector_unsafe_get bv (Node.to_int node)
+ let r = bit_vector_unsafe_get bv (Node.to_int node) in
+ LOG( __ "bottom-up" 3 "Running predicate on node %i = %b@\n" (Node.to_int node) r);
+ r
end;
Predicate.make memo
let res = (query_fun q) t s true in
Hashtbl.replace _pred_cache (q,s) res;
res.pos
+
+let stats tree =
+ let h = Hashtbl.create 1024 in
+ let depth = ref 0 in
+ let numleaves = ref 0 in
+ let numtexts = ref 0 in
+ let rec traverse tree t p d =
+ if is_nil t then
+ let oldc =
+ try
+ Hashtbl.find h p
+ with Not_found -> 0
+ in
+ Hashtbl.replace h p (oldc + 1);
+ if d > !depth then depth := d;
+ incr numleaves
+ else
+ let label = tree_tag tree t in
+ if label == Tag.pcdata || label == Tag.attribute_data then incr numtexts;
+ iter_siblings tree t (label::p) (d+1)
+ and iter_siblings tree t p d =
+ if is_nil t then () else
+ let fs = tree_first_child tree t in
+ traverse tree fs p d;
+ let ns = tree_next_sibling tree t in
+ iter_siblings tree ns p d
+ in
+ traverse tree.doc root [] 0;
+ let sumdepth = Hashtbl.fold (fun p c acc -> (List.length p) * c + acc) h 0 in
+ let alltags = Ptset.Int.union tree.elements tree.attributes in
+ Logger.print err_formatter "Statistics :@\n\
+Average depth: %f@\n\
+Longest path: %i@\n\
+Number of distinct paths: %i@\n\
+Number of nodes: %i@\n\
+Number of leaves: %i@\n\
+Number of pcdata/cdata nodes: %i@\n\
+Number of distinct tags: %i@\n\
+Largest tag id: %i@\n@?"
+ (float_of_int sumdepth /. float_of_int !numleaves)
+ !depth
+ (Hashtbl.length h)
+ (tree_subtree_size tree.doc root)
+ !numleaves
+ !numtexts
+ (Ptset.Int.cardinal alltags)
+ (Ptset.Int.max_elt alltags)
+
+(*
+ Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b
+;;
+
+*)
+
+type tree_pointer = tree
+let get_tree_pointer x = x.doc