(* Distributed under the terms of the LGPL (see LICENCE) *)
(******************************************************************************)
INCLUDE "debug.ml"
-INCLUDE "utils.ml"
+ INCLUDE "utils.ml"
external init_lib : unit -> unit = "sxsi_cpp_init"
let event_counter = ref 0 in
(fun parser_ ->
incr event_counter;
- if !event_counter land 0xffffff == 0 then
- Logger.print Format.err_formatter "Current position: %i@\n@?" (Expat.get_current_byte_index parser_))
+ if !event_counter land 0xffffff == 0 then
+ Logger.print Format.err_formatter "Current position: %i@\n@?" (Expat.get_current_byte_index parser_))
let do_text b t =
do_text b t;
open_tag b tag;
match attr_list with
- [] -> ()
- | l ->
- open_tag b "<@>";
- List.iter (fun (name, value) -> output_attr b name value) l;
- close_tag b "<@>"
+ [] -> ()
+ | l ->
+ open_tag b "<@>";
+ List.iter (fun (name, value) -> output_attr b name value) l;
+ close_tag b "<@>"
let end_element_handler parser_ b t tag =
let read = input in_chan buffer 0 4096 in
if read == 0 then raise End_of_file else
Expat.parse_sub parser_ buffer 0 read;
- done
+ done
with
- | End_of_file -> close_in in_chan
- | e -> raise e
+ | End_of_file -> close_in in_chan
+ | e -> raise e
in
- finalizer ()
+ finalizer ()
end
let tag_operations t = mk_tag_ops t.doc
(*
-external parse_xml_uri : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_uri"
-external parse_xml_string : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_string"
+ external parse_xml_uri : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_uri"
+ external parse_xml_string : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_string"
*)
external tree_print_xml_fast3 : tree -> [`Tree ] Node.t -> Unix.file_descr -> unit = "caml_xml_tree_print"
let print_xml t n fd =
module HPtset = Hashtbl.Make(Ptset.Int)
let vector_htbl = HPtset.create MED_H_SIZE
+let reinit () = HPtset.clear vector_htbl
let tag_list_of_set s =
try
let subtree_elements t node =
let size = tree_subtree_size t.doc node - 1 in
- if size == 0 then 0
- else let size = size - (tree_subtree_tags t.doc node Tag.pcdata) in
- if size < 2 then size else
- let acc = ref size in
- for i = 0 to Array.length t.attribute_array - 1 do
- acc := !acc - tree_subtree_tags t.doc node t.attribute_array.(i)
- done;
- !acc
+ if size == 0 then 0
+ else let size = size - (tree_subtree_tags t.doc node Tag.pcdata) in
+ if size < 2 then size else
+ let acc = ref size in
+ for i = 0 to Array.length t.attribute_array - 1 do
+ acc := !acc - tree_subtree_tags t.doc node t.attribute_array.(i)
+ done;
+ !acc
external tree_closing : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_closing" "noalloc"
let closing t n = tree_closing t.doc n
let tree = t.doc in
let rec loop left node acc_d total_d num_leaves =
if node == nil then
- (acc_d+total_d,if left then num_leaves+1 else num_leaves)
+ (acc_d+total_d,if left then num_leaves+1 else num_leaves)
else
- let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
- loop false (tree_next_sibling tree node) (acc_d) d td
+ let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
+ loop false (tree_next_sibling tree node) (acc_d) d td
in
let a,b = loop true root 0 0 0
in
;;
module TagS =
- struct
- include Ptset.Make (
- struct type t = int
- type data = t
- external hash : t -> int = "%identity"
- external uid : t -> Uid.t = "%identity"
- external equal : t -> t -> bool = "%eq"
- external make : t -> int = "%identity"
- external node : t -> int = "%identity"
- external stats : unit -> unit = "%identity"
- end
- )
- let to_ptset s = fold (Ptset.Int.add) s Ptset.Int.empty
- end
+struct
+ include Ptset.Make (
+ struct type t = int
+ type data = t
+ external hash : t -> int = "%identity"
+ external uid : t -> Uid.t = "%identity"
+ external equal : t -> t -> bool = "%eq"
+ external make : t -> int = "%identity"
+ external node : t -> int = "%identity"
+ external stats : unit -> unit = "%identity"
+ external init : unit -> unit = "%identity"
+ end
+ )
+ let to_ptset s = fold (Ptset.Int.add) s Ptset.Int.empty
+end
module TSTSCache =
Hashtbl.Make(struct type t = TagS.t * TagS.t
let equal u v =
let u1,u2 = u
and v1,v2 = v in
- u1 == v1 && u2 == v2
- end)
+ u1 == v1 && u2 == v2
+ end)
module TagTSCache =
Hashtbl.Make(struct type t = Tag.t * TagS.t
let hash (x, y) =
let equal u v =
let u1,u2 = u
and v1,v2 = v in
- u1 == v1 && u2 == v2
- end)
+ u1 == v1 && u2 == v2
+ end)
let add_cache = TagTSCache.create 1023
let union_cache = TSTSCache.create 1023
if y == TagS.empty then false
else
let key = (x, y) in
- try
- TSTSCache.find subset_cache key
- with
- | Not_found ->
- let z = TagS.subset x y in
- TSTSCache.add subset_cache key z; z
+ try
+ TSTSCache.find subset_cache key
+ with
+ | Not_found ->
+ let z = TagS.subset x y in
+ TSTSCache.add subset_cache key z; z
let order ((x, y) as z) =
if x.TagS.Node.id <= y.TagS.Node.id then z
if _subset x y then y
else if _subset y x then x
else
- let key = order (x, y) in
- try
- TSTSCache.find union_cache key
+ let key = order (x, y) in
+ try
+ TSTSCache.find union_cache key
with
- | Not_found ->
- let z = TagS.union x y in
- TSTSCache.add union_cache key z; z
+ | Not_found ->
+ let z = TagS.union x y in
+ TSTSCache.add union_cache key z; z
let _add t s =
let key = (t,s) in
- try
- TagTSCache.find add_cache key
- with
- | Not_found ->
- let z = TagS.add t s in
- TagTSCache.add add_cache key z;z
+ try
+ TagTSCache.find add_cache key
+ with
+ | Not_found ->
+ let z = TagS.add t s in
+ TagTSCache.add add_cache key z;z
let child_sibling_labels tree =
let table_c = Array.create (tree_num_tags tree) TagS.empty in
let siblings = loop (tree_next_sibling tree node) in
let () =
let tn = table_n.(tag) in
- if _subset siblings tn then ()
- else table_n.(tag) <- _union tn siblings
+ if _subset siblings tn then ()
+ else table_n.(tag) <- _union tn siblings
in
- _add tag siblings
+ _add tag siblings
in
- ignore (loop root);
- table_c, table_n
+ ignore (loop root);
+ table_c, table_n
let descendant_labels tree =
let table_d = Array.create (tree_num_tags tree) TagS.empty in
let tag = tree_tag tree node in
let () =
let td = table_d.(tag) in
- if _subset d1 td then ()
- else table_d.(tag) <- _union td d1;
+ if _subset d1 td then ()
+ else table_d.(tag) <- _union td d1;
in
- _add tag (_union d1 d2)
+ _add tag (_union d1 d2)
in
- ignore (loop root);
- table_d
+ ignore (loop root);
+ table_d
let collect_labels tree =
let table_f = Array.create (tree_num_tags tree) TagS.empty in
let tag = tree_tag tree node in
let () =
let tf = table_f.(tag) in
- if _subset followings tf then ()
- else table_f.(tag) <- _union tf followings in
+ if _subset followings tf then ()
+ else table_f.(tag) <- _union tf followings in
let () =
let tn = table_n.(tag) in
- if _subset foll_siblings tn then ()
- else table_n.(tag) <- _union tn foll_siblings in
+ if _subset foll_siblings tn then ()
+ else table_n.(tag) <- _union tn foll_siblings in
let children, n_descendants, n_followings =
loop (tree_last_child tree node) TagS.empty TagS.empty followings
in
let () =
let tc = table_c.(tag) in
- if _subset children tc then ()
- else table_c.(tag) <- _union tc children
+ if _subset children tc then ()
+ else table_c.(tag) <- _union tc children
in
let () =
let td = table_d.(tag) in
- if _subset n_descendants td then ()
- else table_d.(tag) <- _union td n_descendants
+ if _subset n_descendants td then ()
+ else table_d.(tag) <- _union td n_descendants
in
- loop (tree_prev_sibling tree node)
- (_add tag foll_siblings)
- (_add tag (_union n_descendants descendants))
- (_add tag n_followings)
+ loop (tree_prev_sibling tree node)
+ (_add tag foll_siblings)
+ (_add tag (_union n_descendants descendants))
+ (_add tag n_followings)
in
- ignore (loop root TagS.empty TagS.empty TagS.empty);
- table_f, table_n, table_c, table_d
+ ignore (loop root TagS.empty TagS.empty TagS.empty);
+ table_f, table_n, table_c, table_d
let is_nil t = t == nil
(Ptset.Int.remove Tag.pcdata
(Ptset.Int.diff d.(Tag.document_node) attributes))
in
- { doc= t;
- attributes = attributes;
- attribute_array = Array.of_list (Ptset.Int.elements attributes);
- elements = elements;
- children = c;
- siblings = n;
- descendants = d;
- followings = f
+ { doc= t;
+ attributes = attributes;
+ attribute_array = Array.of_list (Ptset.Int.elements attributes);
+ elements = elements;
+ children = c;
+ siblings = n;
+ descendants = d;
+ followings = f
- }
+ }
let parse_xml_uri str = node_of_t (TreeBuilder.parse_file str)
let write fd s =
let sl = String.length s in
let ssl = Printf.sprintf "%020i" sl in
- ignore (Unix.write fd ssl 0 20);
- ignore (Unix.write fd s 0 (String.length s))
+ ignore (Unix.write fd ssl 0 20);
+ ignore (Unix.write fd s 0 (String.length s))
let rec really_read fd buffer start length =
if length <= 0 then () else
match Unix.read fd buffer start length with
- 0 -> raise End_of_file
- | r -> really_read fd buffer (start + r) (length - r);;
+ 0 -> raise End_of_file
+ | r -> really_read fd buffer (start + r) (length - r);;
let read fd =
let buffer = String.create 20 in
let size = int_of_string buffer in
let buffer = String.create size in
let _ = really_read fd buffer 0 size in
- buffer
+ buffer
let save_tag_table channel t =
let t = Array.map (fun s -> Array.of_list (Ptset.Int.elements s)) t in
- Marshal.to_channel channel t []
+ Marshal.to_channel channel t []
let save t str =
let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
let out_c = Unix.out_channel_of_descr fd in
let index_prefix = Filename.chop_suffix str ".srx" in
let _ = set_binary_mode_out out_c true in
- output_string out_c magic_string;
- output_char out_c '\n';
- output_string out_c version_string;
- output_char out_c '\n';
- save_tag_table out_c t.children;
- save_tag_table out_c t.siblings;
- save_tag_table out_c t.descendants;
- save_tag_table out_c t.followings;
+ output_string out_c magic_string;
+ output_char out_c '\n';
+ output_string out_c version_string;
+ output_char out_c '\n';
+ save_tag_table out_c t.children;
+ save_tag_table out_c t.siblings;
+ save_tag_table out_c t.descendants;
+ save_tag_table out_c t.followings;
(* we need to move the fd to the correct position *)
- flush out_c;
- ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
- tree_save t.doc fd index_prefix;
- close_out out_c
+ flush out_c;
+ ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
+ tree_save t.doc fd index_prefix;
+ close_out out_c
;;
let load_tag_table channel =
let table : int array array = Marshal.from_channel channel in
- Array.map (fun a -> Ptset.Int.from_list (Array.to_list a)) table
+ Array.map (fun a -> Ptset.Int.from_list (Array.to_list a)) table
let load ?(sample=64) ?(load_text=true) str =
let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
let s = load_tag_table in_c in
let d = load_tag_table in_c in
let f = load_tag_table in_c in
- c,s,d,f
+ c,s,d,f
in
let c, s, d, f = time ~msg:"Loading tag table"(load_table) () in
ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
- let xml_tree = tree_load fd index_prefix load_text sample in
- let () = Tag.init (Obj.magic xml_tree) in
- let attributes = Ptset.Int.add Tag.attribute d.(Tag.attribute) in
- let elements = Ptset.Int.add Tag.document_node
- (Ptset.Int.remove Tag.pcdata
- (Ptset.Int.diff d.(Tag.document_node) attributes))
- in
- let tree = { doc = xml_tree;
- attributes = attributes;
- attribute_array = Array.of_list (Ptset.Int.elements attributes);
- elements = elements;
- children = c;
- siblings = s;
- descendants = d;
- followings = f
+ let xml_tree = tree_load fd index_prefix load_text sample in
+ let () = Tag.init (Obj.magic xml_tree) in
+ let attributes = Ptset.Int.add Tag.attribute d.(Tag.attribute) in
+ let elements = Ptset.Int.add Tag.document_node
+ (Ptset.Int.remove Tag.pcdata
+ (Ptset.Int.diff d.(Tag.document_node) attributes))
+ in
+ let tree = { doc = xml_tree;
+ attributes = attributes;
+ attribute_array = Array.of_list (Ptset.Int.elements attributes);
+ elements = elements;
+ children = c;
+ siblings = s;
+ descendants = d;
+ followings = f
}
in close_in in_c;
tree
let equal a b = a == b
let nts = function
- -1 -> "Nil"
+-1 -> "Nil"
| i -> Printf.sprintf "Node (%i)" i
let dump_node t = nts (Node.to_int t)
type t = (_t -> node -> bool) ref
let hash t = Hashtbl.hash t
let equal t1 t2 = t1 == t2
-end)
+ end)
let string_of_query query =
- match query with
- | `Prefix -> "starts-with"
- | `Suffix -> "ends-with"
- | `Equals -> "equals"
- | `Contains -> "contains"
+ match query with
+ | `Prefix -> "starts-with"
+ | `Suffix -> "ends-with"
+ | `Equals -> "equals"
+ | `Contains -> "contains"
;;
let query_fun = function
- | `Prefix -> text_prefix
- | `Suffix -> text_suffix
- | `Equals -> text_equals
- | `Contains -> text_contains
+ | `Prefix -> text_prefix
+ | `Suffix -> text_suffix
+ | `Equals -> text_equals
+ | `Contains -> text_contains
;;
let _pred_cache = Hashtbl.create 17
memo := begin fun tree node ->
let results =
try Hashtbl.find _pred_cache (query,s) with
- Not_found ->
- time ~count:1 ~msg:(Printf.sprintf "Computing text query %s(%s)"
- (string_of_query query) s)
- (f tree) s true
+ Not_found ->
+ time ~count:1 ~msg:(Printf.sprintf "Computing text query %s(%s)"
+ (string_of_query query) s)
+ (f tree) s true
in
let bv = results.bv in
memo := begin fun _ n ->