type 'a node = private int
type node_kind = [`Text | `Tree ]
-type t = {
- doc : tree;
+type t = {
+ doc : tree;
ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
}
-external inode : 'a node -> int = "%identity"
-external nodei : int -> 'a node = "%identity"
+external inode : 'a node -> int = "%identity"
+external nodei : int -> 'a node = "%identity"
let compare_node x y = (inode x) - (inode y)
let equal_node : 'a node -> 'a node -> bool = (==)
-
-external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
+
+external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
-external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
-external tree_load : Unix.file_descr -> bool -> int -> tree = "caml_xml_tree_load"
-
+external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save"
+external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load"
+
external nullt : unit -> 'a node = "caml_xml_tree_nullt"
let nil : [`Tree ] node = nodei ~-1
let nulldoc : [`Text ] node = nodei ~-1
let root : [`Tree ] node = nodei 0
-external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
-external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
+external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
+external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
-external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
-external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
-external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
-external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
-external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
+external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
+external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
+external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
+external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
+external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
external text_count : tree -> string -> int = "caml_text_collection_count"
external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
-
+
external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
-external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
+external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
type unordered_set
external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
external tree_is_open : tree -> [`Tree] node -> bool = "caml_xml_tree_is_open" "noalloc"
-external benchmark_jump : tree -> Tag.t -> unit = "caml_benchmark_jump" "noalloc"
+external benchmark_jump : tree -> Tag.t -> int = "caml_benchmark_jump" "noalloc"
let benchmark_jump t s = benchmark_jump t.doc s
-external benchmark_fcns : tree -> unit = "caml_benchmark_fcns" "noalloc"
+external benchmark_fcns : tree -> int = "caml_benchmark_fcns" "noalloc"
+external benchmark_fene : tree -> int = "caml_benchmark_fene" "noalloc"
+external benchmark_iter : tree -> int = "caml_benchmark_iter" "noalloc"
let benchmark_fcns t = benchmark_fcns t.doc
+let benchmark_fene t = benchmark_fene t.doc
+
+let benchmark_iter t = benchmark_iter t.doc
+
external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc"
let benchmark_lcps t = benchmark_lcps t.doc
let vector_htbl = HPtset.create MED_H_SIZE
let ptset_to_vector s =
- try
+ try
HPtset.find vector_htbl s
with
Not_found ->
let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
HPtset.add vector_htbl s v; v
-
+
let subtree_size t i = tree_subtree_size t.doc i
let subtree_elements t i = tree_subtree_elements t.doc i
let text_size t = text_size t.doc
-module MemUnion = Hashtbl.Make (struct
+module MemUnion = Hashtbl.Make (struct
type t = Ptset.Int.t*Ptset.Int.t
let equal (x,y) (z,t) = x == z && y == t
let equal a b = equal a b || equal b a
end)
module MemAdd = Hashtbl.Make (
- struct
+ struct
type t = Tag.t*Ptset.Int.t
let equal (x,y) (z,t) = (x == z)&&(y == t)
let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y))
module MemUpdate = struct
include Hashtbl.Make (
- struct
+ struct
type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t
let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 &&
b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2
- let hash (a,b,c,d,e) =
+ let hash (a,b,c,d,e) =
HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)),
Uid.to_int (Ptset.Int.uid c),
Uid.to_int (Ptset.Int.uid d),
| Not_found -> let s = Ptset.Int.union s1 s2
in
MemUnion.add h_union (s1,s2) s;s
- in
+ in
let h_add = MemAdd.create BIG_H_SIZE in
- let pt_add t s =
+ let pt_add t s =
try MemAdd.find h_add (t,s)
with
| Not_found -> let r = Ptset.Int.add t s in
MemAdd.add h_add (t,s) r;r
- in
+ in
let h = Hashtbl.create BIG_H_SIZE in
- let update t sc sb ss sa =
- let schild,sbelow,ssibling,safter =
+ let update t sc sb ss sa =
+ let schild,sbelow,ssibling,safter =
try
- Hashtbl.find h t
+ Hashtbl.find h t
with
- | Not_found ->
+ | Not_found ->
(Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
in
- Hashtbl.replace h t
- (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
+ Hashtbl.replace h t
+ (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
in
- let rec loop right id acc_after =
+ let rec loop right id acc_after =
if id == nil
then Ptset.Int.empty,Ptset.Int.empty,acc_after else
let sibling2,desc2,after2 = loop true (tree_next_sibling tree id) acc_after in
let child1,desc1,after1 = loop false (tree_first_child tree id) after2 in
let tag = tree_tag tree id in
update tag child1 desc1 sibling2 after2;
- ( pt_add tag sibling2,
+ ( pt_add tag sibling2,
pt_add tag (pt_cup desc1 desc2),
if right then pt_cup after1 (pt_cup desc1 desc2) else acc_after )
in
- let _ = loop false (tree_root tree) Ptset.Int.empty in
+ let _ = loop false (tree_root tree) Ptset.Int.empty in
let _ = Printf.eprintf "Finished\n%!" in
h
let contains_array = ref [| |]
-let contains_index = Hashtbl.create 4096
+let contains_index = Hashtbl.create 4096
let in_array _ i =
try
Hashtbl.find contains_index i
with
Not_found -> false
-let init_textfun f t s =
- let a = match f with
- | `CONTAINS -> text_contains t.doc s
- | `STARTSWITH -> text_prefix t.doc s
- | `ENDSWITH -> text_suffix t.doc s
- | `EQUALS -> text_equals t.doc s
+let init_textfun f t s =
+ let a = match f with
+ | `CONTAINS -> text_contains t.doc s
+ | `STARTSWITH -> text_prefix t.doc s
+ | `ENDSWITH -> text_suffix t.doc s
+ | `EQUALS -> text_equals t.doc s
in
(*Array.fast_sort (compare) a; *)
contains_array := a;
Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
-
+
let count_contains t s = text_count_contains t.doc s
let init_naive_contains t s =
let i,j = tree_doc_ids t.doc (tree_root t.doc)
in
let regexp = Str.regexp_string s in
- let matching arg =
+ let matching arg =
try
let _ = Str.search_forward regexp arg 0;
in true
with _ -> false
in
- let rec loop n acc l =
+ let rec loop n acc l =
if n >= j then acc,l
else
let s = text_get_text t.doc n
in
- if matching s
- then loop (nodei ((inode n)+1)) (n::acc) (l+1)
+ if matching s
+ then loop (nodei ((inode n)+1)) (n::acc) (l+1)
else loop (nodei ((inode n)+1)) acc l
in
let acc,l = loop i [] 0 in
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nulldoc
- else loop !last_idx i j
-
-let text_below tree t =
+ else loop !last_idx i j
+
+let text_below tree t =
let l = Array.length !contains_array in
let i,j = tree_doc_ids tree.doc t in
let id = if l == 0 then i else (array_find !contains_array i j) in
tree_parent_node tree.doc id
-
+
let text_next tree t root =
let l = Array.length !contains_array in
- let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
- let _,j = tree_doc_ids tree.doc root in
+ let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
+ let _,j = tree_doc_ids tree.doc root in
let id = if l == 0 then if inf > j then nulldoc else inf
else array_find !contains_array inf j
- in
+ in
tree_parent_node tree.doc id
module DocIdSet = struct
include Set.Make (struct type t = [`Text] node
let compare = compare_node end)
-
+
end
let is_nil t = t == nil
let node_of_t t =
let _ = Tag.init (Obj.magic t) in
- let table = collect_tags t
+ let table = collect_tags t
in (*
let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
Printf.eprintf "\n\n%!";) table
in
-
- *)
- { doc= t;
+
+ *)
+ { doc= t;
ttable = table;
}
let parse f str =
node_of_t
- (f str
- !Options.sample_factor
+ (f str
+ !Options.sample_factor
!Options.index_empty_texts
!Options.disable_text_collection)
-
+
let parse_xml_uri str = parse parse_xml_uri str
let parse_xml_string str = parse parse_xml_string str
let size t = tree_size t.doc;;
-
+
external pool : tree -> Tag.pool = "%identity"
let magic_string = "SXSI_INDEX"
let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
-let write fd s =
+let write fd s =
let sl = String.length s in
let ssl = Printf.sprintf "%020i" sl in
ignore (Unix.write fd ssl 0 20);
let buffer = String.create size in
let _ = really_read fd buffer 0 size in
buffer
-
+
let save t str =
let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
(* we need to move the fd to the correct position *)
flush out_c;
ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
- tree_save t.doc fd;
+ tree_save t.doc fd str;
close_out out_c
;;
-let load ?(sample=64) ?(load_text=true) str =
+let load ?(sample=64) ?(load_text=true) str =
let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
let in_c = Unix.in_channel_of_descr fd in
let _ = set_binary_mode_in in_c true in
- let load_table () =
+ let load_table () =
(let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
(let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
- Marshal.from_channel in_c
+ Marshal.from_channel in_c
in
let ntable = Hashtbl.create (Hashtbl.length table) in
- Hashtbl.iter (fun k (s1,s2,s3,s4) ->
+ Hashtbl.iter (fun k (s1,s2,s3,s4) ->
let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
let _ = Printf.eprintf "\nLoading tag table : " in
let ntable = time (load_table) () in
ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
- let tree = { doc = tree_load fd load_text sample;
+ let tree = { doc = tree_load fd str load_text sample;
ttable = ntable;}
in close_in in_c;
tree
-
+
let tag_pool t = pool t.doc
-
+
let compare = compare_node
let equal a b = a == b
-
+
let nts = function
-1 -> "Nil"
| i -> Printf.sprintf "Node (%i)" i
-
+
let dump_node t = nts (inode t)
let is_left t n = tree_is_first_child t.doc n
-let is_below_right t n1 n2 =
- tree_is_ancestor t.doc (tree_parent t.doc n1) n2
+let is_below_right t n1 n2 =
+ tree_is_ancestor t.doc (tree_parent t.doc n1) n2
&& not (tree_is_ancestor t.doc n1 n2)
let is_binary_ancestor t n1 n2 =
n2 > n1 && n2 < fin
(* (is_below_right t n1 n2) ||
(tree_is_ancestor t.doc n1 n2) *)
-
+
let parent t n = tree_parent t.doc n
let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
-
+let first_element t n = tree_first_element t.doc n
(* these function will be called in two times: first partial application
on the tag, then application of the tag and the tree, then application of
the other arguments. We use the trick to let the compiler optimize application
let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n
let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n
+let next_element t n = tree_next_element t.doc n
let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag
let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
-let select_following_sibling_below t = fun ts ->
+let select_following_sibling_below t = fun ts ->
let v = (ptset_to_vector ts) in ();
fun n _ -> tree_select_following_sibling t.doc n v
let id t n = tree_node_xml_id t.doc n
-
+
let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
-let tagged_descendant t tag =
- let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
+let tagged_descendant t tag =
+ let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
-let select_descendant t = fun ts ->
+let select_descendant t = fun ts ->
let v = (ptset_to_vector ts) in ();
fun n -> tree_select_descendant t.doc n v
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nil
- else loop !last_idx i j
+ else loop !last_idx i j
let stack = ref []
let init_stack () = stack := []
let push x = stack:= x::!stack
- let peek () = match !stack with
+ let peek () = match !stack with
p::_ -> p
| _ -> failwith "peek"
let pop () = match !stack with
p::r -> stack:=r; p
| _ -> failwith "pop"
- let next t = nodei ( (inode t) + 1 )
- let next2 t = nodei ( (inode t) + 2 )
- let next3 t = nodei ( (inode t) + 3 )
-
+ let next t = nodei ( (inode t) + 1 )
+ let next2 t = nodei ( (inode t) + 2 )
+ let next3 t = nodei ( (inode t) + 3 )
+
let print_xml_fast2 =
let _ = init_stack () in
- let h = Hashtbl.create MED_H_SIZE in
+ let h = Hashtbl.create MED_H_SIZE in
let tag_str t = try Hashtbl.find h t with
Not_found -> let s = Tag.to_string t in
Hashtbl.add h t s;s
in
- let h_att = Hashtbl.create MED_H_SIZE in
+ let h_att = Hashtbl.create MED_H_SIZE in
let att_str t = try Hashtbl.find h_att t with
Not_found -> let s = Tag.to_string t in
let attname = String.sub s 3 ((String.length s) -3) in
if t <= fin then
if tree_is_open tree t then
(* opening tag *)
- if tag == Tag.pcdata then
+ if tag == Tag.pcdata then
begin
output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
loop (next2 t) (* skip closing $ *)
end
else
let tagstr = tag_str tag in
- let _ = output_char outc '<';
+ let _ = output_char outc '<';
output_string outc tagstr in
let t' = next t in
if tree_is_open tree t' then
let _ = push tagstr in
let tag' = tree_tag tree t' in
- if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
+ if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
else (* closing with no content *)
let _ = output_string outc "/>" in
loop (next t);
end
and loop t = loop_tag t (tree_tag tree t)
- and loop_attr t n =
- if tree_is_open tree t then
+ and loop_attr t n =
+ if tree_is_open tree t then
let attname = att_str (tree_tag tree t) in
output_char outc ' ';
output_string outc attname;
in loop t
let print_xml_fast =
- let h = Hashtbl.create MED_H_SIZE in
+ let h = Hashtbl.create MED_H_SIZE in
let tag_str t = try Hashtbl.find h t with
Not_found -> let s = Tag.to_string t in
Hashtbl.add h t s;s
in
- let h_att = Hashtbl.create MED_H_SIZE in
+ let h_att = Hashtbl.create MED_H_SIZE in
let att_str t = try Hashtbl.find h_att t with
Not_found -> let s = Tag.to_string t in
let attname = String.sub s 3 ((String.length s) -3) in
Hashtbl.add h_att t attname;attname
in fun outc tree t ->
- let rec loop ?(print_right=true) t =
- if t != nil
- then
+ let rec loop ?(print_right=true) t =
+ if t != nil
+ then
let tagid = tree_tag tree.doc t in
if tagid==Tag.pcdata
- then
+ then
begin
let tid = tree_my_text_unsafe tree.doc t in
output_string outc (text_get_text tree.doc tid);
end
else
let tagstr = tag_str tagid in
- let l = first_child tree t
- and r = next_sibling tree t
+ let l = first_child tree t
+ and r = next_sibling tree t
in
output_char outc '<';
output_string outc tagstr;
if l == nil then output_string outc "/>"
- else
+ else
if (tag tree l) == Tag.attribute then
begin
loop_attributes (first_child tree l);
if (next_sibling tree l) == nil then output_string outc "/>"
- else
- begin
- output_char outc '>';
+ else
+ begin
+ output_char outc '>';
loop (next_sibling tree l);
output_string outc "</";
output_string outc tagstr;
end
else
begin
- output_char outc '>';
+ output_char outc '>';
loop l;
output_string outc "</";
output_string outc tagstr;
output_char outc '>';
end;
if print_right then loop r
- and loop_attributes a =
+ and loop_attributes a =
if a != nil
then
let attname = att_str (tag tree a) in
loop_attributes (next_sibling tree a)
in
loop ~print_right:false t
-
-
- let print_xml_fast outc tree t =
+
+
+ let print_xml_fast outc tree t =
if (tag tree t) = Tag.document_node then
print_xml_fast outc tree (first_child tree t)
- else print_xml_fast outc tree t
-
-let tags_children t tag =
+ else print_xml_fast outc tree t
+
+let tags_children t tag =
let a,_,_,_ = Hashtbl.find t.ttable tag in a
-let tags_below t tag =
+let tags_below t tag =
let _,a,_,_ = Hashtbl.find t.ttable tag in a
-let tags_siblings t tag =
+let tags_siblings t tag =
let _,_,a,_ = Hashtbl.find t.ttable tag in a
-let tags_after t tag =
+let tags_after t tag =
let _,_,_,a = Hashtbl.find t.ttable tag in a
let tags t tag = Hashtbl.find t.ttable tag
-let rec binary_parent t n =
- let r =
+let rec binary_parent t n =
+ let r =
if tree_is_first_child t.doc n
then tree_parent t.doc n
else tree_prev_sibling t.doc n
let get_text t n =
let tid = tree_my_text t.doc n in
- if tid == nulldoc then "" else
+ if tid == nulldoc then "" else
text_get_text t.doc tid
-let dump_tree fmt tree =
+let dump_tree fmt tree =
let rec loop t n =
if t != nil then
let tag = (tree_tag tree.doc t ) in
let tagstr = Tag.to_string tag in
let tab = String.make n ' ' in
- if tag == Tag.pcdata || tag == Tag.attribute_data
- then
- Format.fprintf fmt "%s<%s>%s</%s>\n"
+ if tag == Tag.pcdata || tag == Tag.attribute_data
+ then
+ Format.fprintf fmt "%s<%s>%s</%s>\n"
tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
else begin
Format.fprintf fmt "%s<%s>\n" tab tagstr;
loop root 0
;;
-
+
let print_xml_fast3 t = tree_print_xml_fast3 t.doc
-let stats t =
+let stats t =
let tree = t.doc in
- let rec loop left node acc_d total_d num_leaves =
+ let rec loop left node acc_d total_d num_leaves =
if node == nil then
(acc_d+total_d,if left then num_leaves+1 else num_leaves)
else
let test_prefix t s = Array.length (text_prefix t.doc s)
let test_suffix t s = Array.length (text_suffix t.doc s)
-let test_contains t s = Array.length (text_contains t.doc s)
+let test_contains t s = Array.length (text_contains t.doc s)
let test_equals t s = Array.length (text_equals t.doc s)