type 'a node = private int
type node_kind = [`Text | `Tree ]
-type t = {
- doc : tree;
+type t = {
+ doc : tree;
ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
}
-external inode : 'a node -> int = "%identity"
-external nodei : int -> 'a node = "%identity"
+external inode : 'a node -> int = "%identity"
+external nodei : int -> 'a node = "%identity"
let compare_node x y = (inode x) - (inode y)
let equal_node : 'a node -> 'a node -> bool = (==)
-
-external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
+
+external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save"
external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load"
-
+
external nullt : unit -> 'a node = "caml_xml_tree_nullt"
let nil : [`Tree ] node = nodei ~-1
let nulldoc : [`Text ] node = nodei ~-1
let root : [`Tree ] node = nodei 0
-external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
-external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
+external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
+external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
-external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
-external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
-external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
-external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
-external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
+external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
+external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
+external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
+external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
+external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
external text_count : tree -> string -> int = "caml_text_collection_count"
external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
-
+
external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
-external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
+external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
type unordered_set
external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
let vector_htbl = HPtset.create MED_H_SIZE
let ptset_to_vector s =
- try
+ try
HPtset.find vector_htbl s
with
Not_found ->
let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
HPtset.add vector_htbl s v; v
-
+
let subtree_size t i = tree_subtree_size t.doc i
let subtree_elements t i = tree_subtree_elements t.doc i
let text_size t = text_size t.doc
-module MemUnion = Hashtbl.Make (struct
+module MemUnion = Hashtbl.Make (struct
type t = Ptset.Int.t*Ptset.Int.t
let equal (x,y) (z,t) = x == z && y == t
let equal a b = equal a b || equal b a
end)
module MemAdd = Hashtbl.Make (
- struct
+ struct
type t = Tag.t*Ptset.Int.t
let equal (x,y) (z,t) = (x == z)&&(y == t)
let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y))
module MemUpdate = struct
include Hashtbl.Make (
- struct
+ struct
type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t
let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 &&
b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2
- let hash (a,b,c,d,e) =
+ let hash (a,b,c,d,e) =
HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)),
Uid.to_int (Ptset.Int.uid c),
Uid.to_int (Ptset.Int.uid d),
| Not_found -> let s = Ptset.Int.union s1 s2
in
MemUnion.add h_union (s1,s2) s;s
- in
+ in
let h_add = MemAdd.create BIG_H_SIZE in
- let pt_add t s =
+ let pt_add t s =
try MemAdd.find h_add (t,s)
with
| Not_found -> let r = Ptset.Int.add t s in
MemAdd.add h_add (t,s) r;r
- in
+ in
let h = Hashtbl.create BIG_H_SIZE in
- let update t sc sb ss sa =
- let schild,sbelow,ssibling,safter =
+ let update t sc sb ss sa =
+ let schild,sbelow,ssibling,safter =
try
- Hashtbl.find h t
+ Hashtbl.find h t
with
- | Not_found ->
+ | Not_found ->
(Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
in
- Hashtbl.replace h t
- (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
+ Hashtbl.replace h t
+ (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
in
- let rec loop right id acc_after =
+ let rec loop right id acc_after =
if id == nil
then Ptset.Int.empty,Ptset.Int.empty,acc_after else
let sibling2,desc2,after2 = loop true (tree_next_sibling tree id) acc_after in
let child1,desc1,after1 = loop false (tree_first_child tree id) after2 in
let tag = tree_tag tree id in
update tag child1 desc1 sibling2 after2;
- ( pt_add tag sibling2,
+ ( pt_add tag sibling2,
pt_add tag (pt_cup desc1 desc2),
if right then pt_cup after1 (pt_cup desc1 desc2) else acc_after )
in
- let _ = loop false (tree_root tree) Ptset.Int.empty in
+ let _ = loop false (tree_root tree) Ptset.Int.empty in
let _ = Printf.eprintf "Finished\n%!" in
h
let contains_array = ref [| |]
-let contains_index = Hashtbl.create 4096
+let contains_index = Hashtbl.create 4096
let in_array _ i =
try
Hashtbl.find contains_index i
with
Not_found -> false
-let init_textfun f t s =
- let a = match f with
- | `CONTAINS -> text_contains t.doc s
- | `STARTSWITH -> text_prefix t.doc s
- | `ENDSWITH -> text_suffix t.doc s
- | `EQUALS -> text_equals t.doc s
+let init_textfun f t s =
+ let a = match f with
+ | `CONTAINS -> text_contains t.doc s
+ | `STARTSWITH -> text_prefix t.doc s
+ | `ENDSWITH -> text_suffix t.doc s
+ | `EQUALS -> text_equals t.doc s
in
(*Array.fast_sort (compare) a; *)
contains_array := a;
Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
-
+
let count_contains t s = text_count_contains t.doc s
let init_naive_contains t s =
let i,j = tree_doc_ids t.doc (tree_root t.doc)
in
let regexp = Str.regexp_string s in
- let matching arg =
+ let matching arg =
try
let _ = Str.search_forward regexp arg 0;
in true
with _ -> false
in
- let rec loop n acc l =
+ let rec loop n acc l =
if n >= j then acc,l
else
let s = text_get_text t.doc n
in
- if matching s
- then loop (nodei ((inode n)+1)) (n::acc) (l+1)
+ if matching s
+ then loop (nodei ((inode n)+1)) (n::acc) (l+1)
else loop (nodei ((inode n)+1)) acc l
in
let acc,l = loop i [] 0 in
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nulldoc
- else loop !last_idx i j
-
-let text_below tree t =
+ else loop !last_idx i j
+
+let text_below tree t =
let l = Array.length !contains_array in
let i,j = tree_doc_ids tree.doc t in
let id = if l == 0 then i else (array_find !contains_array i j) in
tree_parent_node tree.doc id
-
+
let text_next tree t root =
let l = Array.length !contains_array in
- let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
- let _,j = tree_doc_ids tree.doc root in
+ let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
+ let _,j = tree_doc_ids tree.doc root in
let id = if l == 0 then if inf > j then nulldoc else inf
else array_find !contains_array inf j
- in
+ in
tree_parent_node tree.doc id
module DocIdSet = struct
include Set.Make (struct type t = [`Text] node
let compare = compare_node end)
-
+
end
let is_nil t = t == nil
let node_of_t t =
let _ = Tag.init (Obj.magic t) in
- let table = collect_tags t
+ let table = collect_tags t
in (*
let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
Printf.eprintf "\n\n%!";) table
in
-
- *)
- { doc= t;
+
+ *)
+ { doc= t;
ttable = table;
}
let parse f str =
node_of_t
- (f str
- !Options.sample_factor
+ (f str
+ !Options.sample_factor
!Options.index_empty_texts
!Options.disable_text_collection)
-
+
let parse_xml_uri str = parse parse_xml_uri str
let parse_xml_string str = parse parse_xml_string str
let size t = tree_size t.doc;;
-
+
external pool : tree -> Tag.pool = "%identity"
let magic_string = "SXSI_INDEX"
let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
-let write fd s =
+let write fd s =
let sl = String.length s in
let ssl = Printf.sprintf "%020i" sl in
ignore (Unix.write fd ssl 0 20);
let buffer = String.create size in
let _ = really_read fd buffer 0 size in
buffer
-
+
let save t str =
let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
close_out out_c
;;
-let load ?(sample=64) ?(load_text=true) str =
+let load ?(sample=64) ?(load_text=true) str =
let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
let in_c = Unix.in_channel_of_descr fd in
let _ = set_binary_mode_in in_c true in
- let load_table () =
+ let load_table () =
(let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
(let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
- Marshal.from_channel in_c
+ Marshal.from_channel in_c
in
let ntable = Hashtbl.create (Hashtbl.length table) in
- Hashtbl.iter (fun k (s1,s2,s3,s4) ->
+ Hashtbl.iter (fun k (s1,s2,s3,s4) ->
let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
ttable = ntable;}
in close_in in_c;
tree
-
+
let tag_pool t = pool t.doc
-
+
let compare = compare_node
let equal a b = a == b
-
+
let nts = function
-1 -> "Nil"
| i -> Printf.sprintf "Node (%i)" i
-
+
let dump_node t = nts (inode t)
let is_left t n = tree_is_first_child t.doc n
-let is_below_right t n1 n2 =
- tree_is_ancestor t.doc (tree_parent t.doc n1) n2
+let is_below_right t n1 n2 =
+ tree_is_ancestor t.doc (tree_parent t.doc n1) n2
&& not (tree_is_ancestor t.doc n1 n2)
let is_binary_ancestor t n1 n2 =
n2 > n1 && n2 < fin
(* (is_below_right t n1 n2) ||
(tree_is_ancestor t.doc n1 n2) *)
-
+
let parent t n = tree_parent t.doc n
let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
-let select_following_sibling_below t = fun ts ->
+let select_following_sibling_below t = fun ts ->
let v = (ptset_to_vector ts) in ();
fun n _ -> tree_select_following_sibling t.doc n v
let id t n = tree_node_xml_id t.doc n
-
+
let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
-let tagged_descendant t tag =
- let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
+let tagged_descendant t tag =
+ let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
-let select_descendant t = fun ts ->
+let select_descendant t = fun ts ->
let v = (ptset_to_vector ts) in ();
fun n -> tree_select_descendant t.doc n v
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nil
- else loop !last_idx i j
+ else loop !last_idx i j
let stack = ref []
let init_stack () = stack := []
let push x = stack:= x::!stack
- let peek () = match !stack with
+ let peek () = match !stack with
p::_ -> p
| _ -> failwith "peek"
let pop () = match !stack with
p::r -> stack:=r; p
| _ -> failwith "pop"
- let next t = nodei ( (inode t) + 1 )
- let next2 t = nodei ( (inode t) + 2 )
- let next3 t = nodei ( (inode t) + 3 )
-
+ let next t = nodei ( (inode t) + 1 )
+ let next2 t = nodei ( (inode t) + 2 )
+ let next3 t = nodei ( (inode t) + 3 )
+
let print_xml_fast2 =
let _ = init_stack () in
- let h = Hashtbl.create MED_H_SIZE in
+ let h = Hashtbl.create MED_H_SIZE in
let tag_str t = try Hashtbl.find h t with
Not_found -> let s = Tag.to_string t in
Hashtbl.add h t s;s
in
- let h_att = Hashtbl.create MED_H_SIZE in
+ let h_att = Hashtbl.create MED_H_SIZE in
let att_str t = try Hashtbl.find h_att t with
Not_found -> let s = Tag.to_string t in
let attname = String.sub s 3 ((String.length s) -3) in
if t <= fin then
if tree_is_open tree t then
(* opening tag *)
- if tag == Tag.pcdata then
+ if tag == Tag.pcdata then
begin
output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
loop (next2 t) (* skip closing $ *)
end
else
let tagstr = tag_str tag in
- let _ = output_char outc '<';
+ let _ = output_char outc '<';
output_string outc tagstr in
let t' = next t in
if tree_is_open tree t' then
let _ = push tagstr in
let tag' = tree_tag tree t' in
- if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
+ if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
else (* closing with no content *)
let _ = output_string outc "/>" in
loop (next t);
end
and loop t = loop_tag t (tree_tag tree t)
- and loop_attr t n =
- if tree_is_open tree t then
+ and loop_attr t n =
+ if tree_is_open tree t then
let attname = att_str (tree_tag tree t) in
output_char outc ' ';
output_string outc attname;
in loop t
let print_xml_fast =
- let h = Hashtbl.create MED_H_SIZE in
+ let h = Hashtbl.create MED_H_SIZE in
let tag_str t = try Hashtbl.find h t with
Not_found -> let s = Tag.to_string t in
Hashtbl.add h t s;s
in
- let h_att = Hashtbl.create MED_H_SIZE in
+ let h_att = Hashtbl.create MED_H_SIZE in
let att_str t = try Hashtbl.find h_att t with
Not_found -> let s = Tag.to_string t in
let attname = String.sub s 3 ((String.length s) -3) in
Hashtbl.add h_att t attname;attname
in fun outc tree t ->
- let rec loop ?(print_right=true) t =
- if t != nil
- then
+ let rec loop ?(print_right=true) t =
+ if t != nil
+ then
let tagid = tree_tag tree.doc t in
if tagid==Tag.pcdata
- then
+ then
begin
let tid = tree_my_text_unsafe tree.doc t in
output_string outc (text_get_text tree.doc tid);
end
else
let tagstr = tag_str tagid in
- let l = first_child tree t
- and r = next_sibling tree t
+ let l = first_child tree t
+ and r = next_sibling tree t
in
output_char outc '<';
output_string outc tagstr;
if l == nil then output_string outc "/>"
- else
+ else
if (tag tree l) == Tag.attribute then
begin
loop_attributes (first_child tree l);
if (next_sibling tree l) == nil then output_string outc "/>"
- else
- begin
- output_char outc '>';
+ else
+ begin
+ output_char outc '>';
loop (next_sibling tree l);
output_string outc "</";
output_string outc tagstr;
end
else
begin
- output_char outc '>';
+ output_char outc '>';
loop l;
output_string outc "</";
output_string outc tagstr;
output_char outc '>';
end;
if print_right then loop r
- and loop_attributes a =
+ and loop_attributes a =
if a != nil
then
let attname = att_str (tag tree a) in
loop_attributes (next_sibling tree a)
in
loop ~print_right:false t
-
-
- let print_xml_fast outc tree t =
+
+
+ let print_xml_fast outc tree t =
if (tag tree t) = Tag.document_node then
print_xml_fast outc tree (first_child tree t)
- else print_xml_fast outc tree t
-
-let tags_children t tag =
+ else print_xml_fast outc tree t
+
+let tags_children t tag =
let a,_,_,_ = Hashtbl.find t.ttable tag in a
-let tags_below t tag =
+let tags_below t tag =
let _,a,_,_ = Hashtbl.find t.ttable tag in a
-let tags_siblings t tag =
+let tags_siblings t tag =
let _,_,a,_ = Hashtbl.find t.ttable tag in a
-let tags_after t tag =
+let tags_after t tag =
let _,_,_,a = Hashtbl.find t.ttable tag in a
let tags t tag = Hashtbl.find t.ttable tag
-let rec binary_parent t n =
- let r =
+let rec binary_parent t n =
+ let r =
if tree_is_first_child t.doc n
then tree_parent t.doc n
else tree_prev_sibling t.doc n
let get_text t n =
let tid = tree_my_text t.doc n in
- if tid == nulldoc then "" else
+ if tid == nulldoc then "" else
text_get_text t.doc tid
-let dump_tree fmt tree =
+let dump_tree fmt tree =
let rec loop t n =
if t != nil then
let tag = (tree_tag tree.doc t ) in
let tagstr = Tag.to_string tag in
let tab = String.make n ' ' in
- if tag == Tag.pcdata || tag == Tag.attribute_data
- then
- Format.fprintf fmt "%s<%s>%s</%s>\n"
+ if tag == Tag.pcdata || tag == Tag.attribute_data
+ then
+ Format.fprintf fmt "%s<%s>%s</%s>\n"
tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
else begin
Format.fprintf fmt "%s<%s>\n" tab tagstr;
loop root 0
;;
-
+
let print_xml_fast3 t = tree_print_xml_fast3 t.doc
-let stats t =
+let stats t =
let tree = t.doc in
- let rec loop left node acc_d total_d num_leaves =
+ let rec loop left node acc_d total_d num_leaves =
if node == nil then
(acc_d+total_d,if left then num_leaves+1 else num_leaves)
else
let test_prefix t s = Array.length (text_prefix t.doc s)
let test_suffix t s = Array.length (text_suffix t.doc s)
-let test_contains t s = Array.length (text_contains t.doc s)
+let test_contains t s = Array.length (text_contains t.doc s)
let test_equals t s = Array.length (text_equals t.doc s)