type 'a node = private int
type node_kind = [`Text | `Tree ]
-external inode : 'a node -> int = "%identity"
-external nodei : int -> 'a node = "%identity"
+type t = {
+ doc : tree;
+ ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
+}
+
+external inode : 'a node -> int = "%identity"
+external nodei : int -> 'a node = "%identity"
let compare_node x y = (inode x) - (inode y)
let equal_node : 'a node -> 'a node -> bool = (==)
-
-external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
+
+external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
-
-external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
-external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
-
+external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
+external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save"
+external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load"
+
external nullt : unit -> 'a node = "caml_xml_tree_nullt"
let nil : [`Tree ] node = nodei ~-1
let nulldoc : [`Text ] node = nodei ~-1
let root : [`Tree ] node = nodei 0
-external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
-
-external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
+external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
+external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
-external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
-external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
-external text_count : tree -> string -> int = "caml_text_collection_count"
-external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
-external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
-external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
-
-external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
-external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size"
-
+external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
+external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
+external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
+external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
+external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
+
+external text_count : tree -> string -> int = "caml_text_collection_count"
+external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
+external text_count_suffix : tree -> string -> int = "caml_text_collection_count_suffix"
+external text_count_equal : tree -> string -> int = "caml_text_collection_count_equal"
+external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
+external text_count_lessthan : tree -> string -> int = "caml_text_collection_count_lessthan"
+
+external text_prefix : tree -> string -> [`Text ] node array = "caml_text_collection_prefix"
+external text_suffix : tree -> string -> [`Text ] node array = "caml_text_collection_suffix"
+external text_equals : tree -> string -> [`Text ] node array = "caml_text_collection_equals"
+external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
+external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
+
+
+external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
+external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
+external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
+external tree_subtree_elements : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_elements" "noalloc"
+external tree_subtree_tags : tree -> [`Tree] node -> Tag.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
+
let tree_is_nil x = equal_node x nil
+external tree_is_leaf : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
+external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
+external tree_is_child : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_child" "noalloc"
+external tree_is_first_child : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
+external tree_num_children : tree -> [`Tree ] node -> int = "caml_xml_tree_num_children" "noalloc"
+external tree_child_number : tree -> [`Tree ] node -> int = "caml_xml_tree_child_number" "noalloc"
+external tree_depth : tree -> [`Tree ] node -> int = "caml_xml_tree_depth" "noalloc"
+external tree_preorder : tree -> [`Tree ] node -> int = "caml_xml_tree_preorder" "noalloc"
+external tree_postorder : tree -> [`Tree ] node -> int = "caml_xml_tree_postorder" "noalloc"
+external tree_tag : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" "noalloc"
+external tree_doc_ids : tree -> [`Tree ] node -> [`Text] node*[`Text] node = "caml_xml_tree_doc_ids"
external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
-external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc"
-(*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
+external tree_child : tree -> [`Tree] node -> int -> [`Tree] node = "caml_xml_tree_child" "noalloc"
external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
-external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
+external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
-external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc"
-
external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
-external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
-external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
-external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
+external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
+type unordered_set
+external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
+external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
+external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
-external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc"
-
+external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
+external tree_tagged_following_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_following_sibling" "noalloc"
+external tree_select_following_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_following_sibling" "noalloc"
+external tree_tagged_descendant : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_descendant" "noalloc"
+external tree_select_descendant : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_descendant" "noalloc"
+external tree_tagged_following : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_following" "noalloc"
+external tree_tagged_following_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_below" "noalloc"
+external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_below" "noalloc"
-let tree_is_last t n = equal_node nil (tree_next_sibling t n)
-
-(*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *)
-external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc"
-(*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *)
-external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
+external tree_tagged_following_before : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_before" "noalloc"
+external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_before" "noalloc"
-let text_size tree = inode (snd ( tree_doc_ids tree root ))
+external tree_my_text : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text" "noalloc"
+external tree_my_text_unsafe : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text_unsafe" "noalloc"
+external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
+external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
-let text_get_cached_text t (x:[`Text] node) =
- if x == nulldoc then ""
- else
- text_get_cached_text t x
+external tree_parent_node : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_node" "noalloc"
+(*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
-external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
-external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
-external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
-external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc"
-external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc"
-external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc"
+external tree_closing : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_closing" "noalloc"
+external tree_is_open : tree -> [`Tree] node -> bool = "caml_xml_tree_is_open" "noalloc"
-type unordered_set
-external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
-external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
-external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
+external benchmark_jump : tree -> Tag.t -> int = "caml_benchmark_jump" "noalloc"
+
+let benchmark_jump t s = benchmark_jump t.doc s
+
+external benchmark_fcns : tree -> int = "caml_benchmark_fcns" "noalloc"
+external benchmark_fene : tree -> int = "caml_benchmark_fene" "noalloc"
+external benchmark_iter : tree -> int = "caml_benchmark_iter" "noalloc"
+
+let benchmark_fcns t = benchmark_fcns t.doc
+
+let benchmark_fene t = benchmark_fene t.doc
+
+let benchmark_iter t = benchmark_iter t.doc
+
+external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc"
+
+let benchmark_lcps t = benchmark_lcps t.doc
+
+
+
+
+
+
+
+let text_size tree = inode (snd ( tree_doc_ids tree root ))
+
+let text_get_text t (x:[`Text] node) =
+ if x == nulldoc then ""
+ else text_get_text t x
+
-external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
-external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc"
-external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc"
-external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc"
module HPtset = Hashtbl.Make(Ptset.Int)
let vector_htbl = HPtset.create MED_H_SIZE
let ptset_to_vector s =
- try
+ try
HPtset.find vector_htbl s
with
Not_found ->
let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
HPtset.add vector_htbl s v; v
-
-type t = {
- doc : tree;
- ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
-}
+
+
let subtree_size t i = tree_subtree_size t.doc i
+let subtree_elements t i = tree_subtree_elements t.doc i
let text_size t = text_size t.doc
-module MemUnion = Hashtbl.Make (struct
+module MemUnion = Hashtbl.Make (struct
type t = Ptset.Int.t*Ptset.Int.t
- let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
+ let equal (x,y) (z,t) = x == z && y == t
let equal a b = equal a b || equal b a
let hash (x,y) = (* commutative hash *)
- let x = Ptset.Int.hash x
- and y = Ptset.Int.hash y
+ let x = Uid.to_int (Ptset.Int.uid x)
+ and y = Uid.to_int (Ptset.Int.uid y)
in
- if x < y then HASHINT2(x,y) else HASHINT2(y,x)
+ if x <= y then HASHINT2(x,y) else HASHINT2(y,x)
end)
module MemAdd = Hashtbl.Make (
- struct
+ struct
type t = Tag.t*Ptset.Int.t
- let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t)
- let hash (x,y) = HASHINT2(x,Ptset.Int.hash y)
+ let equal (x,y) (z,t) = (x == z)&&(y == t)
+ let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y))
end)
+module MemUpdate = struct
+include Hashtbl.Make (
+ struct
+ type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t
+ let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 &&
+ b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2
+ let hash (a,b,c,d,e) =
+ HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)),
+ Uid.to_int (Ptset.Int.uid c),
+ Uid.to_int (Ptset.Int.uid d),
+ Uid.to_int (Ptset.Int.uid e))
+ end)
+
+end
+
let collect_tags tree =
+ let _ = Printf.eprintf "Collecting Tags\n%!" in
let h_union = MemUnion.create BIG_H_SIZE in
let pt_cup s1 s2 =
try
| Not_found -> let s = Ptset.Int.union s1 s2
in
MemUnion.add h_union (s1,s2) s;s
- in
+ in
let h_add = MemAdd.create BIG_H_SIZE in
- let pt_add t s =
+ let pt_add t s =
try MemAdd.find h_add (t,s)
with
| Not_found -> let r = Ptset.Int.add t s in
in
let h = Hashtbl.create BIG_H_SIZE in
let update t sc sb ss sa =
- let schild,sbelow,ssibling,safter =
+ let schild,sbelow,ssibling,safter =
try
- Hashtbl.find h t
+ Hashtbl.find h t
with
- | Not_found ->
+ | Not_found ->
(Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
in
- Hashtbl.replace h t
+ Hashtbl.replace h t
(pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
in
- let rec loop_right id acc_after =
+ let rec loop right id acc_after =
if id == nil
- then Ptset.Int.empty,Ptset.Int.empty,acc_after
- else
- let sibling2,desc2,after2 = loop_right (tree_next_sibling tree id) acc_after in
- let child1,desc1,after1 = loop_left (tree_first_child tree id) after2 in
- let tag = tree_tag_id tree id in
+ then Ptset.Int.empty,Ptset.Int.empty,acc_after else
+ let sibling2,desc2,after2 = loop true (tree_next_sibling tree id) acc_after in
+ let child1,desc1,after1 = loop false (tree_first_child tree id) after2 in
+ let tag = tree_tag tree id in
update tag child1 desc1 sibling2 after2;
- ( pt_add tag sibling2,
+ ( pt_add tag sibling2,
pt_add tag (pt_cup desc1 desc2),
- pt_cup after1 (pt_cup desc1 desc2) )
- and loop_left id acc_after =
- if id == nil
- then Ptset.Int.empty,Ptset.Int.empty,acc_after
- else
- let sibling2,desc2,after2 = loop_right (tree_next_sibling tree id) acc_after in
- let child1,desc1,after1 = loop_left (tree_first_child tree id) after2 in
- let tag = tree_tag_id tree id in
- update tag child1 desc1 sibling2 after2;
- (pt_add tag sibling2,
- pt_add tag (pt_cup desc1 desc2),
- acc_after )
+ if right then pt_cup after1 (pt_cup desc1 desc2) else acc_after )
in
- let _ = loop_left (tree_root tree) Ptset.Int.empty in h
-
-
-
+ let _ = loop false (tree_root tree) Ptset.Int.empty in
+ let _ = Printf.eprintf "Finished\n%!" in
+ h
+
+
+
let contains_array = ref [| |]
-let contains_index = Hashtbl.create 4096
+let contains_index = Hashtbl.create 4096
let in_array _ i =
try
Hashtbl.find contains_index i
with
Not_found -> false
-let init_contains t s =
- let a = text_contains t.doc s
+let init_textfun f t s =
+ let a = match f with
+ | `CONTAINS -> text_contains t.doc s
+ | `STARTSWITH -> text_prefix t.doc s
+ | `ENDSWITH -> text_suffix t.doc s
+ | `EQUALS -> text_equals t.doc s
in
- Array.fast_sort (compare) a;
+ (*Array.fast_sort (compare) a; *)
contains_array := a;
Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
-
+
let count_contains t s = text_count_contains t.doc s
-let unsorted_contains t s = text_unsorted_contains t.doc s
let init_naive_contains t s =
let i,j = tree_doc_ids t.doc (tree_root t.doc)
in
let regexp = Str.regexp_string s in
- let matching arg =
+ let matching arg =
try
let _ = Str.search_forward regexp arg 0;
in true
with _ -> false
in
- let rec loop n acc l =
+ let rec loop n acc l =
if n >= j then acc,l
else
- let s = text_get_cached_text t.doc n
+ let s = text_get_text t.doc n
in
- if matching s
- then loop (nodei ((inode n)+1)) (n::acc) (l+1)
+ if matching s
+ then loop (nodei ((inode n)+1)) (n::acc) (l+1)
else loop (nodei ((inode n)+1)) acc l
in
let acc,l = loop i [] 0 in
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nulldoc
- else loop !last_idx i j
-
-let text_below tree t =
+ else loop !last_idx i j
+
+let text_below tree t =
let l = Array.length !contains_array in
let i,j = tree_doc_ids tree.doc t in
let id = if l == 0 then i else (array_find !contains_array i j) in
- tree_parent_doc tree.doc id
-
+ tree_parent_node tree.doc id
+
let text_next tree t root =
let l = Array.length !contains_array in
- let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
- let _,j = tree_doc_ids tree.doc root in
+ let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
+ let _,j = tree_doc_ids tree.doc root in
let id = if l == 0 then if inf > j then nulldoc else inf
else array_find !contains_array inf j
- in
- tree_parent_doc tree.doc id
+ in
+ tree_parent_node tree.doc id
module DocIdSet = struct
include Set.Make (struct type t = [`Text] node
let compare = compare_node end)
-
+
end
let is_nil t = t == nil
let node_of_t t =
let _ = Tag.init (Obj.magic t) in
- let table = collect_tags t
+ let table = collect_tags t
in (*
let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
Printf.eprintf "\n\n%!";) table
in
-
- *)
- { doc= t;
+
+ *)
+ { doc= t;
ttable = table;
}
let parse f str =
node_of_t
- (f str
- !Options.sample_factor
+ (f str
+ !Options.sample_factor
!Options.index_empty_texts
!Options.disable_text_collection)
-
+
let parse_xml_uri str = parse parse_xml_uri str
let parse_xml_string str = parse parse_xml_string str
-
+let size t = tree_size t.doc;;
+
external pool : tree -> Tag.pool = "%identity"
let magic_string = "SXSI_INDEX"
let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
-let write fd s =
+let write fd s =
let sl = String.length s in
let ssl = Printf.sprintf "%020i" sl in
ignore (Unix.write fd ssl 0 20);
let buffer = String.create size in
let _ = really_read fd buffer 0 size in
buffer
-
+
let save t str =
let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
(* we need to move the fd to the correct position *)
flush out_c;
ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
- tree_save t.doc fd;
+ tree_save t.doc fd str;
close_out out_c
;;
-let load ?(sample=64) str =
+let load ?(sample=64) ?(load_text=true) str =
let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
let in_c = Unix.in_channel_of_descr fd in
let _ = set_binary_mode_in in_c true in
- let load_table () =
+ let load_table () =
(let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
(let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
- Marshal.from_channel in_c
+ Marshal.from_channel in_c
in
let ntable = Hashtbl.create (Hashtbl.length table) in
- Hashtbl.iter (fun k (s1,s2,s3,s4) ->
+ Hashtbl.iter (fun k (s1,s2,s3,s4) ->
let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
let _ = Printf.eprintf "\nLoading tag table : " in
let ntable = time (load_table) () in
ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
- let tree = { doc = tree_load fd;
+ let tree = { doc = tree_load fd str load_text sample;
ttable = ntable;}
in close_in in_c;
tree
-
+
let tag_pool t = pool t.doc
-
+
let compare = compare_node
let equal a b = a == b
-
+
let nts = function
-1 -> "Nil"
| i -> Printf.sprintf "Node (%i)" i
-
+
let dump_node t = nts (inode t)
let is_left t n = tree_is_first_child t.doc n
-let is_below_right t n1 n2 =
- tree_is_ancestor t.doc (tree_parent t.doc n1) n2
+
+
+let is_below_right t n1 n2 =
+ tree_is_ancestor t.doc (tree_parent t.doc n1) n2
&& not (tree_is_ancestor t.doc n1 n2)
-
-let parent t n = tree_parent t.doc n
-let first_child t = (); fun n -> tree_first_child t.doc n
-let first_element t = (); fun n -> tree_first_element t.doc n
+let is_binary_ancestor t n1 n2 =
+ let p = tree_parent t.doc n1 in
+ let fin = tree_closing t.doc p in
+ n2 > n1 && n2 < fin
+(* (is_below_right t n1 n2) ||
+ (tree_is_ancestor t.doc n1 n2) *)
+let parent t n = tree_parent t.doc n
+
+let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
+let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
+let first_element t n = tree_first_element t.doc n
(* these function will be called in two times: first partial application
on the tag, then application of the tag and the tree, then application of
the other arguments. We use the trick to let the compiler optimize application
let v = ptset_to_vector ts in ();
fun n -> tree_select_child t.doc n v
-let next_sibling t = (); fun n -> tree_next_sibling t.doc n
-let next_element t = (); fun n -> tree_next_element t.doc n
+let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n
+let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n
+let next_element t n = tree_next_element t.doc n
-let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
+let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag
-let select_sibling t = fun ts ->
+let select_following_sibling t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n -> tree_select_foll_sibling t.doc n v
+ fun n -> tree_select_following_sibling t.doc n v
+
+let next_sibling_below t = (); fun n _ -> tree_next_sibling t.doc n
+let next_element_below t = (); fun n _ -> tree_next_element t.doc n
-let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
-let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n
-let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
+let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
-let select_sibling_ctx t = fun ts ->
+let select_following_sibling_below t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n _ -> tree_select_foll_sibling t.doc n v
+ fun n _ -> tree_select_following_sibling t.doc n v
let id t n = tree_node_xml_id t.doc n
-
-let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
-let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
+let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
-let select_desc t = fun ts ->
+let tagged_descendant t tag =
+ let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
+
+let select_descendant t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n -> tree_select_desc t.doc n v
+ fun n -> tree_select_descendant t.doc n v
-let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
+let tagged_following_below t tag =
+ let doc = t.doc in
+ (); fun n ctx -> tree_tagged_following_below doc n tag ctx
-let select_foll_ctx t = fun ts ->
+let select_following_below t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n ctx -> tree_select_foll_below t.doc n v ctx
+ fun n ctx -> tree_select_following_below t.doc n v ctx
+
+let closing t n = tree_closing t.doc n
+let is_open t n = tree_is_open t.doc n
+let get_text_id t n = tree_my_text t.doc n
let last_idx = ref 0
let array_find a i j =
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nil
- else loop !last_idx i j
+ else loop !last_idx i j
let count t s = text_count t.doc s
-
- let print_xml_fast outc tree t =
- let rec loop ?(print_right=true) t =
- if t != nil
- then
- let tagid = tree_tag_id tree.doc t in
+ let stack = ref []
+ let init_stack () = stack := []
+ let push x = stack:= x::!stack
+ let peek () = match !stack with
+ p::_ -> p
+ | _ -> failwith "peek"
+ let pop () = match !stack with
+ p::r -> stack:=r; p
+ | _ -> failwith "pop"
+
+ let next t = nodei ( (inode t) + 1 )
+ let next2 t = nodei ( (inode t) + 2 )
+ let next3 t = nodei ( (inode t) + 3 )
+
+ let print_xml_fast2 =
+ let _ = init_stack () in
+ let h = Hashtbl.create MED_H_SIZE in
+ let tag_str t = try Hashtbl.find h t with
+ Not_found -> let s = Tag.to_string t in
+ Hashtbl.add h t s;s
+ in
+ let h_att = Hashtbl.create MED_H_SIZE in
+ let att_str t = try Hashtbl.find h_att t with
+ Not_found -> let s = Tag.to_string t in
+ let attname = String.sub s 3 ((String.length s) -3) in
+ Hashtbl.add h_att t attname;attname
+ in fun outc tree t ->
+ let tree = tree.doc in
+ let fin = tree_closing tree t in
+ let rec loop_tag t tag =
+ if t <= fin then
+ if tree_is_open tree t then
+ (* opening tag *)
+ if tag == Tag.pcdata then
+ begin
+ output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
+ loop (next2 t) (* skip closing $ *)
+ end
+ else
+ let tagstr = tag_str tag in
+ let _ = output_char outc '<';
+ output_string outc tagstr in
+ let t' = next t in
+ if tree_is_open tree t' then
+ let _ = push tagstr in
+ let tag' = tree_tag tree t' in
+ if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
+ output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
+ else (* closing with no content *)
+ let _ = output_string outc "/>" in
+ loop (next t')
+ else
+ begin
+ (* closing tag *)
+ output_string outc "</";
+ output_string outc (pop());
+ output_char outc '>';
+ loop (next t);
+ end
+ and loop t = loop_tag t (tree_tag tree t)
+ and loop_attr t n =
+ if tree_is_open tree t then
+ let attname = att_str (tree_tag tree t) in
+ output_char outc ' ';
+ output_string outc attname;
+ output_string outc "=\"";
+ let t = next t in (* open $@ *)
+ output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
+ output_char outc '"';
+ loop_attr (next3 t) (n+1)
+ else
+ next t (* close @ *)
+ in loop t
+
+ let print_xml_fast =
+ let h = Hashtbl.create MED_H_SIZE in
+ let tag_str t = try Hashtbl.find h t with
+ Not_found -> let s = Tag.to_string t in
+ Hashtbl.add h t s;s
+ in
+ let h_att = Hashtbl.create MED_H_SIZE in
+ let att_str t = try Hashtbl.find h_att t with
+ Not_found -> let s = Tag.to_string t in
+ let attname = String.sub s 3 ((String.length s) -3) in
+ Hashtbl.add h_att t attname;attname
+ in fun outc tree t ->
+ let rec loop ?(print_right=true) t =
+ if t != nil
+ then
+ let tagid = tree_tag tree.doc t in
if tagid==Tag.pcdata
- then
+ then
begin
- let tid = tree_my_text tree.doc t in
- output_string outc (text_get_cached_text tree.doc tid);
+ let tid = tree_my_text_unsafe tree.doc t in
+ output_string outc (text_get_text tree.doc tid);
if print_right
then loop (next_sibling tree t);
end
else
- let tagstr = Tag.to_string tagid in
- let l = first_child tree t
- and r = next_sibling tree t
+ let tagstr = tag_str tagid in
+ let l = first_child tree t
+ and r = next_sibling tree t
in
output_char outc '<';
- output_string outc tagstr;
+ output_string outc tagstr;
if l == nil then output_string outc "/>"
- else
+ else
if (tag tree l) == Tag.attribute then
begin
loop_attributes (first_child tree l);
if (next_sibling tree l) == nil then output_string outc "/>"
- else
- begin
- output_char outc '>';
+ else
+ begin
+ output_char outc '>';
loop (next_sibling tree l);
output_string outc "</";
output_string outc tagstr;
end
else
begin
- output_char outc '>';
+ output_char outc '>';
loop l;
output_string outc "</";
output_string outc tagstr;
output_char outc '>';
end;
if print_right then loop r
- and loop_attributes a =
+ and loop_attributes a =
if a != nil
then
- let s = (Tag.to_string (tag tree a)) in
- let attname = String.sub s 3 ((String.length s) -3) in
+ let attname = att_str (tag tree a) in
let fsa = first_child tree a in
- let tid = tree_my_text tree.doc fsa in
+ let tid = tree_my_text_unsafe tree.doc fsa in
output_char outc ' ';
output_string outc attname;
output_string outc "=\"";
- output_string outc (text_get_cached_text tree.doc tid);
+ output_string outc (text_get_text tree.doc tid);
output_char outc '"';
loop_attributes (next_sibling tree a)
in
loop ~print_right:false t
-
-
- let print_xml_fast outc tree t =
+
+
+ let print_xml_fast outc tree t =
if (tag tree t) = Tag.document_node then
print_xml_fast outc tree (first_child tree t)
- else print_xml_fast outc tree t
-
-let tags_children t tag =
+ else print_xml_fast outc tree t
+
+let tags_children t tag =
let a,_,_,_ = Hashtbl.find t.ttable tag in a
-let tags_below t tag =
+let tags_below t tag =
let _,a,_,_ = Hashtbl.find t.ttable tag in a
-let tags_siblings t tag =
+let tags_siblings t tag =
let _,_,a,_ = Hashtbl.find t.ttable tag in a
-let tags_after t tag =
+let tags_after t tag =
let _,_,_,a = Hashtbl.find t.ttable tag in a
let tags t tag = Hashtbl.find t.ttable tag
-let rec binary_parent t n =
- let r =
+let rec binary_parent t n =
+ let r =
if tree_is_first_child t.doc n
then tree_parent t.doc n
else tree_prev_sibling t.doc n
- in if tree_tag_id t.doc r = Tag.pcdata then
+ in if tree_tag t.doc r = Tag.pcdata then
binary_parent t r
else r
let get_text t n =
let tid = tree_my_text t.doc n in
- if tid == nulldoc then "" else
- text_get_cached_text t.doc tid
+ if tid == nulldoc then "" else
+ text_get_text t.doc tid
-let dump_tree fmt tree =
+let dump_tree fmt tree =
let rec loop t n =
if t != nil then
- let tag = (tree_tag_id tree.doc t ) in
+ let tag = (tree_tag tree.doc t ) in
let tagstr = Tag.to_string tag in
let tab = String.make n ' ' in
- if tag == Tag.pcdata || tag == Tag.attribute_data
- then
- Format.fprintf fmt "%s<%s>%s</%s>\n"
- tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
+ if tag == Tag.pcdata || tag == Tag.attribute_data
+ then
+ Format.fprintf fmt "%s<%s>%s</%s>\n"
+ tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
else begin
Format.fprintf fmt "%s<%s>\n" tab tagstr;
loop (tree_first_child tree.doc t) (n+2);
loop root 0
;;
-
+
+let print_xml_fast3 t = tree_print_xml_fast3 t.doc
+
+
+
+
+let stats t =
+ let tree = t.doc in
+ let rec loop left node acc_d total_d num_leaves =
+ if node == nil then
+ (acc_d+total_d,if left then num_leaves+1 else num_leaves)
+ else
+ let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
+ loop false (tree_next_sibling tree node) (acc_d) d td
+ in
+ let a,b = loop true root 0 0 0
+ in
+ Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b
+;;
+
+
+
+
+
+
+let test_prefix t s = Array.length (text_prefix t.doc s)
+let test_suffix t s = Array.length (text_suffix t.doc s)
+let test_contains t s = Array.length (text_contains t.doc s)
+let test_equals t s = Array.length (text_equals t.doc s)