type 'a node = private int
type node_kind = [`Text | `Tree ]
-external inode : 'a node -> int = "%identity"
-external nodei : int -> 'a node = "%identity"
+type t = {
+ doc : tree;
+ children : Ptset.Int.t array;
+ siblings : Ptset.Int.t array;
+ descendants: Ptset.Int.t array;
+ followings: Ptset.Int.t array;
+}
+
+external inode : 'a node -> int = "%identity"
+external nodei : int -> 'a node = "%identity"
let compare_node x y = (inode x) - (inode y)
let equal_node : 'a node -> 'a node -> bool = (==)
-
-external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
+
+external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
-
-external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
-external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
-
+external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
+external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save"
+external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load"
+
external nullt : unit -> 'a node = "caml_xml_tree_nullt"
let nil : [`Tree ] node = nodei ~-1
let nulldoc : [`Text ] node = nodei ~-1
let root : [`Tree ] node = nodei 0
-external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
-
-external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
+external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
+external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
-external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
-external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
-external text_count : tree -> string -> int = "caml_text_collection_count"
-external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
-external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
-external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
-
-external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
-external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size"
-
+external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
+external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
+external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
+external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
+external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
+
+external text_count : tree -> string -> int = "caml_text_collection_count"
+external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
+external text_count_suffix : tree -> string -> int = "caml_text_collection_count_suffix"
+external text_count_equal : tree -> string -> int = "caml_text_collection_count_equal"
+external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
+external text_count_lessthan : tree -> string -> int = "caml_text_collection_count_lessthan"
+
+external text_prefix : tree -> string -> [`Text ] node array = "caml_text_collection_prefix"
+external text_suffix : tree -> string -> [`Text ] node array = "caml_text_collection_suffix"
+external text_equals : tree -> string -> [`Text ] node array = "caml_text_collection_equals"
+external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
+external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
+
+
+external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
+external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
+external tree_num_tags : tree -> int = "caml_xml_tree_num_tags" "noalloc"
+external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
+external tree_subtree_elements : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_elements" "noalloc"
+external tree_subtree_tags : tree -> [`Tree] node -> Tag.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
+
let tree_is_nil x = equal_node x nil
+external tree_is_leaf : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
+external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
+external tree_is_child : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_child" "noalloc"
+external tree_is_first_child : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
+external tree_num_children : tree -> [`Tree ] node -> int = "caml_xml_tree_num_children" "noalloc"
+external tree_child_number : tree -> [`Tree ] node -> int = "caml_xml_tree_child_number" "noalloc"
+external tree_depth : tree -> [`Tree ] node -> int = "caml_xml_tree_depth" "noalloc"
+external tree_preorder : tree -> [`Tree ] node -> int = "caml_xml_tree_preorder" "noalloc"
+external tree_postorder : tree -> [`Tree ] node -> int = "caml_xml_tree_postorder" "noalloc"
+external tree_tag : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" "noalloc"
+external tree_doc_ids : tree -> [`Tree ] node -> [`Text] node*[`Text] node = "caml_xml_tree_doc_ids"
external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
-external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc"
-(*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
+external tree_child : tree -> [`Tree] node -> int -> [`Tree] node = "caml_xml_tree_child" "noalloc"
external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
-external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
+external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
-external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc"
-
external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
-external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
-external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
-external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
+external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
+type unordered_set
+external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
+external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
+external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
-external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc"
-
+external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
+external tree_tagged_following_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_following_sibling" "noalloc"
+external tree_select_following_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_following_sibling" "noalloc"
+external tree_tagged_descendant : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_descendant" "noalloc"
+external tree_select_descendant : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_descendant" "noalloc"
+external tree_tagged_following : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_following" "noalloc"
+external tree_tagged_following_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_below" "noalloc"
+external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_below" "noalloc"
-let tree_is_last t n = equal_node nil (tree_next_sibling t n)
-
-(*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *)
-external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc"
-(*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *)
-external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
+external tree_tagged_following_before : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_before" "noalloc"
+external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_before" "noalloc"
-let text_size tree = inode (snd ( tree_doc_ids tree root ))
+external tree_my_text : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text" "noalloc"
+external tree_my_text_unsafe : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text_unsafe" "noalloc"
+external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
+external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
-let text_get_cached_text t (x:[`Text] node) =
- if x == nulldoc then ""
- else
- text_get_cached_text t x
+external tree_parent_node : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_node" "noalloc"
+(*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
-external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
-external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
-external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
-external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc"
-external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc"
-external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc"
+external tree_closing : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_closing" "noalloc"
+external tree_is_open : tree -> [`Tree] node -> bool = "caml_xml_tree_is_open" "noalloc"
-type unordered_set
-external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
-external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
-external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
+external benchmark_jump : tree -> Tag.t -> int = "caml_benchmark_jump" "noalloc"
+
+let benchmark_jump t s = benchmark_jump t.doc s
+
+external benchmark_fcns : tree -> int = "caml_benchmark_fcns" "noalloc"
+external benchmark_fene : tree -> int = "caml_benchmark_fene" "noalloc"
+external benchmark_iter : tree -> int = "caml_benchmark_iter" "noalloc"
+
+let benchmark_fcns t = benchmark_fcns t.doc
+
+let benchmark_fene t = benchmark_fene t.doc
+
+let benchmark_iter t = benchmark_iter t.doc
+
+external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc"
+
+let benchmark_lcps t = benchmark_lcps t.doc
+
+
+
+
+
+
+
+let text_size tree = inode (snd ( tree_doc_ids tree root ))
+
+let text_get_text t (x:[`Text] node) =
+ if x == nulldoc then ""
+ else text_get_text t x
+
-external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
-external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc"
-external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc"
-external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc"
module HPtset = Hashtbl.Make(Ptset.Int)
let vector_htbl = HPtset.create MED_H_SIZE
let ptset_to_vector s =
- try
+ try
HPtset.find vector_htbl s
with
Not_found ->
let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
HPtset.add vector_htbl s v; v
-
-type t = {
- doc : tree;
- ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
-}
+
+
let subtree_size t i = tree_subtree_size t.doc i
+let subtree_elements t i = tree_subtree_elements t.doc i
let text_size t = text_size t.doc
-module MemUnion = Hashtbl.Make (struct
- type t = Ptset.Int.t*Ptset.Int.t
- let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
- let equal a b = equal a b || equal b a
- let hash (x,y) = (* commutative hash *)
- let x = Ptset.Int.hash x
- and y = Ptset.Int.hash y
- in
- if x < y then HASHINT2(x,y) else HASHINT2(y,x)
- end)
-
-module MemAdd = Hashtbl.Make (
- struct
- type t = Tag.t*Ptset.Int.t
- let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t)
- let hash (x,y) = HASHINT2(x,Ptset.Int.hash y)
- end)
-let collect_tags tree =
- let h_union = MemUnion.create BIG_H_SIZE in
- let pt_cup s1 s2 =
- try
- MemUnion.find h_union (s1,s2)
- with
- | Not_found -> let s = Ptset.Int.union s1 s2
- in
- MemUnion.add h_union (s1,s2) s;s
- in
- let h_add = MemAdd.create BIG_H_SIZE in
- let pt_add t s =
- try MemAdd.find h_add (t,s)
- with
- | Not_found -> let r = Ptset.Int.add t s in
- MemAdd.add h_add (t,s) r;r
- in
- let h = Hashtbl.create BIG_H_SIZE in
- let update t sc sb ss sa =
- let schild,sbelow,ssibling,safter =
- try
- Hashtbl.find h t
- with
- | Not_found ->
- (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
- in
- Hashtbl.replace h t
- (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
+let rec fold_siblings tree f node acc =
+ if node == nil then acc else fold_siblings tree f (tree_next_sibling tree node) (f node acc)
+module TS =
+ struct
+ type t = bool array
+ let create n = Array.create n false
+ let add e a = a.(e) <- true; a
+ let merge a b =
+ for i = 0 to Array.length a - 1 do
+ a.(i) <- a.(i) || b.(i)
+ done
+ let clear a =
+ for i = 0 to Array.length a - 1 do
+ a.(i) <- false;
+ done
+
+ let to_ptset a =
+ let r = ref Ptset.Int.empty in
+ for i = 0 to Array.length a - 1 do
+ r := Ptset.Int.add i !r;
+ done;
+ !r
+ end
+
+
+let collect_children_siblings tree =
+ let ntags = (tree_num_tags tree) in
+ let () = Printf.eprintf ">>>length: %i\n%!" ntags in
+ let table_c = Array.init (tree_num_tags tree) (fun _ -> TS.create ntags) in
+ let table_n = Array.init (tree_num_tags tree) (fun _ -> TS.create ntags) in
+ let acc_tag n s = TS.add (tree_tag tree n) s in
+ let count = ref 0 in
+ let size = tree_subtree_size tree root in
+ let tmp = TS.create ntags in
+ let rec loop node =
+ if node == nil then ()
+ else
+ let () = if !count mod 10000 == 0 then
+ Printf.eprintf "Node %i / %i\n%!" !count size;
+ in
+ let () = if !count mod 1000000 == 0 then Gc.compact() in
+ let () = count := !count + 1 in
+ let tag = tree_tag tree node in
+ let () = TS.clear tmp in
+ let children =
+ fold_siblings tree
+ acc_tag
+ (tree_first_child tree node) tmp
+ in
+ let () = TS.merge table_c.(tag) children in
+ let () = TS.clear tmp in
+ let siblings =
+ fold_siblings tree
+ acc_tag
+ (tree_next_sibling tree node) tmp
+ in
+ TS.merge table_n.(tag) siblings;
+ loop (tree_first_child tree node);
+ loop (tree_next_sibling tree node)
in
- let rec loop_right id acc_sibling acc_after=
- if id == nil
- then (acc_sibling,acc_after)
+ loop root;
+ ( Array.map TS.to_ptset table_c,
+ Array.map TS.to_ptset table_n )
+
+let collect_children_siblings tree =
+ let table_c = Array.create (tree_num_tags tree) Ptset.Int.empty in
+ let table_n = Array.copy table_c in
+ let rec loop node =
+ if node == nil then Ptset.Int.empty
else
- let sibling2,after2 = loop_right (tree_next_sibling tree id) acc_sibling acc_after in
- let child1,below1 = loop_left (tree_first_child tree id) after2 in
- let tag = tree_tag_id tree id in
- update tag child1 below1 sibling2 after2;
- (pt_add tag sibling2, (pt_add tag (pt_cup after2 below1)))
- and loop_left id acc_after =
- if id == nil
- then (Ptset.Int.empty,Ptset.Int.empty)
+ let children = loop (tree_first_child tree node) in
+ let tag = tree_tag tree node in
+ let () = table_c.(tag) <- Ptset.Int.union table_c.(tag) children in
+ let siblings = loop (tree_next_sibling tree node) in
+ Ptset.Int.add tag siblings
+ in
+ ignore (loop root);
+ table_c, table_n
+
+
+
+
+let collect_descendants tree =
+ let table_d = Array.create (tree_num_tags tree) Ptset.Int.empty in
+ let rec loop node =
+ if node == nil then Ptset.Int.empty
else
- let sibling2,after2 = loop_right (tree_next_sibling tree id) Ptset.Int.empty acc_after in
- let child1,below1 = loop_left (tree_first_child tree id) after2 in
- let tag = tree_tag_id tree id in
- update tag child1 below1 sibling2 after2;
- (pt_add tag sibling2,(pt_add tag (pt_cup after2 below1)))
+ let d1 = loop (tree_first_child tree node) in
+ let d2 = loop (tree_next_sibling tree node) in
+ let tag = tree_tag tree node in
+ table_d.(tag) <- Ptset.Int.union table_d.(tag) d1;
+ Ptset.Int.add tag (Ptset.Int.union d1 d2)
+ in
+ ignore (loop root);
+ table_d
+
+let collect_followings tree =
+ let table_f = Array.create (tree_num_tags tree) Ptset.Int.empty in
+ let rec loop node acc =
+ if node == nil then acc else
+ let f1 = loop (tree_next_sibling tree node) acc in
+ let f2 = loop (tree_first_child tree node) f1 in
+ let tag = tree_tag tree node in
+ table_f.(tag) <- Ptset.Int.union table_f.(tag) f1;
+ Ptset.Int.add tag (Ptset.Int.union f1 f2)
in
- let _ = loop_left (tree_root tree) Ptset.Int.empty in h
-
-
-
+ ignore (loop root Ptset.Int.empty);
+ table_f
+
+let collect_tags tree =
+ let c,n = time (collect_children_siblings) tree ~msg:"Collecting child and sibling tags" in
+ let d = time collect_descendants tree ~msg:"Collecting descendant tags" in
+ let f = time collect_followings tree ~msg:"Collecting following tags" in
+ c,n,d,f
let contains_array = ref [| |]
-let contains_index = Hashtbl.create 4096
+let contains_index = Hashtbl.create 4096
let in_array _ i =
try
Hashtbl.find contains_index i
with
Not_found -> false
-let init_contains t s =
- let a = text_contains t.doc s
+let init_textfun f t s =
+ let a = match f with
+ | `CONTAINS -> text_contains t.doc s
+ | `STARTSWITH -> text_prefix t.doc s
+ | `ENDSWITH -> text_suffix t.doc s
+ | `EQUALS -> text_equals t.doc s
in
- Array.fast_sort (compare) a;
+ (*Array.fast_sort (compare) a; *)
contains_array := a;
Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
-
+
let count_contains t s = text_count_contains t.doc s
-let unsorted_contains t s = text_unsorted_contains t.doc s
let init_naive_contains t s =
let i,j = tree_doc_ids t.doc (tree_root t.doc)
in
let regexp = Str.regexp_string s in
- let matching arg =
+ let matching arg =
try
let _ = Str.search_forward regexp arg 0;
in true
with _ -> false
in
- let rec loop n acc l =
+ let rec loop n acc l =
if n >= j then acc,l
else
- let s = text_get_cached_text t.doc n
+ let s = text_get_text t.doc n
in
- if matching s
- then loop (nodei ((inode n)+1)) (n::acc) (l+1)
+ if matching s
+ then loop (nodei ((inode n)+1)) (n::acc) (l+1)
else loop (nodei ((inode n)+1)) acc l
in
let acc,l = loop i [] 0 in
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nulldoc
- else loop !last_idx i j
-
-let text_below tree t =
+ else loop !last_idx i j
+
+let text_below tree t =
let l = Array.length !contains_array in
let i,j = tree_doc_ids tree.doc t in
let id = if l == 0 then i else (array_find !contains_array i j) in
- tree_parent_doc tree.doc id
-
+ tree_parent_node tree.doc id
+
let text_next tree t root =
let l = Array.length !contains_array in
- let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
- let _,j = tree_doc_ids tree.doc root in
+ let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
+ let _,j = tree_doc_ids tree.doc root in
let id = if l == 0 then if inf > j then nulldoc else inf
else array_find !contains_array inf j
- in
- tree_parent_doc tree.doc id
+ in
+ tree_parent_node tree.doc id
module DocIdSet = struct
include Set.Make (struct type t = [`Text] node
let compare = compare_node end)
-
+
end
let is_nil t = t == nil
let node_of_t t =
let _ = Tag.init (Obj.magic t) in
- let table = collect_tags t
+ let c,n,d,f = collect_tags t
in
- { doc= t;
- ttable = table;
+ { doc= t;
+ children = c;
+ siblings = n;
+ descendants = d;
+ followings = f
+
}
let finalize _ = Printf.eprintf "Release the string list !\n%!"
let parse f str =
node_of_t
- (f str
- !Options.sample_factor
+ (f str
+ !Options.sample_factor
!Options.index_empty_texts
!Options.disable_text_collection)
-
+
let parse_xml_uri str = parse parse_xml_uri str
let parse_xml_string str = parse parse_xml_string str
-
+let size t = tree_size t.doc;;
+
external pool : tree -> Tag.pool = "%identity"
let magic_string = "SXSI_INDEX"
-let version_string = "1"
+let version_string = "3"
let pos fd =
Unix.lseek fd 0 Unix.SEEK_CUR
let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
-let write fd s =
+let write fd s =
let sl = String.length s in
let ssl = Printf.sprintf "%020i" sl in
ignore (Unix.write fd ssl 0 20);
let buffer = String.create size in
let _ = really_read fd buffer 0 size in
buffer
-
+
+let save_tag_table channel t =
+ let t = Array.map (fun s -> Array.of_list (Ptset.Int.elements s)) t in
+ Marshal.to_channel channel t []
let save t str =
let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
output_char out_c '\n';
output_string out_c version_string;
output_char out_c '\n';
- Marshal.to_channel out_c t.ttable [ ];
+ save_tag_table out_c t.children;
+ save_tag_table out_c t.siblings;
+ save_tag_table out_c t.descendants;
+ save_tag_table out_c t.followings;
(* we need to move the fd to the correct position *)
flush out_c;
ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
- tree_save t.doc fd;
+ tree_save t.doc fd str;
close_out out_c
;;
+let load_tag_table channel =
+ let table : int array array = Marshal.from_channel channel in
+ Array.map (fun a -> Ptset.Int.from_list (Array.to_list a)) table
-let load ?(sample=64) str =
+let load ?(sample=64) ?(load_text=true) str =
let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
let in_c = Unix.in_channel_of_descr fd in
let _ = set_binary_mode_in in_c true in
- let load_table () =
+ let load_table () =
(let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
(let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
- let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
- Marshal.from_channel in_c
- in
- let ntable = Hashtbl.create (Hashtbl.length table) in
- Hashtbl.iter (fun k (s1,s2,s3,s4) ->
- let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
- and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
- and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
- and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
- in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
- ) table;
- Hashtbl.clear table;
- (* The in_channel read a chunk of fd, so we might be after
- the start of the XMLTree save file. Reset to the correct
- position *)
- ntable
+ let c = load_tag_table in_c in
+ let s = load_tag_table in_c in
+ let d = load_tag_table in_c in
+ let f = load_tag_table in_c in
+ c,s,d,f
in
let _ = Printf.eprintf "\nLoading tag table : " in
- let ntable = time (load_table) () in
+ let c,s,d,f = time (load_table) () in
ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
- let tree = { doc = tree_load fd;
- ttable = ntable;}
+ let tree = { doc = tree_load fd str load_text sample;
+ children = c;
+ siblings = s;
+ descendants = d;
+ followings = f
+ }
in close_in in_c;
tree
-
+
let tag_pool t = pool t.doc
-
+
let compare = compare_node
let equal a b = a == b
-
+
let nts = function
-1 -> "Nil"
| i -> Printf.sprintf "Node (%i)" i
-
+
let dump_node t = nts (inode t)
let is_left t n = tree_is_first_child t.doc n
-let is_below_right t n1 n2 =
- tree_is_ancestor t.doc (tree_parent t.doc n1) n2
+
+
+let is_below_right t n1 n2 =
+ tree_is_ancestor t.doc (tree_parent t.doc n1) n2
&& not (tree_is_ancestor t.doc n1 n2)
-
-let parent t n = tree_parent t.doc n
-let first_child t = (); fun n -> tree_first_child t.doc n
-let first_element t = (); fun n -> tree_first_element t.doc n
+let is_binary_ancestor t n1 n2 =
+ let p = tree_parent t.doc n1 in
+ let fin = tree_closing t.doc p in
+ n2 > n1 && n2 < fin
+(* (is_below_right t n1 n2) ||
+ (tree_is_ancestor t.doc n1 n2) *)
+
+let parent t n = tree_parent t.doc n
+let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
+let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
+let first_element t n = tree_first_element t.doc n
(* these function will be called in two times: first partial application
on the tag, then application of the tag and the tree, then application of
the other arguments. We use the trick to let the compiler optimize application
let v = ptset_to_vector ts in ();
fun n -> tree_select_child t.doc n v
-let next_sibling t = (); fun n -> tree_next_sibling t.doc n
-let next_element t = (); fun n -> tree_next_element t.doc n
+let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n
+let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n
+let next_element t n = tree_next_element t.doc n
-let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
+let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag
-let select_sibling t = fun ts ->
+let select_following_sibling t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n -> tree_select_foll_sibling t.doc n v
+ fun n -> tree_select_following_sibling t.doc n v
+
+let next_sibling_below t = (); fun n _ -> tree_next_sibling t.doc n
+let next_element_below t = (); fun n _ -> tree_next_element t.doc n
-let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
-let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n
-let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
+let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
-let select_sibling_ctx t = fun ts ->
+let select_following_sibling_below t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n _ -> tree_select_foll_sibling t.doc n v
+ fun n _ -> tree_select_following_sibling t.doc n v
let id t n = tree_node_xml_id t.doc n
-
-let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
-let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
+let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
-let select_desc t = fun ts ->
+let tagged_descendant t tag =
+ let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
+
+let select_descendant t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n -> tree_select_desc t.doc n v
+ fun n -> tree_select_descendant t.doc n v
-let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
+let tagged_following_below t tag =
+ let doc = t.doc in
+ (); fun n ctx -> tree_tagged_following_below doc n tag ctx
-let select_foll_ctx t = fun ts ->
+let select_following_below t = fun ts ->
let v = (ptset_to_vector ts) in ();
- fun n ctx -> tree_select_foll_below t.doc n v ctx
+ fun n ctx -> tree_select_following_below t.doc n v ctx
+
+let closing t n = tree_closing t.doc n
+let is_open t n = tree_is_open t.doc n
+let get_text_id t n = tree_my_text t.doc n
let last_idx = ref 0
let array_find a i j =
else loop (idx+1) x y
in
if a.(0) > j || a.(l-1) < i then nil
- else loop !last_idx i j
+ else loop !last_idx i j
let count t s = text_count t.doc s
-
- let print_xml_fast outc tree t =
- let rec loop ?(print_right=true) t =
- if t != nil
- then
- let tagid = tree_tag_id tree.doc t in
+ let stack = ref []
+ let init_stack () = stack := []
+ let push x = stack:= x::!stack
+ let peek () = match !stack with
+ p::_ -> p
+ | _ -> failwith "peek"
+ let pop () = match !stack with
+ p::r -> stack:=r; p
+ | _ -> failwith "pop"
+
+ let next t = nodei ( (inode t) + 1 )
+ let next2 t = nodei ( (inode t) + 2 )
+ let next3 t = nodei ( (inode t) + 3 )
+
+ let print_xml_fast2 =
+ let _ = init_stack () in
+ let h = Hashtbl.create MED_H_SIZE in
+ let tag_str t = try Hashtbl.find h t with
+ Not_found -> let s = Tag.to_string t in
+ Hashtbl.add h t s;s
+ in
+ let h_att = Hashtbl.create MED_H_SIZE in
+ let att_str t = try Hashtbl.find h_att t with
+ Not_found -> let s = Tag.to_string t in
+ let attname = String.sub s 3 ((String.length s) -3) in
+ Hashtbl.add h_att t attname;attname
+ in fun outc tree t ->
+ let tree = tree.doc in
+ let fin = tree_closing tree t in
+ let rec loop_tag t tag =
+ if t <= fin then
+ if tree_is_open tree t then
+ (* opening tag *)
+ if tag == Tag.pcdata then
+ begin
+ output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
+ loop (next2 t) (* skip closing $ *)
+ end
+ else
+ let tagstr = tag_str tag in
+ let _ = output_char outc '<';
+ output_string outc tagstr in
+ let t' = next t in
+ if tree_is_open tree t' then
+ let _ = push tagstr in
+ let tag' = tree_tag tree t' in
+ if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
+ output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
+ else (* closing with no content *)
+ let _ = output_string outc "/>" in
+ loop (next t')
+ else
+ begin
+ (* closing tag *)
+ output_string outc "</";
+ output_string outc (pop());
+ output_char outc '>';
+ loop (next t);
+ end
+ and loop t = loop_tag t (tree_tag tree t)
+ and loop_attr t n =
+ if tree_is_open tree t then
+ let attname = att_str (tree_tag tree t) in
+ output_char outc ' ';
+ output_string outc attname;
+ output_string outc "=\"";
+ let t = next t in (* open $@ *)
+ output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
+ output_char outc '"';
+ loop_attr (next3 t) (n+1)
+ else
+ next t (* close @ *)
+ in loop t
+
+ let print_xml_fast =
+ let h = Hashtbl.create MED_H_SIZE in
+ let tag_str t = try Hashtbl.find h t with
+ Not_found -> let s = Tag.to_string t in
+ Hashtbl.add h t s;s
+ in
+ let h_att = Hashtbl.create MED_H_SIZE in
+ let att_str t = try Hashtbl.find h_att t with
+ Not_found -> let s = Tag.to_string t in
+ let attname = String.sub s 3 ((String.length s) -3) in
+ Hashtbl.add h_att t attname;attname
+ in fun outc tree t ->
+ let rec loop ?(print_right=true) t =
+ if t != nil
+ then
+ let tagid = tree_tag tree.doc t in
if tagid==Tag.pcdata
- then
+ then
begin
- let tid = tree_my_text tree.doc t in
- let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode t) (inode tid)
- in
- output_string outc (text_get_cached_text tree.doc tid);
+ let tid = tree_my_text_unsafe tree.doc t in
+ output_string outc (text_get_text tree.doc tid);
if print_right
then loop (next_sibling tree t);
end
else
- let tagstr = Tag.to_string tagid in
- let l = first_child tree t
- and r = next_sibling tree t
+ let tagstr = tag_str tagid in
+ let l = first_child tree t
+ and r = next_sibling tree t
in
output_char outc '<';
- output_string outc tagstr;
+ output_string outc tagstr;
if l == nil then output_string outc "/>"
- else
+ else
if (tag tree l) == Tag.attribute then
begin
loop_attributes (first_child tree l);
if (next_sibling tree l) == nil then output_string outc "/>"
- else
- begin
- output_char outc '>';
+ else
+ begin
+ output_char outc '>';
loop (next_sibling tree l);
output_string outc "</";
output_string outc tagstr;
end
else
begin
- output_char outc '>';
+ output_char outc '>';
loop l;
output_string outc "</";
output_string outc tagstr;
output_char outc '>';
end;
if print_right then loop r
- and loop_attributes a =
+ and loop_attributes a =
if a != nil
then
- let s = (Tag.to_string (tag tree a)) in
- let attname = String.sub s 3 ((String.length s) -3) in
+ let attname = att_str (tag tree a) in
let fsa = first_child tree a in
- let tid = tree_my_text tree.doc fsa in
- let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode fsa) (inode tid)
- in
+ let tid = tree_my_text_unsafe tree.doc fsa in
output_char outc ' ';
output_string outc attname;
output_string outc "=\"";
- output_string outc (text_get_cached_text tree.doc tid);
+ output_string outc (text_get_text tree.doc tid);
output_char outc '"';
loop_attributes (next_sibling tree a)
in
loop ~print_right:false t
-
-
- let print_xml_fast outc tree t =
+
+
+ let print_xml_fast outc tree t =
if (tag tree t) = Tag.document_node then
print_xml_fast outc tree (first_child tree t)
- else print_xml_fast outc tree t
-
-let tags_children t tag =
- let a,_,_,_ = Hashtbl.find t.ttable tag in a
-let tags_below t tag =
- let _,a,_,_ = Hashtbl.find t.ttable tag in a
-let tags_siblings t tag =
- let _,_,a,_ = Hashtbl.find t.ttable tag in a
-let tags_after t tag =
- let _,_,_,a = Hashtbl.find t.ttable tag in a
+ else print_xml_fast outc tree t
+
+let tags_children t tag = t.children.(tag)
+
+let tags_below t tag = t.descendants.(tag)
+
+let tags_siblings t tag = t.siblings.(tag)
+
+let tags_after t tag = t.followings.(tag)
+
-let tags t tag = Hashtbl.find t.ttable tag
+let tags t tag =
+ t.children.(tag),
+ t.descendants.(tag),
+ t.siblings.(tag),
+ t.followings.(tag)
-let rec binary_parent t n =
- let r =
+let rec binary_parent t n =
+ let r =
if tree_is_first_child t.doc n
then tree_parent t.doc n
else tree_prev_sibling t.doc n
- in if tree_tag_id t.doc r = Tag.pcdata then
+ in if tree_tag t.doc r = Tag.pcdata then
binary_parent t r
else r
let get_text t n =
let tid = tree_my_text t.doc n in
- if tid == nulldoc then "" else
- text_get_cached_text t.doc tid
+ if tid == nulldoc then "" else
+ text_get_text t.doc tid
-let dump_tree fmt tree =
+let dump_tree fmt tree =
let rec loop t n =
if t != nil then
- let tag = (tree_tag_id tree.doc t ) in
+ let tag = (tree_tag tree.doc t ) in
let tagstr = Tag.to_string tag in
let tab = String.make n ' ' in
- if tag == Tag.pcdata || tag == Tag.attribute_data
- then
- Format.fprintf fmt "%s<%s>%s</%s>\n"
- tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
+ if tag == Tag.pcdata || tag == Tag.attribute_data
+ then
+ Format.fprintf fmt "%s<%s>%s</%s>\n"
+ tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
else begin
Format.fprintf fmt "%s<%s>\n" tab tagstr;
loop (tree_first_child tree.doc t) (n+2);
loop root 0
;;
-
+
+let print_xml_fast3 t = tree_print_xml_fast3 t.doc
+
+
+
+
+let stats t =
+ let tree = t.doc in
+ let rec loop left node acc_d total_d num_leaves =
+ if node == nil then
+ (acc_d+total_d,if left then num_leaves+1 else num_leaves)
+ else
+ let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
+ loop false (tree_next_sibling tree node) (acc_d) d td
+ in
+ let a,b = loop true root 0 0 0
+ in
+ Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b
+;;
+
+
+
+
+
+
+let test_prefix t s = Array.length (text_prefix t.doc s)
+let test_suffix t s = Array.length (text_suffix t.doc s)
+let test_contains t s = Array.length (text_contains t.doc s)
+let test_equals t s = Array.length (text_equals t.doc s)