1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
25 children : Ptset.Int.t array;
26 siblings : Ptset.Int.t array;
27 descendants: Ptset.Int.t array;
28 followings: Ptset.Int.t array;
31 external inode : 'a node -> int = "%identity"
32 external nodei : int -> 'a node = "%identity"
33 let compare_node x y = (inode x) - (inode y)
34 let equal_node : 'a node -> 'a node -> bool = (==)
37 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
38 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
39 external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
40 external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save"
41 external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load"
43 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
45 let nil : [`Tree ] node = nodei ~-1
46 let nulldoc : [`Text ] node = nodei ~-1
47 let root : [`Tree ] node = nodei 0
49 external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
50 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
52 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
54 external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
55 external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
56 external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
57 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
58 external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
60 external text_count : tree -> string -> int = "caml_text_collection_count"
61 external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
62 external text_count_suffix : tree -> string -> int = "caml_text_collection_count_suffix"
63 external text_count_equal : tree -> string -> int = "caml_text_collection_count_equal"
64 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
65 external text_count_lessthan : tree -> string -> int = "caml_text_collection_count_lessthan"
67 external text_prefix : tree -> string -> [`Text ] node array = "caml_text_collection_prefix"
68 external text_suffix : tree -> string -> [`Text ] node array = "caml_text_collection_suffix"
69 external text_equals : tree -> string -> [`Text ] node array = "caml_text_collection_equals"
70 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
71 external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
74 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
75 external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
76 external tree_num_tags : tree -> int = "caml_xml_tree_num_tags" "noalloc"
77 external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
78 external tree_subtree_elements : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_elements" "noalloc"
79 external tree_subtree_tags : tree -> [`Tree] node -> Tag.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
81 let tree_is_nil x = equal_node x nil
82 external tree_is_leaf : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
83 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
84 external tree_is_child : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_child" "noalloc"
85 external tree_is_first_child : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
86 external tree_num_children : tree -> [`Tree ] node -> int = "caml_xml_tree_num_children" "noalloc"
87 external tree_child_number : tree -> [`Tree ] node -> int = "caml_xml_tree_child_number" "noalloc"
88 external tree_depth : tree -> [`Tree ] node -> int = "caml_xml_tree_depth" "noalloc"
89 external tree_preorder : tree -> [`Tree ] node -> int = "caml_xml_tree_preorder" "noalloc"
90 external tree_postorder : tree -> [`Tree ] node -> int = "caml_xml_tree_postorder" "noalloc"
91 external tree_tag : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" "noalloc"
92 external tree_doc_ids : tree -> [`Tree ] node -> [`Text] node*[`Text] node = "caml_xml_tree_doc_ids"
94 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
95 external tree_child : tree -> [`Tree] node -> int -> [`Tree] node = "caml_xml_tree_child" "noalloc"
96 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
97 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
98 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
99 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
100 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
101 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
102 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
105 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
106 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
107 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
109 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
110 external tree_tagged_following_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_following_sibling" "noalloc"
111 external tree_select_following_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_following_sibling" "noalloc"
112 external tree_tagged_descendant : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_descendant" "noalloc"
113 external tree_select_descendant : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_descendant" "noalloc"
114 external tree_tagged_following : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_following" "noalloc"
115 external tree_tagged_following_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_below" "noalloc"
116 external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_below" "noalloc"
119 external tree_tagged_following_before : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_before" "noalloc"
120 external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_before" "noalloc"
122 external tree_my_text : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text" "noalloc"
123 external tree_my_text_unsafe : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text_unsafe" "noalloc"
124 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
125 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
127 external tree_parent_node : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_node" "noalloc"
129 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
131 external tree_closing : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_closing" "noalloc"
132 external tree_is_open : tree -> [`Tree] node -> bool = "caml_xml_tree_is_open" "noalloc"
135 external benchmark_jump : tree -> Tag.t -> int = "caml_benchmark_jump" "noalloc"
137 let benchmark_jump t s = benchmark_jump t.doc s
139 external benchmark_fcns : tree -> int = "caml_benchmark_fcns" "noalloc"
140 external benchmark_fene : tree -> int = "caml_benchmark_fene" "noalloc"
141 external benchmark_iter : tree -> int = "caml_benchmark_iter" "noalloc"
143 let benchmark_fcns t = benchmark_fcns t.doc
145 let benchmark_fene t = benchmark_fene t.doc
147 let benchmark_iter t = benchmark_iter t.doc
149 external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc"
151 let benchmark_lcps t = benchmark_lcps t.doc
159 let text_size tree = inode (snd ( tree_doc_ids tree root ))
161 let text_get_text t (x:[`Text] node) =
162 if x == nulldoc then ""
163 else text_get_text t x
168 module HPtset = Hashtbl.Make(Ptset.Int)
170 let vector_htbl = HPtset.create MED_H_SIZE
172 let ptset_to_vector s =
174 HPtset.find vector_htbl s
177 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
178 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
179 HPtset.add vector_htbl s v; v
183 let subtree_size t i = tree_subtree_size t.doc i
184 let subtree_elements t i = tree_subtree_elements t.doc i
185 let text_size t = text_size t.doc
188 let rec fold_siblings tree f node acc =
189 if node == nil then acc else fold_siblings tree f (tree_next_sibling tree node) (f node acc)
193 let create n = Array.create n false
194 let add e a = a.(e) <- true; a
196 for i = 0 to Array.length a - 1 do
197 a.(i) <- a.(i) || b.(i)
200 for i = 0 to Array.length a - 1 do
205 let r = ref Ptset.Int.empty in
206 for i = 0 to Array.length a - 1 do
207 r := Ptset.Int.add i !r;
213 let collect_children_siblings tree =
214 let ntags = (tree_num_tags tree) in
215 let () = Printf.eprintf ">>>length: %i\n%!" ntags in
216 let table_c = Array.init (tree_num_tags tree) (fun _ -> TS.create ntags) in
217 let table_n = Array.init (tree_num_tags tree) (fun _ -> TS.create ntags) in
218 let acc_tag n s = TS.add (tree_tag tree n) s in
220 let size = tree_subtree_size tree root in
221 let tmp = TS.create ntags in
223 if node == nil then ()
225 let () = if !count mod 10000 == 0 then
226 Printf.eprintf "Node %i / %i\n%!" !count size;
228 let () = if !count mod 1000000 == 0 then Gc.compact() in
229 let () = count := !count + 1 in
230 let tag = tree_tag tree node in
231 let () = TS.clear tmp in
235 (tree_first_child tree node) tmp
237 let () = TS.merge table_c.(tag) children in
238 let () = TS.clear tmp in
242 (tree_next_sibling tree node) tmp
244 TS.merge table_n.(tag) siblings;
245 loop (tree_first_child tree node);
246 loop (tree_next_sibling tree node)
249 ( Array.map TS.to_ptset table_c,
250 Array.map TS.to_ptset table_n )
252 let collect_children_siblings tree =
253 let table_c = Array.create (tree_num_tags tree) Ptset.Int.empty in
254 let table_n = Array.copy table_c in
256 if node == nil then Ptset.Int.empty
258 let children = loop (tree_first_child tree node) in
259 let tag = tree_tag tree node in
260 let () = table_c.(tag) <- Ptset.Int.union table_c.(tag) children in
261 let siblings = loop (tree_next_sibling tree node) in
262 Ptset.Int.add tag siblings
270 let collect_descendants tree =
271 let table_d = Array.create (tree_num_tags tree) Ptset.Int.empty in
273 if node == nil then Ptset.Int.empty
275 let d1 = loop (tree_first_child tree node) in
276 let d2 = loop (tree_next_sibling tree node) in
277 let tag = tree_tag tree node in
278 table_d.(tag) <- Ptset.Int.union table_d.(tag) d1;
279 Ptset.Int.add tag (Ptset.Int.union d1 d2)
284 let collect_followings tree =
285 let table_f = Array.create (tree_num_tags tree) Ptset.Int.empty in
286 let rec loop node acc =
287 if node == nil then acc else
288 let f1 = loop (tree_next_sibling tree node) acc in
289 let f2 = loop (tree_first_child tree node) f1 in
290 let tag = tree_tag tree node in
291 table_f.(tag) <- Ptset.Int.union table_f.(tag) f1;
292 Ptset.Int.add tag (Ptset.Int.union f1 f2)
294 ignore (loop root Ptset.Int.empty);
297 let collect_tags tree =
298 let c,n = time (collect_children_siblings) tree ~msg:"Collecting child and sibling tags" in
299 let d = time collect_descendants tree ~msg:"Collecting descendant tags" in
300 let f = time collect_followings tree ~msg:"Collecting following tags" in
303 let contains_array = ref [| |]
304 let contains_index = Hashtbl.create 4096
307 Hashtbl.find contains_index i
311 let init_textfun f t s =
313 | `CONTAINS -> text_contains t.doc s
314 | `STARTSWITH -> text_prefix t.doc s
315 | `ENDSWITH -> text_suffix t.doc s
316 | `EQUALS -> text_equals t.doc s
318 (*Array.fast_sort (compare) a; *)
320 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
322 let count_contains t s = text_count_contains t.doc s
324 let init_naive_contains t s =
325 let i,j = tree_doc_ids t.doc (tree_root t.doc)
327 let regexp = Str.regexp_string s in
330 let _ = Str.search_forward regexp arg 0;
334 let rec loop n acc l =
337 let s = text_get_text t.doc n
340 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
341 else loop (nodei ((inode n)+1)) acc l
343 let acc,l = loop i [] 0 in
344 let a = Array.create l nulldoc in
345 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
351 let array_find a i j =
352 let l = Array.length a in
353 let rec loop idx x y =
354 if x > y || idx >= l then nulldoc
356 if a.(idx) >= x then if a.(idx) > y then nulldoc else (last_idx := idx;a.(idx))
357 else loop (idx+1) x y
359 if a.(0) > j || a.(l-1) < i then nulldoc
360 else loop !last_idx i j
362 let text_below tree t =
363 let l = Array.length !contains_array in
364 let i,j = tree_doc_ids tree.doc t in
365 let id = if l == 0 then i else (array_find !contains_array i j) in
366 tree_parent_node tree.doc id
368 let text_next tree t root =
369 let l = Array.length !contains_array in
370 let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
371 let _,j = tree_doc_ids tree.doc root in
372 let id = if l == 0 then if inf > j then nulldoc else inf
373 else array_find !contains_array inf j
375 tree_parent_node tree.doc id
379 module DocIdSet = struct
380 include Set.Make (struct type t = [`Text] node
381 let compare = compare_node end)
384 let is_nil t = t == nil
386 let is_node t = t != nil
387 let is_root t = t == root
390 let _ = Tag.init (Obj.magic t) in
391 let c,n,d,f = collect_tags t
401 let finalize _ = Printf.eprintf "Release the string list !\n%!"
407 !Options.sample_factor
408 !Options.index_empty_texts
409 !Options.disable_text_collection)
411 let parse_xml_uri str = parse parse_xml_uri str
412 let parse_xml_string str = parse parse_xml_string str
414 let size t = tree_size t.doc;;
416 external pool : tree -> Tag.pool = "%identity"
418 let magic_string = "SXSI_INDEX"
419 let version_string = "3"
422 Unix.lseek fd 0 Unix.SEEK_CUR
424 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
427 let sl = String.length s in
428 let ssl = Printf.sprintf "%020i" sl in
429 ignore (Unix.write fd ssl 0 20);
430 ignore (Unix.write fd s 0 (String.length s))
432 let rec really_read fd buffer start length =
433 if length <= 0 then () else
434 match Unix.read fd buffer start length with
435 0 -> raise End_of_file
436 | r -> really_read fd buffer (start + r) (length - r);;
439 let buffer = String.create 20 in
440 let _ = really_read fd buffer 0 20 in
441 let size = int_of_string buffer in
442 let buffer = String.create size in
443 let _ = really_read fd buffer 0 size in
446 let save_tag_table channel t =
447 let t = Array.map (fun s -> Array.of_list (Ptset.Int.elements s)) t in
448 Marshal.to_channel channel t []
451 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
452 let out_c = Unix.out_channel_of_descr fd in
453 let _ = set_binary_mode_out out_c true in
454 output_string out_c magic_string;
455 output_char out_c '\n';
456 output_string out_c version_string;
457 output_char out_c '\n';
458 save_tag_table out_c t.children;
459 save_tag_table out_c t.siblings;
460 save_tag_table out_c t.descendants;
461 save_tag_table out_c t.followings;
462 (* we need to move the fd to the correct position *)
464 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
465 tree_save t.doc fd str;
468 let load_tag_table channel =
469 let table : int array array = Marshal.from_channel channel in
470 Array.map (fun a -> Ptset.Int.from_list (Array.to_list a)) table
472 let load ?(sample=64) ?(load_text=true) str =
473 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
474 let in_c = Unix.in_channel_of_descr fd in
475 let _ = set_binary_mode_in in_c true in
477 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
478 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
479 let c = load_tag_table in_c in
480 let s = load_tag_table in_c in
481 let d = load_tag_table in_c in
482 let f = load_tag_table in_c in
485 let _ = Printf.eprintf "\nLoading tag table : " in
486 let c,s,d,f = time (load_table) () in
487 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
488 let tree = { doc = tree_load fd str load_text sample;
500 let tag_pool t = pool t.doc
502 let compare = compare_node
504 let equal a b = a == b
508 | i -> Printf.sprintf "Node (%i)" i
510 let dump_node t = nts (inode t)
512 let is_left t n = tree_is_first_child t.doc n
516 let is_below_right t n1 n2 =
517 tree_is_ancestor t.doc (tree_parent t.doc n1) n2
518 && not (tree_is_ancestor t.doc n1 n2)
520 let is_binary_ancestor t n1 n2 =
521 let p = tree_parent t.doc n1 in
522 let fin = tree_closing t.doc p in
524 (* (is_below_right t n1 n2) ||
525 (tree_is_ancestor t.doc n1 n2) *)
527 let parent t n = tree_parent t.doc n
529 let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
530 let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
531 let first_element t n = tree_first_element t.doc n
532 (* these function will be called in two times: first partial application
533 on the tag, then application of the tag and the tree, then application of
534 the other arguments. We use the trick to let the compiler optimize application
537 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
539 let select_child t = fun ts ->
540 let v = ptset_to_vector ts in ();
541 fun n -> tree_select_child t.doc n v
543 let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n
544 let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n
545 let next_element t n = tree_next_element t.doc n
547 let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag
549 let select_following_sibling t = fun ts ->
550 let v = (ptset_to_vector ts) in ();
551 fun n -> tree_select_following_sibling t.doc n v
553 let next_sibling_below t = (); fun n _ -> tree_next_sibling t.doc n
554 let next_element_below t = (); fun n _ -> tree_next_element t.doc n
556 let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
558 let select_following_sibling_below t = fun ts ->
559 let v = (ptset_to_vector ts) in ();
560 fun n _ -> tree_select_following_sibling t.doc n v
562 let id t n = tree_node_xml_id t.doc n
564 let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
566 let tagged_descendant t tag =
567 let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
569 let select_descendant t = fun ts ->
570 let v = (ptset_to_vector ts) in ();
571 fun n -> tree_select_descendant t.doc n v
573 let tagged_following_below t tag =
575 (); fun n ctx -> tree_tagged_following_below doc n tag ctx
577 let select_following_below t = fun ts ->
578 let v = (ptset_to_vector ts) in ();
579 fun n ctx -> tree_select_following_below t.doc n v ctx
581 let closing t n = tree_closing t.doc n
582 let is_open t n = tree_is_open t.doc n
583 let get_text_id t n = tree_my_text t.doc n
586 let array_find a i j =
587 let l = Array.length a in
588 let rec loop idx x y =
589 if x > y || idx >= l then nil
591 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
592 else loop (idx+1) x y
594 if a.(0) > j || a.(l-1) < i then nil
595 else loop !last_idx i j
599 let count t s = text_count t.doc s
601 let init_stack () = stack := []
602 let push x = stack:= x::!stack
603 let peek () = match !stack with
605 | _ -> failwith "peek"
606 let pop () = match !stack with
608 | _ -> failwith "pop"
610 let next t = nodei ( (inode t) + 1 )
611 let next2 t = nodei ( (inode t) + 2 )
612 let next3 t = nodei ( (inode t) + 3 )
614 let print_xml_fast2 =
615 let _ = init_stack () in
616 let h = Hashtbl.create MED_H_SIZE in
617 let tag_str t = try Hashtbl.find h t with
618 Not_found -> let s = Tag.to_string t in
621 let h_att = Hashtbl.create MED_H_SIZE in
622 let att_str t = try Hashtbl.find h_att t with
623 Not_found -> let s = Tag.to_string t in
624 let attname = String.sub s 3 ((String.length s) -3) in
625 Hashtbl.add h_att t attname;attname
626 in fun outc tree t ->
627 let tree = tree.doc in
628 let fin = tree_closing tree t in
629 let rec loop_tag t tag =
631 if tree_is_open tree t then
633 if tag == Tag.pcdata then
635 output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
636 loop (next2 t) (* skip closing $ *)
639 let tagstr = tag_str tag in
640 let _ = output_char outc '<';
641 output_string outc tagstr in
643 if tree_is_open tree t' then
644 let _ = push tagstr in
645 let tag' = tree_tag tree t' in
646 if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
647 output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
648 else (* closing with no content *)
649 let _ = output_string outc "/>" in
654 output_string outc "</";
655 output_string outc (pop());
656 output_char outc '>';
659 and loop t = loop_tag t (tree_tag tree t)
661 if tree_is_open tree t then
662 let attname = att_str (tree_tag tree t) in
663 output_char outc ' ';
664 output_string outc attname;
665 output_string outc "=\"";
666 let t = next t in (* open $@ *)
667 output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
668 output_char outc '"';
669 loop_attr (next3 t) (n+1)
675 let h = Hashtbl.create MED_H_SIZE in
676 let tag_str t = try Hashtbl.find h t with
677 Not_found -> let s = Tag.to_string t in
680 let h_att = Hashtbl.create MED_H_SIZE in
681 let att_str t = try Hashtbl.find h_att t with
682 Not_found -> let s = Tag.to_string t in
683 let attname = String.sub s 3 ((String.length s) -3) in
684 Hashtbl.add h_att t attname;attname
685 in fun outc tree t ->
686 let rec loop ?(print_right=true) t =
689 let tagid = tree_tag tree.doc t in
693 let tid = tree_my_text_unsafe tree.doc t in
694 output_string outc (text_get_text tree.doc tid);
696 then loop (next_sibling tree t);
699 let tagstr = tag_str tagid in
700 let l = first_child tree t
701 and r = next_sibling tree t
703 output_char outc '<';
704 output_string outc tagstr;
705 if l == nil then output_string outc "/>"
707 if (tag tree l) == Tag.attribute then
709 loop_attributes (first_child tree l);
710 if (next_sibling tree l) == nil then output_string outc "/>"
713 output_char outc '>';
714 loop (next_sibling tree l);
715 output_string outc "</";
716 output_string outc tagstr;
717 output_char outc '>';
722 output_char outc '>';
724 output_string outc "</";
725 output_string outc tagstr;
726 output_char outc '>';
728 if print_right then loop r
729 and loop_attributes a =
732 let attname = att_str (tag tree a) in
733 let fsa = first_child tree a in
734 let tid = tree_my_text_unsafe tree.doc fsa in
735 output_char outc ' ';
736 output_string outc attname;
737 output_string outc "=\"";
738 output_string outc (text_get_text tree.doc tid);
739 output_char outc '"';
740 loop_attributes (next_sibling tree a)
742 loop ~print_right:false t
745 let print_xml_fast outc tree t =
746 if (tag tree t) = Tag.document_node then
747 print_xml_fast outc tree (first_child tree t)
748 else print_xml_fast outc tree t
750 let tags_children t tag = t.children.(tag)
752 let tags_below t tag = t.descendants.(tag)
754 let tags_siblings t tag = t.siblings.(tag)
756 let tags_after t tag = t.followings.(tag)
767 let rec binary_parent t n =
769 if tree_is_first_child t.doc n
770 then tree_parent t.doc n
771 else tree_prev_sibling t.doc n
772 in if tree_tag t.doc r = Tag.pcdata then
776 let doc_ids t n = tree_doc_ids t.doc n
778 let subtree_tags t tag = ();
779 fun n -> if n == nil then 0 else
780 tree_subtree_tags t.doc n tag
783 let tid = tree_my_text t.doc n in
784 if tid == nulldoc then "" else
785 text_get_text t.doc tid
788 let dump_tree fmt tree =
791 let tag = (tree_tag tree.doc t ) in
792 let tagstr = Tag.to_string tag in
793 let tab = String.make n ' ' in
795 if tag == Tag.pcdata || tag == Tag.attribute_data
797 Format.fprintf fmt "%s<%s>%s</%s>\n"
798 tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
800 Format.fprintf fmt "%s<%s>\n" tab tagstr;
801 loop (tree_first_child tree.doc t) (n+2);
802 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
804 loop (tree_next_sibling tree.doc t) n
810 let print_xml_fast3 t = tree_print_xml_fast3 t.doc
817 let rec loop left node acc_d total_d num_leaves =
819 (acc_d+total_d,if left then num_leaves+1 else num_leaves)
821 let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
822 loop false (tree_next_sibling tree node) (acc_d) d td
824 let a,b = loop true root 0 0 0
826 Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b
834 let test_prefix t s = Array.length (text_prefix t.doc s)
835 let test_suffix t s = Array.length (text_suffix t.doc s)
836 let test_contains t s = Array.length (text_contains t.doc s)
837 let test_equals t s = Array.length (text_equals t.doc s)