1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
25 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
28 external inode : 'a node -> int = "%identity"
29 external nodei : int -> 'a node = "%identity"
30 let compare_node x y = (inode x) - (inode y)
31 let equal_node : 'a node -> 'a node -> bool = (==)
34 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
35 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
36 external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
37 external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
38 external tree_load : Unix.file_descr -> bool -> int -> tree = "caml_xml_tree_load"
40 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
42 let nil : [`Tree ] node = nodei ~-1
43 let nulldoc : [`Text ] node = nodei ~-1
44 let root : [`Tree ] node = nodei 0
46 external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
47 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
49 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
51 external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
52 external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
53 external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
54 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
55 external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
57 external text_count : tree -> string -> int = "caml_text_collection_count"
58 external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
59 external text_count_suffix : tree -> string -> int = "caml_text_collection_count_suffix"
60 external text_count_equal : tree -> string -> int = "caml_text_collection_count_equal"
61 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
62 external text_count_lessthan : tree -> string -> int = "caml_text_collection_count_lessthan"
64 external text_prefix : tree -> string -> [`Text ] node array = "caml_text_collection_prefix"
65 external text_suffix : tree -> string -> [`Text ] node array = "caml_text_collection_suffix"
66 external text_equals : tree -> string -> [`Text ] node array = "caml_text_collection_equals"
67 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
68 external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
71 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
72 external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
73 external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
74 external tree_subtree_elements : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_elements" "noalloc"
75 external tree_subtree_tags : tree -> [`Tree] node -> Tag.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
77 let tree_is_nil x = equal_node x nil
78 external tree_is_leaf : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
79 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
80 external tree_is_child : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_child" "noalloc"
81 external tree_is_first_child : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
82 external tree_num_children : tree -> [`Tree ] node -> int = "caml_xml_tree_num_children" "noalloc"
83 external tree_child_number : tree -> [`Tree ] node -> int = "caml_xml_tree_child_number" "noalloc"
84 external tree_depth : tree -> [`Tree ] node -> int = "caml_xml_tree_depth" "noalloc"
85 external tree_preorder : tree -> [`Tree ] node -> int = "caml_xml_tree_preorder" "noalloc"
86 external tree_postorder : tree -> [`Tree ] node -> int = "caml_xml_tree_postorder" "noalloc"
87 external tree_tag : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" "noalloc"
88 external tree_doc_ids : tree -> [`Tree ] node -> [`Text] node*[`Text] node = "caml_xml_tree_doc_ids"
90 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
91 external tree_child : tree -> [`Tree] node -> int -> [`Tree] node = "caml_xml_tree_child" "noalloc"
92 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
93 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
94 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
95 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
96 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
97 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
98 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
101 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
102 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
103 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
105 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
106 external tree_tagged_following_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_following_sibling" "noalloc"
107 external tree_select_following_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_following_sibling" "noalloc"
108 external tree_tagged_descendant : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_descendant" "noalloc"
109 external tree_select_descendant : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_descendant" "noalloc"
110 external tree_tagged_following : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_following" "noalloc"
111 external tree_tagged_following_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_below" "noalloc"
112 external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_below" "noalloc"
115 external tree_tagged_following_before : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_before" "noalloc"
116 external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_before" "noalloc"
118 external tree_my_text : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text" "noalloc"
119 external tree_my_text_unsafe : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text_unsafe" "noalloc"
120 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
121 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
123 external tree_parent_node : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_node" "noalloc"
125 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
127 external tree_closing : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_closing" "noalloc"
128 external tree_is_open : tree -> [`Tree] node -> bool = "caml_xml_tree_is_open" "noalloc"
131 external benchmark_jump : tree -> Tag.t -> unit = "caml_benchmark_jump" "noalloc"
133 let benchmark_jump t s = benchmark_jump t.doc s
135 external benchmark_fcns : tree -> unit = "caml_benchmark_fcns" "noalloc"
137 let benchmark_fcns t = benchmark_fcns t.doc
139 external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc"
141 let benchmark_lcps t = benchmark_lcps t.doc
149 let text_size tree = inode (snd ( tree_doc_ids tree root ))
151 let text_get_text t (x:[`Text] node) =
152 if x == nulldoc then ""
153 else text_get_text t x
158 module HPtset = Hashtbl.Make(Ptset.Int)
160 let vector_htbl = HPtset.create MED_H_SIZE
162 let ptset_to_vector s =
164 HPtset.find vector_htbl s
167 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
168 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
169 HPtset.add vector_htbl s v; v
173 let subtree_size t i = tree_subtree_size t.doc i
174 let subtree_elements t i = tree_subtree_elements t.doc i
175 let text_size t = text_size t.doc
177 module MemUnion = Hashtbl.Make (struct
178 type t = Ptset.Int.t*Ptset.Int.t
179 let equal (x,y) (z,t) = x == z || y == t
180 let equal a b = equal a b || equal b a
181 let hash (x,y) = (* commutative hash *)
182 let x = Uid.to_int (Ptset.Int.uid x)
183 and y = Uid.to_int (Ptset.Int.uid y)
185 if x <= y then HASHINT2(x,y) else HASHINT2(y,x)
188 module MemAdd = Hashtbl.Make (
190 type t = Tag.t*Ptset.Int.t
191 let equal (x,y) (z,t) = (x == z)&&(y == t)
192 let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y))
195 module MemUpdate = struct
196 include Hashtbl.Make (
198 type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t
199 let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 &&
200 b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2
201 let hash (a,b,c,d,e) =
202 HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)),
203 Uid.to_int (Ptset.Int.uid c),
204 Uid.to_int (Ptset.Int.uid d),
205 Uid.to_int (Ptset.Int.uid e))
210 let collect_tags tree =
211 let _ = Printf.eprintf "Collecting Tags\n%!" in
212 let h_union = MemUnion.create BIG_H_SIZE in
215 MemUnion.find h_union (s1,s2)
217 | Not_found -> let s = Ptset.Int.union s1 s2
219 MemUnion.add h_union (s1,s2) s;s
221 let h_add = MemAdd.create BIG_H_SIZE in
223 try MemAdd.find h_add (t,s)
225 | Not_found -> let r = Ptset.Int.add t s in
226 MemAdd.add h_add (t,s) r;r
228 let h = Hashtbl.create BIG_H_SIZE in
229 let update t sc sb ss sa =
230 let schild,sbelow,ssibling,safter =
235 (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
238 (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
240 let rec loop right id acc_after =
242 then Ptset.Int.empty,Ptset.Int.empty,acc_after else
243 let sibling2,desc2,after2 = loop true (tree_next_sibling tree id) acc_after in
244 let child1,desc1,after1 = loop false (tree_first_child tree id) after2 in
245 let tag = tree_tag tree id in
246 update tag child1 desc1 sibling2 after2;
247 ( pt_add tag sibling2,
248 pt_add tag (pt_cup desc1 desc2),
249 if right then pt_cup after1 (pt_cup desc1 desc2) else acc_after )
251 let _ = loop false (tree_root tree) Ptset.Int.empty in
252 let _ = Printf.eprintf "Finished\n%!" in
258 let contains_array = ref [| |]
259 let contains_index = Hashtbl.create 4096
262 Hashtbl.find contains_index i
266 let init_textfun f t s =
268 | `CONTAINS -> text_contains t.doc s
269 | `STARTSWITH -> text_prefix t.doc s
270 | `ENDSWITH -> text_suffix t.doc s
271 | `EQUALS -> text_equals t.doc s
273 (*Array.fast_sort (compare) a; *)
275 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
277 let count_contains t s = text_count_contains t.doc s
279 let init_naive_contains t s =
280 let i,j = tree_doc_ids t.doc (tree_root t.doc)
282 let regexp = Str.regexp_string s in
285 let _ = Str.search_forward regexp arg 0;
289 let rec loop n acc l =
292 let s = text_get_text t.doc n
295 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
296 else loop (nodei ((inode n)+1)) acc l
298 let acc,l = loop i [] 0 in
299 let a = Array.create l nulldoc in
300 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
306 let array_find a i j =
307 let l = Array.length a in
308 let rec loop idx x y =
309 if x > y || idx >= l then nulldoc
311 if a.(idx) >= x then if a.(idx) > y then nulldoc else (last_idx := idx;a.(idx))
312 else loop (idx+1) x y
314 if a.(0) > j || a.(l-1) < i then nulldoc
315 else loop !last_idx i j
317 let text_below tree t =
318 let l = Array.length !contains_array in
319 let i,j = tree_doc_ids tree.doc t in
320 let id = if l == 0 then i else (array_find !contains_array i j) in
321 tree_parent_node tree.doc id
323 let text_next tree t root =
324 let l = Array.length !contains_array in
325 let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
326 let _,j = tree_doc_ids tree.doc root in
327 let id = if l == 0 then if inf > j then nulldoc else inf
328 else array_find !contains_array inf j
330 tree_parent_node tree.doc id
334 module DocIdSet = struct
335 include Set.Make (struct type t = [`Text] node
336 let compare = compare_node end)
339 let is_nil t = t == nil
341 let is_node t = t != nil
342 let is_root t = t == root
345 let _ = Tag.init (Obj.magic t) in
346 let table = collect_tags t
348 let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
349 Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
350 Printf.eprintf "Child tags: ";
351 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) c;
352 Printf.eprintf "\nDescendant tags: ";
353 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) d;
354 Printf.eprintf "\nNextSibling tags: ";
355 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) ns;
356 Printf.eprintf "\nFollowing tags: ";
357 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
358 Printf.eprintf "\n\n%!";) table
366 let finalize _ = Printf.eprintf "Release the string list !\n%!"
372 !Options.sample_factor
373 !Options.index_empty_texts
374 !Options.disable_text_collection)
376 let parse_xml_uri str = parse parse_xml_uri str
377 let parse_xml_string str = parse parse_xml_string str
379 let size t = tree_size t.doc;;
381 external pool : tree -> Tag.pool = "%identity"
383 let magic_string = "SXSI_INDEX"
384 let version_string = "2"
387 Unix.lseek fd 0 Unix.SEEK_CUR
389 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
392 let sl = String.length s in
393 let ssl = Printf.sprintf "%020i" sl in
394 ignore (Unix.write fd ssl 0 20);
395 ignore (Unix.write fd s 0 (String.length s))
397 let rec really_read fd buffer start length =
398 if length <= 0 then () else
399 match Unix.read fd buffer start length with
400 0 -> raise End_of_file
401 | r -> really_read fd buffer (start + r) (length - r);;
404 let buffer = String.create 20 in
405 let _ = really_read fd buffer 0 20 in
406 let size = int_of_string buffer in
407 let buffer = String.create size in
408 let _ = really_read fd buffer 0 size in
413 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
414 let out_c = Unix.out_channel_of_descr fd in
415 let _ = set_binary_mode_out out_c true in
416 output_string out_c magic_string;
417 output_char out_c '\n';
418 output_string out_c version_string;
419 output_char out_c '\n';
420 Marshal.to_channel out_c t.ttable [ ];
421 (* we need to move the fd to the correct position *)
423 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
428 let load ?(sample=64) ?(load_text=true) str =
429 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
430 let in_c = Unix.in_channel_of_descr fd in
431 let _ = set_binary_mode_in in_c true in
433 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
434 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
435 let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
436 Marshal.from_channel in_c
438 let ntable = Hashtbl.create (Hashtbl.length table) in
439 Hashtbl.iter (fun k (s1,s2,s3,s4) ->
440 let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
441 and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
442 and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
443 and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
444 in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
447 (* The in_channel read a chunk of fd, so we might be after
448 the start of the XMLTree save file. Reset to the correct
452 let _ = Printf.eprintf "\nLoading tag table : " in
453 let ntable = time (load_table) () in
454 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
455 let tree = { doc = tree_load fd load_text sample;
463 let tag_pool t = pool t.doc
465 let compare = compare_node
467 let equal a b = a == b
471 | i -> Printf.sprintf "Node (%i)" i
473 let dump_node t = nts (inode t)
475 let is_left t n = tree_is_first_child t.doc n
479 let is_below_right t n1 n2 =
480 tree_is_ancestor t.doc (tree_parent t.doc n1) n2
481 && not (tree_is_ancestor t.doc n1 n2)
483 let is_binary_ancestor t n1 n2 =
484 let p = tree_parent t.doc n1 in
485 let fin = tree_closing t.doc p in
487 (* (is_below_right t n1 n2) ||
488 (tree_is_ancestor t.doc n1 n2) *)
490 let parent t n = tree_parent t.doc n
492 let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
493 let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
495 (* these function will be called in two times: first partial application
496 on the tag, then application of the tag and the tree, then application of
497 the other arguments. We use the trick to let the compiler optimize application
500 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
502 let select_child t = fun ts ->
503 let v = ptset_to_vector ts in ();
504 fun n -> tree_select_child t.doc n v
506 let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n
507 let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n
509 let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag
511 let select_following_sibling t = fun ts ->
512 let v = (ptset_to_vector ts) in ();
513 fun n -> tree_select_following_sibling t.doc n v
515 let next_sibling_below t = (); fun n _ -> tree_next_sibling t.doc n
516 let next_element_below t = (); fun n _ -> tree_next_element t.doc n
517 let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
519 let select_following_sibling_below t = fun ts ->
520 let v = (ptset_to_vector ts) in ();
521 fun n _ -> tree_select_following_sibling t.doc n v
523 let id t n = tree_node_xml_id t.doc n
525 let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
527 let tagged_descendant t tag =
528 let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
530 let select_descendant t = fun ts ->
531 let v = (ptset_to_vector ts) in ();
532 fun n -> tree_select_descendant t.doc n v
534 let tagged_following_below t tag =
536 (); fun n ctx -> tree_tagged_following_below doc n tag ctx
538 let select_following_below t = fun ts ->
539 let v = (ptset_to_vector ts) in ();
540 fun n ctx -> tree_select_following_below t.doc n v ctx
542 let closing t n = tree_closing t.doc n
543 let is_open t n = tree_is_open t.doc n
544 let get_text_id t n = tree_my_text t.doc n
547 let array_find a i j =
548 let l = Array.length a in
549 let rec loop idx x y =
550 if x > y || idx >= l then nil
552 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
553 else loop (idx+1) x y
555 if a.(0) > j || a.(l-1) < i then nil
556 else loop !last_idx i j
560 let count t s = text_count t.doc s
562 let init_stack () = stack := []
563 let push x = stack:= x::!stack
564 let peek () = match !stack with
566 | _ -> failwith "peek"
567 let pop () = match !stack with
569 | _ -> failwith "pop"
571 let next t = nodei ( (inode t) + 1 )
572 let next2 t = nodei ( (inode t) + 2 )
573 let next3 t = nodei ( (inode t) + 3 )
575 let print_xml_fast2 =
576 let _ = init_stack () in
577 let h = Hashtbl.create MED_H_SIZE in
578 let tag_str t = try Hashtbl.find h t with
579 Not_found -> let s = Tag.to_string t in
582 let h_att = Hashtbl.create MED_H_SIZE in
583 let att_str t = try Hashtbl.find h_att t with
584 Not_found -> let s = Tag.to_string t in
585 let attname = String.sub s 3 ((String.length s) -3) in
586 Hashtbl.add h_att t attname;attname
587 in fun outc tree t ->
588 let tree = tree.doc in
589 let fin = tree_closing tree t in
590 let rec loop_tag t tag =
592 if tree_is_open tree t then
594 if tag == Tag.pcdata then
596 output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
597 loop (next2 t) (* skip closing $ *)
600 let tagstr = tag_str tag in
601 let _ = output_char outc '<';
602 output_string outc tagstr in
604 if tree_is_open tree t' then
605 let _ = push tagstr in
606 let tag' = tree_tag tree t' in
607 if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
608 output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
609 else (* closing with no content *)
610 let _ = output_string outc "/>" in
615 output_string outc "</";
616 output_string outc (pop());
617 output_char outc '>';
620 and loop t = loop_tag t (tree_tag tree t)
622 if tree_is_open tree t then
623 let attname = att_str (tree_tag tree t) in
624 output_char outc ' ';
625 output_string outc attname;
626 output_string outc "=\"";
627 let t = next t in (* open $@ *)
628 output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
629 output_char outc '"';
630 loop_attr (next3 t) (n+1)
636 let h = Hashtbl.create MED_H_SIZE in
637 let tag_str t = try Hashtbl.find h t with
638 Not_found -> let s = Tag.to_string t in
641 let h_att = Hashtbl.create MED_H_SIZE in
642 let att_str t = try Hashtbl.find h_att t with
643 Not_found -> let s = Tag.to_string t in
644 let attname = String.sub s 3 ((String.length s) -3) in
645 Hashtbl.add h_att t attname;attname
646 in fun outc tree t ->
647 let rec loop ?(print_right=true) t =
650 let tagid = tree_tag tree.doc t in
654 let tid = tree_my_text_unsafe tree.doc t in
655 output_string outc (text_get_text tree.doc tid);
657 then loop (next_sibling tree t);
660 let tagstr = tag_str tagid in
661 let l = first_child tree t
662 and r = next_sibling tree t
664 output_char outc '<';
665 output_string outc tagstr;
666 if l == nil then output_string outc "/>"
668 if (tag tree l) == Tag.attribute then
670 loop_attributes (first_child tree l);
671 if (next_sibling tree l) == nil then output_string outc "/>"
674 output_char outc '>';
675 loop (next_sibling tree l);
676 output_string outc "</";
677 output_string outc tagstr;
678 output_char outc '>';
683 output_char outc '>';
685 output_string outc "</";
686 output_string outc tagstr;
687 output_char outc '>';
689 if print_right then loop r
690 and loop_attributes a =
693 let attname = att_str (tag tree a) in
694 let fsa = first_child tree a in
695 let tid = tree_my_text_unsafe tree.doc fsa in
696 output_char outc ' ';
697 output_string outc attname;
698 output_string outc "=\"";
699 output_string outc (text_get_text tree.doc tid);
700 output_char outc '"';
701 loop_attributes (next_sibling tree a)
703 loop ~print_right:false t
706 let print_xml_fast outc tree t =
707 if (tag tree t) = Tag.document_node then
708 print_xml_fast outc tree (first_child tree t)
709 else print_xml_fast outc tree t
711 let tags_children t tag =
712 let a,_,_,_ = Hashtbl.find t.ttable tag in a
713 let tags_below t tag =
714 let _,a,_,_ = Hashtbl.find t.ttable tag in a
715 let tags_siblings t tag =
716 let _,_,a,_ = Hashtbl.find t.ttable tag in a
717 let tags_after t tag =
718 let _,_,_,a = Hashtbl.find t.ttable tag in a
721 let tags t tag = Hashtbl.find t.ttable tag
724 let rec binary_parent t n =
726 if tree_is_first_child t.doc n
727 then tree_parent t.doc n
728 else tree_prev_sibling t.doc n
729 in if tree_tag t.doc r = Tag.pcdata then
733 let doc_ids t n = tree_doc_ids t.doc n
735 let subtree_tags t tag = ();
736 fun n -> if n == nil then 0 else
737 tree_subtree_tags t.doc n tag
740 let tid = tree_my_text t.doc n in
741 if tid == nulldoc then "" else
742 text_get_text t.doc tid
745 let dump_tree fmt tree =
748 let tag = (tree_tag tree.doc t ) in
749 let tagstr = Tag.to_string tag in
750 let tab = String.make n ' ' in
752 if tag == Tag.pcdata || tag == Tag.attribute_data
754 Format.fprintf fmt "%s<%s>%s</%s>\n"
755 tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
757 Format.fprintf fmt "%s<%s>\n" tab tagstr;
758 loop (tree_first_child tree.doc t) (n+2);
759 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
761 loop (tree_next_sibling tree.doc t) n
767 let print_xml_fast3 t = tree_print_xml_fast3 t.doc
774 let rec loop left node acc_d total_d num_leaves =
776 (acc_d+total_d,if left then num_leaves+1 else num_leaves)
778 let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
779 loop false (tree_next_sibling tree node) (acc_d) d td
781 let a,b = loop true root 0 0 0
783 Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b