1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
25 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
28 external inode : 'a node -> int = "%identity"
29 external nodei : int -> 'a node = "%identity"
30 let compare_node x y = (inode x) - (inode y)
31 let equal_node : 'a node -> 'a node -> bool = (==)
34 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
35 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
36 external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
37 external tree_save : tree -> Unix.file_descr -> string -> unit = "caml_xml_tree_save"
38 external tree_load : Unix.file_descr -> string -> bool -> int -> tree = "caml_xml_tree_load"
40 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
42 let nil : [`Tree ] node = nodei ~-1
43 let nulldoc : [`Text ] node = nodei ~-1
44 let root : [`Tree ] node = nodei 0
46 external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
47 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
49 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
51 external text_is_prefix : tree -> string -> bool = "caml_text_collection_is_prefix"
52 external text_is_suffix : tree -> string -> bool = "caml_text_collection_is_suffix"
53 external text_is_equal : tree -> string -> bool = "caml_text_collection_is_equal"
54 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
55 external text_is_lessthan : tree -> string -> bool = "caml_text_collection_is_lessthan"
57 external text_count : tree -> string -> int = "caml_text_collection_count"
58 external text_count_prefix : tree -> string -> int = "caml_text_collection_count_prefix"
59 external text_count_suffix : tree -> string -> int = "caml_text_collection_count_suffix"
60 external text_count_equal : tree -> string -> int = "caml_text_collection_count_equal"
61 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
62 external text_count_lessthan : tree -> string -> int = "caml_text_collection_count_lessthan"
64 external text_prefix : tree -> string -> [`Text ] node array = "caml_text_collection_prefix"
65 external text_suffix : tree -> string -> [`Text ] node array = "caml_text_collection_suffix"
66 external text_equals : tree -> string -> [`Text ] node array = "caml_text_collection_equals"
67 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
68 external text_lessthan : tree -> string -> [`Text ] node array = "caml_text_collection_lessthan"
71 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" "noalloc"
72 external tree_size : tree -> int = "caml_xml_tree_size" "noalloc"
73 external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size" "noalloc"
74 external tree_subtree_elements : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_elements" "noalloc"
75 external tree_subtree_tags : tree -> [`Tree] node -> Tag.t -> int = "caml_xml_tree_subtree_elements" "noalloc"
77 let tree_is_nil x = equal_node x nil
78 external tree_is_leaf : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
79 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
80 external tree_is_child : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_child" "noalloc"
81 external tree_is_first_child : tree -> [`Tree ] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
82 external tree_num_children : tree -> [`Tree ] node -> int = "caml_xml_tree_num_children" "noalloc"
83 external tree_child_number : tree -> [`Tree ] node -> int = "caml_xml_tree_child_number" "noalloc"
84 external tree_depth : tree -> [`Tree ] node -> int = "caml_xml_tree_depth" "noalloc"
85 external tree_preorder : tree -> [`Tree ] node -> int = "caml_xml_tree_preorder" "noalloc"
86 external tree_postorder : tree -> [`Tree ] node -> int = "caml_xml_tree_postorder" "noalloc"
87 external tree_tag : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" "noalloc"
88 external tree_doc_ids : tree -> [`Tree ] node -> [`Text] node*[`Text] node = "caml_xml_tree_doc_ids"
90 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
91 external tree_child : tree -> [`Tree] node -> int -> [`Tree] node = "caml_xml_tree_child" "noalloc"
92 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
93 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
94 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
95 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
96 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
97 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
98 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
101 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
102 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
103 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
105 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
106 external tree_tagged_following_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_following_sibling" "noalloc"
107 external tree_select_following_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_following_sibling" "noalloc"
108 external tree_tagged_descendant : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_descendant" "noalloc"
109 external tree_select_descendant : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_descendant" "noalloc"
110 external tree_tagged_following : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_following" "noalloc"
111 external tree_tagged_following_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_below" "noalloc"
112 external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_below" "noalloc"
115 external tree_tagged_following_before : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_following_before" "noalloc"
116 external tree_select_following_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_following_before" "noalloc"
118 external tree_my_text : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text" "noalloc"
119 external tree_my_text_unsafe : tree -> [`Tree ] node -> [`Text] node = "caml_xml_tree_my_text_unsafe" "noalloc"
120 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
121 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
123 external tree_parent_node : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_node" "noalloc"
125 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
127 external tree_closing : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_closing" "noalloc"
128 external tree_is_open : tree -> [`Tree] node -> bool = "caml_xml_tree_is_open" "noalloc"
131 external benchmark_jump : tree -> Tag.t -> unit = "caml_benchmark_jump" "noalloc"
133 let benchmark_jump t s = benchmark_jump t.doc s
135 external benchmark_fcns : tree -> unit = "caml_benchmark_fcns" "noalloc"
137 let benchmark_fcns t = benchmark_fcns t.doc
139 external benchmark_lcps : tree -> unit = "caml_benchmark_lcps" "noalloc"
141 let benchmark_lcps t = benchmark_lcps t.doc
149 let text_size tree = inode (snd ( tree_doc_ids tree root ))
151 let text_get_text t (x:[`Text] node) =
152 if x == nulldoc then ""
153 else text_get_text t x
158 module HPtset = Hashtbl.Make(Ptset.Int)
160 let vector_htbl = HPtset.create MED_H_SIZE
162 let ptset_to_vector s =
164 HPtset.find vector_htbl s
167 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
168 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
169 HPtset.add vector_htbl s v; v
173 let subtree_size t i = tree_subtree_size t.doc i
174 let subtree_elements t i = tree_subtree_elements t.doc i
175 let text_size t = text_size t.doc
177 module MemUnion = Hashtbl.Make (struct
178 type t = Ptset.Int.t*Ptset.Int.t
179 let equal (x,y) (z,t) = x == z && y == t
180 let equal a b = equal a b || equal b a
181 let hash (x,y) = (* commutative hash *)
182 let x = Uid.to_int (Ptset.Int.uid x)
183 and y = Uid.to_int (Ptset.Int.uid y)
185 if x <= y then HASHINT2(x,y) else HASHINT2(y,x)
188 module MemAdd = Hashtbl.Make (
190 type t = Tag.t*Ptset.Int.t
191 let equal (x,y) (z,t) = (x == z)&&(y == t)
192 let hash (x,y) = HASHINT2(x,Uid.to_int (Ptset.Int.uid y))
195 module MemUpdate = struct
196 include Hashtbl.Make (
198 type t = Tag.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t
199 let equal (a1,b1,c1,d1,e1) (a2,b2,c2,d2,e2) = a1==a2 &&
200 b1 == b2 && c1 == c2 && d1 == d2 && e1 == e2
201 let hash (a,b,c,d,e) =
202 HASHINT4(HASHINT2(a,Uid.to_int (Ptset.Int.uid b)),
203 Uid.to_int (Ptset.Int.uid c),
204 Uid.to_int (Ptset.Int.uid d),
205 Uid.to_int (Ptset.Int.uid e))
210 let collect_tags tree =
211 let _ = Printf.eprintf "Collecting Tags\n%!" in
212 let h_union = MemUnion.create BIG_H_SIZE in
215 MemUnion.find h_union (s1,s2)
217 | Not_found -> let s = Ptset.Int.union s1 s2
219 MemUnion.add h_union (s1,s2) s;s
221 let h_add = MemAdd.create BIG_H_SIZE in
223 try MemAdd.find h_add (t,s)
225 | Not_found -> let r = Ptset.Int.add t s in
226 MemAdd.add h_add (t,s) r;r
228 let h = Hashtbl.create BIG_H_SIZE in
229 let update t sc sb ss sa =
230 let schild,sbelow,ssibling,safter =
235 (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
238 (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
240 let rec loop right id acc_after =
242 then Ptset.Int.empty,Ptset.Int.empty,acc_after else
243 let sibling2,desc2,after2 = loop true (tree_next_sibling tree id) acc_after in
244 let child1,desc1,after1 = loop false (tree_first_child tree id) after2 in
245 let tag = tree_tag tree id in
246 update tag child1 desc1 sibling2 after2;
247 ( pt_add tag sibling2,
248 pt_add tag (pt_cup desc1 desc2),
249 if right then pt_cup after1 (pt_cup desc1 desc2) else acc_after )
251 let _ = loop false (tree_root tree) Ptset.Int.empty in
252 let _ = Printf.eprintf "Finished\n%!" in
258 let contains_array = ref [| |]
259 let contains_index = Hashtbl.create 4096
262 Hashtbl.find contains_index i
266 let init_textfun f t s =
268 | `CONTAINS -> text_contains t.doc s
269 | `STARTSWITH -> text_prefix t.doc s
270 | `ENDSWITH -> text_suffix t.doc s
271 | `EQUALS -> text_equals t.doc s
273 (*Array.fast_sort (compare) a; *)
275 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
277 let count_contains t s = text_count_contains t.doc s
279 let init_naive_contains t s =
280 let i,j = tree_doc_ids t.doc (tree_root t.doc)
282 let regexp = Str.regexp_string s in
285 let _ = Str.search_forward regexp arg 0;
289 let rec loop n acc l =
292 let s = text_get_text t.doc n
295 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
296 else loop (nodei ((inode n)+1)) acc l
298 let acc,l = loop i [] 0 in
299 let a = Array.create l nulldoc in
300 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
306 let array_find a i j =
307 let l = Array.length a in
308 let rec loop idx x y =
309 if x > y || idx >= l then nulldoc
311 if a.(idx) >= x then if a.(idx) > y then nulldoc else (last_idx := idx;a.(idx))
312 else loop (idx+1) x y
314 if a.(0) > j || a.(l-1) < i then nulldoc
315 else loop !last_idx i j
317 let text_below tree t =
318 let l = Array.length !contains_array in
319 let i,j = tree_doc_ids tree.doc t in
320 let id = if l == 0 then i else (array_find !contains_array i j) in
321 tree_parent_node tree.doc id
323 let text_next tree t root =
324 let l = Array.length !contains_array in
325 let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
326 let _,j = tree_doc_ids tree.doc root in
327 let id = if l == 0 then if inf > j then nulldoc else inf
328 else array_find !contains_array inf j
330 tree_parent_node tree.doc id
334 module DocIdSet = struct
335 include Set.Make (struct type t = [`Text] node
336 let compare = compare_node end)
339 let is_nil t = t == nil
341 let is_node t = t != nil
342 let is_root t = t == root
345 let _ = Tag.init (Obj.magic t) in
346 let table = collect_tags t
348 let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
349 Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
350 Printf.eprintf "Child tags: ";
351 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) c;
352 Printf.eprintf "\nDescendant tags: ";
353 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) d;
354 Printf.eprintf "\nNextSibling tags: ";
355 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) ns;
356 Printf.eprintf "\nFollowing tags: ";
357 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
358 Printf.eprintf "\n\n%!";) table
366 let finalize _ = Printf.eprintf "Release the string list !\n%!"
372 !Options.sample_factor
373 !Options.index_empty_texts
374 !Options.disable_text_collection)
376 let parse_xml_uri str = parse parse_xml_uri str
377 let parse_xml_string str = parse parse_xml_string str
379 let size t = tree_size t.doc;;
381 external pool : tree -> Tag.pool = "%identity"
383 let magic_string = "SXSI_INDEX"
384 let version_string = "2"
387 Unix.lseek fd 0 Unix.SEEK_CUR
389 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
392 let sl = String.length s in
393 let ssl = Printf.sprintf "%020i" sl in
394 ignore (Unix.write fd ssl 0 20);
395 ignore (Unix.write fd s 0 (String.length s))
397 let rec really_read fd buffer start length =
398 if length <= 0 then () else
399 match Unix.read fd buffer start length with
400 0 -> raise End_of_file
401 | r -> really_read fd buffer (start + r) (length - r);;
404 let buffer = String.create 20 in
405 let _ = really_read fd buffer 0 20 in
406 let size = int_of_string buffer in
407 let buffer = String.create size in
408 let _ = really_read fd buffer 0 size in
413 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
414 let out_c = Unix.out_channel_of_descr fd in
415 let _ = set_binary_mode_out out_c true in
416 output_string out_c magic_string;
417 output_char out_c '\n';
418 output_string out_c version_string;
419 output_char out_c '\n';
420 Marshal.to_channel out_c t.ttable [ ];
421 (* we need to move the fd to the correct position *)
423 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
424 tree_save t.doc fd str;
428 let load ?(sample=64) ?(load_text=true) str =
429 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
430 let in_c = Unix.in_channel_of_descr fd in
431 let _ = set_binary_mode_in in_c true in
433 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
434 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
435 let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
436 Marshal.from_channel in_c
438 let ntable = Hashtbl.create (Hashtbl.length table) in
439 Hashtbl.iter (fun k (s1,s2,s3,s4) ->
440 let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
441 and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
442 and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
443 and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
444 in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
447 (* The in_channel read a chunk of fd, so we might be after
448 the start of the XMLTree save file. Reset to the correct
452 let _ = Printf.eprintf "\nLoading tag table : " in
453 let ntable = time (load_table) () in
454 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
455 let tree = { doc = tree_load fd str load_text sample;
463 let tag_pool t = pool t.doc
465 let compare = compare_node
467 let equal a b = a == b
471 | i -> Printf.sprintf "Node (%i)" i
473 let dump_node t = nts (inode t)
475 let is_left t n = tree_is_first_child t.doc n
479 let is_below_right t n1 n2 =
480 tree_is_ancestor t.doc (tree_parent t.doc n1) n2
481 && not (tree_is_ancestor t.doc n1 n2)
483 let is_binary_ancestor t n1 n2 =
484 let p = tree_parent t.doc n1 in
485 let fin = tree_closing t.doc p in
487 (* (is_below_right t n1 n2) ||
488 (tree_is_ancestor t.doc n1 n2) *)
490 let parent t n = tree_parent t.doc n
492 let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
493 let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
495 (* these function will be called in two times: first partial application
496 on the tag, then application of the tag and the tree, then application of
497 the other arguments. We use the trick to let the compiler optimize application
500 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
502 let select_child t = fun ts ->
503 let v = ptset_to_vector ts in ();
504 fun n -> tree_select_child t.doc n v
506 let next_sibling t = let doc = t.doc in (); fun n -> tree_next_sibling doc n
507 let next_element t = let doc = t.doc in (); fun n -> tree_next_element doc n
509 let tagged_following_sibling t tag = (); fun n -> tree_tagged_following_sibling t.doc n tag
511 let select_following_sibling t = fun ts ->
512 let v = (ptset_to_vector ts) in ();
513 fun n -> tree_select_following_sibling t.doc n v
515 let next_sibling_below t = (); fun n _ -> tree_next_sibling t.doc n
516 let next_element_below t = (); fun n _ -> tree_next_element t.doc n
518 let tagged_following_sibling_below t tag = (); fun n _ -> tree_tagged_following_sibling t.doc n tag
520 let select_following_sibling_below t = fun ts ->
521 let v = (ptset_to_vector ts) in ();
522 fun n _ -> tree_select_following_sibling t.doc n v
524 let id t n = tree_node_xml_id t.doc n
526 let tag t n = if n == nil then Tag.nullt else tree_tag t.doc n
528 let tagged_descendant t tag =
529 let doc = t.doc in (); fun n -> tree_tagged_descendant doc n tag
531 let select_descendant t = fun ts ->
532 let v = (ptset_to_vector ts) in ();
533 fun n -> tree_select_descendant t.doc n v
535 let tagged_following_below t tag =
537 (); fun n ctx -> tree_tagged_following_below doc n tag ctx
539 let select_following_below t = fun ts ->
540 let v = (ptset_to_vector ts) in ();
541 fun n ctx -> tree_select_following_below t.doc n v ctx
543 let closing t n = tree_closing t.doc n
544 let is_open t n = tree_is_open t.doc n
545 let get_text_id t n = tree_my_text t.doc n
548 let array_find a i j =
549 let l = Array.length a in
550 let rec loop idx x y =
551 if x > y || idx >= l then nil
553 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
554 else loop (idx+1) x y
556 if a.(0) > j || a.(l-1) < i then nil
557 else loop !last_idx i j
561 let count t s = text_count t.doc s
563 let init_stack () = stack := []
564 let push x = stack:= x::!stack
565 let peek () = match !stack with
567 | _ -> failwith "peek"
568 let pop () = match !stack with
570 | _ -> failwith "pop"
572 let next t = nodei ( (inode t) + 1 )
573 let next2 t = nodei ( (inode t) + 2 )
574 let next3 t = nodei ( (inode t) + 3 )
576 let print_xml_fast2 =
577 let _ = init_stack () in
578 let h = Hashtbl.create MED_H_SIZE in
579 let tag_str t = try Hashtbl.find h t with
580 Not_found -> let s = Tag.to_string t in
583 let h_att = Hashtbl.create MED_H_SIZE in
584 let att_str t = try Hashtbl.find h_att t with
585 Not_found -> let s = Tag.to_string t in
586 let attname = String.sub s 3 ((String.length s) -3) in
587 Hashtbl.add h_att t attname;attname
588 in fun outc tree t ->
589 let tree = tree.doc in
590 let fin = tree_closing tree t in
591 let rec loop_tag t tag =
593 if tree_is_open tree t then
595 if tag == Tag.pcdata then
597 output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
598 loop (next2 t) (* skip closing $ *)
601 let tagstr = tag_str tag in
602 let _ = output_char outc '<';
603 output_string outc tagstr in
605 if tree_is_open tree t' then
606 let _ = push tagstr in
607 let tag' = tree_tag tree t' in
608 if tag' == Tag.attribute then let t'' = loop_attr (next t') 0 in
609 output_string outc ">"; loop t'' else (output_string outc ">";loop_tag t' tag')
610 else (* closing with no content *)
611 let _ = output_string outc "/>" in
616 output_string outc "</";
617 output_string outc (pop());
618 output_char outc '>';
621 and loop t = loop_tag t (tree_tag tree t)
623 if tree_is_open tree t then
624 let attname = att_str (tree_tag tree t) in
625 output_char outc ' ';
626 output_string outc attname;
627 output_string outc "=\"";
628 let t = next t in (* open $@ *)
629 output_string outc (text_get_text tree (tree_my_text_unsafe tree t));
630 output_char outc '"';
631 loop_attr (next3 t) (n+1)
637 let h = Hashtbl.create MED_H_SIZE in
638 let tag_str t = try Hashtbl.find h t with
639 Not_found -> let s = Tag.to_string t in
642 let h_att = Hashtbl.create MED_H_SIZE in
643 let att_str t = try Hashtbl.find h_att t with
644 Not_found -> let s = Tag.to_string t in
645 let attname = String.sub s 3 ((String.length s) -3) in
646 Hashtbl.add h_att t attname;attname
647 in fun outc tree t ->
648 let rec loop ?(print_right=true) t =
651 let tagid = tree_tag tree.doc t in
655 let tid = tree_my_text_unsafe tree.doc t in
656 output_string outc (text_get_text tree.doc tid);
658 then loop (next_sibling tree t);
661 let tagstr = tag_str tagid in
662 let l = first_child tree t
663 and r = next_sibling tree t
665 output_char outc '<';
666 output_string outc tagstr;
667 if l == nil then output_string outc "/>"
669 if (tag tree l) == Tag.attribute then
671 loop_attributes (first_child tree l);
672 if (next_sibling tree l) == nil then output_string outc "/>"
675 output_char outc '>';
676 loop (next_sibling tree l);
677 output_string outc "</";
678 output_string outc tagstr;
679 output_char outc '>';
684 output_char outc '>';
686 output_string outc "</";
687 output_string outc tagstr;
688 output_char outc '>';
690 if print_right then loop r
691 and loop_attributes a =
694 let attname = att_str (tag tree a) in
695 let fsa = first_child tree a in
696 let tid = tree_my_text_unsafe tree.doc fsa in
697 output_char outc ' ';
698 output_string outc attname;
699 output_string outc "=\"";
700 output_string outc (text_get_text tree.doc tid);
701 output_char outc '"';
702 loop_attributes (next_sibling tree a)
704 loop ~print_right:false t
707 let print_xml_fast outc tree t =
708 if (tag tree t) = Tag.document_node then
709 print_xml_fast outc tree (first_child tree t)
710 else print_xml_fast outc tree t
712 let tags_children t tag =
713 let a,_,_,_ = Hashtbl.find t.ttable tag in a
714 let tags_below t tag =
715 let _,a,_,_ = Hashtbl.find t.ttable tag in a
716 let tags_siblings t tag =
717 let _,_,a,_ = Hashtbl.find t.ttable tag in a
718 let tags_after t tag =
719 let _,_,_,a = Hashtbl.find t.ttable tag in a
722 let tags t tag = Hashtbl.find t.ttable tag
725 let rec binary_parent t n =
727 if tree_is_first_child t.doc n
728 then tree_parent t.doc n
729 else tree_prev_sibling t.doc n
730 in if tree_tag t.doc r = Tag.pcdata then
734 let doc_ids t n = tree_doc_ids t.doc n
736 let subtree_tags t tag = ();
737 fun n -> if n == nil then 0 else
738 tree_subtree_tags t.doc n tag
741 let tid = tree_my_text t.doc n in
742 if tid == nulldoc then "" else
743 text_get_text t.doc tid
746 let dump_tree fmt tree =
749 let tag = (tree_tag tree.doc t ) in
750 let tagstr = Tag.to_string tag in
751 let tab = String.make n ' ' in
753 if tag == Tag.pcdata || tag == Tag.attribute_data
755 Format.fprintf fmt "%s<%s>%s</%s>\n"
756 tab tagstr (text_get_text tree.doc (tree_my_text tree.doc t)) tagstr
758 Format.fprintf fmt "%s<%s>\n" tab tagstr;
759 loop (tree_first_child tree.doc t) (n+2);
760 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
762 loop (tree_next_sibling tree.doc t) n
768 let print_xml_fast3 t = tree_print_xml_fast3 t.doc
775 let rec loop left node acc_d total_d num_leaves =
777 (acc_d+total_d,if left then num_leaves+1 else num_leaves)
779 let d,td = loop true (tree_first_child tree node) (acc_d+1) total_d num_leaves in
780 loop false (tree_next_sibling tree node) (acc_d) d td
782 let a,b = loop true root 0 0 0
784 Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b
792 let test_prefix t s = Array.length (text_prefix t.doc s)
793 let test_suffix t s = Array.length (text_suffix t.doc s)
794 let test_contains t s = Array.length (text_contains t.doc s)
795 let test_equals t s = Array.length (text_equals t.doc s)