1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
11 type node_kind = [`Text | `Tree ]
13 let compare_node : 'a node -> 'a node -> int = (-)
14 let equal_node : 'a node -> 'a node -> bool = (==)
16 (* abstract type, values are pointers to a XMLTree C++ object *)
18 external int_of_node : 'a node -> int = "%identity"
20 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
21 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
23 external save_tree : tree -> string -> unit = "caml_xml_tree_save"
24 external load_tree : string -> int -> tree = "caml_xml_tree_load"
26 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
28 let nil : 'a node = -1
29 let root : [`Tree ] node = 0
31 external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
33 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
35 let text_is_empty t n =
36 (equal_node nil n) || text_is_empty t n
40 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
41 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
42 external text_count : tree -> string -> int = "caml_text_collection_count"
43 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
44 external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
45 external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
48 external tree_serialize : tree -> string -> unit = "caml_xml_tree_serialize"
50 external tree_unserialize : string -> tree = "caml_xml_tree_unserialize"
52 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
54 let tree_is_nil x = equal_node x nil
56 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent"
57 external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc"
58 external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc"
59 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child"
60 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child"
61 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling"
62 external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling"
64 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling"
65 external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf"
66 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child"
67 external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child"
69 (* external tag : tree -> [`Tree ] node -> T = "caml_xml_tree_tag"*)
70 external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id"
73 let tree_is_last t n = equal_node nil (tree_next_sibling t n)
75 external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text"
77 external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text"
78 external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text"
79 external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
81 let text_size tree = int_of_node (snd ( tree_doc_ids tree (Obj.magic 0) ))
83 let text_get_cached_text t x =
86 text_get_cached_text t x
89 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id"
90 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id"
91 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor"
92 external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc"
93 external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below"
94 external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags"
99 external int_vector_alloc : int -> int_vector = "caml_int_vector_alloc"
100 external int_vector_length : int_vector -> int = "caml_int_vector_length"
101 external int_vector_set : int_vector -> int -> int -> unit = "caml_int_vector_set"
103 external tree_select_child : tree -> [`Tree ] node -> int_vector -> [`Tree] node = "caml_xml_tree_select_child"
104 external tree_select_foll_sibling : tree -> [`Tree ] node -> int_vector -> [`Tree] node = "caml_xml_tree_select_foll_sibling"
105 external tree_select_desc : tree -> [`Tree ] node -> int_vector -> [`Tree] node = "caml_xml_tree_select_desc"
106 external tree_select_foll_below : tree -> [`Tree ] node -> int_vector -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below"
109 module HPtset = Hashtbl.Make(Ptset.Int)
111 let vector_htbl = HPtset.create MED_H_SIZE
113 let ptset_to_vector s =
115 HPtset.find vector_htbl s
118 let v = int_vector_alloc (Ptset.Int.cardinal s) in
119 let _ = Ptset.Int.fold (fun e i -> int_vector_set v i e;i+1) s 0 in
120 HPtset.add vector_htbl s v; v
125 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
128 let text_size t = text_size t.doc
130 module MemUnion = Hashtbl.Make (struct
131 type t = Ptset.Int.t*Ptset.Int.t
132 let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
133 let equal a b = equal a b || equal b a
134 let hash (x,y) = (* commutative hash *)
135 let x = Ptset.Int.hash x
136 and y = Ptset.Int.hash y
138 if x < y then HASHINT2(x,y) else HASHINT2(y,x)
141 let collect_tags tree =
142 let h_union = MemUnion.create BIG_H_SIZE in
145 MemUnion.find h_union (s1,s2)
147 | Not_found -> let s = Ptset.Int.union s1 s2
149 MemUnion.add h_union (s1,s2) s;s
151 let h_add = Hashtbl.create BIG_H_SIZE in
153 let k = HASHINT2(Tag.hash t,Ptset.Int.hash s) in
157 | Not_found -> let r = Ptset.Int.add t s in
158 Hashtbl.add h_add k r;r
160 let h = Hashtbl.create BIG_H_SIZE in
167 (Ptset.Int.empty,Ptset.Int.empty)
169 Hashtbl.replace h t (pt_cup sbelow sb, pt_cup safter sa)
171 let rec loop id acc =
173 then (Ptset.Int.empty,acc)
175 let below2,after2 = loop (tree_next_sibling tree id) acc in
176 let below1,after1 = loop (tree_first_child tree id) after2 in
177 let tag = tree_tag_id tree id in
178 update tag below1 after2;
179 pt_add tag (pt_cup below1 below2), (pt_add tag after1)
181 let _ = loop (tree_root tree) Ptset.Int.empty in h
187 let contains_array = ref [| |]
188 let contains_index = Hashtbl.create 4096
191 Hashtbl.find contains_index i
195 let init_contains t s =
196 let a = text_contains t.doc s
198 Array.fast_sort (compare) a;
200 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
202 let count_contains t s = text_count_contains t.doc s
203 let unsorted_contains t s = text_unsorted_contains t.doc s
205 let init_naive_contains t s =
206 let i,j = tree_doc_ids t.doc (tree_root t.doc)
208 let regexp = Str.regexp_string s in
211 let _ = Str.search_forward regexp arg 0;
215 let rec loop n acc l =
218 let s = text_get_cached_text t.doc n
221 then loop (n+1) (n::acc) (l+1)
222 else loop (n+1) acc l
224 let acc,l = loop i [] 0 in
225 let a = Array.create l nil in
226 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
232 module DocIdSet = struct
233 include Set.Make (struct type t = [`Text] node
234 let compare = compare_node end)
237 let is_nil t = t == nil
239 let is_node t = t != nil
240 let is_root t = t == root
243 let _ = Tag.init (Obj.magic t) in
244 let table = collect_tags t
250 let finalize _ = Printf.eprintf "Release the string list !\n%!"
256 !Options.sample_factor
257 !Options.index_empty_texts
258 !Options.disable_text_collection)
260 let parse_xml_uri str = parse parse_xml_uri str
261 let parse_xml_string str = parse parse_xml_string str
264 external pool : tree -> Tag.pool = "%identity"
266 let save t str = (save_tree t.doc str)
269 let load ?(sample=64) str =
270 node_of_t (load_tree str sample)
275 let tag_pool t = pool t.doc
277 let compare a b = a - b
279 let equal a b = a == b
283 | i -> Printf.sprintf "Node (%i)" i
285 let dump_node t = nts t
288 let is_left t n = tree_is_first_child t.doc n
290 let is_below_right t n1 n2 = tree_is_ancestor t.doc (tree_parent t.doc n1) n2
292 let parent t n = tree_parent t.doc n
294 let first_child t = (); fun n -> tree_first_child t.doc n
296 (* these function will be called in two times: first partial application
297 on the tag, then application of the tag and the tree, then application of
298 the other arguments. We use the trick to let the compiler optimize application
301 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
303 let select_child t = fun ts ->
304 let v = ptset_to_vector ts in ();
305 fun n -> tree_select_child t.doc n v
307 let next_sibling t = (); fun n -> tree_next_sibling t.doc n
308 let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
310 let select_sibling t = fun ts ->
311 let v = (ptset_to_vector ts) in ();
312 fun n -> tree_select_foll_sibling t.doc n v
314 let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
315 let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
317 let select_sibling_ctx t = fun ts ->
318 let v = (ptset_to_vector ts) in ();
319 fun n _ -> tree_select_foll_sibling t.doc n v
321 let id t n = tree_node_xml_id t.doc n
323 let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
325 let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
327 let select_desc t = fun ts ->
328 let v = (ptset_to_vector ts) in ();
329 fun n -> tree_select_desc t.doc n v
331 let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
333 let select_foll_ctx t = fun ts ->
334 let v = (ptset_to_vector ts) in ();
335 fun n ctx -> tree_select_foll_below t.doc n v ctx
338 let array_find a i j =
339 let l = Array.length a in
340 let rec loop idx x y =
341 if x > y || idx >= l then nil
343 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
344 else loop (idx+1) x y
346 if a.(0) > j || a.(l-1) < i then nil
347 else loop !last_idx i j
351 let count t s = text_count t.doc s
353 let print_xml_fast outc tree t =
354 let rec loop ?(print_right=true) t =
357 let tagid = tree_tag_id tree.doc t in
359 then output_string outc (text_get_cached_text tree.doc t);
361 then loop (next_sibling tree t)
364 let tagstr = Tag.to_string tagid in
365 let l = first_child tree t
366 and r = next_sibling tree t
368 output_char outc '<';
369 output_string outc tagstr;
370 if l == nil then output_string outc "/>"
372 if (tag tree l) == Tag.attribute then
374 loop_attributes (first_child tree l);
375 if (next_sibling tree l) == nil then output_string outc "/>"
378 output_char outc '>';
379 loop (next_sibling tree l);
380 output_string outc "</";
381 output_string outc tagstr;
382 output_char outc '>';
387 output_char outc '>';
389 output_string outc "</";
390 output_string outc tagstr;
391 output_char outc '>';
393 if print_right then loop r
394 and loop_attributes a =
395 let s = (Tag.to_string (tag tree a)) in
396 let attname = String.sub s 3 ((String.length s) -3) in
397 output_char outc ' ';
398 output_string outc attname;
399 output_string outc "=\"";
400 output_string outc (text_get_cached_text tree.doc
401 (tree_my_text tree.doc (first_child tree a)));
402 output_char outc '"';
403 loop_attributes (next_sibling tree a)
405 loop ~print_right:false t
408 let print_xml_fast outc tree t =
409 if (tag tree t) = Tag.document_node then
410 print_xml_fast outc tree (first_child tree t)
411 else print_xml_fast outc tree t
413 let tags_below t tag =
414 fst(Hashtbl.find t.ttable tag)
416 let tags_after t tag =
417 snd(Hashtbl.find t.ttable tag)
419 let tags t tag = Hashtbl.find t.ttable tag
422 let binary_parent t n =
423 if tree_is_first_child t.doc n
424 then tree_parent t.doc n
425 else tree_prev_sibling t.doc n
427 let doc_ids t n = tree_doc_ids t.doc n
429 let subtree_tags t tag = ();
430 fun n -> if n == nil then 0 else
431 tree_subtree_tags t.doc n tag
434 let tid = tree_my_text t.doc n in
435 if tid == nil then "" else
436 text_get_cached_text t.doc tid
439 let dump_tree fmt tree =
442 let tag = (tree_tag_id tree.doc t ) in
443 let tagstr = Tag.to_string tag in
444 let tab = String.make n ' ' in
446 if tag == Tag.pcdata || tag == Tag.attribute_data
448 Format.fprintf fmt "%s<%s>%s</%s>\n"
449 tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
451 Format.fprintf fmt "%s<%s>\n" tab tagstr;
452 loop (tree_first_child tree.doc t) (n+2);
453 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
455 loop (tree_next_sibling tree.doc t) n