1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
23 external inode : 'a node -> int = "%identity"
24 external nodei : int -> 'a node = "%identity"
25 let compare_node x y = (inode x) - (inode y)
26 let equal_node : 'a node -> 'a node -> bool = (==)
29 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
30 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
32 external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
33 external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
35 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
37 let nil : [`Tree ] node = nodei ~-1
38 let nulldoc : [`Text ] node = nodei ~-1
39 let root : [`Tree ] node = nodei 0
41 external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
43 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
45 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
47 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
48 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
49 external text_count : tree -> string -> int = "caml_text_collection_count"
50 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
51 external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
52 external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
54 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
55 external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size"
57 let tree_is_nil x = equal_node x nil
59 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
60 external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc"
61 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
62 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
63 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
64 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
65 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
66 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
67 external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc"
69 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
70 external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
71 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
72 external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
75 external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc"
78 let tree_is_last t n = equal_node nil (tree_next_sibling t n)
80 (*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *)
82 external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc"
83 (*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *)
84 external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
86 let text_size tree = inode (snd ( tree_doc_ids tree root ))
88 let text_get_cached_text t (x:[`Text] node) =
89 if x == nulldoc then ""
91 text_get_cached_text t x
94 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
95 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
96 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
97 external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc"
98 external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc"
99 external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc"
103 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
104 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
105 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
107 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
108 external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc"
109 external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc"
110 external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc"
113 module HPtset = Hashtbl.Make(Ptset.Int)
115 let vector_htbl = HPtset.create MED_H_SIZE
117 let ptset_to_vector s =
119 HPtset.find vector_htbl s
122 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
123 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
124 HPtset.add vector_htbl s v; v
129 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
131 let subtree_size t i = tree_subtree_size t.doc i
132 let text_size t = text_size t.doc
134 module MemUnion = Hashtbl.Make (struct
135 type t = Ptset.Int.t*Ptset.Int.t
136 let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
137 let equal a b = equal a b || equal b a
138 let hash (x,y) = (* commutative hash *)
139 let x = Ptset.Int.hash x
140 and y = Ptset.Int.hash y
142 if x < y then HASHINT2(x,y) else HASHINT2(y,x)
145 module MemAdd = Hashtbl.Make (
147 type t = Tag.t*Ptset.Int.t
148 let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t)
149 let hash (x,y) = HASHINT2(x,Ptset.Int.hash y)
152 let collect_tags tree =
153 let h_union = MemUnion.create BIG_H_SIZE in
156 MemUnion.find h_union (s1,s2)
158 | Not_found -> let s = Ptset.Int.union s1 s2
160 MemUnion.add h_union (s1,s2) s;s
162 let h_add = MemAdd.create BIG_H_SIZE in
164 try MemAdd.find h_add (t,s)
166 | Not_found -> let r = Ptset.Int.add t s in
167 MemAdd.add h_add (t,s) r;r
169 let h = Hashtbl.create BIG_H_SIZE in
170 let update t sc sb ss sa =
171 let schild,sbelow,ssibling,safter =
176 (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
179 (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
181 let rec loop_right id acc_sibling acc_after=
183 then (acc_sibling,acc_after)
185 let sibling2,after2 = loop_right (tree_next_sibling tree id) acc_sibling acc_after in
186 let child1,below1 = loop_left (tree_first_child tree id) after2 in
187 let tag = tree_tag_id tree id in
188 update tag child1 below1 sibling2 after2;
189 (pt_add tag sibling2, (pt_add tag (pt_cup after2 below1)))
190 and loop_left id acc_after =
192 then (Ptset.Int.empty,Ptset.Int.empty)
194 let sibling2,after2 = loop_right (tree_next_sibling tree id) Ptset.Int.empty acc_after in
195 let child1,below1 = loop_left (tree_first_child tree id) after2 in
196 let tag = tree_tag_id tree id in
197 update tag child1 below1 sibling2 after2;
198 (pt_add tag sibling2,(pt_add tag (pt_cup after2 below1)))
200 let _ = loop_left (tree_root tree) Ptset.Int.empty in h
205 let contains_array = ref [| |]
206 let contains_index = Hashtbl.create 4096
209 Hashtbl.find contains_index i
213 let init_contains t s =
214 let a = text_contains t.doc s
216 Array.fast_sort (compare) a;
218 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
220 let count_contains t s = text_count_contains t.doc s
221 let unsorted_contains t s = text_unsorted_contains t.doc s
223 let init_naive_contains t s =
224 let i,j = tree_doc_ids t.doc (tree_root t.doc)
226 let regexp = Str.regexp_string s in
229 let _ = Str.search_forward regexp arg 0;
233 let rec loop n acc l =
236 let s = text_get_cached_text t.doc n
239 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
240 else loop (nodei ((inode n)+1)) acc l
242 let acc,l = loop i [] 0 in
243 let a = Array.create l nulldoc in
244 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
250 let array_find a i j =
251 let l = Array.length a in
252 let rec loop idx x y =
253 if x > y || idx >= l then nulldoc
255 if a.(idx) >= x then if a.(idx) > y then nulldoc else (last_idx := idx;a.(idx))
256 else loop (idx+1) x y
258 if a.(0) > j || a.(l-1) < i then nulldoc
259 else loop !last_idx i j
261 let text_below tree t =
262 let l = Array.length !contains_array in
263 let i,j = tree_doc_ids tree.doc t in
264 let id = if l == 0 then i else (array_find !contains_array i j) in
265 tree_parent_doc tree.doc id
267 let text_next tree t root =
268 let l = Array.length !contains_array in
269 let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
270 let _,j = tree_doc_ids tree.doc root in
271 let id = if l == 0 then if inf > j then nulldoc else inf
272 else array_find !contains_array inf j
274 tree_parent_doc tree.doc id
278 module DocIdSet = struct
279 include Set.Make (struct type t = [`Text] node
280 let compare = compare_node end)
283 let is_nil t = t == nil
285 let is_node t = t != nil
286 let is_root t = t == root
289 let _ = Tag.init (Obj.magic t) in
290 let table = collect_tags t
296 let finalize _ = Printf.eprintf "Release the string list !\n%!"
302 !Options.sample_factor
303 !Options.index_empty_texts
304 !Options.disable_text_collection)
306 let parse_xml_uri str = parse parse_xml_uri str
307 let parse_xml_string str = parse parse_xml_string str
310 external pool : tree -> Tag.pool = "%identity"
312 let magic_string = "SXSI_INDEX"
313 let version_string = "1"
316 Unix.lseek fd 0 Unix.SEEK_CUR
318 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
321 let sl = String.length s in
322 let ssl = Printf.sprintf "%020i" sl in
323 ignore (Unix.write fd ssl 0 20);
324 ignore (Unix.write fd s 0 (String.length s))
326 let rec really_read fd buffer start length =
327 if length <= 0 then () else
328 match Unix.read fd buffer start length with
329 0 -> raise End_of_file
330 | r -> really_read fd buffer (start + r) (length - r);;
333 let buffer = String.create 20 in
334 let _ = really_read fd buffer 0 20 in
335 let size = int_of_string buffer in
336 let buffer = String.create size in
337 let _ = really_read fd buffer 0 size in
342 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
343 let out_c = Unix.out_channel_of_descr fd in
344 let _ = set_binary_mode_out out_c true in
345 output_string out_c magic_string;
346 output_char out_c '\n';
347 output_string out_c version_string;
348 output_char out_c '\n';
349 Marshal.to_channel out_c t.ttable [ ];
350 (* we need to move the fd to the correct position *)
352 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
357 let load ?(sample=64) str =
358 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
359 let in_c = Unix.in_channel_of_descr fd in
360 let _ = set_binary_mode_in in_c true in
361 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
362 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
363 let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
364 Marshal.from_channel in_c
366 let ntable = Hashtbl.create (Hashtbl.length table) in
367 Hashtbl.iter (fun k (s1,s2,s3,s4) ->
368 let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
369 and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
370 and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
371 and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
372 in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
375 (* The in_channel read a chunk of fd, so we might be after
376 the start of the XMLTree save file. Reset to the correct
378 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
379 let tree = { doc = tree_load fd;
387 let tag_pool t = pool t.doc
389 let compare = compare_node
391 let equal a b = a == b
395 | i -> Printf.sprintf "Node (%i)" i
397 let dump_node t = nts (inode t)
399 let is_left t n = tree_is_first_child t.doc n
401 let is_below_right t n1 n2 =
402 tree_is_ancestor t.doc (tree_parent t.doc n1) n2
403 && not (tree_is_ancestor t.doc n1 n2)
405 let parent t n = tree_parent t.doc n
407 let first_child t = (); fun n -> tree_first_child t.doc n
408 let first_element t = (); fun n -> tree_first_element t.doc n
410 (* these function will be called in two times: first partial application
411 on the tag, then application of the tag and the tree, then application of
412 the other arguments. We use the trick to let the compiler optimize application
415 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
417 let select_child t = fun ts ->
418 let v = ptset_to_vector ts in ();
419 fun n -> tree_select_child t.doc n v
421 let next_sibling t = (); fun n -> tree_next_sibling t.doc n
422 let next_element t = (); fun n -> tree_next_element t.doc n
424 let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
426 let select_sibling t = fun ts ->
427 let v = (ptset_to_vector ts) in ();
428 fun n -> tree_select_foll_sibling t.doc n v
430 let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
431 let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n
432 let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
434 let select_sibling_ctx t = fun ts ->
435 let v = (ptset_to_vector ts) in ();
436 fun n _ -> tree_select_foll_sibling t.doc n v
438 let id t n = tree_node_xml_id t.doc n
440 let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
442 let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
444 let select_desc t = fun ts ->
445 let v = (ptset_to_vector ts) in ();
446 fun n -> tree_select_desc t.doc n v
448 let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
450 let select_foll_ctx t = fun ts ->
451 let v = (ptset_to_vector ts) in ();
452 fun n ctx -> tree_select_foll_below t.doc n v ctx
455 let array_find a i j =
456 let l = Array.length a in
457 let rec loop idx x y =
458 if x > y || idx >= l then nil
460 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
461 else loop (idx+1) x y
463 if a.(0) > j || a.(l-1) < i then nil
464 else loop !last_idx i j
468 let count t s = text_count t.doc s
470 let print_xml_fast outc tree t =
471 let rec loop ?(print_right=true) t =
474 let tagid = tree_tag_id tree.doc t in
478 let tid = tree_my_text tree.doc t in
479 let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode t) (inode tid)
481 output_string outc (text_get_cached_text tree.doc tid);
483 then loop (next_sibling tree t);
486 let tagstr = Tag.to_string tagid in
487 let l = first_child tree t
488 and r = next_sibling tree t
490 output_char outc '<';
491 output_string outc tagstr;
492 if l == nil then output_string outc "/>"
494 if (tag tree l) == Tag.attribute then
496 loop_attributes (first_child tree l);
497 if (next_sibling tree l) == nil then output_string outc "/>"
500 output_char outc '>';
501 loop (next_sibling tree l);
502 output_string outc "</";
503 output_string outc tagstr;
504 output_char outc '>';
509 output_char outc '>';
511 output_string outc "</";
512 output_string outc tagstr;
513 output_char outc '>';
515 if print_right then loop r
516 and loop_attributes a =
519 let s = (Tag.to_string (tag tree a)) in
520 let attname = String.sub s 3 ((String.length s) -3) in
521 let fsa = first_child tree a in
522 let tid = tree_my_text tree.doc fsa in
523 let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode fsa) (inode tid)
525 output_char outc ' ';
526 output_string outc attname;
527 output_string outc "=\"";
528 output_string outc (text_get_cached_text tree.doc tid);
529 output_char outc '"';
530 loop_attributes (next_sibling tree a)
532 loop ~print_right:false t
535 let print_xml_fast outc tree t =
536 if (tag tree t) = Tag.document_node then
537 print_xml_fast outc tree (first_child tree t)
538 else print_xml_fast outc tree t
540 let tags_children t tag =
541 let a,_,_,_ = Hashtbl.find t.ttable tag in a
542 let tags_below t tag =
543 let _,a,_,_ = Hashtbl.find t.ttable tag in a
544 let tags_siblings t tag =
545 let _,_,a,_ = Hashtbl.find t.ttable tag in a
546 let tags_after t tag =
547 let _,_,_,a = Hashtbl.find t.ttable tag in a
550 let tags t tag = Hashtbl.find t.ttable tag
553 let rec binary_parent t n =
555 if tree_is_first_child t.doc n
556 then tree_parent t.doc n
557 else tree_prev_sibling t.doc n
558 in if tree_tag_id t.doc r = Tag.pcdata then
562 let doc_ids t n = tree_doc_ids t.doc n
564 let subtree_tags t tag = ();
565 fun n -> if n == nil then 0 else
566 tree_subtree_tags t.doc n tag
569 let tid = tree_my_text t.doc n in
570 if tid == nulldoc then "" else
571 text_get_cached_text t.doc tid
574 let dump_tree fmt tree =
577 let tag = (tree_tag_id tree.doc t ) in
578 let tagstr = Tag.to_string tag in
579 let tab = String.make n ' ' in
581 if tag == Tag.pcdata || tag == Tag.attribute_data
583 Format.fprintf fmt "%s<%s>%s</%s>\n"
584 tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
586 Format.fprintf fmt "%s<%s>\n" tab tagstr;
587 loop (tree_first_child tree.doc t) (n+2);
588 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
590 loop (tree_next_sibling tree.doc t) n