1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
23 external inode : 'a node -> int = "%identity"
24 external nodei : int -> 'a node = "%identity"
25 let compare_node x y = (inode x) - (inode y)
26 let equal_node : 'a node -> 'a node -> bool = (==)
29 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
30 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
32 external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
33 external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
35 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
37 let nil : [`Tree ] node = nodei ~-1
38 let nulldoc : [`Text ] node = nodei ~-1
39 let root : [`Tree ] node = nodei 0
41 external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
43 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
45 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
47 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
48 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
49 external text_count : tree -> string -> int = "caml_text_collection_count"
50 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
51 external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
52 external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
54 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
55 external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size"
57 let tree_is_nil x = equal_node x nil
59 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
60 external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc"
61 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
62 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
63 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
64 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
65 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
66 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
67 external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc"
69 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
70 external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
71 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
72 external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
75 external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc"
78 let tree_is_last t n = equal_node nil (tree_next_sibling t n)
80 (*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *)
82 external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc"
83 (*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *)
84 external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
86 let text_size tree = inode (snd ( tree_doc_ids tree root ))
88 let text_get_cached_text t (x:[`Text] node) =
89 if x == nulldoc then ""
91 text_get_cached_text t x
94 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
95 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
96 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
97 external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc"
98 external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc"
99 external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc"
103 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
104 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
105 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
107 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
108 external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc"
109 external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc"
110 external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc"
113 module HPtset = Hashtbl.Make(Ptset.Int)
115 let vector_htbl = HPtset.create MED_H_SIZE
117 let ptset_to_vector s =
119 HPtset.find vector_htbl s
122 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
123 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
124 HPtset.add vector_htbl s v; v
129 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
131 let subtree_size t i = tree_subtree_size t.doc i
132 let text_size t = text_size t.doc
134 module MemUnion = Hashtbl.Make (struct
135 type t = Ptset.Int.t*Ptset.Int.t
136 let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
137 let equal a b = equal a b || equal b a
138 let hash (x,y) = (* commutative hash *)
139 let x = Ptset.Int.hash x
140 and y = Ptset.Int.hash y
142 if x < y then HASHINT2(x,y) else HASHINT2(y,x)
145 module MemAdd = Hashtbl.Make (
147 type t = Tag.t*Ptset.Int.t
148 let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t)
149 let hash (x,y) = HASHINT2(x,Ptset.Int.hash y)
152 let collect_tags tree =
153 let h_union = MemUnion.create BIG_H_SIZE in
156 MemUnion.find h_union (s1,s2)
158 | Not_found -> let s = Ptset.Int.union s1 s2
160 MemUnion.add h_union (s1,s2) s;s
162 let h_add = MemAdd.create BIG_H_SIZE in
164 try MemAdd.find h_add (t,s)
166 | Not_found -> let r = Ptset.Int.add t s in
167 MemAdd.add h_add (t,s) r;r
169 let h = Hashtbl.create BIG_H_SIZE in
170 let update t sc sb ss sa =
171 let schild,sbelow,ssibling,safter =
176 (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
179 (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
181 let rec loop_right id acc_after =
183 then Ptset.Int.empty,Ptset.Int.empty,acc_after
185 let sibling2,desc2,after2 = loop_right (tree_next_sibling tree id) acc_after in
186 let child1,desc1,after1 = loop_left (tree_first_child tree id) after2 in
187 let tag = tree_tag_id tree id in
188 update tag child1 desc1 sibling2 after2;
189 ( pt_add tag sibling2,
190 pt_add tag (pt_cup desc1 desc2),
191 pt_cup after1 (pt_cup desc1 desc2) )
192 and loop_left id acc_after =
194 then Ptset.Int.empty,Ptset.Int.empty,acc_after
196 let sibling2,desc2,after2 = loop_right (tree_next_sibling tree id) acc_after in
197 let child1,desc1,after1 = loop_left (tree_first_child tree id) after2 in
198 let tag = tree_tag_id tree id in
199 update tag child1 desc1 sibling2 after2;
200 (pt_add tag sibling2,
201 pt_add tag (pt_cup desc1 desc2),
204 let _ = loop_left (tree_root tree) Ptset.Int.empty in h
209 let contains_array = ref [| |]
210 let contains_index = Hashtbl.create 4096
213 Hashtbl.find contains_index i
217 let init_contains t s =
218 let a = text_contains t.doc s
220 Array.fast_sort (compare) a;
222 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
224 let count_contains t s = text_count_contains t.doc s
225 let unsorted_contains t s = text_unsorted_contains t.doc s
227 let init_naive_contains t s =
228 let i,j = tree_doc_ids t.doc (tree_root t.doc)
230 let regexp = Str.regexp_string s in
233 let _ = Str.search_forward regexp arg 0;
237 let rec loop n acc l =
240 let s = text_get_cached_text t.doc n
243 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
244 else loop (nodei ((inode n)+1)) acc l
246 let acc,l = loop i [] 0 in
247 let a = Array.create l nulldoc in
248 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
254 let array_find a i j =
255 let l = Array.length a in
256 let rec loop idx x y =
257 if x > y || idx >= l then nulldoc
259 if a.(idx) >= x then if a.(idx) > y then nulldoc else (last_idx := idx;a.(idx))
260 else loop (idx+1) x y
262 if a.(0) > j || a.(l-1) < i then nulldoc
263 else loop !last_idx i j
265 let text_below tree t =
266 let l = Array.length !contains_array in
267 let i,j = tree_doc_ids tree.doc t in
268 let id = if l == 0 then i else (array_find !contains_array i j) in
269 tree_parent_doc tree.doc id
271 let text_next tree t root =
272 let l = Array.length !contains_array in
273 let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
274 let _,j = tree_doc_ids tree.doc root in
275 let id = if l == 0 then if inf > j then nulldoc else inf
276 else array_find !contains_array inf j
278 tree_parent_doc tree.doc id
282 module DocIdSet = struct
283 include Set.Make (struct type t = [`Text] node
284 let compare = compare_node end)
287 let is_nil t = t == nil
289 let is_node t = t != nil
290 let is_root t = t == root
293 let _ = Tag.init (Obj.magic t) in
294 let table = collect_tags t
296 let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
297 Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
298 Printf.eprintf "Child tags: ";
299 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) c;
300 Printf.eprintf "\nDescendant tags: ";
301 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) d;
302 Printf.eprintf "\nNextSibling tags: ";
303 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) ns;
304 Printf.eprintf "\nFollowing tags: ";
305 Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
306 Printf.eprintf "\n\n%!";) table
314 let finalize _ = Printf.eprintf "Release the string list !\n%!"
320 !Options.sample_factor
321 !Options.index_empty_texts
322 !Options.disable_text_collection)
324 let parse_xml_uri str = parse parse_xml_uri str
325 let parse_xml_string str = parse parse_xml_string str
328 external pool : tree -> Tag.pool = "%identity"
330 let magic_string = "SXSI_INDEX"
331 let version_string = "2"
334 Unix.lseek fd 0 Unix.SEEK_CUR
336 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
339 let sl = String.length s in
340 let ssl = Printf.sprintf "%020i" sl in
341 ignore (Unix.write fd ssl 0 20);
342 ignore (Unix.write fd s 0 (String.length s))
344 let rec really_read fd buffer start length =
345 if length <= 0 then () else
346 match Unix.read fd buffer start length with
347 0 -> raise End_of_file
348 | r -> really_read fd buffer (start + r) (length - r);;
351 let buffer = String.create 20 in
352 let _ = really_read fd buffer 0 20 in
353 let size = int_of_string buffer in
354 let buffer = String.create size in
355 let _ = really_read fd buffer 0 size in
360 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
361 let out_c = Unix.out_channel_of_descr fd in
362 let _ = set_binary_mode_out out_c true in
363 output_string out_c magic_string;
364 output_char out_c '\n';
365 output_string out_c version_string;
366 output_char out_c '\n';
367 Marshal.to_channel out_c t.ttable [ ];
368 (* we need to move the fd to the correct position *)
370 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
375 let load ?(sample=64) str =
376 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
377 let in_c = Unix.in_channel_of_descr fd in
378 let _ = set_binary_mode_in in_c true in
380 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
381 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
382 let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
383 Marshal.from_channel in_c
385 let ntable = Hashtbl.create (Hashtbl.length table) in
386 Hashtbl.iter (fun k (s1,s2,s3,s4) ->
387 let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
388 and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
389 and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
390 and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
391 in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
394 (* The in_channel read a chunk of fd, so we might be after
395 the start of the XMLTree save file. Reset to the correct
399 let _ = Printf.eprintf "\nLoading tag table : " in
400 let ntable = time (load_table) () in
401 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
402 let tree = { doc = tree_load fd;
410 let tag_pool t = pool t.doc
412 let compare = compare_node
414 let equal a b = a == b
418 | i -> Printf.sprintf "Node (%i)" i
420 let dump_node t = nts (inode t)
422 let is_left t n = tree_is_first_child t.doc n
424 let is_below_right t n1 n2 =
425 tree_is_ancestor t.doc (tree_parent t.doc n1) n2
426 && not (tree_is_ancestor t.doc n1 n2)
428 let parent t n = tree_parent t.doc n
430 let first_child t = (); fun n -> tree_first_child t.doc n
431 let first_element t = (); fun n -> tree_first_element t.doc n
433 (* these function will be called in two times: first partial application
434 on the tag, then application of the tag and the tree, then application of
435 the other arguments. We use the trick to let the compiler optimize application
438 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
440 let select_child t = fun ts ->
441 let v = ptset_to_vector ts in ();
442 fun n -> tree_select_child t.doc n v
444 let next_sibling t = (); fun n -> tree_next_sibling t.doc n
445 let next_element t = (); fun n -> tree_next_element t.doc n
447 let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
449 let select_sibling t = fun ts ->
450 let v = (ptset_to_vector ts) in ();
451 fun n -> tree_select_foll_sibling t.doc n v
453 let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
454 let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n
455 let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
457 let select_sibling_ctx t = fun ts ->
458 let v = (ptset_to_vector ts) in ();
459 fun n _ -> tree_select_foll_sibling t.doc n v
461 let id t n = tree_node_xml_id t.doc n
463 let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
465 let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
467 let select_desc t = fun ts ->
468 let v = (ptset_to_vector ts) in ();
469 fun n -> tree_select_desc t.doc n v
471 let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
473 let select_foll_ctx t = fun ts ->
474 let v = (ptset_to_vector ts) in ();
475 fun n ctx -> tree_select_foll_below t.doc n v ctx
478 let array_find a i j =
479 let l = Array.length a in
480 let rec loop idx x y =
481 if x > y || idx >= l then nil
483 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
484 else loop (idx+1) x y
486 if a.(0) > j || a.(l-1) < i then nil
487 else loop !last_idx i j
491 let count t s = text_count t.doc s
493 let print_xml_fast outc tree t =
494 let rec loop ?(print_right=true) t =
497 let tagid = tree_tag_id tree.doc t in
501 let tid = tree_my_text tree.doc t in
502 output_string outc (text_get_cached_text tree.doc tid);
504 then loop (next_sibling tree t);
507 let tagstr = Tag.to_string tagid in
508 let l = first_child tree t
509 and r = next_sibling tree t
511 output_char outc '<';
512 output_string outc tagstr;
513 if l == nil then output_string outc "/>"
515 if (tag tree l) == Tag.attribute then
517 loop_attributes (first_child tree l);
518 if (next_sibling tree l) == nil then output_string outc "/>"
521 output_char outc '>';
522 loop (next_sibling tree l);
523 output_string outc "</";
524 output_string outc tagstr;
525 output_char outc '>';
530 output_char outc '>';
532 output_string outc "</";
533 output_string outc tagstr;
534 output_char outc '>';
536 if print_right then loop r
537 and loop_attributes a =
540 let s = (Tag.to_string (tag tree a)) in
541 let attname = String.sub s 3 ((String.length s) -3) in
542 let fsa = first_child tree a in
543 let tid = tree_my_text tree.doc fsa in
544 output_char outc ' ';
545 output_string outc attname;
546 output_string outc "=\"";
547 output_string outc (text_get_cached_text tree.doc tid);
548 output_char outc '"';
549 loop_attributes (next_sibling tree a)
551 loop ~print_right:false t
554 let print_xml_fast outc tree t =
555 if (tag tree t) = Tag.document_node then
556 print_xml_fast outc tree (first_child tree t)
557 else print_xml_fast outc tree t
559 let tags_children t tag =
560 let a,_,_,_ = Hashtbl.find t.ttable tag in a
561 let tags_below t tag =
562 let _,a,_,_ = Hashtbl.find t.ttable tag in a
563 let tags_siblings t tag =
564 let _,_,a,_ = Hashtbl.find t.ttable tag in a
565 let tags_after t tag =
566 let _,_,_,a = Hashtbl.find t.ttable tag in a
569 let tags t tag = Hashtbl.find t.ttable tag
572 let rec binary_parent t n =
574 if tree_is_first_child t.doc n
575 then tree_parent t.doc n
576 else tree_prev_sibling t.doc n
577 in if tree_tag_id t.doc r = Tag.pcdata then
581 let doc_ids t n = tree_doc_ids t.doc n
583 let subtree_tags t tag = ();
584 fun n -> if n == nil then 0 else
585 tree_subtree_tags t.doc n tag
588 let tid = tree_my_text t.doc n in
589 if tid == nulldoc then "" else
590 text_get_cached_text t.doc tid
593 let dump_tree fmt tree =
596 let tag = (tree_tag_id tree.doc t ) in
597 let tagstr = Tag.to_string tag in
598 let tab = String.make n ' ' in
600 if tag == Tag.pcdata || tag == Tag.attribute_data
602 Format.fprintf fmt "%s<%s>%s</%s>\n"
603 tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
605 Format.fprintf fmt "%s<%s>\n" tab tagstr;
606 loop (tree_first_child tree.doc t) (n+2);
607 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
609 loop (tree_next_sibling tree.doc t) n