1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
23 external inode : 'a node -> int = "%identity"
24 external nodei : int -> 'a node = "%identity"
25 let compare_node x y = (inode x) - (inode y)
26 let equal_node : 'a node -> 'a node -> bool = (==)
29 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
30 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
32 external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
33 external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
35 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
37 let nil : [`Tree ] node = nodei ~-1
38 let nulldoc : [`Text ] node = nodei ~-1
39 let root : [`Tree ] node = nodei 0
41 external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
43 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
45 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
47 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
48 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
49 external text_count : tree -> string -> int = "caml_text_collection_count"
50 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
51 external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
52 external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
54 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
55 external tree_subtree_size : tree -> [`Tree] node -> int = "caml_xml_tree_subtree_size"
57 let tree_is_nil x = equal_node x nil
59 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
60 external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc"
61 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
62 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
63 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
64 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
65 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
66 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
67 external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc"
69 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
70 external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
71 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
72 external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
75 external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc"
78 let tree_is_last t n = equal_node nil (tree_next_sibling t n)
80 (*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *)
82 external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc"
83 (*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *)
84 external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
86 let text_size tree = inode (snd ( tree_doc_ids tree root ))
88 let text_get_cached_text t (x:[`Text] node) =
89 if x == nulldoc then ""
91 text_get_cached_text t x
94 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
95 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
96 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
97 external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc"
98 external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc"
99 external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc"
103 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
104 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
105 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
107 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
108 external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc"
109 external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc"
110 external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc"
113 module HPtset = Hashtbl.Make(Ptset.Int)
115 let vector_htbl = HPtset.create MED_H_SIZE
117 let ptset_to_vector s =
119 HPtset.find vector_htbl s
122 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
123 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
124 HPtset.add vector_htbl s v; v
129 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
131 let subtree_size t i = tree_subtree_size t.doc i
132 let text_size t = text_size t.doc
134 module MemUnion = Hashtbl.Make (struct
135 type t = Ptset.Int.t*Ptset.Int.t
136 let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
137 let equal a b = equal a b || equal b a
138 let hash (x,y) = (* commutative hash *)
139 let x = Ptset.Int.hash x
140 and y = Ptset.Int.hash y
142 if x < y then HASHINT2(x,y) else HASHINT2(y,x)
145 module MemAdd = Hashtbl.Make (
147 type t = Tag.t*Ptset.Int.t
148 let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t)
149 let hash (x,y) = HASHINT2(x,Ptset.Int.hash y)
152 let collect_tags tree =
153 let h_union = MemUnion.create BIG_H_SIZE in
156 MemUnion.find h_union (s1,s2)
158 | Not_found -> let s = Ptset.Int.union s1 s2
160 MemUnion.add h_union (s1,s2) s;s
162 let h_add = MemAdd.create BIG_H_SIZE in
164 try MemAdd.find h_add (t,s)
166 | Not_found -> let r = Ptset.Int.add t s in
167 MemAdd.add h_add (t,s) r;r
169 let h = Hashtbl.create BIG_H_SIZE in
170 let update t sc sb ss sa =
171 let schild,sbelow,ssibling,safter =
176 (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
179 (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
181 let rec loop_right id acc_sibling acc_after=
183 then (acc_sibling,acc_after)
185 let sibling2,after2 = loop_right (tree_next_sibling tree id) acc_sibling acc_after in
186 let child1,below1 = loop_left (tree_first_child tree id) after2 in
187 let tag = tree_tag_id tree id in
188 update tag child1 below1 sibling2 after2;
189 (pt_add tag sibling2, (pt_add tag (pt_cup after2 below1)))
190 and loop_left id acc_after =
192 then (Ptset.Int.empty,Ptset.Int.empty)
194 let sibling2,after2 = loop_right (tree_next_sibling tree id) Ptset.Int.empty acc_after in
195 let child1,below1 = loop_left (tree_first_child tree id) after2 in
196 let tag = tree_tag_id tree id in
197 update tag child1 below1 sibling2 after2;
198 (pt_add tag sibling2,(pt_add tag (pt_cup after2 below1)))
200 let _ = loop_left (tree_root tree) Ptset.Int.empty in h
205 let contains_array = ref [| |]
206 let contains_index = Hashtbl.create 4096
209 Hashtbl.find contains_index i
213 let init_contains t s =
214 let a = text_contains t.doc s
216 Array.fast_sort (compare) a;
218 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
220 let count_contains t s = text_count_contains t.doc s
221 let unsorted_contains t s = text_unsorted_contains t.doc s
223 let init_naive_contains t s =
224 let i,j = tree_doc_ids t.doc (tree_root t.doc)
226 let regexp = Str.regexp_string s in
229 let _ = Str.search_forward regexp arg 0;
233 let rec loop n acc l =
236 let s = text_get_cached_text t.doc n
239 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
240 else loop (nodei ((inode n)+1)) acc l
242 let acc,l = loop i [] 0 in
243 let a = Array.create l nulldoc in
244 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
250 let array_find a i j =
251 let l = Array.length a in
252 let rec loop idx x y =
253 if x > y || idx >= l then nulldoc
255 if a.(idx) >= x then if a.(idx) > y then nulldoc else (last_idx := idx;a.(idx))
256 else loop (idx+1) x y
258 if a.(0) > j || a.(l-1) < i then nulldoc
259 else loop !last_idx i j
261 let text_below tree t =
262 let l = Array.length !contains_array in
263 let i,j = tree_doc_ids tree.doc t in
264 let id = if l == 0 then i else (array_find !contains_array i j) in
265 tree_parent_doc tree.doc id
267 let text_next tree t root =
268 let l = Array.length !contains_array in
269 let inf = nodei((inode(snd(tree_doc_ids tree.doc t)))+1) in
270 let _,j = tree_doc_ids tree.doc root in
271 let id = if l == 0 then if inf > j then nulldoc else inf
272 else array_find !contains_array inf j
274 tree_parent_doc tree.doc id
278 module DocIdSet = struct
279 include Set.Make (struct type t = [`Text] node
280 let compare = compare_node end)
283 let is_nil t = t == nil
285 let is_node t = t != nil
286 let is_root t = t == root
289 let _ = Tag.init (Obj.magic t) in
290 let table = collect_tags t
296 let finalize _ = Printf.eprintf "Release the string list !\n%!"
302 !Options.sample_factor
303 !Options.index_empty_texts
304 !Options.disable_text_collection)
306 let parse_xml_uri str = parse parse_xml_uri str
307 let parse_xml_string str = parse parse_xml_string str
310 external pool : tree -> Tag.pool = "%identity"
312 let magic_string = "SXSI_INDEX"
313 let version_string = "1"
316 Unix.lseek fd 0 Unix.SEEK_CUR
318 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
321 let sl = String.length s in
322 let ssl = Printf.sprintf "%020i" sl in
323 ignore (Unix.write fd ssl 0 20);
324 ignore (Unix.write fd s 0 (String.length s))
326 let rec really_read fd buffer start length =
327 if length <= 0 then () else
328 match Unix.read fd buffer start length with
329 0 -> raise End_of_file
330 | r -> really_read fd buffer (start + r) (length - r);;
333 let buffer = String.create 20 in
334 let _ = really_read fd buffer 0 20 in
335 let size = int_of_string buffer in
336 let buffer = String.create size in
337 let _ = really_read fd buffer 0 size in
342 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
343 let out_c = Unix.out_channel_of_descr fd in
344 let _ = set_binary_mode_out out_c true in
345 output_string out_c magic_string;
346 output_char out_c '\n';
347 output_string out_c version_string;
348 output_char out_c '\n';
349 Marshal.to_channel out_c t.ttable [ ];
350 (* we need to move the fd to the correct position *)
352 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
357 let load ?(sample=64) str =
358 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
359 let in_c = Unix.in_channel_of_descr fd in
360 let _ = set_binary_mode_in in_c true in
362 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
363 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
364 let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
365 Marshal.from_channel in_c
367 let ntable = Hashtbl.create (Hashtbl.length table) in
368 Hashtbl.iter (fun k (s1,s2,s3,s4) ->
369 let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
370 and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
371 and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
372 and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
373 in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
376 (* The in_channel read a chunk of fd, so we might be after
377 the start of the XMLTree save file. Reset to the correct
381 let _ = Printf.eprintf "\nLoading tag table : " in
382 let ntable = time (load_table) () in
383 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
384 let tree = { doc = tree_load fd;
392 let tag_pool t = pool t.doc
394 let compare = compare_node
396 let equal a b = a == b
400 | i -> Printf.sprintf "Node (%i)" i
402 let dump_node t = nts (inode t)
404 let is_left t n = tree_is_first_child t.doc n
406 let is_below_right t n1 n2 =
407 tree_is_ancestor t.doc (tree_parent t.doc n1) n2
408 && not (tree_is_ancestor t.doc n1 n2)
410 let parent t n = tree_parent t.doc n
412 let first_child t = (); fun n -> tree_first_child t.doc n
413 let first_element t = (); fun n -> tree_first_element t.doc n
415 (* these function will be called in two times: first partial application
416 on the tag, then application of the tag and the tree, then application of
417 the other arguments. We use the trick to let the compiler optimize application
420 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
422 let select_child t = fun ts ->
423 let v = ptset_to_vector ts in ();
424 fun n -> tree_select_child t.doc n v
426 let next_sibling t = (); fun n -> tree_next_sibling t.doc n
427 let next_element t = (); fun n -> tree_next_element t.doc n
429 let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
431 let select_sibling t = fun ts ->
432 let v = (ptset_to_vector ts) in ();
433 fun n -> tree_select_foll_sibling t.doc n v
435 let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
436 let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n
437 let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
439 let select_sibling_ctx t = fun ts ->
440 let v = (ptset_to_vector ts) in ();
441 fun n _ -> tree_select_foll_sibling t.doc n v
443 let id t n = tree_node_xml_id t.doc n
445 let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
447 let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
449 let select_desc t = fun ts ->
450 let v = (ptset_to_vector ts) in ();
451 fun n -> tree_select_desc t.doc n v
453 let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
455 let select_foll_ctx t = fun ts ->
456 let v = (ptset_to_vector ts) in ();
457 fun n ctx -> tree_select_foll_below t.doc n v ctx
460 let array_find a i j =
461 let l = Array.length a in
462 let rec loop idx x y =
463 if x > y || idx >= l then nil
465 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
466 else loop (idx+1) x y
468 if a.(0) > j || a.(l-1) < i then nil
469 else loop !last_idx i j
473 let count t s = text_count t.doc s
475 let print_xml_fast outc tree t =
476 let rec loop ?(print_right=true) t =
479 let tagid = tree_tag_id tree.doc t in
483 let tid = tree_my_text tree.doc t in
484 let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode t) (inode tid)
486 output_string outc (text_get_cached_text tree.doc tid);
488 then loop (next_sibling tree t);
491 let tagstr = Tag.to_string tagid in
492 let l = first_child tree t
493 and r = next_sibling tree t
495 output_char outc '<';
496 output_string outc tagstr;
497 if l == nil then output_string outc "/>"
499 if (tag tree l) == Tag.attribute then
501 loop_attributes (first_child tree l);
502 if (next_sibling tree l) == nil then output_string outc "/>"
505 output_char outc '>';
506 loop (next_sibling tree l);
507 output_string outc "</";
508 output_string outc tagstr;
509 output_char outc '>';
514 output_char outc '>';
516 output_string outc "</";
517 output_string outc tagstr;
518 output_char outc '>';
520 if print_right then loop r
521 and loop_attributes a =
524 let s = (Tag.to_string (tag tree a)) in
525 let attname = String.sub s 3 ((String.length s) -3) in
526 let fsa = first_child tree a in
527 let tid = tree_my_text tree.doc fsa in
528 let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode fsa) (inode tid)
530 output_char outc ' ';
531 output_string outc attname;
532 output_string outc "=\"";
533 output_string outc (text_get_cached_text tree.doc tid);
534 output_char outc '"';
535 loop_attributes (next_sibling tree a)
537 loop ~print_right:false t
540 let print_xml_fast outc tree t =
541 if (tag tree t) = Tag.document_node then
542 print_xml_fast outc tree (first_child tree t)
543 else print_xml_fast outc tree t
545 let tags_children t tag =
546 let a,_,_,_ = Hashtbl.find t.ttable tag in a
547 let tags_below t tag =
548 let _,a,_,_ = Hashtbl.find t.ttable tag in a
549 let tags_siblings t tag =
550 let _,_,a,_ = Hashtbl.find t.ttable tag in a
551 let tags_after t tag =
552 let _,_,_,a = Hashtbl.find t.ttable tag in a
555 let tags t tag = Hashtbl.find t.ttable tag
558 let rec binary_parent t n =
560 if tree_is_first_child t.doc n
561 then tree_parent t.doc n
562 else tree_prev_sibling t.doc n
563 in if tree_tag_id t.doc r = Tag.pcdata then
567 let doc_ids t n = tree_doc_ids t.doc n
569 let subtree_tags t tag = ();
570 fun n -> if n == nil then 0 else
571 tree_subtree_tags t.doc n tag
574 let tid = tree_my_text t.doc n in
575 if tid == nulldoc then "" else
576 text_get_cached_text t.doc tid
579 let dump_tree fmt tree =
582 let tag = (tree_tag_id tree.doc t ) in
583 let tagstr = Tag.to_string tag in
584 let tab = String.make n ' ' in
586 if tag == Tag.pcdata || tag == Tag.attribute_data
588 Format.fprintf fmt "%s<%s>%s</%s>\n"
589 tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
591 Format.fprintf fmt "%s<%s>\n" tab tagstr;
592 loop (tree_first_child tree.doc t) (n+2);
593 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
595 loop (tree_next_sibling tree.doc t) n