1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
10 external init_lib : unit -> unit = "caml_init_lib"
12 exception CPlusPlusError of string
14 let () = Callback.register_exception "CPlusPlusError" (CPlusPlusError "")
20 type 'a node = private int
21 type node_kind = [`Text | `Tree ]
23 external inode : 'a node -> int = "%identity"
24 external nodei : int -> 'a node = "%identity"
25 let compare_node x y = (inode x) - (inode y)
26 let equal_node : 'a node -> 'a node -> bool = (==)
29 external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri"
30 external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
32 external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
33 external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
35 external nullt : unit -> 'a node = "caml_xml_tree_nullt"
37 let nil : [`Tree ] node = nodei ~-1
38 let nulldoc : [`Text ] node = nodei ~-1
39 let root : [`Tree ] node = nodei 0
41 external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text"
43 external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text"
45 let text_is_empty t n = (equal_node nulldoc n) || text_is_empty t n
47 external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains"
48 external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
49 external text_count : tree -> string -> int = "caml_text_collection_count"
50 external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
51 external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
52 external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
54 external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
56 let tree_is_nil x = equal_node x nil
58 external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" "noalloc"
59 external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" "noalloc"
60 (*external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" "noalloc" *)
61 external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" "noalloc"
62 external tree_first_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_element" "noalloc"
63 external tree_tagged_child : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_child" "noalloc"
64 external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" "noalloc"
65 external tree_next_element : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_element" "noalloc"
66 external tree_tagged_sibling : tree -> [`Tree] node -> Tag.t -> [`Tree] node = "caml_xml_tree_tagged_sibling" "noalloc"
68 external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" "noalloc"
69 external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" "noalloc"
70 external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" "noalloc"
71 external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" "noalloc"
74 external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" "noalloc"
77 let tree_is_last t n = equal_node nil (tree_next_sibling t n)
79 (*external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" "noalloc" *)
81 external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" "noalloc"
82 (*external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" "noalloc" *)
83 external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids"
85 let text_size tree = inode (snd ( tree_doc_ids tree root ))
87 let text_get_cached_text t (x:[`Text] node) =
88 if x == nulldoc then ""
90 text_get_cached_text t x
93 external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" "noalloc"
94 external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" "noalloc"
95 external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" "noalloc"
96 external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" "noalloc"
97 external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" "noalloc"
98 external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc"
102 external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc"
103 external unordered_set_length : unordered_set -> int = "caml_unordered_set_length"
104 external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc"
106 external tree_select_child : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_child" "noalloc"
107 external tree_select_foll_sibling : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_foll_sibling" "noalloc"
108 external tree_select_desc : tree -> [`Tree ] node -> unordered_set -> [`Tree] node = "caml_xml_tree_select_desc" "noalloc"
109 external tree_select_foll_below : tree -> [`Tree ] node -> unordered_set -> [`Tree] node -> [`Tree] node = "caml_xml_tree_select_foll_below" "noalloc"
112 module HPtset = Hashtbl.Make(Ptset.Int)
114 let vector_htbl = HPtset.create MED_H_SIZE
116 let ptset_to_vector s =
118 HPtset.find vector_htbl s
121 let v = unordered_set_alloc (Ptset.Int.cardinal s) in
122 let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in
123 HPtset.add vector_htbl s v; v
128 ttable : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t;
131 let text_size t = text_size t.doc
133 module MemUnion = Hashtbl.Make (struct
134 type t = Ptset.Int.t*Ptset.Int.t
135 let equal (x,y) (z,t) = (Ptset.Int.equal x z)&&(Ptset.Int.equal y t)
136 let equal a b = equal a b || equal b a
137 let hash (x,y) = (* commutative hash *)
138 let x = Ptset.Int.hash x
139 and y = Ptset.Int.hash y
141 if x < y then HASHINT2(x,y) else HASHINT2(y,x)
144 module MemAdd = Hashtbl.Make (
146 type t = Tag.t*Ptset.Int.t
147 let equal (x,y) (z,t) = (x == z)&&(Ptset.Int.equal y t)
148 let hash (x,y) = HASHINT2(x,Ptset.Int.hash y)
151 let collect_tags tree =
152 let h_union = MemUnion.create BIG_H_SIZE in
155 MemUnion.find h_union (s1,s2)
157 | Not_found -> let s = Ptset.Int.union s1 s2
159 MemUnion.add h_union (s1,s2) s;s
161 let h_add = MemAdd.create BIG_H_SIZE in
163 try MemAdd.find h_add (t,s)
165 | Not_found -> let r = Ptset.Int.add t s in
166 MemAdd.add h_add (t,s) r;r
168 let h = Hashtbl.create BIG_H_SIZE in
169 let update t sc sb ss sa =
170 let schild,sbelow,ssibling,safter =
175 (Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty,Ptset.Int.empty)
178 (pt_cup sc schild,pt_cup sbelow sb, pt_cup ssibling ss, pt_cup safter sa)
180 let rec loop_right id acc_sibling acc_after=
182 then (acc_sibling,acc_after)
184 let sibling2,after2 = loop_right (tree_next_sibling tree id) acc_sibling acc_after in
185 let child1,below1 = loop_left (tree_first_child tree id) after2 in
186 let tag = tree_tag_id tree id in
187 update tag child1 below1 sibling2 after2;
188 (pt_add tag sibling2, (pt_add tag (pt_cup after2 below1)))
189 and loop_left id acc_after =
191 then (Ptset.Int.empty,Ptset.Int.empty)
193 let sibling2,after2 = loop_right (tree_next_sibling tree id) Ptset.Int.empty acc_after in
194 let child1,below1 = loop_left (tree_first_child tree id) after2 in
195 let tag = tree_tag_id tree id in
196 update tag child1 below1 sibling2 after2;
197 (pt_add tag sibling2,(pt_add tag (pt_cup after2 below1)))
199 let _ = loop_left (tree_root tree) Ptset.Int.empty in h
204 let contains_array = ref [| |]
205 let contains_index = Hashtbl.create 4096
208 Hashtbl.find contains_index i
212 let init_contains t s =
213 let a = text_contains t.doc s
215 Array.fast_sort (compare) a;
217 Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
219 let count_contains t s = text_count_contains t.doc s
220 let unsorted_contains t s = text_unsorted_contains t.doc s
222 let init_naive_contains t s =
223 let i,j = tree_doc_ids t.doc (tree_root t.doc)
225 let regexp = Str.regexp_string s in
228 let _ = Str.search_forward regexp arg 0;
232 let rec loop n acc l =
235 let s = text_get_cached_text t.doc n
238 then loop (nodei ((inode n)+1)) (n::acc) (l+1)
239 else loop (nodei ((inode n)+1)) acc l
241 let acc,l = loop i [] 0 in
242 let a = Array.create l nulldoc in
243 let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc
249 module DocIdSet = struct
250 include Set.Make (struct type t = [`Text] node
251 let compare = compare_node end)
254 let is_nil t = t == nil
256 let is_node t = t != nil
257 let is_root t = t == root
260 let _ = Tag.init (Obj.magic t) in
261 let table = collect_tags t
267 let finalize _ = Printf.eprintf "Release the string list !\n%!"
273 !Options.sample_factor
274 !Options.index_empty_texts
275 !Options.disable_text_collection)
277 let parse_xml_uri str = parse parse_xml_uri str
278 let parse_xml_string str = parse parse_xml_string str
281 external pool : tree -> Tag.pool = "%identity"
283 let magic_string = "SXSI_INDEX"
284 let version_string = "1"
287 Unix.lseek fd 0 Unix.SEEK_CUR
289 let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
292 let sl = String.length s in
293 let ssl = Printf.sprintf "%020i" sl in
294 ignore (Unix.write fd ssl 0 20);
295 ignore (Unix.write fd s 0 (String.length s))
297 let rec really_read fd buffer start length =
298 if length <= 0 then () else
299 match Unix.read fd buffer start length with
300 0 -> raise End_of_file
301 | r -> really_read fd buffer (start + r) (length - r);;
304 let buffer = String.create 20 in
305 let _ = really_read fd buffer 0 20 in
306 let size = int_of_string buffer in
307 let buffer = String.create size in
308 let _ = really_read fd buffer 0 size in
313 let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
314 let out_c = Unix.out_channel_of_descr fd in
315 let _ = set_binary_mode_out out_c true in
316 output_string out_c magic_string;
317 output_char out_c '\n';
318 output_string out_c version_string;
319 output_char out_c '\n';
320 Marshal.to_channel out_c t.ttable [ ];
321 (* we need to move the fd to the correct position *)
323 ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
328 let load ?(sample=64) str =
329 let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
330 let in_c = Unix.in_channel_of_descr fd in
331 let _ = set_binary_mode_in in_c true in
332 (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
333 (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
334 let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
335 Marshal.from_channel in_c
337 let ntable = Hashtbl.create (Hashtbl.length table) in
338 Hashtbl.iter (fun k (s1,s2,s3,s4) ->
339 let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
340 and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
341 and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
342 and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
343 in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
346 (* The in_channel read a chunk of fd, so we might be after
347 the start of the XMLTree save file. Reset to the correct
349 ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
350 let tree = { doc = tree_load fd;
358 let tag_pool t = pool t.doc
360 let compare = compare_node
362 let equal a b = a == b
366 | i -> Printf.sprintf "Node (%i)" i
368 let dump_node t = nts (inode t)
371 let is_left t n = tree_is_first_child t.doc n
373 let is_below_right t n1 n2 = tree_is_ancestor t.doc (tree_parent t.doc n1) n2
375 let parent t n = tree_parent t.doc n
377 let first_child t = (); fun n -> tree_first_child t.doc n
378 let first_element t = (); fun n -> tree_first_element t.doc n
380 (* these function will be called in two times: first partial application
381 on the tag, then application of the tag and the tree, then application of
382 the other arguments. We use the trick to let the compiler optimize application
385 let tagged_child t tag = () ; fun n -> tree_tagged_child t.doc n tag
387 let select_child t = fun ts ->
388 let v = ptset_to_vector ts in ();
389 fun n -> tree_select_child t.doc n v
391 let next_sibling t = (); fun n -> tree_next_sibling t.doc n
392 let next_element t = (); fun n -> tree_next_element t.doc n
394 let tagged_sibling t tag = (); fun n -> tree_tagged_sibling t.doc n tag
396 let select_sibling t = fun ts ->
397 let v = (ptset_to_vector ts) in ();
398 fun n -> tree_select_foll_sibling t.doc n v
400 let next_sibling_ctx t = (); fun n _ -> tree_next_sibling t.doc n
401 let next_element_ctx t = (); fun n _ -> tree_next_element t.doc n
402 let tagged_sibling_ctx t tag = (); fun n _ -> tree_tagged_sibling t.doc n tag
404 let select_sibling_ctx t = fun ts ->
405 let v = (ptset_to_vector ts) in ();
406 fun n _ -> tree_select_foll_sibling t.doc n v
408 let id t n = tree_node_xml_id t.doc n
410 let tag t n = if n == nil then Tag.nullt else tree_tag_id t.doc n
412 let tagged_desc t tag = (); fun n -> tree_tagged_desc t.doc n tag
414 let select_desc t = fun ts ->
415 let v = (ptset_to_vector ts) in ();
416 fun n -> tree_select_desc t.doc n v
418 let tagged_foll_ctx t tag = (); fun n ctx -> tree_tagged_foll_below t.doc n tag ctx
420 let select_foll_ctx t = fun ts ->
421 let v = (ptset_to_vector ts) in ();
422 fun n ctx -> tree_select_foll_below t.doc n v ctx
425 let array_find a i j =
426 let l = Array.length a in
427 let rec loop idx x y =
428 if x > y || idx >= l then nil
430 if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx))
431 else loop (idx+1) x y
433 if a.(0) > j || a.(l-1) < i then nil
434 else loop !last_idx i j
438 let count t s = text_count t.doc s
440 let print_xml_fast outc tree t =
441 let rec loop ?(print_right=true) t =
444 let tagid = tree_tag_id tree.doc t in
448 let tid = tree_my_text tree.doc t in
449 let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode t) (inode tid)
451 output_string outc (text_get_cached_text tree.doc tid);
453 then loop (next_sibling tree t);
456 let tagstr = Tag.to_string tagid in
457 let l = first_child tree t
458 and r = next_sibling tree t
460 output_char outc '<';
461 output_string outc tagstr;
462 if l == nil then output_string outc "/>"
464 if (tag tree l) == Tag.attribute then
466 loop_attributes (first_child tree l);
467 if (next_sibling tree l) == nil then output_string outc "/>"
470 output_char outc '>';
471 loop (next_sibling tree l);
472 output_string outc "</";
473 output_string outc tagstr;
474 output_char outc '>';
479 output_char outc '>';
481 output_string outc "</";
482 output_string outc tagstr;
483 output_char outc '>';
485 if print_right then loop r
486 and loop_attributes a =
489 let s = (Tag.to_string (tag tree a)) in
490 let attname = String.sub s 3 ((String.length s) -3) in
491 let fsa = first_child tree a in
492 let tid = tree_my_text tree.doc fsa in
493 let _ = Printf.eprintf "my_text %i returned %i\n%!" (inode fsa) (inode tid)
495 output_char outc ' ';
496 output_string outc attname;
497 output_string outc "=\"";
498 output_string outc (text_get_cached_text tree.doc tid);
499 output_char outc '"';
500 loop_attributes (next_sibling tree a)
502 loop ~print_right:false t
505 let print_xml_fast outc tree t =
506 if (tag tree t) = Tag.document_node then
507 print_xml_fast outc tree (first_child tree t)
508 else print_xml_fast outc tree t
510 let tags_children t tag =
511 let a,_,_,_ = Hashtbl.find t.ttable tag in a
512 let tags_below t tag =
513 let _,a,_,_ = Hashtbl.find t.ttable tag in a
514 let tags_siblings t tag =
515 let _,_,a,_ = Hashtbl.find t.ttable tag in a
516 let tags_after t tag =
517 let _,_,_,a = Hashtbl.find t.ttable tag in a
520 let tags t tag = Hashtbl.find t.ttable tag
523 let binary_parent t n =
524 if tree_is_first_child t.doc n
525 then tree_parent t.doc n
526 else tree_prev_sibling t.doc n
528 let doc_ids t n = tree_doc_ids t.doc n
530 let subtree_tags t tag = ();
531 fun n -> if n == nil then 0 else
532 tree_subtree_tags t.doc n tag
535 let tid = tree_my_text t.doc n in
536 if tid == nulldoc then "" else
537 text_get_cached_text t.doc tid
540 let dump_tree fmt tree =
543 let tag = (tree_tag_id tree.doc t ) in
544 let tagstr = Tag.to_string tag in
545 let tab = String.make n ' ' in
547 if tag == Tag.pcdata || tag == Tag.attribute_data
549 Format.fprintf fmt "%s<%s>%s</%s>\n"
550 tab tagstr (text_get_cached_text tree.doc (tree_my_text tree.doc t)) tagstr
552 Format.fprintf fmt "%s<%s>\n" tab tagstr;
553 loop (tree_first_child tree.doc t) (n+2);
554 Format.fprintf fmt "%s</%s>\n%!" tab tagstr;
556 loop (tree_next_sibling tree.doc t) n