X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2Ftree.ml;h=fae8ebaea4dac30eea66c0b4b73db01f99acd189;hb=4309f9456521bffcab5ff79abe1ed826744c3a57;hp=b80178061b8ab778339c1dcff13e671d8b72065b;hpb=813b239795aac1844eb233dab7f8f98d8dba845e;p=SXSI%2Fxpathcomp.git diff --git a/src/tree.ml b/src/tree.ml index b801780..fae8eba 100644 --- a/src/tree.ml +++ b/src/tree.ml @@ -7,7 +7,6 @@ INCLUDE "debug.ml" INCLUDE "utils.ml" - external init_lib : unit -> unit = "sxsi_cpp_init" exception CPlusPlusError of string @@ -20,23 +19,71 @@ type node = [ `Tree ] Node.t type tree +external register_tag : tree -> string -> Tag.t = "caml_xml_tree_register_tag" + +external tag_name : tree -> Tag.t -> string = "caml_xml_tree_get_tag_name" + +let tag t = (); fun s -> + match s with + | "<$>" -> Tag.pcdata + | "<@>" -> Tag.attribute + | "" -> Tag.document_node + | "<@$>" -> Tag.attribute_data + | _ -> register_tag t s + +let to_string d = (); + fun t -> + if t == Tag.pcdata then "<$>" + else if t == Tag.attribute_data then "<@$>" + else if t == Tag.attribute then "<@>" + else if t == Tag.nullt then "" + else tag_name d t + +let translate x = x + +let mk_tag_ops t = { + Tag.tag = tag t; + Tag.to_string = to_string t; + Tag.translate = translate +} module TreeBuilder = struct type t external create : unit -> t = "caml_xml_tree_builder_create" - external open_document : t -> bool -> int -> bool -> int -> unit = "caml_xml_tree_builder_open_document" + external open_document : t -> int -> bool -> int -> unit = "caml_xml_tree_builder_open_document" external close_document : t -> tree = "caml_xml_tree_builder_close_document" - external open_tag : t -> string -> unit = "caml_xml_tree_builder_new_open_tag" - external close_tag : t -> string -> unit = "caml_xml_tree_builder_new_closing_tag" - external text : t -> string -> unit = "caml_xml_tree_builder_new_text" + external open_tag : t -> string -> unit = "caml_xml_tree_builder_open_tag" + external close_tag : t -> string -> unit = "caml_xml_tree_builder_close_tag" + external text : t -> string -> unit = "caml_xml_tree_builder_text" + + let is_whitespace s = + let rec loop len i = + if i < len then + let c = s.[i] in + (c == '\n' || c == '\t' || c == ' ') && loop len (i+1) + else + true + in + loop (String.length s) 0 + + + let display_count = + let event_counter = ref 0 in + (fun parser_ -> + incr event_counter; + if !event_counter land 0xffffff == 0 then + Logger.print Format.err_formatter "Current position: %i@\n@?" (Expat.get_current_byte_index parser_)) let do_text b t = if Buffer.length t > 0 then begin - open_tag b "<$>"; - text b (Buffer.contents t); - close_tag b "<$>"; + let s = Buffer.contents t in + begin + open_tag b "<$>"; + text b s; + close_tag b "<$>"; + end; Buffer.clear t end @@ -44,11 +91,11 @@ struct let atname = "<@>" ^ name in open_tag b atname; open_tag b "<@$>"; - text b value; - close_tag b "<@$>"; - close_tag b atname + text b value; + close_tag b "<@$>"; + close_tag b atname - let start_element_handler b t tag attr_list = + let start_element_handler parser_ b t tag attr_list = do_text b t; open_tag b tag; match attr_list with @@ -56,14 +103,14 @@ struct | l -> open_tag b "<@>"; List.iter (fun (name, value) -> output_attr b name value) l; - close_tag b "<@>" + close_tag b "<@>" - let end_element_handler b t tag = + let end_element_handler parser_ b t tag = do_text b t; close_tag b tag - let character_data_handler _b t text = + let character_data_handler parser_ _ t text = Buffer.add_string t text let create_parser () = @@ -73,13 +120,17 @@ struct let finalize () = do_text build buf; close_tag build ""; - close_document build + Logger.print Format.err_formatter "Finished parsing@\n"; + Logger.print Format.err_formatter "Starting index construction@\n"; + let r = close_document build in + Logger.print Format.err_formatter "Finished index construction@\n"; + r in - Expat.set_start_element_handler parser_ (start_element_handler build buf); - Expat.set_end_element_handler parser_ (end_element_handler build buf); - Expat.set_character_data_handler parser_ (character_data_handler build buf); - open_document build !Options.index_empty_texts !Options.sample_factor - !Options.disable_text_collection !Options.text_index_type; + Expat.set_start_element_handler parser_ (start_element_handler parser_ build buf); + Expat.set_end_element_handler parser_ (end_element_handler parser_ build buf); + Expat.set_character_data_handler parser_ (character_data_handler parser_ build buf); + Logger.print Format.err_formatter "Started parsing@\n"; + open_document build !Options.sample_factor !Options.disable_text_collection !Options.text_index_type; open_tag build ""; parser_, finalize @@ -106,8 +157,6 @@ struct in finalizer () - - end @@ -116,10 +165,22 @@ end type bit_vector = string external bool_of_int : int -> bool = "%identity" +external int_of_bool : bool -> int = "%identity" let bit_vector_unsafe_get v i = bool_of_int (((Char.code (String.unsafe_get v (i lsr 3))) lsr (i land 7)) land 1) +let chr (c:int) : char = Obj.magic (c land 0xff) +let bit_vector_unsafe_set v i b = + let j = i lsr 3 in + let c = Char.code v.[j] in + let bit = int_of_bool b in + let mask = bit lsl (i land 7) in + if b then v.[j] <- chr (c lor mask) else v.[j] <- (chr (c land (lnot mask))) + +let bit_vector_create n = + let len = if n <= 0 then 0 else (n - 1) / 8 + 1 in + String.make len '\000' type t = { doc : tree; @@ -133,7 +194,7 @@ type t = { } - +let tag_operations t = mk_tag_ops t.doc (* external parse_xml_uri : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_uri" external parse_xml_string : string -> int -> bool -> bool -> int -> tree = "caml_call_shredder_string" @@ -151,26 +212,25 @@ external nullt : unit -> 'a Node.t = "caml_xml_tree_nullt" let nil : [`Tree ] Node.t = Node.nil let root : [`Tree ] Node.t = Node.null -type unordered_set +type tag_list -external unordered_set_alloc : int -> unordered_set = "caml_unordered_set_alloc" -external unordered_set_length : unordered_set -> int = "caml_unordered_set_length" -external unordered_set_insert : unordered_set -> int -> unit = "caml_unordered_set_set" "noalloc" +external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc" +external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc" module HPtset = Hashtbl.Make(Ptset.Int) let vector_htbl = HPtset.create MED_H_SIZE -let unordered_set_of_set s = +let tag_list_of_set s = try HPtset.find vector_htbl s with - Not_found -> - let v = unordered_set_alloc (Ptset.Int.cardinal s) in - let _ = Ptset.Int.iter (fun e -> unordered_set_insert v e) s in - HPtset.add vector_htbl s v; v - -let ptset_to_vector = unordered_set_of_set + Not_found -> + let v = tag_list_alloc (Ptset.Int.cardinal s + 1) in + let i = ref 0 in + let () = Ptset.Int.iter (fun e -> tag_list_set v !i e; incr i) s in + let () = tag_list_set v !i Tag.nullt in + HPtset.add vector_htbl s v; v (** tree interface *) @@ -186,25 +246,26 @@ let first_element t n = tree_first_element t.doc n external tree_tagged_child : tree -> [`Tree] Node.t -> Tag.t -> [`Tree] Node.t = "caml_xml_tree_tagged_child" "noalloc" let tagged_child t n tag = tree_tagged_child t.doc n tag -external tree_select_child : tree -> [`Tree ] Node.t -> unordered_set -> [`Tree] Node.t = "caml_xml_tree_select_child" "noalloc" +external tree_select_child : tree -> [`Tree ] Node.t -> tag_list -> [`Tree] Node.t = "caml_xml_tree_select_child" "noalloc" let select_child t n tag_set = tree_select_child t.doc n tag_set external tree_last_child : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_last_child" "noalloc" let last_child t n = tree_last_child t.doc n - external tree_next_sibling : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_next_sibling" "noalloc" let next_sibling t n = tree_next_sibling t.doc n external tree_next_element : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_next_element" "noalloc" let next_element t n = tree_next_element t.doc n -external tree_tagged_following_sibling : tree -> [`Tree] Node.t -> Tag.t -> [`Tree] Node.t = "caml_xml_tree_tagged_following_sibling" "noalloc" -let tagged_following_sibling t n tag = tree_tagged_following_sibling t.doc n tag -external tree_select_following_sibling : tree -> [`Tree ] Node.t -> unordered_set -> [`Tree] Node.t = "caml_xml_tree_select_following_sibling" "noalloc" -let select_following_sibling t n tag_set = tree_select_following_sibling t.doc n tag_set +external tree_tagged_sibling : tree -> [`Tree] Node.t -> Tag.t -> [`Tree] Node.t = "caml_xml_tree_tagged_sibling" "noalloc" +let tagged_sibling t n tag = tree_tagged_sibling t.doc n tag + + +external tree_select_sibling : tree -> [`Tree ] Node.t -> tag_list -> [`Tree] Node.t = "caml_xml_tree_select_sibling" "noalloc" +let select_sibling t n tag_set = tree_select_sibling t.doc n tag_set external tree_prev_sibling : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_prev_sibling" "noalloc" let prev_sibling t n = tree_prev_sibling t.doc n @@ -217,23 +278,18 @@ let tagged_descendant t n tag = tree_tagged_descendant t.doc n tag external tree_tagged_next : tree -> [`Tree ] Node.t -> Tag.t -> [`Tree ] Node.t = "caml_xml_tree_tagged_next" "noalloc" let tagged_next t n tag = tree_tagged_next t.doc n tag -external tree_select_descendant : tree -> [`Tree ] Node.t -> unordered_set -> [`Tree] Node.t = "caml_xml_tree_select_descendant" "noalloc" +external tree_select_descendant : tree -> [`Tree ] Node.t -> tag_list -> [`Tree] Node.t = "caml_xml_tree_select_descendant" "noalloc" let select_descendant t n tag_set = tree_select_descendant t.doc n tag_set external tree_tagged_following_before : tree -> [`Tree ] Node.t -> Tag.t -> [`Tree ] Node.t -> [`Tree ] Node.t = "caml_xml_tree_tagged_following_before" "noalloc" let tagged_following_before t n tag ctx = tree_tagged_following_before t.doc n tag ctx -external tree_select_following_before : tree -> [`Tree ] Node.t -> unordered_set -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_select_following_before" "noalloc" +external tree_select_following_before : tree -> [`Tree ] Node.t -> tag_list -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_select_following_before" "noalloc" let select_following_before t n tag_set ctx = tree_select_following_before t.doc n tag_set ctx external tree_parent : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_parent" "noalloc" let parent t n = tree_parent t.doc n -external tree_binary_parent : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_binary_parent" - "noalloc" -let binary_parent t n = tree_binary_parent t.doc n - - external tree_tag : tree -> [`Tree] Node.t -> Tag.t = "caml_xml_tree_tag" "noalloc" let tag t n = tree_tag t.doc n @@ -257,31 +313,22 @@ let tags t tag = open Format let dump_tag_table t = - eprintf "Child tags:\n%!"; - Array.iteri - (fun tag set -> eprintf "%s: %a\n%!" - (Tag.to_string tag) TagSet.print (TagSet.inj_positive set)) - t.children; - eprintf "-----------------------------\n%!"; - eprintf "Descendant tags:\n%!"; - Array.iteri - (fun tag set -> eprintf "%s: %a\n%!" - (Tag.to_string tag) TagSet.print (TagSet.inj_positive set)) - t.descendants; - eprintf "-----------------------------\n%!"; - eprintf "Sibling tags:\n%!"; - Array.iteri - (fun tag set -> eprintf "%s: %a\n%!" - (Tag.to_string tag) TagSet.print (TagSet.inj_positive set)) - t.siblings; - eprintf "-----------------------------\n%!"; - eprintf "Following tags:\n%!"; - Array.iteri - (fun tag set -> eprintf "%s: %a\n%!" - (Tag.to_string tag) TagSet.print (TagSet.inj_positive set)) - t.followings; - eprintf "-----------------------------\n%!" - + let tag = ref 0 in + let printer ppf set = + Logger.print ppf "%s: %a" + (Tag.to_string !tag) TagSet.print (TagSet.inj_positive set); + incr tag + in + let set_printer msg set = + tag := 0; + Logger.print err_formatter "%s :@\n" msg; + Pretty.pp_print_array ~sep:pp_force_newline printer err_formatter set; + Logger.print err_formatter "-----------------------------@\n"; + in + set_printer "Child tags" t.children; + set_printer "Descendant tags" t.descendants; + set_printer "Sibling tags" t.siblings; + set_printer "Following tags" t.followings external tree_subtree_tags : tree -> [`Tree] Node.t -> Tag.t -> int = "caml_xml_tree_subtree_tags" "noalloc" let subtree_tags t n tag = tree_subtree_tags t.doc n tag @@ -321,7 +368,7 @@ let stats t = in let a,b = loop true root 0 0 0 in - Printf.eprintf "Average depth: %f, number of leaves %i\n%!" ((float_of_int a)/. (float_of_int b)) b + Logger.print err_formatter "Average depth: %f, number of leaves %i@\n@?" ((float_of_int a)/. (float_of_int b)) b ;; module TagS = @@ -490,7 +537,9 @@ let is_node t = t != nil let is_root t = t == root let node_of_t t = - let _ = Tag.init (Obj.magic t) in + Logger.print err_formatter "Initializing tag structure@\n"; + let _ = Tag.init (mk_tag_ops t) in + Logger.print err_formatter "Starting tag table construction@\n"; let f, n, c, d = time collect_labels t ~msg:"Building tag relationship table" in let c = Array.map TagS.to_ptset c in let n = Array.map TagS.to_ptset n in @@ -519,15 +568,13 @@ let parse_xml_string str = node_of_t (TreeBuilder.parse_string str) let size t = tree_size t.doc;; -external pool : tree -> Tag.pool = "%identity" - let magic_string = "SXSI_INDEX" let version_string = "3" let pos fd = Unix.lseek fd 0 Unix.SEEK_CUR -let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd) +let pr_pos fd = Logger.print err_formatter "At position %i@\n" (pos fd) let write fd s = let sl = String.length s in @@ -556,6 +603,7 @@ let save_tag_table channel t = let save t str = let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in let out_c = Unix.out_channel_of_descr fd in + let index_prefix = Filename.chop_suffix str ".srx" in let _ = set_binary_mode_out out_c true in output_string out_c magic_string; output_char out_c '\n'; @@ -568,7 +616,7 @@ let save t str = (* we need to move the fd to the correct position *) flush out_c; ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET); - tree_save t.doc fd str; + tree_save t.doc fd index_prefix; close_out out_c ;; let load_tag_table channel = @@ -578,6 +626,7 @@ let load_tag_table channel = let load ?(sample=64) ?(load_text=true) str = let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in let in_c = Unix.in_channel_of_descr fd in + let index_prefix = Filename.chop_suffix str ".srx" in let _ = set_binary_mode_in in_c true in let load_table () = (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file"); @@ -590,7 +639,7 @@ let load ?(sample=64) ?(load_text=true) str = in let c, s, d, f = time ~msg:"Loading tag table"(load_table) () in ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET); - let xml_tree = tree_load fd str load_text sample in + let xml_tree = tree_load fd index_prefix load_text sample in let () = Tag.init (Obj.magic xml_tree) in let attributes = Ptset.Int.add Tag.attribute d.(Tag.attribute) in let elements = Ptset.Int.add Tag.document_node @@ -612,8 +661,6 @@ let load ?(sample=64) ?(load_text=true) str = -let tag_pool t = pool t.doc - let equal a b = a == b let nts = function @@ -631,17 +678,17 @@ type query_result = { bv : bit_vector; external tree_flush : tree -> Unix.file_descr -> unit = "caml_xml_tree_flush" let flush t fd = tree_flush t.doc fd -external text_prefix : tree -> string -> query_result = "caml_text_collection_prefix_bv" -let text_prefix t s = text_prefix t.doc s +external text_prefix : tree -> string -> bool -> query_result = "caml_text_collection_prefix_bv" +let text_prefix t s b = text_prefix t.doc s b -external text_suffix : tree -> string -> query_result = "caml_text_collection_suffix_bv" -let text_suffix t s = text_suffix t.doc s +external text_suffix : tree -> string -> bool -> query_result = "caml_text_collection_suffix_bv" +let text_suffix t s b = text_suffix t.doc s b -external text_equals : tree -> string -> query_result = "caml_text_collection_equals_bv" -let text_equals t s = text_equals t.doc s +external text_equals : tree -> string -> bool -> query_result = "caml_text_collection_equals_bv" +let text_equals t s b = text_equals t.doc s b -external text_contains : tree -> string -> query_result = "caml_text_collection_contains_bv" -let text_contains t s = text_contains t.doc s +external text_contains : tree -> string -> bool -> query_result = "caml_text_collection_contains_bv" +let text_contains t s b = text_contains t.doc s b module Predicate = Hcons.Make ( @@ -670,7 +717,7 @@ let query_fun = function let _pred_cache = Hashtbl.create 17 ;; let mk_pred query s = - let f = query_fun query in + let f = query_fun query in let memo = ref (fun _ _ -> failwith "Undefined") in memo := begin fun tree node -> let results = @@ -678,32 +725,26 @@ let mk_pred query s = Not_found -> time ~count:1 ~msg:(Printf.sprintf "Computing text query %s(%s)" (string_of_query query) s) - (f tree) s + (f tree) s true in let bv = results.bv in memo := begin fun _ n -> - let b = - bit_vector_unsafe_get bv (Node.to_int n) - in - D_TRACE_(Printf.eprintf "Result of memoized call to query %s is %b for node %i\n" s b (Node.to_int n)); - b + bit_vector_unsafe_get bv (Node.to_int n) end; - let b = bit_vector_unsafe_get bv (Node.to_int node) in - D_TRACE_(Printf.eprintf "Result is %b for node %i\n" b (Node.to_int node)); - b + bit_vector_unsafe_get bv (Node.to_int node) end; Predicate.make memo -let full_text_prefix t s = (text_prefix t s).pos +let full_text_prefix t s = (text_prefix t s true).pos -let full_text_suffix t s = (text_suffix t s).pos +let full_text_suffix t s = (text_suffix t s true).pos -let full_text_equals t s = (text_equals t s).pos +let full_text_equals t s = (text_equals t s true).pos -let full_text_contains t s = (text_contains t s).pos +let full_text_contains t s = (text_contains t s true).pos let full_text_query q t s = - let res = (query_fun q) t s in + let res = (query_fun q) t s true in Hashtbl.replace _pred_cache (q,s) res; res.pos