From 071e25c710e9a48116f66ddf51bfdca25e20502b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Wed, 22 Feb 2012 16:01:05 +0100 Subject: [PATCH] Finish porting to the Grammar2 API --- src/OCamlDriver.cpp | 2 - src/grammar2.ml | 51 ++++++++------- src/grammar2.mli | 37 +++++++++++ src/main.ml | 150 +++++++++++++++++++++++--------------------- src/runtime.ml | 75 +++++++++++----------- src/runtime.mli | 2 +- 6 files changed, 177 insertions(+), 140 deletions(-) diff --git a/src/OCamlDriver.cpp b/src/OCamlDriver.cpp index 146a0df..ab145ec 100644 --- a/src/OCamlDriver.cpp +++ b/src/OCamlDriver.cpp @@ -1070,7 +1070,6 @@ extern "C" value caml_bitmap_resize(value bitmap, value nsize) CAMLparam2(bitmap, nsize); size_t bits = Long_val(nsize); size_t bytes = (bits / (8 * sizeof(unsigned int)) + 1 ) * sizeof(unsigned int); - fprintf(stderr, "Growing to: %lu bytes\n", (bits / (8 * sizeof(unsigned int)) + 1 ) * sizeof(unsigned int)); unsigned int * buffer = (unsigned int*) realloc((void *) bitmap, bytes); if (buffer == NULL) CAMLRAISEMSG("BP: cannot reallocate memory"); @@ -1148,7 +1147,6 @@ extern "C" value caml_bp_save(value b, value file) int f1 = Int_val(file); int f2 = dup(f1); FILE * fd = fdopen(f2, "a"); - fprintf(stderr, "Writing %i %p bytes\n", ((B->n+D-1)/D)*8, B ); fflush(stderr); if (fd == NULL) CAMLRAISEMSG("Error saving bp file"); diff --git a/src/grammar2.ml b/src/grammar2.ml index 0c509bf..d2d497e 100644 --- a/src/grammar2.ml +++ b/src/grammar2.ml @@ -200,16 +200,16 @@ struct loop 1 l in Hashtbl.iter (fun lhs rhs -> - let Node( head, _ ) = lhs in + let Node( head, args ) = lhs in let Node( tag1, params) = rhs in let tag2, pos2 = pos_id2 params in let id1 = h_find ~msg:"7" tag_to_id tag1 - and id2 = h_find ~msg:"8" tag_to_id tag2 - in + and id2 = h_find ~msg:"8" tag_to_id tag2 in let rule_ = id2 lsl 27 in let rule_ = (rule_ lor id1) lsl 2 in let rule_ = (rule_ lor pos2) lsl 2 in - let rule_ = rule_ lor (List.length params) in + let rule_ = (rule_ lor (List.length params)) lsl 2 in + let rule_ = rule_ lor (List.length args) in r_array.((h_find ~msg:"9" tag_to_id head) - rules_offset ) <- rule_ ) rules; (*let l = Array.length renum_tags in *) @@ -258,23 +258,14 @@ let save g f = let load f = let cin = open_in f in - let pr_pos () = - Printf.eprintf "Position: %i kiB\n" (pos_in cin / 1024) - in let read () = Marshal.from_channel cin in if read () != _GRAMMAR_MAGIC then failwith "Invalid grammar file"; if read () != _GRAMMAR_VERSION then failwith "Deprecated grammar format"; - pr_pos(); let tags : int array = read () in - pr_pos(); let rules : int array = read () in - pr_pos(); let rules_offset : int = read () in - pr_pos(); let tag_to_id : (string, int) Hashtbl.t = read () in - pr_pos(); let tag_of_id : string array = read () in - pr_pos(); let fd = Unix.descr_of_in_channel cin in let pos = pos_in cin in ignore(Unix.lseek fd pos Unix.SEEK_SET); @@ -303,6 +294,11 @@ type t_symbol = t_type Node.t type tn_symbol = [ any_type ] Node.t +type partial = + Leaf of node + | Node of tn_symbol * partial array + + let is_nil (t : t_symbol) = (Node.to_int t) == 4 @@ -312,7 +308,7 @@ let nil_symbol : t_symbol = let translate_tag _ t = if t == 4 then ~-1 else t let to_string t tag = if tag < Array.length t.tag_of_id then t.tag_of_id.(Tag.to_int tag) - else "" + else "" let register_tag t s = try Hashtbl.find t.tag_to_id s with @@ -325,14 +321,16 @@ let tag_operations t = { } let start_root : node = Node.of_int 0 -let start_tag t (idx : node) = - t.tags.(Bp.preorder_rank t.start (Node.to_int idx)) +let start_tag g (idx : node) : [ t_symbol = "%identity" -external non_terminal : [< any_type ] Node.t -> t_symbol = "%identity" +external non_terminal : [< any_type ] Node.t -> n_symbol = "%identity" let tag (n : t_symbol) : Tag.t = Obj.magic n @@ -351,14 +349,19 @@ let get_rule g (r : n_symbol) : rhs = let get_id1 (r : rhs) : tn_symbol = Node.of_int( - ((Node.to_int r) lsr 4) land 0x7ffffff) + ((Node.to_int r) lsr 6) land 0x7ffffff) let get_id2 (r : rhs) : tn_symbol = - Node.of_int((Node.to_int r) lsr 31) + Node.of_int((Node.to_int r) lsr 33) + +let get_rank (r : rhs) : int = + (Node.to_int r) land 0b11 -let get_param_pos (r : rhs) : int = +let get_id1_rank (r : rhs) : int = ((Node.to_int r) lsr 2) land 0b11 -let num_params (r : rhs) : int = - (Node.to_int r) land 0b11 +let get_id2_pos (r : rhs) : int = + ((Node.to_int r) lsr 4) land 0b11 +let get_id2_rank (r : rhs) : int = + get_rank r + 1 - get_id1_rank r diff --git a/src/grammar2.mli b/src/grammar2.mli index 8d84e4e..a775780 100644 --- a/src/grammar2.mli +++ b/src/grammar2.mli @@ -1,4 +1,41 @@ type t + +type node = [ `Start ] Node.t + +type n_type = [ `NonTerminal ] +type t_type = [ `Terminal ] +type r_type = [ `Rule ] +type any_type = [ n_type | t_type ] +type rhs = [ r_type ] Node.t + +type n_symbol = n_type Node.t +type t_symbol = t_type Node.t +type tn_symbol = [ any_type ] Node.t + + +type partial = + Leaf of node + | Node of tn_symbol * partial array + + val parse : string -> t val save : t -> string -> unit val load : string -> t +val tag_operations : t -> Tag.operations + +val start_tag : t -> node -> tn_symbol +val is_terminal : t -> [< any_type ] Node.t -> bool +val is_non_terminal : t -> [< any_type ] Node.t -> bool +external terminal : [< any_type ] Node.t -> t_symbol = "%identity" +external non_terminal : [< any_type ] Node.t -> n_symbol = "%identity" +val nil_symbol : t_symbol +val tag : t_symbol -> Tag.t +val start_first_child : t -> node -> node +val start_next_sibling : t -> node -> node +val get_rule : t -> n_symbol -> rhs +val get_rank : rhs -> int +val get_id1_rank : rhs -> int +val get_id2_rank : rhs -> int +val get_id2_pos : rhs -> int +val get_id1 : rhs -> tn_symbol +val get_id2 : rhs -> tn_symbol diff --git a/src/main.ml b/src/main.ml index bfbdff9..fa16737 100644 --- a/src/main.ml +++ b/src/main.ml @@ -14,7 +14,7 @@ let () = init_timer();; let default_gc = Gc.get() let tuned_gc = { default_gc with Gc.minor_heap_size = 32*1024*1024; - Gc.major_heap_increment = 8*1024*1024; + Gc.major_heap_increment = 8*1024*1024; Gc.max_overhead = 1000000; Gc.space_overhead = 100; } @@ -89,88 +89,92 @@ let main v query_string output = let () = Options.parse_cmdline() ;; +let _ = + try + Printexc.record_backtrace true; -let document = - if Filename.check_suffix !Options.input_file ".g" then - let g = Grammar2.parse !Options.input_file in - let () = Grammar2.save g "/tmp/test.g" in - let g = Grammar2.load "/tmp/test.g" in - ignore g; - exit 3 - - else if Filename.check_suffix !Options.input_file ".g.bin" then - let g = time ~msg:"Loading grammar" (Grammar.load !Options.input_file) true in - begin + let document = + if Filename.check_suffix !Options.input_file ".g.bin" || + Filename.check_suffix !Options.input_file ".g" + then + let is_index = Filename.check_suffix !Options.input_file ".g.bin" in + let g = + if is_index then + time ~msg:"Loading grammar" (Grammar2.load) !Options.input_file + else + let g = time ~msg:"Parsing grammar file" Grammar2.parse !Options.input_file in + if !Options.save_file <> "" then + time ~msg:"Saving index" (Grammar2.save g) !Options.save_file; + g + in + begin (* Todo Factorise with main *) - Tag.init (Grammar.tag_operations g); - let query = - time ~msg:"Parsing query" XPath.parse !Options.query - in - if !Options.verbose then begin - Printf.eprintf "Parsed query:\n%!"; - XPath.Ast.print Format.err_formatter query; - Format.fprintf Format.err_formatter "\n%!" - end; - let auto, bu_info = - time ~msg:"Compiling query" (Compile.compile) query - in - if !Options.verbose then Ata.print Format.err_formatter auto; - Gc.full_major(); - Gc.compact(); - Gc.set (tuned_gc); - let runtime = - let module R = ResJIT.Count in - let module M = Runtime.Make(R) in + Tag.init (Grammar2.tag_operations g); + let query = + time ~msg:"Parsing query" XPath.parse !Options.query + in + if !Options.verbose then begin + Printf.eprintf "Parsed query:\n%!"; + XPath.Ast.print Format.err_formatter query; + Format.fprintf Format.err_formatter "\n%!" + end; + let auto, bu_info = + time ~msg:"Compiling query" (Compile.compile) query + in + if !Options.verbose then Ata.print Format.err_formatter auto; + Gc.full_major(); + Gc.compact(); + Gc.set (tuned_gc); + let runtime = + let module R = ResJIT.Count in + let module M = Runtime.Make(R) in (* mk_runtime run auto doc arg count print outfile *) - mk_runtime M.grammar_run auto (Obj.magic g) () R.NS.length (Obj.magic R.NS.serialize) None - in - runtime (); - exit 0 - end - else if Filename.check_suffix !Options.input_file ".srx" - then - time - ~msg:"Loading file" - (Tree.load - ~sample:!Options.sample_factor - ~load_text:true) - !Options.input_file - else - let v = - time - ~msg:"Parsing document" - (Tree.parse_xml_uri) - !Options.input_file - in - let () = - if !Options.save_file <> "" + mk_runtime M.grammar_run auto (Obj.magic g) () R.NS.length (Obj.magic R.NS.serialize) None + in + runtime (); + exit 0 + end + else if Filename.check_suffix !Options.input_file ".srx" then time - ~msg:"Writing file to disk" - (Tree.save v) - !Options.save_file; + ~msg:"Loading file" + (Tree.load + ~sample:!Options.sample_factor + ~load_text:true) + !Options.input_file + else + let v = + time + ~msg:"Parsing document" + (Tree.parse_xml_uri) + !Options.input_file + in + let () = + if !Options.save_file <> "" + then + time + ~msg:"Writing file to disk" + (Tree.save v) + !Options.save_file; + in + v in - v -in - try - (*Printexc.record_backtrace true; *) main document !Options.query !Options.output_file; if !Options.verbose then Printf.eprintf "Maximum resident set size: %s\n" (read_procmem()); Gc.full_major(); Profile.summary Format.err_formatter with - | Ulexer.Loc.Exc_located ((x,y),e) -> - Printf.eprintf "character %i-%i %s\n" x y (Printexc.to_string e); - exit 1 - - | e -> - output_string stderr "\n"; - flush stderr; - Printexc.print_backtrace stderr; - Printf.eprintf "FATAL ERROR: %s\n%!" (Printexc.to_string e); - output_string stderr "\n"; - flush stderr; -(* Ptset.Int.stats(); *) - exit 2 + | Ulexer.Loc.Exc_located ((x,y),e) -> + Printf.eprintf "character %i-%i %s\n" x y (Printexc.to_string e); + exit 1 + + | e -> + output_string stderr "\n"; + flush stderr; + Printexc.print_backtrace stderr; + Printf.eprintf "FATAL ERROR: %s\n%!" (Printexc.to_string e); + output_string stderr "\n"; + flush stderr; + exit 2 diff --git a/src/runtime.ml b/src/runtime.ml index 9836952..782293f 100644 --- a/src/runtime.ml +++ b/src/runtime.ml @@ -8,7 +8,7 @@ module type S = sig type result_set val top_down_run : Ata.t -> Tree.t -> Tree.node -> result_set val bottom_up_run : Ata.t -> Tree.t -> Compile.text_query * string -> result_set - val grammar_run : Ata.t -> Grammar.t -> unit -> result_set + val grammar_run : Ata.t -> Grammar2.t -> unit -> result_set end @@ -425,6 +425,7 @@ DEFINE LOOP_TAG (t, states, tag, ctx) = ( (* Grammar run *) + external is_young : 'a array -> bool = "caml_custom_is_young" "noalloc" external blit : 'a array -> int -> 'a array -> int -> int -> unit = "caml_custom_array_blit" module M = Map.Make(struct type t = Grammar.n_symbol let compare = compare end) @@ -446,9 +447,7 @@ DEFINE LOOP_TAG (t, states, tag, ctx) = ( let grammar_run auto g () = - let start_symbol = Node.of_int 0 in - let dummy_leaf = Grammar.Leaf (Node.nil) in - let nil_symbol = Grammar.nil_symbol g in + let dummy_leaf = Grammar2.Leaf (Node.nil) in let res_len = (StateSet.max_elt auto.states) + 1 in let empty_slot = Array.create res_len U.NS.empty in let nil_res = auto.bottom_states, empty_slot in @@ -491,65 +490,63 @@ DEFINE LOOP_TAG (t, states, tag, ctx) = ( if idx < Node.null then nil_res else if StateSet.is_empty states then empty_res else begin - let symbol = Grammar.get_symbol_at g start_symbol idx in - if Grammar.is_terminal symbol then - let symbol = Grammar.terminal symbol in - if symbol == nil_symbol then nil_res else - let tag = Grammar.tag symbol in + let symbol = Grammar2.start_tag g idx in + if Grammar2.is_terminal g symbol then + let symbol = Grammar2.terminal symbol in + if symbol == Grammar2.nil_symbol then nil_res else + let tag = Grammar2.tag symbol in let lst, rst, trans = get_trans tag states in - let fs = Grammar.start_first_child g idx in + let fs = Grammar2.start_first_child g idx in let s1, slot1 = start_loop fs lst in - let s2, slot2 = start_loop (Grammar.start_next_sibling g fs) rst in + let s2, slot2 = start_loop (Grammar2.start_next_sibling g fs) rst in let opcode = L3JIT.find cache3 trans s1 s2 in if opcode == L3JIT.dummy then (L3JIT.cache_apply cache3 auto trans s1 s2) empty_slot slot1 slot2 (Obj.magic ()) (Obj.magic ()) else opcode empty_slot slot1 slot2 (Obj.magic ()) (Obj.magic()) else - let nt = Grammar.non_terminal symbol in - let nparam = Grammar.num_params nt in - let child = ref (Grammar.first_child g start_symbol idx) in + let nt = Grammar2.non_terminal symbol in + let rhs = Grammar2.get_rule g nt in + let nparam = Grammar2.get_rank rhs in + let child = ref (Grammar2.start_first_child g idx) in for i = 0 to nparam - 1 do let c = !child in - parameters.(i) <- Grammar.Leaf c; - child := Grammar.next_sibling g start_symbol c; + parameters.(i) <- Grammar2.Leaf c; + child := Grammar2.start_next_sibling g c; done; rule_loop nt states parameters end - and rule_loop (t : Grammar.n_symbol) states a_param = + and rule_loop (t : Grammar2.n_symbol) states a_param = incr rule_counter; -(* log_symbol (t); *) if !rule_counter land (4095) == 0 then begin Gc.minor() end; - let id1 = Grammar.get_id1 g t in - let id2 = Grammar.get_id2 g t in - let param_pos = Grammar.get_param_pos t in - let nparam1 = Grammar.num_children id1 in - let nparam2 = - if Grammar.is_terminal id2 && nil_symbol == (Grammar.terminal id2) then 0 - else Grammar.num_children id2 - in + let rhs = Grammar2.get_rule g t in + let id1 = Grammar2.get_id1 rhs in + let id2 = Grammar2.get_id2 rhs in + let param_pos = Grammar2.get_id2_pos rhs in + let nparam1 = Grammar2.get_id1_rank rhs in + let nparam2 = Grammar2.get_id2_rank rhs in let a_param2 = if nparam2 == 0 then [||] else Array.create nparam2 dummy_leaf in let i = param_pos - 2 in let ip1 = i + 1 in let offset2d = i+2 in let offset2s = i+nparam2 + 1 in blit a_param 0 parameters_tmp 0 (i+1); - parameters_tmp.(ip1) <- Grammar.Node(id2, a_param2); (* id2( ... ) *) + parameters_tmp.(ip1) <- Grammar2.Node(id2, a_param2); (* id2( ... ) *) blit a_param offset2s parameters_tmp offset2d (nparam1 - i - 2); blit a_param ip1 a_param2 0 nparam2; blit parameters_tmp 0 parameters 0 nparam1; - if Grammar.is_non_terminal id1 then - let id1 = Grammar.non_terminal id1 in + if Grammar2.is_non_terminal g id1 then + let id1 = Grammar2.non_terminal id1 in rule_loop id1 states parameters else - let id1 = Grammar.terminal id1 in + let id1 = Grammar2.terminal id1 in terminal_loop id1 states parameters - and terminal_loop (symbol : Grammar.t_symbol) states a_param = - if symbol == nil_symbol then nil_res else begin + and terminal_loop (symbol : Grammar2.t_symbol) states a_param = + if symbol == Grammar2.nil_symbol then nil_res else begin (* todo factor in from start_loop *) - let tag = Grammar.tag symbol in + let tag = Grammar2.tag symbol in let lst, rst, trans = get_trans tag states in let next = a_param.(1) in let s1, slot1 = partial_loop a_param.(0) lst in @@ -560,19 +557,17 @@ DEFINE LOOP_TAG (t, states, tag, ctx) = ( else opcode empty_slot slot1 slot2 (Obj.magic()) (Obj.magic()) - (* End: TODO refactor *) - end and partial_loop l states = match l with - | Grammar.Leaf id -> start_loop id states - | Grammar.Node (id, a_param) -> - let is_term = Grammar.is_terminal id in + | Grammar2.Leaf id -> start_loop id states + | Grammar2.Node (id, a_param) -> + let is_term = Grammar2.is_terminal g id in if is_term then - terminal_loop (Grammar.terminal id) states a_param + terminal_loop (Grammar2.terminal id) states a_param else - rule_loop (Grammar.non_terminal id) states a_param + rule_loop (Grammar2.non_terminal id) states a_param in let _, slot = start_loop (Node.null) auto.init in diff --git a/src/runtime.mli b/src/runtime.mli index eec5980..0ce04ef 100644 --- a/src/runtime.mli +++ b/src/runtime.mli @@ -2,7 +2,7 @@ module type S = sig type result_set val top_down_run : Ata.t -> Tree.t -> Tree.node -> result_set val bottom_up_run : Ata.t -> Tree.t -> Compile.text_query * string -> result_set - val grammar_run : Ata.t -> Grammar.t -> unit -> result_set + val grammar_run : Ata.t -> Grammar2.t -> unit -> result_set end module Make (U : ResJIT.S) : S with type result_set = U.NS.t -- 2.17.1