type t = { start : Bp.t; tags : int array; rules : int array; rules_offset : int; tag_to_id : (string, int) Hashtbl.t; tag_of_id : string array } module Parse = struct let buffer = Buffer.create 512 let parse_tree cin open_tag close_tag = let rec loop () = let c = input_char cin in match c with '\n'| '>' -> () | ' ' | ',' | '-' -> loop () | 'a'..'z' | 'B'..'Z' | '0'..'9' | '_' -> Buffer.clear buffer; Buffer.add_char buffer c; loop_tag false | 'A' -> Buffer.clear buffer; Buffer.add_char buffer c; loop_tag true | ')' -> close_tag (); loop () | _ -> failwith ("Invalid character: " ^ (String.make 1 c)) and loop_tag t = let c = input_char cin in match c with | 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' -> Buffer.add_char buffer c; loop_tag t | '(' -> let s = Buffer.contents buffer in open_tag s t; Buffer.clear buffer; loop () | ' ' -> loop_tag t | ',' | '-' -> let s = Buffer.contents buffer in open_tag s t; close_tag (); Buffer.clear buffer; loop () | ')' -> let s = Buffer.contents buffer in open_tag s t; Buffer.clear buffer; close_tag (); close_tag (); loop () | _ -> failwith ("Invalid character: " ^ (String.make 1 c)) in loop () let tag_info = Hashtbl.create 1023 let tag_of_id = Hashtbl.create 1023 let current_id = ref 4 let init() = Hashtbl.clear tag_info; Hashtbl.clear tag_of_id; current_id := 4; Hashtbl.add tag_info "_ROOT" (0, ~-1, false); Hashtbl.add tag_info "_A" (1, ~-1, false); Hashtbl.add tag_info "_T" (2, ~-1, false); Hashtbl.add tag_info "_AT" (3, ~-1, false); Hashtbl.add tag_info "_" (4, ~-1, false); Hashtbl.add tag_of_id 0 "_ROOT"; Hashtbl.add tag_of_id 1 "_A"; Hashtbl.add tag_of_id 2 "_T"; Hashtbl.add tag_of_id 3 "_AT"; Hashtbl.add tag_of_id 4 "_" let add_tag s nterm = let id, count, nterm = try Hashtbl.find tag_info s with Not_found -> incr current_id; let id = !current_id in Hashtbl.add tag_of_id id s; (!current_id, ~-1, nterm || s = "START") in let r = id, count+1, nterm in Hashtbl.replace tag_info s r; r type tree = Node of string * tree list let parse_small_tree cin = let stack = ref [ Node("", []) ] in let open_tag s isnterm = if s <> "y0" && s <> "y1" then ignore(add_tag s isnterm); stack := Node(s, []) :: !stack in let close_tag () = match !stack with Node(t1, l1) :: Node(t2, l2) :: r -> stack := Node(t2, Node(t1, List.rev l1)::l2) :: r | _ -> assert false in parse_tree cin open_tag close_tag; match !stack with [ Node(_, [ l ]) ] -> l | _ -> raise End_of_file let parse_big_tree cin = let bv = Bp.bitmap_create () in let tags = IntArray.create () in let open_tag s isnterm = let id, _, _ = add_tag s isnterm in Bp.bitmap_push_back bv 1; IntArray.push_back tags id in let close_tag () = Bp.bitmap_push_back bv 0 in parse_tree cin open_tag close_tag; Bp.create bv, IntArray.pack tags let eat_char cin = ignore (input_char cin) let h_find ?(msg="") h i = try Hashtbl.find h i with Not_found -> let r = Obj.repr i in if Obj.is_int r then Printf.eprintf "Not_found (%s): %i\n%!" msg (Obj.magic i); if Obj.tag r = Obj.string_tag then Printf.eprintf "Not_found (%s): %s\n%!" msg (Obj.magic i); raise Not_found ;; let parse cin = let rules = Hashtbl.create 1023 in init (); (* START *) ignore (parse_small_tree cin); (* > *) (* ignore (input_char cin); *) let bv, tags = parse_big_tree cin in let () = try while true do let lhs = parse_small_tree cin in let rhs = parse_small_tree cin in Hashtbl.add rules lhs rhs done; with End_of_file -> () in (* First, re-order the tags *) let old_new_mapping = Array.init (Hashtbl.length tag_of_id) (fun i -> h_find ~msg:"1" tag_of_id i) in Array.fast_sort (fun tag1 tag2 -> let t1, count1, isnterm1 = h_find ~msg:"2" tag_info tag1 and t2, count2, isnterm2 = h_find ~msg:"3" tag_info tag2 in if t1 <= 4 && t2 <= 4 then compare t1 t2 else if t1 <= 4 then -1 else if t2 <= 4 then 1 else if (not isnterm1) && (not isnterm2) then compare t1 t2 else if isnterm1 && isnterm2 then match tag1, tag2 with "START", "START" -> 0 | "START", _ -> ~-1 | _, "START" -> 1 | _ -> compare count2 count1 else if isnterm2 then -1 else 1) old_new_mapping; let tag_to_id = Hashtbl.create 503 in Array.iteri (fun i s -> Hashtbl.add tag_to_id s i) old_new_mapping; let renum_tags = Array.copy tags in for i = 0 to Array.length tags - 1 do renum_tags.(i) <- h_find ~msg:"4" tag_to_id (h_find ~msg:"5" tag_of_id (tags.(i))) done; let r_array = Array.create (Hashtbl.length rules) 0 in let rules_offset = h_find ~msg:"6" tag_to_id "START" + 1 in let pos_id2 l = let rec loop i l = match l with [] -> assert false | Node(tag, children) :: ll -> if tag <> "y0" && tag <> "y1" then tag, i else loop (i+1) ll in loop 1 l in Hashtbl.iter (fun lhs rhs -> let Node( head, args ) = lhs in let Node( tag1, params) = rhs in let tag2, pos2 = pos_id2 params in let id1 = h_find ~msg:"7" tag_to_id tag1 and id2 = h_find ~msg:"8" tag_to_id tag2 in let conf = if List.length args = 0 then 0 else if List.length args = 1 then if List.length params = 1 then 1 else if pos2 = 1 then 2 else 3 else (* 2 parameters *) if List.length params = 1 then 4 else if pos2 = 1 then 5 else 6 in let rule_ = id2 lsl 27 in let rule_ = (rule_ lor id1) lsl 3 in let rule_ = rule_ lor conf in r_array.((h_find ~msg:"9" tag_to_id head) - rules_offset ) <- rule_ ) rules; (*let l = Array.length renum_tags in *) (*let tag32 = Array32.create l 0 in for i = 0 to l - 1 do Array32.set tag32 i (renum_tags.(i) land 0x7ffffff); done; *) (* Remove the non-terminal names from the hash tables *) let tag_to_id2 = Hashtbl.create 31 in Hashtbl.iter (fun s i -> if i < rules_offset then Hashtbl.add tag_to_id2 s i) tag_to_id; { start = bv; tags = renum_tags; rules = r_array; rules_offset = rules_offset; tag_to_id = tag_to_id2; tag_of_id = Array.sub old_new_mapping 0 rules_offset } end let parse file = let cin = open_in file in let g = Parse.parse cin in close_in cin; g let _GRAMMAR_MAGIC = 0xaabbcc let _GRAMMAR_VERSION = 3 let save g f = let cout = open_out f in let write a = Marshal.to_channel cout a [ ] in write _GRAMMAR_MAGIC; write _GRAMMAR_VERSION; write g.tags; write g.rules; write g.rules_offset; write g.tag_to_id; write g.tag_of_id; flush cout; let fd = Unix.descr_of_out_channel cout in Bp.save g.start fd; close_out cout let load f = let cin = open_in f in let read () = Marshal.from_channel cin in if read () != _GRAMMAR_MAGIC then failwith "Invalid grammar file"; if read () != _GRAMMAR_VERSION then failwith "Deprecated grammar format"; let tags : int array = read () in let rules : int array = read () in let rules_offset : int = read () in let tag_to_id : (string, int) Hashtbl.t = read () in let tag_of_id : string array = read () in let fd = Unix.descr_of_in_channel cin in let pos = pos_in cin in ignore(Unix.lseek fd pos Unix.SEEK_SET); let bp = Bp.load fd in close_in cin; let g = { start = bp; tags = tags; rules = rules; rules_offset = rules_offset; tag_to_id = tag_to_id; tag_of_id = tag_of_id; } in Printf.eprintf "Grammar size:%i kb\n%!" ((Ocaml.size_b g + Bp.alloc_stats ())/1024); g type node = [ `Start ] Node.t type n_type = [ `NonTerminal ] type t_type = [ `Terminal ] type r_type = [ `Rule ] type any_type = [ n_type | t_type ] type rhs = [ r_type ] Node.t type n_symbol = n_type Node.t type t_symbol = t_type Node.t type tn_symbol = [ any_type ] Node.t type 'a partial = | Cache of 'a | Leaf of int*int * StateSet.t array * node | Node0 of tn_symbol (* No parameters *) | Node1 of tn_symbol * 'a partial | Node2 of tn_symbol * 'a partial * 'a partial let is_nil (t : t_symbol) = (Node.to_int t) == 4 let nil_symbol : t_symbol = (Node.of_int 4) let translate_tag _ t = if t == 4 then ~-1 else t let to_string t tag = if tag < Array.length t.tag_of_id then t.tag_of_id.(Tag.to_int tag) else "" let register_tag t s = try Hashtbl.find t.tag_to_id s with Not_found -> 4 let tag_operations t = { Tag.tag = (fun s -> register_tag t s); Tag.to_string = (fun s -> to_string t s); Tag.translate = (fun s -> translate_tag t s); } let start_root : node = Node.of_int 0 let start_tag g (idx : node) : [= t.rules_offset let is_terminal t (n : [< any_type ] Node.t) = not(is_non_terminal t n) external terminal : [< any_type ] Node.t -> t_symbol = "%identity" external non_terminal : [< any_type ] Node.t -> n_symbol = "%identity" let tag (n : t_symbol) : Tag.t = Obj.magic n let get_rule g (r : n_symbol) : rhs = Node.of_int (g.rules.((Node.to_int r) - g.rules_offset)) let get_id1 (r : rhs) : tn_symbol = Node.of_int(((Node.to_int r) lsr 3) land 0x7ffffff) let get_id2 (r : rhs) : tn_symbol = Node.of_int((Node.to_int r) lsr 30) type conf = | C0 (* B(C) *) | C1 (* B(C(y0)) *) | C2 (* B(C, y0) *) | C3 (* B(y0, C) *) | C4 (* B(C(y0, y1)) *) | C5 (* B(C(y0), y1) *) | C6 (* B(y0, C(y1)) *) let get_conf (r : rhs) : conf = (Obj.magic ((Node.to_int r) land 0b111)) let get_rank (r : rhs) : int = match get_conf r with | C0 -> 0 | C1 | C2 | C3 -> 1 | C4 | C5 | C6 -> 2 let get_id1_rank (r : rhs) : int = match get_conf r with | C0 | C1 | C4 -> 1 | _ -> 2 let get_id2_pos (r : rhs) : int = match get_conf r with | C0 | C1 |C2 | C4 | C5 -> 1 | _ -> 2 let get_id2_rank (r : rhs) : int = match get_conf r with | C0 | C2 | C3 -> 0 | C1 | C5 | C6 -> 1 | C4 -> 2 let is_attribute g tag = tag > 4 && (to_string g tag).[0] == '2' let dummy_param : 'a partial = Leaf (~-1,~-1, [||], Node.nil) (* let rec start_skip g idx count = if idx < Node.null then count else let symbol = start_tag g idx in if is_terminal g symbol then let symbol = terminal symbol in if symbol == nil_symbol then count else let count = count+1 in let fs = start_first_child g idx in let countl = start_skip g fs count in start_skip g fs countl else let nt = non_terminal symbol in let rhs = get_rule g nt in let nparam = get_rank rhs in match nparam with | 0 -> rule_skip g nt dummy_param dummy_param count | 1 -> rule_skip g nt (Leaf(0,StateSet.empty, Node.nil,start_first_child g idx)) dummy_param count | 2 -> let fc = start_first_child g idx in let ns = start_next_sibling g fc in rule_skip g nt (Leaf (0,[||],fc)) (Leaf (1,[||],ns)) count | _ -> assert false and rule_skip g t y0 y1 count = let rhs = get_rule g t in let id1 = get_id1 rhs in let id2 = get_id2 rhs in let conf = get_conf rhs in if is_non_terminal g id1 then let id1 = non_terminal id1 in match conf with | C0 ->rule_skip g id1 (Node0 id2) dummy_param count | C1 -> rule_skip g id1 (Node1(id2,y0)) dummy_param count | C2 -> rule_skip g id1 (Node0 id2) y0 count | C3 -> rule_skip g id1 y0 (Node0 id2) count | C4 -> rule_skip g id1 (Node2(id2, y0, y1)) dummy_param count | C5 -> rule_skip g id1 (Node1(id2, y0)) y1 count | C6 -> rule_skip g id1 y0 (Node1(id2, y1)) count else let id1 = terminal id1 in match conf with | C0 | C1 -> assert false | C2 -> terminal_skip g id1 (Node0 id2) y0 count | C3 -> terminal_skip g id1 y0 (Node0 id2) count | C4 -> assert false | C5 -> terminal_skip g id1 (Node1(id2, y0)) y1 count | C6 -> terminal_skip g id1 y0 (Node1(id2, y1)) count and terminal_skip g (symbol : t_symbol) y0 y1 count = if symbol == nil_symbol then count else let count = count + 1 in let countl = partial_skip g y0 count in partial_skip g y1 countl and partial_skip g l count = match l with | Cache _ -> assert false | Leaf (_,_,_, id) -> start_skip g id count | Node0 id -> if (terminal id) == nil_symbol then count else rule_skip g (non_terminal id) dummy_param dummy_param count | Node1 (id, y0) -> rule_skip g (non_terminal id) y0 dummy_param count | Node2 (id, y0, y1) -> if is_terminal g id then terminal_skip g (terminal id) y0 y1 count else rule_skip g (non_terminal id) y0 y1 count let dispatch_param0 conf id2 y0 y1 = match conf with | C0 -> Node0 id2 | C1 -> Node1(id2,y0) | C2 -> Node0 id2 | C3 -> Node0 id2 | C4 -> Node2(id2, y0, y1) | C5 -> Node1(id2, y0) | C6 -> y0 let dispatch_param1 conf id2 y0 y1 = match conf with | C0 -> dummy_param | C1 -> dummy_param | C2 -> y0 | C3 -> Node0 id2 | C4 -> dummy_param | C5 -> y1 | C6 -> Node1(id2, y1) *)