From 3dc9065cb7e4b38bf25e6fb50017efa5b11de4ff Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Wed, 22 Feb 2012 10:11:29 +0100 Subject: [PATCH] New version of the Grammar data-structure --- src/IntArray.ml | 45 +++++++ src/IntArray.mli | 10 ++ src/bp.ml | 39 ++++++ src/bp.mli | 15 +++ src/grammar2.ml | 342 +++++++++++++++++++++++++++++++++++++++++++++++ src/grammar2.mli | 4 + 6 files changed, 455 insertions(+) create mode 100644 src/IntArray.ml create mode 100644 src/IntArray.mli create mode 100644 src/bp.ml create mode 100644 src/bp.mli create mode 100644 src/grammar2.ml create mode 100644 src/grammar2.mli diff --git a/src/IntArray.ml b/src/IntArray.ml new file mode 100644 index 0000000..d5998df --- /dev/null +++ b/src/IntArray.ml @@ -0,0 +1,45 @@ +type t = { + mutable data : int array; + mutable length : int; + mutable capacity : int; +} + + +let length a = a.length + +let create () = + { data = Array.create 512 0; + length = 0; + capacity = 512 + } + +let resize a = + let ncap = a.capacity lsl 1 in + let b = Array.create ncap 0 in + for i = 0 to a.capacity - 1 do + b.(i) <- a.data.(i) + done; + a.data <- b; + a.capacity <- ncap + +let get a i = + if i >= 0 && i < a.length then a.data.(i) + else failwith "Invalid array access" + +let set a i v = + if i >= 0 && i < a.length then a.data.(i) <- v + else failwith "Invalid array access" + +let unsafe_get a i = a.data.(i) +let unsafe_set a i v = a.data.(i) <- v + + +let push_back a v = + if a.length >= a.capacity then resize a; + a.data.(a.length) <- v; + a.length <- a.length + 1 + +let pack a = + let b = Array.create a.length 0 in + Array.blit a.data 0 b 0 a.length; + b diff --git a/src/IntArray.mli b/src/IntArray.mli new file mode 100644 index 0000000..07f82ca --- /dev/null +++ b/src/IntArray.mli @@ -0,0 +1,10 @@ +type t + +val create : unit -> t +val get : t -> int -> int +val set : t -> int -> int -> unit +val unsafe_get : t -> int -> int +val unsafe_set : t -> int -> int -> unit +val length : t -> int +val push_back : t -> int -> unit +val pack : t -> int array diff --git a/src/bp.ml b/src/bp.ml new file mode 100644 index 0000000..e230931 --- /dev/null +++ b/src/bp.ml @@ -0,0 +1,39 @@ +type cbitmap + +external cbitmap_create : int -> cbitmap = "caml_bitmap_create" +external cbitmap_setbit : cbitmap -> int -> int -> unit = "caml_bitmap_setbit" +external cbitmap_resize : cbitmap -> int -> cbitmap = "caml_bitmap_resize" + + +type bitmap = { mutable data : cbitmap; + mutable size : int; + mutable capacity : int; (* in bits *) + } +let bitmap_increment = 4096 + +let bitmap_create () = + { + data = cbitmap_create bitmap_increment; + size = 0; + capacity = bitmap_increment; + } + +let bitmap_push_back b i = + if b.size >= b.capacity then begin + let ncap = b.capacity + bitmap_increment in + b.data <- cbitmap_resize b.data ncap; + b.capacity <- ncap; + end; + cbitmap_setbit b.data b.size i; + b.size <- b.size + 1 + + +type t +external bp_construct : cbitmap -> int -> t = "caml_bp_construct" +external save : t -> Unix.file_descr -> unit = "caml_bp_save" +external load : Unix.file_descr -> t = "caml_bp_load" +external first_child : t -> int -> int = "caml_bp_first_child" "noalloc" +external next_sibling : t -> int -> int = "caml_bp_next_sibling" "noalloc" +external preorder_rank : t -> int -> int = "caml_bp_preorder_rank" "noalloc" + +let create bm = bp_construct bm.data bm.size diff --git a/src/bp.mli b/src/bp.mli new file mode 100644 index 0000000..7053493 --- /dev/null +++ b/src/bp.mli @@ -0,0 +1,15 @@ +type bitmap + +val bitmap_create : unit -> bitmap +val bitmap_push_back : bitmap -> int -> unit + + +type t + +val create : bitmap -> t +external save : t -> Unix.file_descr -> unit = "caml_bp_save" +external load : Unix.file_descr -> t = "caml_bp_load" +external first_child : t -> int -> int = "caml_bp_first_child" "noalloc" +external next_sibling : t -> int -> int = "caml_bp_next_sibling" "noalloc" +external preorder_rank : t -> int -> int = "caml_bp_preorder_rank" "noalloc" + diff --git a/src/grammar2.ml b/src/grammar2.ml new file mode 100644 index 0000000..82cf036 --- /dev/null +++ b/src/grammar2.ml @@ -0,0 +1,342 @@ +type t = { + start : Bp.t; + tags : int array; + rules : int array; + rules_offset : int; + tag_to_id : (string, int) Hashtbl.t; + tag_of_id : string array +} + + + +module Parse = +struct + + let buffer = Buffer.create 512 + + let parse_tree cin open_tag close_tag = + let rec loop () = + let c = input_char cin in + match c with + '\n'| '>' -> () + | ' ' | ',' | '-' -> loop () + | 'a'..'z' | 'B'..'Z' | '0'..'9' | '_' -> + Buffer.clear buffer; + Buffer.add_char buffer c; + loop_tag false + + | 'A' -> Buffer.clear buffer; + Buffer.add_char buffer c; + loop_tag true + | ')' -> close_tag (); loop () + | _ -> failwith ("Invalid character: " ^ (String.make 1 c)) + + and loop_tag t = + let c = input_char cin in + match c with + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' -> + Buffer.add_char buffer c; + loop_tag t + | '(' -> let s = Buffer.contents buffer in + open_tag s t; + Buffer.clear buffer; + loop () + | ' ' -> loop_tag t + | ',' | '-' -> let s = Buffer.contents buffer in + open_tag s t; + close_tag (); + Buffer.clear buffer; + loop () + | ')' -> let s = Buffer.contents buffer in + open_tag s t; + Buffer.clear buffer; + close_tag (); + close_tag (); + loop () + | _ -> failwith ("Invalid character: " ^ (String.make 1 c)) + in + loop () + + + let tag_info = Hashtbl.create 1023 + let tag_of_id = Hashtbl.create 1023 + let current_id = ref 4 + let init() = + Hashtbl.clear tag_info; + Hashtbl.clear tag_of_id; + current_id := 4; + Hashtbl.add tag_info "_ROOT" (0, ~-1, false); + Hashtbl.add tag_info "_A" (1, ~-1, false); + Hashtbl.add tag_info "_T" (2, ~-1, false); + Hashtbl.add tag_info "_AT" (3, ~-1, false); + Hashtbl.add tag_info "_" (4, ~-1, false); + Hashtbl.add tag_of_id 0 "_ROOT"; + Hashtbl.add tag_of_id 1 "_A"; + Hashtbl.add tag_of_id 2 "_T"; + Hashtbl.add tag_of_id 3 "_AT"; + Hashtbl.add tag_of_id 4 "_" + + + let add_tag s nterm = + let id, count, nterm = + try Hashtbl.find tag_info s with + Not_found -> + incr current_id; + let id = !current_id in + Hashtbl.add tag_of_id id s; + (!current_id, ~-1, nterm || s = "START") + in + let r = id, count+1, nterm in + Hashtbl.replace tag_info s r; + r + + + type tree = Node of string * tree list + + let parse_small_tree cin = + let stack = ref [ Node("", []) ] in + let open_tag s isnterm = + if s <> "y0" && s <> "y1" then ignore(add_tag s isnterm); + stack := Node(s, []) :: !stack + in + let close_tag () = + match !stack with + Node(t1, l1) :: Node(t2, l2) :: r -> + stack := Node(t2, Node(t1, List.rev l1)::l2) :: r + | _ -> assert false + in + parse_tree cin open_tag close_tag; + match !stack with + [ Node(_, [ l ]) ] -> l + | _ -> raise End_of_file + + let parse_big_tree cin = + let bv = Bp.bitmap_create () in + let tags = IntArray.create () in + let open_tag s isnterm = + let id, _, _ = add_tag s isnterm in + Bp.bitmap_push_back bv 1; + IntArray.push_back tags id + in + let close_tag () = + Bp.bitmap_push_back bv 0 + in + parse_tree cin open_tag close_tag; + Bp.create bv, IntArray.pack tags + + let eat_char cin = ignore (input_char cin) + + let h_find ?(msg="") h i = + try + Hashtbl.find h i + with + Not_found -> + let r = Obj.repr i in + if Obj.is_int r then Printf.eprintf "Not_found (%s): %i\n%!" msg (Obj.magic i); + if Obj.tag r = Obj.string_tag then Printf.eprintf "Not_found (%s): %s\n%!" msg (Obj.magic i); + raise Not_found + ;; + + let parse cin = + let rules = Hashtbl.create 1023 in + init (); + (* START *) + ignore (parse_small_tree cin); + (* > *) + (* ignore (input_char cin); *) + let bv, tags = parse_big_tree cin in + let () = + try + while true do + let lhs = parse_small_tree cin in + let rhs = parse_small_tree cin in + Hashtbl.add rules lhs rhs + done; + with End_of_file -> () + in + (* First, re-order the tags *) + let old_new_mapping = + Array.init (Hashtbl.length tag_of_id) + (fun i -> h_find ~msg:"1" tag_of_id i) + in + Array.fast_sort (fun tag1 tag2 -> + let t1, count1, isnterm1 = + h_find ~msg:"2" tag_info tag1 + and t2, count2, isnterm2 = + h_find ~msg:"3" tag_info tag2 + in + if t1 <= 4 && t2 <= 4 then compare t1 t2 + else if t1 <= 4 then -1 + else if t2 <= 4 then 1 + else + if (not isnterm1) && (not isnterm2) then compare t1 t2 + else if isnterm1 && isnterm2 then + match tag1, tag2 with + "START", "START" -> 0 + | "START", _ -> ~-1 + | _, "START" -> 1 + | _ -> compare count2 count1 + else if isnterm2 then -1 + else 1) old_new_mapping; + let tag_to_id = Hashtbl.create 503 in + Array.iteri (fun i s -> + Hashtbl.add tag_to_id s i) old_new_mapping; + let renum_tags = Array.copy tags in + for i = 0 to Array.length tags - 1 do + renum_tags.(i) <- + h_find ~msg:"4" tag_to_id (h_find ~msg:"5" tag_of_id (tags.(i))) + done; + let r_array = Array.create (Hashtbl.length rules) 0 in + let rules_offset = h_find ~msg:"6" tag_to_id "START" + 1 in + let pos_id2 l = + let rec loop i l = + match l with + [] -> assert false + | Node(tag, children) :: ll -> + if tag <> "y0" && tag <> "y1" then + tag, i + else loop (i+1) ll + in + loop 1 l + in + Hashtbl.iter (fun lhs rhs -> + let Node( head, _ ) = lhs in + let Node( tag1, params) = rhs in + let tag2, pos2 = pos_id2 params in + let id1 = h_find ~msg:"7" tag_to_id tag1 + and id2 = h_find ~msg:"8" tag_to_id tag2 + in + let rule_ = id2 lsl 27 in + let rule_ = (rule_ lor id1) lsl 2 in + let rule_ = (rule_ lor pos2) lsl 2 in + let rule_ = rule_ lor (List.length params) in + r_array.((h_find ~msg:"9" tag_to_id head) - rules_offset ) <- rule_ + ) rules; + let l = Array.length renum_tags in + let tag32 = Array32.create l 0 in + for i = 0 to l - 1 do + Array32.set tag32 i (renum_tags.(i) land 0x7ffffff); + done; + (* Remove the non-terminal names from the hash tables *) + let tag_to_id2 = Hashtbl.create 31 in + Hashtbl.iter (fun s i -> if i < rules_offset then Hashtbl.add tag_to_id2 s i) + tag_to_id; + { start = bv; + tags = tag32; + rules = renum_tags; + rules_offset = rules_offset; + tag_to_id = tag_to_id2; + tag_of_id = Array.sub old_new_mapping 0 rules_offset + } + +end + +let parse file = + let cin = open_in file in + let g = Parse.parse cin in + close_in cin; + g + +let _GRAMMAR_MAGIC = 0xaabbcc +let _GRAMMAR_VERSION = 2 + +let save g f = + let cout = open_out f in + let write a = Marshal.to_channel cout a [ ] + in + write _GRAMMAR_MAGIC; + write _GRAMMAR_VERSION; + write g.tags; + write g.rules; + write g.rules_offset; + write g.tag_to_id; + write g.tag_of_id; + flush cout; + let fd = Unix.descr_of_out_channel cout in + Bp.save g.start fd; + close_out cout + +let load f = + let cin = open_in f in + let pr_pos () = + Printf.eprintf "Position: %i kiB\n" (pos_in cin / 1024) + in + let read () = Marshal.from_channel cin in + if read () != _GRAMMAR_MAGIC then failwith "Invalid grammar file"; + if read () != _GRAMMAR_VERSION then failwith "Deprecated grammar format"; + pr_pos(); + let tags : int array = read () in + pr_pos(); + let rules : int array = read () in + pr_pos(); + let rules_offset : int = read () in + pr_pos(); + let tag_to_id : (string, int) Hashtbl.t = read () in + pr_pos(); + let tag_of_id : string array = read () in + pr_pos(); + let fd = Unix.descr_of_in_channel cin in + let pos = pos_in cin in + ignore(Unix.lseek fd pos Unix.SEEK_SET); + let bp = Bp.load fd in + close_in cin; + { + start = bp; + tags = tags; + rules = rules; + rules_offset = rules_offset; + tag_to_id = tag_to_id; + tag_of_id = tag_of_id; + } + + +type node = [ `Grammar ] Node.t + +type p_type = [ `Parameter ] +type n_type = [ `NonTerminal ] +type t_type = [ `Terminal ] +type any_type = [ p_type | n_type | t_type ] +type symbol = [ any_type ] Node.t + +type p_symbol = p_type Node.t +type n_symbol = n_type Node.t +type t_symbol = t_type Node.t +type tn_symbol = [ n_type | t_type ] Node.t + + +let is_nil : (t:t_symbol) = + (Node.to_int t) == 4 + +let nil_symbol : t_symbol = + (Node.of_int 4) + +let translate_tag _ t = if t == 4 then ~-1 else t +let to_string t tag = tag_of_id.(Tag.to_int tag) +let register_tag t tag = + try Hashtbl.find t.tag_to_id (Tag.to_int tag) with + Not_found -> 4 + +let tag_operations t = { + Tag.tag = (fun s -> register_tag t s); + Tag.to_string = (fun s -> to_string t s); + Tag.translate = (fun s -> translate_tag t s); +} + + +let rhs_tag t idx = + t.tags.(Bp.preorder_rank t.start idx) + +let rhs_first_child t idx = + Bp.first_child t.start idx + +let rhs_next_sibling t idx = + Bp.next_sibling t.start idx + +let is_non_terminal t (n : [< any_type ] Node.t) = + let n = Node.to_int n in + n >= t.rules_offset + +let is_terminal t (n : [< any_type ] Node.t) = not(is_terminal t n) + +let tag (n : t_symbol) : Tag.t = Obj.magic n + diff --git a/src/grammar2.mli b/src/grammar2.mli new file mode 100644 index 0000000..8d84e4e --- /dev/null +++ b/src/grammar2.mli @@ -0,0 +1,4 @@ +type t +val parse : string -> t +val save : t -> string -> unit +val load : string -> t -- 2.17.1