From ddd758716b1cd691c8748d2e86c179e803b1d3af Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Tue, 29 Jan 2013 00:22:13 +0100 Subject: [PATCH] Rewrite parser using ocamlyacc to remove further dependencies on camlp4. --- src/parser.mly | 127 ++++++++++++++++++++++++++ src/test.ml | 9 +- src/ulexer.ml | 238 ++++++++++++++----------------------------------- src/ulexer.mli | 21 ----- src/uparser.ml | 5 ++ src/xPath.ml | 18 ++-- src/xPath.mli | 21 +++-- 7 files changed, 219 insertions(+), 220 deletions(-) create mode 100644 src/parser.mly delete mode 100644 src/ulexer.mli create mode 100644 src/uparser.ml diff --git a/src/parser.mly b/src/parser.mly new file mode 100644 index 0000000..39b2187 --- /dev/null +++ b/src/parser.mly @@ -0,0 +1,127 @@ +%{ + + open XPath.Ast + let f () = () +%} + +%token TAG +%token STRING +%token INT +%token FLOAT +%token AXIS +%token RB LB LP RP +%token SLASH SLASHSLASH COLONCOLON STAR PIPE +%token EQ NEQ LT GT LTE GTE OR AND ADD SUB DIV MOD +%token NODE TEXT +%token COMMA +%token EOF + +%left OR +%left AND +%left EQ NEQ +%left LT GT LTE GTE +%left ADD SUB +%left MOD DIV STAR +%nonassoc uminus + +%start xpath +%type xpath + + +%% +xpath: +path EOF { $1 } +; + +path: + path_rev { List.rev $1 } +; + +path_rev: + simple_path { [ $1 ] } +| path_rev PIPE simple_path { $3 :: $1 } +; + + +simple_path: + absolute_path { Absolute (List.rev $1) } +| relative_path { Relative (List.rev $1) } +; + +absolute_path: + SLASH relative_path { $2 } +| SLASHSLASH relative_path { (DescendantOrSelf, node, []) :: $2 } +; + +relative_path: + step { [ $1 ] } +| relative_path SLASH step { $3 :: $1 } +| relative_path SLASHSLASH step { $3 + :: (DescendantOrSelf, node, []) + :: $1 } +; + +step: + axis_test pred_list { let a, b = $1 in a, b, $2 } +; + +axis_test: + AXIS COLONCOLON test { $1, $3 } +| test { Child, $1 } +| AXIS { + let _ = Format.flush_str_formatter () in + let () = Format.fprintf Format.str_formatter "%a" XPath.Ast.print_axis $1 in + let a = Format.flush_str_formatter () in + Child, QNameSet.singleton (QName.of_string a) +} +; + +test: + NODE { node } +| TEXT { text } +| STAR { star } +| TAG { QNameSet.singleton(QName.of_string $1) } +; + +pred_list: + pred_list_rev { List.rev $1 } +; + +pred_list_rev: + { [] } +| pred_list LB expr RB { $3 :: $1 } +; + +expr: + INT { Number(`Int($1)) } +| FLOAT { Number(`Float($1)) } +| STRING { String $1 } +| SUB expr %prec uminus { Unop(Neg, $2) } +| expr AND expr { Binop($1, And, $3) } +| expr OR expr { Binop($1, Or, $3) } +| expr ADD expr { Binop($1, Add, $3) } +| expr SUB expr { Binop($1, Sub, $3) } +| expr STAR expr { Binop($1, Mult, $3) } +| expr DIV expr { Binop($1, Div, $3) } +| expr MOD expr { Binop($1, Mod, $3) } +| expr EQ expr { Binop($1, Eq, $3) } +| expr NEQ expr { Binop($1, Neq, $3) } +| expr LT expr { Binop($1, Lt, $3) } +| expr LTE expr { Binop($1, Lte, $3) } +| expr GT expr { Binop($1, Gt, $3) } +| expr GTE expr { Binop($1, Gte, $3) } +| TAG LP arg_list RP { Fun_call(QName.of_string $1, $3) } +| LP expr RP { $2 } +| path { Path $1 } +; + +arg_list: + { [] } +| arg_list1 { List.rev $1 } +; + +arg_list1: + expr { [ $1 ] } +| arg_list1 COMMA expr { $3 :: $1 } +; + diff --git a/src/test.ml b/src/test.ml index b8f2be8..daabd09 100644 --- a/src/test.ml +++ b/src/test.ml @@ -30,19 +30,14 @@ let doc = close_in fd; d - let query = let arg2 = Sys.argv.(2) in - if arg2 = "-f" - then let fq = open_in Sys.argv.(3) in - let q = XPath.parse_file fq in - close_in fq; q - else XPath.parse_string arg2 + Uparser.xpath Ulexer.token (Ulexing.from_latin1_string arg2) open Format let () = - fprintf err_formatter "Query: %a\n%!" XPath.Ast.print query; + fprintf err_formatter "Query: %a\n%!" XPath.Ast.print_path query; fprintf err_formatter "Document:\n%!"; Tree.print_xml stderr doc (Tree.root doc); exit 0 diff --git a/src/ulexer.ml b/src/ulexer.ml index 157e7d2..a27d55d 100644 --- a/src/ulexer.ml +++ b/src/ulexer.ml @@ -4,129 +4,7 @@ (* Copyright NICTA 2008 *) (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) -open Camlp4.PreCast - -module Loc = struct - type t = int * int - - let mk _ = (0,0) - let ghost = (-1,-1) - - let of_lexing_position _ = assert false - let to_ocaml_location _ = assert false - let of_ocaml_location _ = assert false - let of_lexbuf _ = assert false - let of_tuple _ = assert false - let to_tuple _ = assert false - - let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2) - let join (x1, _) = (x1, x1) - let move _ _ _ = assert false - let shift _ _ = assert false - let move_line _ _ = assert false - let file_name _ = assert false - let start_line _ = assert false - let stop_line _ = assert false - let start_bol _ = assert false - let stop_bol _ = assert false - let start_off = fst - let stop_off = snd - let start_pos _ = assert false - let stop_pos _ = assert false - let is_ghost _ = assert false - let ghostify _ = assert false - let set_file_name _ = assert false - let strictly_before _ = assert false - let make_absolute _ = assert false - let print _ = assert false - let dump _ = assert false - let to_string _ = assert false - exception Exc_located of t * exn - let raise loc exn = - match exn with - | Exc_located _ -> raise exn - | _ -> raise (Exc_located (loc, exn)) - let name = ref "_loc" -end - -type token = - | TAG of string - | STRING of string - | INT of int - | KWD of string - | ATT of string - | EOI - -module Token = struct - open Format - module Loc = Loc - type t = token - type token = t - - let sf = Printf.sprintf - - let to_string = - function - | TAG s -> sf "TAG <%s>" s - | STRING s -> sf "STRING \"%s\"" s - | KWD s -> sf "KWD %s" s - | INT i -> sf "INT %i" i - | ATT s -> sf "ATT %s" s - | EOI -> sf "EOI" - - let print ppf x = pp_print_string ppf (to_string x) - - let match_keyword kwd = - function - | KWD kwd' when kwd = kwd' -> true - | _ -> false - - let extract_string = - function - | KWD s | STRING s | TAG s | ATT s -> s - | INT i -> string_of_int i - | tok -> - invalid_arg ("Cannot extract a string from this token: "^ - to_string tok) - - module Error = struct - type t = string - exception E of string - let print = pp_print_string - let to_string x = x - end - - module Filter = struct - type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter - - type t = - { is_kwd : string -> bool; - mutable filter : token_filter } - - let mk is_kwd = - { is_kwd = is_kwd; - filter = (fun s -> s) } - - let filter x = - let f tok loc = - let tok' = tok in - (tok', loc) - in - let rec filter = - parser - | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >] - | [< >] -> [< >] - in - fun strm -> x.filter (filter strm) - - let define_filter x f = x.filter <- f x.filter - - let keyword_added _ _ _ = () - let keyword_removed _ _ = () - end - -end -module Error = Camlp4.Struct.EmptyError +open Parser module L = Ulexing @@ -136,10 +14,10 @@ let error i j s = raise (Error (i,j,s)) (***********************************************************) (* Buffer for string literals *) - + let string_buff = Buffer.create 1024 -let store_lexeme lexbuf = +let store_lexeme lexbuf = Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) let store_ascii = Buffer.add_char string_buff let store_code = Utf8.store string_buff @@ -156,13 +34,13 @@ let get_stored_string () = let illegal lexbuf = error (L.lexeme_start lexbuf) - (L.lexeme_end lexbuf) + (L.lexeme_end lexbuf) "Illegal character" let return lexbuf tok = (tok, L.loc lexbuf) let return_loc i j tok = (tok, (i,j)) -let regexp ncname_char = +let regexp ncname_char = xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." let hexa_digit = function @@ -172,36 +50,80 @@ let hexa_digit = function | _ -> -1 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) +let regexp digit = ['0'-'9'] +let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)? let parse_char lexbuf base i = - let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in + let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in let r = ref 0 in for i = 0 to String.length s - 1 do let c = hexa_digit s.[i] in - if (c >= base) || (c < 0) then + if (c >= base) || (c < 0) then error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; r := !r * base + c; done; !r +let keyword_or_tag s = + try + List.assoc s [ + "self", AXIS XPath.Ast.Self; + "descendant", AXIS XPath.Ast.Descendant; + "child", AXIS XPath.Ast.Child; + "descendant-or-self", AXIS XPath.Ast.DescendantOrSelf; + "attribute", AXIS XPath.Ast.Attribute; + "following-sibling", AXIS XPath.Ast.FollowingSibling; + "preceding-sibling", AXIS XPath.Ast.PrecedingSibling; + "parent", AXIS XPath.Ast.Parent; + "ancestor", AXIS XPath.Ast.Ancestor; + "ancestor-or-self", AXIS XPath.Ast.AncestorOrSelf; + "preceding", AXIS XPath.Ast.Preceding; + "following", AXIS XPath.Ast.Following; + "and", AND; + "or" , OR; + "div", DIV; + "mod", MOD; + ] + with + _ -> TAG s + + let rec token = lexer | [' ' '\t' '\n'] -> token lexbuf - | "and" | "not" | "or" | "text()" | "node()" - | "self" | "descendant" | "child" | "descendant-or-self" - | "attribute" | "following-sibling" | "preceding-sibling" - | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following" - | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "." | ".." | "@" - -> return lexbuf (KWD (L.utf8_lexeme lexbuf)) - | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf)) - | '-'? ['0'-'9']+ -> let i = INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i + | "*" -> STAR + | "/" -> SLASH + | "//" -> SLASHSLASH + | "::" -> COLONCOLON + | "(" -> LP + | ")" -> RP + | "[" -> LB + | "]" -> RB + | "," -> COMMA + | "|" -> PIPE + | "+" -> ADD + | "-" -> SUB + | "<" -> LT + | "<=" -> LTE + | ">" -> GT + | ">=" -> GTE + | "=" -> EQ + | "!=" -> NEQ + | "node()" -> NODE + | "text()" -> TEXT + | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf) + | float -> + let s = L.utf8_lexeme lexbuf in + (try + INT (int_of_string s) + with + _ -> FLOAT (float_of_string s)) | '"' | "'" -> - let start = L.lexeme_start lexbuf in let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in string (L.lexeme_start lexbuf) double_quote lexbuf; let s = get_stored_string () in - return_loc start (L.lexeme_end lexbuf) (STRING s) + STRING s - | eof -> return lexbuf EOI + | eof -> EOF | _ -> illegal lexbuf and string start double = lexer @@ -211,11 +133,11 @@ and string start double = lexer | '\\' ['\\' '"' '\''] -> store_ascii (L.latin1_lexeme_char lexbuf 1); string start double lexbuf - | "\\n" -> + | "\\n" -> store_ascii '\n'; string start double lexbuf - | "\\t" -> + | "\\t" -> store_ascii '\t'; string start double lexbuf - | "\\r" -> + | "\\r" -> store_ascii '\r'; string start double lexbuf | '\\' ['0'-'9']+ ';' -> store_code (parse_char lexbuf 10 1); @@ -232,33 +154,3 @@ and string start double = lexer string start double lexbuf - -(***********************************************************) - -let enc = ref L.Latin1 -let lexbuf = ref None -let last_tok = ref (KWD "DUMMY") - -let raise_clean e = - clear_buff (); - raise e - -let mk () _loc cs = - let lb = L.from_var_enc_stream enc cs in - lexbuf := Some lb; - let next _ = - let tok, loc = - try token lb - with - | Ulexing.Error -> - raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, - "Unexpected character")) - | Ulexing.InvalidCodepoint i -> - raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, - "Code point invalid for the current encoding")) - | e -> raise_clean e - in - last_tok := tok; - Some (tok, loc) - in - Stream.from next diff --git a/src/ulexer.mli b/src/ulexer.mli deleted file mode 100644 index 6fdac83..0000000 --- a/src/ulexer.mli +++ /dev/null @@ -1,21 +0,0 @@ -(******************************************************************************) -(* SXSI : XPath evaluator *) -(* Kim Nguyen (Kim.Nguyen@nicta.com.au) *) -(* Copyright NICTA 2008 *) -(* Distributed under the terms of the LGPL (see LICENCE) *) -(******************************************************************************) -open Camlp4.Sig - -exception Error of int*int*string -type token = - TAG of string - | STRING of string - | INT of int - | KWD of string - | ATT of string - | EOI -module Loc : Loc with type t = int * int -module Token : Token with module Loc = Loc and type t = token -module Error : Error - -val mk : unit -> (Loc.t -> char Stream.t -> (Token.t * Loc.t) Stream.t) diff --git a/src/uparser.ml b/src/uparser.ml new file mode 100644 index 0000000..74fef9d --- /dev/null +++ b/src/uparser.ml @@ -0,0 +1,5 @@ + +let xpath (f : Ulexing.lexbuf -> Parser.token) (l: Ulexing.lexbuf) = + Parser.xpath (fun _ -> f l) (Lexing.from_string "!!dummy!!") + + diff --git a/src/xPath.ml b/src/xPath.ml index 0e71caf..aab6422 100644 --- a/src/xPath.ml +++ b/src/xPath.ml @@ -21,7 +21,7 @@ struct and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following - and test = Simple of QNameSet.t + and test = QNameSet.t and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod and unop = Neg @@ -45,9 +45,6 @@ struct QName.document; QName.cdata_section; QName.comment]) - let t_text = Simple text - let t_node = Simple node - let t_star = Simple star @@ -122,15 +119,14 @@ struct and print_test fmt ts = try pp fmt "%s" (List.assoc ts - [ t_text,"text()"; - t_node,"node()"; - t_star, "*" ] ) + [ text,"text()"; + node,"node()"; + star, "*" ] ) with Not_found -> pp fmt "%s" - (match ts with - Simple t -> if QNameSet.is_finite t - then QName.to_string (QNameSet.choose t) - else "" + (if QNameSet.is_finite ts + then QName.to_string (QNameSet.choose ts) + else "" ) and print_expr fmt = function diff --git a/src/xPath.mli b/src/xPath.mli index e80a88a..1e4d26d 100644 --- a/src/xPath.mli +++ b/src/xPath.mli @@ -12,10 +12,6 @@ (* ../LICENSE. *) (* *) (***********************************************************************) - - -(** Abstract syntax tree of XPath queries *) - module Ast : sig type path = single_path list @@ -24,7 +20,7 @@ sig and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following - and test = Simple of QNameSet.t + and test = QNameSet.t and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod and unop = Neg @@ -35,7 +31,16 @@ sig | Path of path | Binop of expr * binop * expr | Unop of unop * expr - - type t = path -end + val text : QNameSet.t + val node : QNameSet.t + val star : QNameSet.t + val print_binop : Format.formatter -> binop -> unit + val print_unop : Format.formatter -> unop -> unit + val print_path : Format.formatter -> path -> unit + val print_single_path : Format.formatter -> single_path -> unit + val print_step : Format.formatter -> step -> unit + val print_axis : Format.formatter -> axis -> unit + val print_test : Format.formatter -> test -> unit + val print_expr : Format.formatter -> expr -> unit + end -- 2.17.1