From ddd758716b1cd691c8748d2e86c179e803b1d3af Mon Sep 17 00:00:00 2001
From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= <kn@lri.fr>
Date: Tue, 29 Jan 2013 00:22:13 +0100
Subject: [PATCH] Rewrite parser using ocamlyacc to remove further dependencies
 on camlp4.

---
 src/parser.mly | 127 ++++++++++++++++++++++++++
 src/test.ml    |   9 +-
 src/ulexer.ml  | 238 ++++++++++++++-----------------------------------
 src/ulexer.mli |  21 -----
 src/uparser.ml |   5 ++
 src/xPath.ml   |  18 ++--
 src/xPath.mli  |  21 +++--
 7 files changed, 219 insertions(+), 220 deletions(-)
 create mode 100644 src/parser.mly
 delete mode 100644 src/ulexer.mli
 create mode 100644 src/uparser.ml
diff --git a/src/parser.mly b/src/parser.mly
new file mode 100644
index 0000000..39b2187
--- /dev/null
+++ b/src/parser.mly
@@ -0,0 +1,127 @@
+%{
+
+  open XPath.Ast
+  let f () = ()
+%}
+
+%token <string> TAG
+%token <string> STRING
+%token <int>  INT
+%token <float> FLOAT
+%token <XPath.Ast.axis> AXIS
+%token RB LB LP RP
+%token SLASH SLASHSLASH COLONCOLON STAR PIPE
+%token EQ NEQ LT GT LTE GTE OR AND ADD SUB DIV MOD
+%token NODE TEXT
+%token COMMA
+%token EOF
+
+%left OR
+%left AND
+%left EQ NEQ
+%left LT GT LTE GTE
+%left ADD SUB
+%left MOD DIV STAR
+%nonassoc uminus
+
+%start xpath
+%type <XPath.Ast.path> xpath
+
+
+%%
+xpath:
+path EOF          { $1 }
+;
+
+path:
+  path_rev { List.rev $1 }
+;
+
+path_rev:
+  simple_path     { [ $1 ] }
+| path_rev PIPE simple_path { $3 :: $1 }
+;
+
+
+simple_path:
+   absolute_path  { Absolute  (List.rev $1) }
+|  relative_path  { Relative  (List.rev $1) }
+;
+
+absolute_path:
+  SLASH relative_path { $2 }
+| SLASHSLASH relative_path { (DescendantOrSelf, node, []) :: $2 }
+;
+
+relative_path:
+  step { [ $1 ] }
+| relative_path SLASH step { $3 :: $1 }
+| relative_path SLASHSLASH step { $3
+                                  :: (DescendantOrSelf, node, [])
+                                  :: $1 }
+;
+
+step:
+  axis_test pred_list    { let a, b = $1 in a, b, $2 }
+;
+
+axis_test:
+  AXIS COLONCOLON test  { $1, $3 }
+| test                  { Child, $1 }
+| AXIS            {
+  let _ = Format.flush_str_formatter () in
+  let () = Format.fprintf Format.str_formatter "%a" XPath.Ast.print_axis $1 in
+  let a = Format.flush_str_formatter () in
+  Child, QNameSet.singleton (QName.of_string a)
+}
+;
+
+test:
+  NODE                { node }
+| TEXT                { text }
+| STAR                { star }
+| TAG                 { QNameSet.singleton(QName.of_string $1) }
+;
+
+pred_list:
+  pred_list_rev             { List.rev $1 }
+;
+
+pred_list_rev:
+             { [] }
+| pred_list LB expr RB   { $3 :: $1 }
+;
+
+expr:
+  INT                       { Number(`Int($1)) }
+| FLOAT                     { Number(`Float($1)) }
+| STRING                    { String $1 }
+| SUB expr     %prec uminus { Unop(Neg, $2) }
+| expr AND expr             { Binop($1, And, $3) }
+| expr OR expr              { Binop($1, Or, $3) }
+| expr ADD expr             { Binop($1, Add, $3) }
+| expr SUB expr             { Binop($1, Sub, $3) }
+| expr STAR expr            { Binop($1, Mult, $3) }
+| expr DIV expr             { Binop($1, Div, $3) }
+| expr MOD expr             { Binop($1, Mod, $3) }
+| expr EQ expr              { Binop($1, Eq, $3) }
+| expr NEQ expr             { Binop($1, Neq, $3) }
+| expr LT expr              { Binop($1, Lt, $3) }
+| expr LTE expr             { Binop($1, Lte, $3) }
+| expr GT expr              { Binop($1, Gt, $3) }
+| expr GTE expr             { Binop($1, Gte, $3) }
+| TAG LP arg_list RP        { Fun_call(QName.of_string $1, $3) }
+| LP expr RP                { $2 }
+| path                      { Path $1 }
+;
+
+arg_list:
+                            { [] }
+| arg_list1                 { List.rev $1 }
+;
+
+arg_list1:
+  expr                     { [ $1 ] }
+| arg_list1 COMMA expr     { $3 :: $1 }
+;
+
diff --git a/src/test.ml b/src/test.ml
index b8f2be8..daabd09 100644
--- a/src/test.ml
+++ b/src/test.ml
@@ -30,19 +30,14 @@ let doc =
   close_in fd; d
 
 
-
 let query = 
   let arg2 = Sys.argv.(2) in
-  if arg2 = "-f"
-  then  let fq = open_in Sys.argv.(3) in
-	let q = XPath.parse_file fq in
-	close_in fq; q
-  else XPath.parse_string arg2
+  Uparser.xpath Ulexer.token (Ulexing.from_latin1_string arg2)
 
 open Format
 
 let () =
-  fprintf err_formatter "Query: %a\n%!" XPath.Ast.print query;
+  fprintf err_formatter "Query: %a\n%!" XPath.Ast.print_path query;
   fprintf err_formatter "Document:\n%!";
   Tree.print_xml stderr doc (Tree.root doc);
   exit 0
diff --git a/src/ulexer.ml b/src/ulexer.ml
index 157e7d2..a27d55d 100644
--- a/src/ulexer.ml
+++ b/src/ulexer.ml
@@ -4,129 +4,7 @@
 (*  Copyright NICTA 2008                                                      *)
 (*  Distributed under the terms of the LGPL (see LICENCE)                     *)
 (******************************************************************************)
-open Camlp4.PreCast
-
-module Loc = struct
-  type t = int * int
-
-  let mk _ = (0,0)
-  let ghost = (-1,-1)
-
-  let of_lexing_position _ = assert false
-  let to_ocaml_location _ = assert false
-  let of_ocaml_location _ = assert false
-  let of_lexbuf _ = assert false
-  let of_tuple _ = assert false
-  let to_tuple _ = assert false
-
-  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
-  let join (x1, _) = (x1, x1)
-  let move _ _ _ = assert false
-  let shift _ _ = assert false
-  let move_line _ _ = assert false
-  let file_name  _ = assert false
-  let start_line _ = assert false
-  let stop_line  _ = assert false
-  let start_bol  _ = assert false
-  let stop_bol   _ = assert false
-  let start_off  = fst
-  let stop_off   = snd
-  let start_pos  _ = assert false
-  let stop_pos   _ = assert false
-  let is_ghost   _ = assert false
-  let ghostify   _ = assert false
-  let set_file_name _ = assert false
-  let strictly_before _ = assert false
-  let make_absolute _ = assert false
-  let print _ = assert false
-  let dump  _ = assert false
-  let to_string _ = assert false
-  exception Exc_located of t * exn
-  let raise loc exn =
-    match exn with 
-    | Exc_located _ -> raise exn
-    | _ -> raise (Exc_located (loc, exn))
-  let name = ref "_loc"
-end
-
-type token =
-  | TAG of string
-  | STRING of string
-  | INT of int
-  | KWD of string
-  | ATT of string
-  | EOI
-
-module Token = struct
-  open Format
-  module Loc = Loc
-  type t = token
-  type token = t
-
-  let sf = Printf.sprintf
-
-  let to_string =
-    function
-      | TAG s -> sf "TAG <%s>" s
-      | STRING s -> sf "STRING \"%s\"" s
-      | KWD s -> sf "KWD %s" s
-      | INT i  -> sf "INT %i" i
-      | ATT s -> sf "ATT %s" s
-      | EOI    -> sf "EOI"
-
-  let print ppf x = pp_print_string ppf (to_string x)
-
-  let match_keyword kwd =
-    function
-    | KWD kwd' when kwd = kwd' -> true
-    | _ -> false
-
-  let extract_string =
-    function
-      | KWD s | STRING s | TAG s | ATT s  -> s
-      | INT i -> string_of_int i
-      | tok ->
-          invalid_arg ("Cannot extract a string from this token: "^
-			 to_string tok)
-
-  module Error = struct
-    type t = string
-    exception E of string
-    let print = pp_print_string
-    let to_string x = x
-  end
-
-  module Filter = struct
-    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter
-
-    type t =
-      { is_kwd : string -> bool;
-        mutable filter : token_filter }
-
-    let mk is_kwd =
-      { is_kwd = is_kwd;
-        filter = (fun s -> s) }
-
-    let filter x =
-      let f tok loc =
-        let tok' = tok in
-        (tok', loc)
-      in
-      let rec filter =
-        parser
-        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
-        | [< >] -> [< >]
-      in
-      fun strm -> x.filter (filter strm)
-
-    let define_filter x f = x.filter <- f x.filter
-
-    let keyword_added _ _ _ = ()
-    let keyword_removed _ _ = ()
-  end
-
-end
-module Error = Camlp4.Struct.EmptyError
+open Parser
 
 module L = Ulexing
 
@@ -136,10 +14,10 @@ let error i j s = raise (Error (i,j,s))
 
 (***********************************************************)
 (* Buffer for string literals *)
-  
+
 let string_buff = Buffer.create 1024
 
-let store_lexeme lexbuf = 
+let store_lexeme lexbuf =
   Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
 let store_ascii = Buffer.add_char string_buff
 let store_code  = Utf8.store string_buff
@@ -156,13 +34,13 @@ let get_stored_string () =
 let illegal lexbuf =
   error
     (L.lexeme_start lexbuf)
-    (L.lexeme_end lexbuf) 
+    (L.lexeme_end lexbuf)
     "Illegal character"
 
 let return lexbuf tok = (tok, L.loc lexbuf)
 let return_loc i j tok = (tok, (i,j))
 
-let regexp ncname_char = 
+let regexp ncname_char =
   xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
 
 let hexa_digit = function
@@ -172,36 +50,80 @@ let hexa_digit = function
   | _ -> -1
 
 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
+let regexp digit = ['0'-'9']
+let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)?
 
 let parse_char lexbuf base i =
-  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in 
+  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
   let r = ref 0 in
   for i = 0 to String.length s - 1 do
     let c = hexa_digit s.[i] in
-    if (c >= base) || (c < 0) then 
+    if (c >= base) || (c < 0) then
       error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
     r := !r * base + c;
   done;
   !r
 
+let keyword_or_tag s =
+     try
+       List.assoc s [
+         "self", AXIS XPath.Ast.Self;
+         "descendant", AXIS XPath.Ast.Descendant;
+         "child", AXIS XPath.Ast.Child;
+         "descendant-or-self", AXIS XPath.Ast.DescendantOrSelf;
+         "attribute", AXIS XPath.Ast.Attribute;
+         "following-sibling", AXIS XPath.Ast.FollowingSibling;
+         "preceding-sibling", AXIS XPath.Ast.PrecedingSibling;
+         "parent", AXIS XPath.Ast.Parent;
+         "ancestor", AXIS XPath.Ast.Ancestor;
+         "ancestor-or-self", AXIS XPath.Ast.AncestorOrSelf;
+         "preceding", AXIS XPath.Ast.Preceding;
+         "following", AXIS XPath.Ast.Following;
+         "and", AND;
+         "or" , OR;
+         "div", DIV;
+         "mod", MOD;
+       ]
+     with
+       _ -> TAG s
+
+
 let rec token = lexer
  | [' ' '\t' '\n'] -> token lexbuf
- | "and" | "not" | "or"  | "text()" | "node()" 
- | "self" | "descendant" | "child" | "descendant-or-self" 
- | "attribute" | "following-sibling" | "preceding-sibling"
- | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following"
- | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "."  | ".." | "@"
-     -> return lexbuf (KWD (L.utf8_lexeme lexbuf))
- | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf))
- | '-'? ['0'-'9']+ -> let i =  INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i
+ | "*" -> STAR
+ | "/" -> SLASH
+ | "//" -> SLASHSLASH
+ | "::" -> COLONCOLON
+ | "("  -> LP
+ | ")"  -> RP
+ | "["  -> LB
+ | "]"  -> RB
+ | ","  -> COMMA
+ | "|"  -> PIPE
+ | "+"  -> ADD
+ | "-"  -> SUB
+ | "<"  -> LT
+ | "<=" -> LTE
+ | ">"  -> GT
+ | ">=" -> GTE
+ | "="  -> EQ
+ | "!=" -> NEQ
+ | "node()" -> NODE
+ | "text()" -> TEXT
+ | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf)
+ | float ->
+     let s = L.utf8_lexeme lexbuf in
+     (try
+       INT (int_of_string s)
+     with
+       _ -> FLOAT (float_of_string s))
  | '"' | "'" ->
-     let start = L.lexeme_start lexbuf in
      let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
      string (L.lexeme_start lexbuf) double_quote lexbuf;
      let s = get_stored_string () in
-       return_loc start (L.lexeme_end lexbuf) (STRING s)
+     STRING s
 
- | eof -> return lexbuf EOI
+ | eof -> EOF
  | _ -> illegal lexbuf
 
 and string start double = lexer
@@ -211,11 +133,11 @@ and string start double = lexer
   | '\\' ['\\' '"' '\''] ->
       store_ascii (L.latin1_lexeme_char lexbuf 1);
       string start double lexbuf
-  | "\\n" -> 
+  | "\\n" ->
       store_ascii '\n'; string start double lexbuf
-  | "\\t" -> 
+  | "\\t" ->
       store_ascii '\t'; string start double lexbuf
-  | "\\r" -> 
+  | "\\r" ->
       store_ascii '\r'; string start double lexbuf
   | '\\' ['0'-'9']+ ';' ->
       store_code (parse_char lexbuf 10 1);
@@ -232,33 +154,3 @@ and string start double = lexer
       string start double lexbuf
 
 
-
-(***********************************************************)
-
-let enc = ref L.Latin1
-let lexbuf = ref None
-let last_tok = ref (KWD "DUMMY")
-
-let raise_clean e =
-  clear_buff ();
-  raise e
-
-let mk () _loc cs =
-  let lb = L.from_var_enc_stream enc cs in
-  lexbuf := Some lb;
-  let next _ =
-    let tok, loc = 
-      try token lb
-      with
-        | Ulexing.Error -> 
-            raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
-                          "Unexpected character"))
-        | Ulexing.InvalidCodepoint i ->
-            raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
-                          "Code point invalid for the current encoding"))
-        | e -> raise_clean e
-    in
-    last_tok := tok;
-    Some (tok, loc)
-  in
-  Stream.from next
diff --git a/src/ulexer.mli b/src/ulexer.mli
deleted file mode 100644
index 6fdac83..0000000
--- a/src/ulexer.mli
+++ /dev/null
@@ -1,21 +0,0 @@
-(******************************************************************************)
-(*  SXSI : XPath evaluator                                                    *)
-(*  Kim Nguyen (Kim.Nguyen@nicta.com.au)                                      *)
-(*  Copyright NICTA 2008                                                      *)
-(*  Distributed under the terms of the LGPL (see LICENCE)                     *)
-(******************************************************************************)
-open Camlp4.Sig
-
-exception Error of int*int*string
-type token =
-      TAG of string
-    | STRING of string
-    | INT of int
-    | KWD of string
-    | ATT of string
-    | EOI 
-module Loc   : Loc with type t = int * int
-module Token : Token with module Loc = Loc and type t = token
-module Error : Error
-
-val mk : unit -> (Loc.t -> char Stream.t -> (Token.t * Loc.t) Stream.t)
diff --git a/src/uparser.ml b/src/uparser.ml
new file mode 100644
index 0000000..74fef9d
--- /dev/null
+++ b/src/uparser.ml
@@ -0,0 +1,5 @@
+
+let xpath (f : Ulexing.lexbuf -> Parser.token) (l: Ulexing.lexbuf) =
+  Parser.xpath (fun _ -> f l) (Lexing.from_string "!!dummy!!")
+
+  
diff --git a/src/xPath.ml b/src/xPath.ml
index 0e71caf..aab6422 100644
--- a/src/xPath.ml
+++ b/src/xPath.ml
@@ -21,7 +21,7 @@ struct
   and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling
              | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following
 
-  and test = Simple of QNameSet.t
+  and test = QNameSet.t
 
   and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod
   and unop =  Neg
@@ -45,9 +45,6 @@ struct
                          QName.document;
                          QName.cdata_section;
                          QName.comment])
-  let t_text = Simple text
-  let t_node = Simple node
-  let t_star = Simple star
 
 
 
@@ -122,15 +119,14 @@ struct
   and print_test fmt ts =
     try
       pp fmt "%s" (List.assoc ts
-                     [ t_text,"text()";
-                       t_node,"node()";
-                       t_star, "*" ] )
+                     [ text,"text()";
+                       node,"node()";
+                       star, "*" ] )
     with
       Not_found -> pp fmt "%s"
-        (match ts with
-          Simple t -> if QNameSet.is_finite t
-            then QName.to_string (QNameSet.choose t)
-            else "<INFINITE>"
+        (if QNameSet.is_finite ts
+         then QName.to_string (QNameSet.choose ts)
+         else "<INFINITE>"
         )
 
   and print_expr fmt = function
diff --git a/src/xPath.mli b/src/xPath.mli
index e80a88a..1e4d26d 100644
--- a/src/xPath.mli
+++ b/src/xPath.mli
@@ -12,10 +12,6 @@
 (*  ../LICENSE.                                                        *)
 (*                                                                     *)
 (***********************************************************************)
-
-
-(** Abstract syntax tree of XPath queries *)
-
 module Ast :
 sig
   type path = single_path list
@@ -24,7 +20,7 @@ sig
   and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling
              | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following
 
-  and test = Simple of QNameSet.t
+  and test = QNameSet.t
 
   and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod
   and unop =  Neg
@@ -35,7 +31,16 @@ sig
     | Path of path
     | Binop of expr * binop * expr
     | Unop of unop * expr
-
-
   type t = path
-end
+  val text : QNameSet.t
+  val node : QNameSet.t
+  val star : QNameSet.t
+  val print_binop : Format.formatter -> binop -> unit
+  val print_unop : Format.formatter -> unop -> unit
+  val print_path : Format.formatter -> path -> unit
+  val print_single_path : Format.formatter -> single_path -> unit
+  val print_step : Format.formatter -> step -> unit
+  val print_axis : Format.formatter -> axis -> unit
+  val print_test : Format.formatter -> test -> unit
+  val print_expr : Format.formatter -> expr -> unit
+  end
-- 
2.17.1