src/ulexer.ml

   1 (******************************************************************************)
   2 (*  SXSI : XPath evaluator                                                    *)
   3 (*  Kim Nguyen (Kim.Nguyen@nicta.com.au)                                      *)
   4 (*  Copyright NICTA 2008                                                      *)
   5 (*  Distributed under the terms of the LGPL (see LICENCE)                     *)
   6 (******************************************************************************)
   7 open Camlp4.PreCast
   8
   9 module Loc = struct
  10   type t = int * int
  11
  12   let mk _ = (0,0)
  13   let ghost = (-1,-1)
  14
  15   let of_lexing_position _ = assert false
  16   let to_ocaml_location _ = assert false
  17   let of_ocaml_location _ = assert false
  18   let of_lexbuf _ = assert false
  19   let of_tuple _ = assert false
  20   let to_tuple _ = assert false
  21
  22   let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  23   let join (x1, _) = (x1, x1)
  24   let move _ _ _ = assert false
  25   let shift _ _ = assert false
  26   let move_line _ _ = assert false
  27   let file_name  _ = assert false
  28   let start_line _ = assert false
  29   let stop_line  _ = assert false
  30   let start_bol  _ = assert false
  31   let stop_bol   _ = assert false
  32   let start_off  = fst
  33   let stop_off   = snd
  34   let start_pos  _ = assert false
  35   let stop_pos   _ = assert false
  36   let is_ghost   _ = assert false
  37   let ghostify   _ = assert false
  38   let set_file_name _ = assert false
  39   let strictly_before _ = assert false
  40   let make_absolute _ = assert false
  41   let print _ = assert false
  42   let dump  _ = assert false
  43   let to_string _ = assert false
  44   exception Exc_located of t * exn
  45   let raise loc exn =
  46     match exn with
  47     | Exc_located _ -> raise exn
  48     | _ -> raise (Exc_located (loc, exn))
  49   let name = ref "_loc"
  50 end
  51
  52 type token =
  53   | TAG of string
  54   | STRING of string
  55   | INT of int
  56   | KWD of string
  57   | ATT of string
  58   | EOI
  59
  60 module Token = struct
  61   open Format
  62   module Loc = Loc
  63   type t = token
  64   type token = t
  65
  66   let sf = Printf.sprintf
  67
  68   let to_string =
  69     function
  70       | TAG s -> sf "TAG <%s>" s
  71       | STRING s -> sf "STRING \"%s\"" s
  72       | KWD s -> sf "KWD %s" s
  73       | INT i  -> sf "INT %i" i
  74       | ATT s -> sf "ATT %s" s
  75       | EOI    -> sf "EOI"
  76
  77   let print ppf x = pp_print_string ppf (to_string x)
  78
  79   let match_keyword kwd =
  80     function
  81     | KWD kwd' when kwd = kwd' -> true
  82     | _ -> false
  83
  84   let extract_string =
  85     function
  86       | KWD s | STRING s | TAG s | ATT s  -> s
  87       | INT i -> string_of_int i
  88       | tok ->
  89           invalid_arg ("Cannot extract a string from this token: "^
  90                          to_string tok)
  91
  92   module Error = struct
  93     type t = string
  94     exception E of string
  95     let print = pp_print_string
  96     let to_string x = x
  97   end
  98
  99   module Filter = struct
 100     type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter
 101
 102     type t =
 103       { is_kwd : string -> bool;
 104         mutable filter : token_filter }
 105
 106     let mk is_kwd =
 107       { is_kwd = is_kwd;
 108         filter = (fun s -> s) }
 109
 110     let filter x =
 111       let f tok loc =
 112         let tok' = tok in
 113         (tok', loc)
 114       in
 115       let rec filter =
 116         parser
 117         | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
 118         | [< >] -> [< >]
 119       in
 120       fun strm -> x.filter (filter strm)
 121
 122     let define_filter x f = x.filter <- f x.filter
 123
 124     let keyword_added _ _ _ = ()
 125     let keyword_removed _ _ = ()
 126   end
 127
 128 end
 129 module Error = Camlp4.Struct.EmptyError
 130
 131 module L = Ulexing
 132
 133 exception Error of int * int * string
 134
 135 let error i j s = raise (Error (i,j,s))
 136
 137 (***********************************************************)
 138 (* Buffer for string literals *)
 139
 140 let string_buff = Buffer.create 1024
 141
 142 let store_lexeme lexbuf =
 143   Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
 144 let store_ascii = Buffer.add_char string_buff
 145 let store_code  = Utf8.store string_buff
 146 let clear_buff () = Buffer.clear string_buff
 147 let get_stored_string () =
 148   let s = Buffer.contents string_buff in
 149   clear_buff ();
 150   Buffer.clear string_buff;
 151   s
 152
 153 (***********************************************************)
 154 (* Lexer *)
 155
 156 let illegal lexbuf =
 157   error
 158     (L.lexeme_start lexbuf)
 159     (L.lexeme_end lexbuf)
 160     "Illegal character"
 161
 162 let return lexbuf tok = (tok, L.loc lexbuf)
 163 let return_loc i j tok = (tok, (i,j))
 164
 165 let regexp ncname_char =
 166   xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
 167
 168 let hexa_digit = function
 169   | '0'..'9' as c -> (Char.code c) - (Char.code '0')
 170   | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
 171   | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
 172   | _ -> -1
 173
 174 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
 175
 176 let parse_char lexbuf base i =
 177   let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
 178   let r = ref 0 in
 179   for i = 0 to String.length s - 1 do
 180     let c = hexa_digit s.[i] in
 181     if (c >= base) || (c < 0) then
 182       error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
 183     r := !r * base + c;
 184   done;
 185   !r
 186
 187 let rec token = lexer
 188  | [' ' '\t' '\n'] -> token lexbuf
 189  | "and" | "not" | "or"  | "text()" | "node()"
 190  | "self" | "descendant" | "child" | "descendant-or-self"
 191  | "attribute" | "following-sibling" | "preceding-sibling"
 192  | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following"
 193  | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "."  | ".." | "@"
 194      -> return lexbuf (KWD (L.utf8_lexeme lexbuf))
 195  | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf))
 196  | '-'? ['0'-'9']+ -> let i =  INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i
 197  | '"' | "'" ->
 198      let start = L.lexeme_start lexbuf in
 199      let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
 200      string (L.lexeme_start lexbuf) double_quote lexbuf;
 201      let s = get_stored_string () in
 202        return_loc start (L.lexeme_end lexbuf) (STRING s)
 203
 204  | eof -> return lexbuf EOI
 205  | _ -> illegal lexbuf
 206
 207 and string start double = lexer
 208   | '"' | "'" ->
 209       let d = L.latin1_lexeme_char lexbuf 0 = '"' in
 210       if d != double then (store_lexeme lexbuf; string start double lexbuf)
 211   | '\\' ['\\' '"' '\''] ->
 212       store_ascii (L.latin1_lexeme_char lexbuf 1);
 213       string start double lexbuf
 214   | "\\n" ->
 215       store_ascii '\n'; string start double lexbuf
 216   | "\\t" ->
 217       store_ascii '\t'; string start double lexbuf
 218   | "\\r" ->
 219       store_ascii '\r'; string start double lexbuf
 220   | '\\' ['0'-'9']+ ';' ->
 221       store_code (parse_char lexbuf 10 1);
 222       string start double lexbuf
 223   | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
 224       store_code (parse_char lexbuf 16 2);
 225       string start double lexbuf
 226   | '\\' ->
 227       illegal lexbuf;
 228   | eof ->
 229       error start (start+1) "Unterminated string"
 230   | _ ->
 231       store_lexeme lexbuf;
 232       string start double lexbuf
 233
 234
 235
 236 (***********************************************************)
 237
 238 let enc = ref L.Latin1
 239 let lexbuf = ref None
 240 let last_tok = ref (KWD "DUMMY")
 241
 242 let raise_clean e =
 243   clear_buff ();
 244   raise e
 245
 246 let mk () _loc cs =
 247   let lb = L.from_var_enc_stream enc cs in
 248   lexbuf := Some lb;
 249   let next _ =
 250     let tok, loc =
 251       try token lb
 252       with
 253         | Ulexing.Error ->
 254             raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
 255                           "Unexpected character"))
 256         | Ulexing.InvalidCodepoint i ->
 257             raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
 258                           "Code point invalid for the current encoding"))
 259         | e -> raise_clean e
 260     in
 261     last_tok := tok;
 262     Some (tok, loc)
 263   in
 264   Stream.from next