src/xpath/ulexer.ml

   1 (***********************************************************************)
   2 (*                                                                     *)
   3 (*                               TAToo                                 *)
   4 (*                                                                     *)
   5 (*                     Kim Nguyen, LRI UMR8623                         *)
   6 (*                   Université Paris-Sud & CNRS                       *)
   7 (*                                                                     *)
   8 (*  Copyright 2010-2013 Université Paris-Sud and Centre National de la *)
   9 (*  Recherche Scientifique. All rights reserved.  This file is         *)
  10 (*  distributed under the terms of the GNU Lesser General Public       *)
  11 (*  License, with the special exception on linking described in file   *)
  12 (*  ../LICENSE.                                                        *)
  13 (*                                                                     *)
  14 (***********************************************************************)
  15
  16 open Xpath_internal_parser
  17
  18 module L = Ulexing
  19
  20 exception Error of int * int * string
  21
  22 let error i j s = raise (Error (i,j,s))
  23
  24 (***********************************************************)
  25 (* Buffer for string literals *)
  26
  27 let string_buff = Buffer.create 1024
  28
  29 let store_lexeme lexbuf =
  30   Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
  31 let store_ascii = Buffer.add_char string_buff
  32 let store_code  = Utf8.store string_buff
  33 let clear_buff () = Buffer.clear string_buff
  34 let get_stored_string () =
  35   let s = Buffer.contents string_buff in
  36   clear_buff ();
  37   Buffer.clear string_buff;
  38   s
  39
  40 (***********************************************************)
  41 (* Lexer *)
  42
  43 let illegal lexbuf =
  44   error
  45     (L.lexeme_start lexbuf)
  46     (L.lexeme_end lexbuf)
  47     "Illegal character"
  48
  49 let return lexbuf tok = (tok, L.loc lexbuf)
  50 let return_loc i j tok = (tok, (i,j))
  51
  52 let regexp ncname_char =
  53   xml_letter | xml_digit | [ '-' '_' '.'] | xml_combining_char | xml_extender
  54
  55 let hexa_digit = function
  56   | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  57   | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  58   | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  59   | _ -> -1
  60
  61 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
  62 let regexp digit = ['0'-'9']
  63 let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)?
  64
  65 let parse_char lexbuf base i =
  66   let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
  67   let r = ref 0 in
  68   for i = 0 to String.length s - 1 do
  69     let c = hexa_digit s.[i] in
  70     if (c >= base) || (c < 0) then
  71       error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
  72     r := !r * base + c;
  73   done;
  74   !r
  75
  76 let keyword_or_tag s =
  77      try
  78        List.assoc s [
  79          "self", AXIS Ast.Self;
  80          "descendant", AXIS (Ast.Descendant false);
  81          "child", AXIS Ast.Child;
  82          "descendant-or-self", AXIS (Ast.Descendant true);
  83          "attribute", AXIS Ast.Attribute;
  84          "following-sibling", AXIS Ast.FollowingSibling;
  85          "preceding-sibling", AXIS Ast.PrecedingSibling;
  86          "parent", AXIS Ast.Parent;
  87          "ancestor", AXIS (Ast.Ancestor false);
  88          "ancestor-or-self", AXIS (Ast.Ancestor true);
  89          "preceding", AXIS Ast.Preceding;
  90          "following", AXIS Ast.Following;
  91          "and", AND;
  92          "or" , OR;
  93          "div", DIV;
  94          "mod", MOD;
  95        ]
  96      with
  97        _ -> TAG s
  98
  99
 100 let rec token = lexer
 101  | [' ' '\t' '\n'] -> token lexbuf
 102  | "*" -> STAR
 103  | "/" -> SLASH
 104  | "//" -> SLASHSLASH
 105  | "::" -> COLONCOLON
 106  | "("  -> LP
 107  | ")"  -> RP
 108  | "["  -> LB
 109  | "]"  -> RB
 110  | ","  -> COMMA
 111  | "|"  -> PIPE
 112  | "+"  -> ADD
 113  | "-"  -> SUB
 114  | "<"  -> LT
 115  | "<=" -> LTE
 116  | ">"  -> GT
 117  | ">=" -> GTE
 118  | "="  -> EQ
 119  | "!=" -> NEQ
 120  | ".." -> DOTDOT
 121  | "."  -> DOT
 122  | "node()" -> NODE
 123  | "text()" -> TEXT
 124  | "comment()" -> COMMENT
 125  | '@' ncname -> ATTNAME (L.utf8_lexeme lexbuf)
 126  | "processing-instruction()" -> PI ""
 127  | "processing-instruction('"ncname"')"
 128  | "processing-instruction(\""ncname"\")"->
 129      let s = L.utf8_lexeme lexbuf in
 130      PI (String.sub s 24 (String.length s - 26))
 131  | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf)
 132  | float ->
 133      let s = L.utf8_lexeme lexbuf in
 134      (try
 135        INT (int_of_string s)
 136      with
 137        _ -> FLOAT (float_of_string s))
 138  | '"' | "'" ->
 139      let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
 140      string (L.lexeme_start lexbuf) double_quote lexbuf;
 141      let s = get_stored_string () in
 142      STRING s
 143
 144  | eof -> EOF
 145  | _ -> illegal lexbuf
 146
 147 and string start double = lexer
 148   | '"' | "'" ->
 149       let d = L.latin1_lexeme_char lexbuf 0 = '"' in
 150       if d != double then (store_lexeme lexbuf; string start double lexbuf)
 151   | '\\' ['\\' '"' '\''] ->
 152       store_ascii (L.latin1_lexeme_char lexbuf 1);
 153       string start double lexbuf
 154   | "\\n" ->
 155       store_ascii '\n'; string start double lexbuf
 156   | "\\t" ->
 157       store_ascii '\t'; string start double lexbuf
 158   | "\\r" ->
 159       store_ascii '\r'; string start double lexbuf
 160   | '\\' ['0'-'9']+ ';' ->
 161       store_code (parse_char lexbuf 10 1);
 162       string start double lexbuf
 163   | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
 164       store_code (parse_char lexbuf 16 2);
 165       string start double lexbuf
 166   | '\\' ->
 167       illegal lexbuf;
 168   | eof ->
 169       error start (start+1) "Unterminated string"
 170   | _ ->
 171       store_lexeme lexbuf;
 172       string start double lexbuf