src/ulexer.ml

   1 (******************************************************************************)
   2 (*  SXSI : XPath evaluator                                                    *)
   3 (*  Kim Nguyen (Kim.Nguyen@nicta.com.au)                                      *)
   4 (*  Copyright NICTA 2008                                                      *)
   5 (*  Distributed under the terms of the LGPL (see LICENCE)                     *)
   6 (******************************************************************************)
   7 open Parser
   8
   9 module L = Ulexing
  10
  11 exception Error of int * int * string
  12
  13 let error i j s = raise (Error (i,j,s))
  14
  15 (***********************************************************)
  16 (* Buffer for string literals *)
  17
  18 let string_buff = Buffer.create 1024
  19
  20 let store_lexeme lexbuf =
  21   Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
  22 let store_ascii = Buffer.add_char string_buff
  23 let store_code  = Utf8.store string_buff
  24 let clear_buff () = Buffer.clear string_buff
  25 let get_stored_string () =
  26   let s = Buffer.contents string_buff in
  27   clear_buff ();
  28   Buffer.clear string_buff;
  29   s
  30
  31 (***********************************************************)
  32 (* Lexer *)
  33
  34 let illegal lexbuf =
  35   error
  36     (L.lexeme_start lexbuf)
  37     (L.lexeme_end lexbuf)
  38     "Illegal character"
  39
  40 let return lexbuf tok = (tok, L.loc lexbuf)
  41 let return_loc i j tok = (tok, (i,j))
  42
  43 let regexp ncname_char =
  44   xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
  45
  46 let hexa_digit = function
  47   | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  48   | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  49   | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  50   | _ -> -1
  51
  52 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
  53 let regexp digit = ['0'-'9']
  54 let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)?
  55
  56 let parse_char lexbuf base i =
  57   let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
  58   let r = ref 0 in
  59   for i = 0 to String.length s - 1 do
  60     let c = hexa_digit s.[i] in
  61     if (c >= base) || (c < 0) then
  62       error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
  63     r := !r * base + c;
  64   done;
  65   !r
  66
  67 let keyword_or_tag s =
  68      try
  69        List.assoc s [
  70          "self", AXIS XPath.Ast.Self;
  71          "descendant", AXIS XPath.Ast.Descendant;
  72          "child", AXIS XPath.Ast.Child;
  73          "descendant-or-self", AXIS XPath.Ast.DescendantOrSelf;
  74          "attribute", AXIS XPath.Ast.Attribute;
  75          "following-sibling", AXIS XPath.Ast.FollowingSibling;
  76          "preceding-sibling", AXIS XPath.Ast.PrecedingSibling;
  77          "parent", AXIS XPath.Ast.Parent;
  78          "ancestor", AXIS XPath.Ast.Ancestor;
  79          "ancestor-or-self", AXIS XPath.Ast.AncestorOrSelf;
  80          "preceding", AXIS XPath.Ast.Preceding;
  81          "following", AXIS XPath.Ast.Following;
  82          "and", AND;
  83          "or" , OR;
  84          "div", DIV;
  85          "mod", MOD;
  86        ]
  87      with
  88        _ -> TAG s
  89
  90
  91 let rec token = lexer
  92  | [' ' '\t' '\n'] -> token lexbuf
  93  | "*" -> STAR
  94  | "/" -> SLASH
  95  | "//" -> SLASHSLASH
  96  | "::" -> COLONCOLON
  97  | "("  -> LP
  98  | ")"  -> RP
  99  | "["  -> LB
 100  | "]"  -> RB
 101  | ","  -> COMMA
 102  | "|"  -> PIPE
 103  | "+"  -> ADD
 104  | "-"  -> SUB
 105  | "<"  -> LT
 106  | "<=" -> LTE
 107  | ">"  -> GT
 108  | ">=" -> GTE
 109  | "="  -> EQ
 110  | "!=" -> NEQ
 111  | "node()" -> NODE
 112  | "text()" -> TEXT
 113  | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf)
 114  | float ->
 115      let s = L.utf8_lexeme lexbuf in
 116      (try
 117        INT (int_of_string s)
 118      with
 119        _ -> FLOAT (float_of_string s))
 120  | '"' | "'" ->
 121      let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
 122      string (L.lexeme_start lexbuf) double_quote lexbuf;
 123      let s = get_stored_string () in
 124      STRING s
 125
 126  | eof -> EOF
 127  | _ -> illegal lexbuf
 128
 129 and string start double = lexer
 130   | '"' | "'" ->
 131       let d = L.latin1_lexeme_char lexbuf 0 = '"' in
 132       if d != double then (store_lexeme lexbuf; string start double lexbuf)
 133   | '\\' ['\\' '"' '\''] ->
 134       store_ascii (L.latin1_lexeme_char lexbuf 1);
 135       string start double lexbuf
 136   | "\\n" ->
 137       store_ascii '\n'; string start double lexbuf
 138   | "\\t" ->
 139       store_ascii '\t'; string start double lexbuf
 140   | "\\r" ->
 141       store_ascii '\r'; string start double lexbuf
 142   | '\\' ['0'-'9']+ ';' ->
 143       store_code (parse_char lexbuf 10 1);
 144       string start double lexbuf
 145   | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
 146       store_code (parse_char lexbuf 16 2);
 147       string start double lexbuf
 148   | '\\' ->
 149       illegal lexbuf;
 150   | eof ->
 151       error start (start+1) "Unterminated string"
 152   | _ ->
 153       store_lexeme lexbuf;
 154       string start double lexbuf
 155
 156