X-Git-Url: http://git.nguyen.vg/gitweb/?p=tatoo.git;a=blobdiff_plain;f=src%2Fxpath%2Fulexer.ml;fp=src%2Fxpath%2Fulexer.ml;h=aa66179b20e2bdb23b30a3e874660f59634b001c;hp=0000000000000000000000000000000000000000;hb=30bc0bb1291426e5e26eb2dee1ffc41e4c246349;hpb=d9c0e4863807eaf472e875a4bad35cfefe985c95 diff --git a/src/xpath/ulexer.ml b/src/xpath/ulexer.ml new file mode 100644 index 0000000..aa66179 --- /dev/null +++ b/src/xpath/ulexer.ml @@ -0,0 +1,169 @@ +(***********************************************************************) +(* *) +(* TAToo *) +(* *) +(* Kim Nguyen, LRI UMR8623 *) +(* Université Paris-Sud & CNRS *) +(* *) +(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) +(* Recherche Scientifique. All rights reserved. This file is *) +(* distributed under the terms of the GNU Lesser General Public *) +(* License, with the special exception on linking described in file *) +(* ../LICENSE. *) +(* *) +(***********************************************************************) + +(* + Time-stamp: +*) + +open Xpath_internal_parser + +module L = Ulexing + +exception Error of int * int * string + +let error i j s = raise (Error (i,j,s)) + +(***********************************************************) +(* Buffer for string literals *) + +let string_buff = Buffer.create 1024 + +let store_lexeme lexbuf = + Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) +let store_ascii = Buffer.add_char string_buff +let store_code = Utf8.store string_buff +let clear_buff () = Buffer.clear string_buff +let get_stored_string () = + let s = Buffer.contents string_buff in + clear_buff (); + Buffer.clear string_buff; + s + +(***********************************************************) +(* Lexer *) + +let illegal lexbuf = + error + (L.lexeme_start lexbuf) + (L.lexeme_end lexbuf) + "Illegal character" + +let return lexbuf tok = (tok, L.loc lexbuf) +let return_loc i j tok = (tok, (i,j)) + +let regexp ncname_char = + xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." + +let hexa_digit = function + | '0'..'9' as c -> (Char.code c) - (Char.code '0') + | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 + | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 + | _ -> -1 + +let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) +let regexp digit = ['0'-'9'] +let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)? + +let parse_char lexbuf base i = + let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in + let r = ref 0 in + for i = 0 to String.length s - 1 do + let c = hexa_digit s.[i] in + if (c >= base) || (c < 0) then + error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; + r := !r * base + c; + done; + !r + +let keyword_or_tag s = + try + List.assoc s [ + "self", AXIS Ast.Self; + "descendant", AXIS Ast.Descendant; + "child", AXIS Ast.Child; + "descendant-or-self", AXIS Ast.DescendantOrSelf; + "attribute", AXIS Ast.Attribute; + "following-sibling", AXIS Ast.FollowingSibling; + "preceding-sibling", AXIS Ast.PrecedingSibling; + "parent", AXIS Ast.Parent; + "ancestor", AXIS Ast.Ancestor; + "ancestor-or-self", AXIS Ast.AncestorOrSelf; + "preceding", AXIS Ast.Preceding; + "following", AXIS Ast.Following; + "and", AND; + "or" , OR; + "div", DIV; + "mod", MOD; + ] + with + _ -> TAG s + + +let rec token = lexer + | [' ' '\t' '\n'] -> token lexbuf + | "*" -> STAR + | "/" -> SLASH + | "//" -> SLASHSLASH + | "::" -> COLONCOLON + | "(" -> LP + | ")" -> RP + | "[" -> LB + | "]" -> RB + | "," -> COMMA + | "|" -> PIPE + | "+" -> ADD + | "-" -> SUB + | "<" -> LT + | "<=" -> LTE + | ">" -> GT + | ">=" -> GTE + | "=" -> EQ + | "!=" -> NEQ + | "node()" -> NODE + | "text()" -> TEXT + | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf) + | float -> + let s = L.utf8_lexeme lexbuf in + (try + INT (int_of_string s) + with + _ -> FLOAT (float_of_string s)) + | '"' | "'" -> + let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in + string (L.lexeme_start lexbuf) double_quote lexbuf; + let s = get_stored_string () in + STRING s + + | eof -> EOF + | _ -> illegal lexbuf + +and string start double = lexer + | '"' | "'" -> + let d = L.latin1_lexeme_char lexbuf 0 = '"' in + if d != double then (store_lexeme lexbuf; string start double lexbuf) + | '\\' ['\\' '"' '\''] -> + store_ascii (L.latin1_lexeme_char lexbuf 1); + string start double lexbuf + | "\\n" -> + store_ascii '\n'; string start double lexbuf + | "\\t" -> + store_ascii '\t'; string start double lexbuf + | "\\r" -> + store_ascii '\r'; string start double lexbuf + | '\\' ['0'-'9']+ ';' -> + store_code (parse_char lexbuf 10 1); + string start double lexbuf + | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> + store_code (parse_char lexbuf 16 2); + string start double lexbuf + | '\\' -> + illegal lexbuf; + | eof -> + error start (start+1) "Unterminated string" + | _ -> + store_lexeme lexbuf; + string start double lexbuf + +