(***********************************************************************) (* *) (* TAToo *) (* *) (* Kim Nguyen, LRI UMR8623 *) (* Université Paris-Sud & CNRS *) (* *) (* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) (* Recherche Scientifique. All rights reserved. This file is *) (* distributed under the terms of the GNU Lesser General Public *) (* License, with the special exception on linking described in file *) (* ../LICENSE. *) (* *) (***********************************************************************) open Xpath_internal_parser module L = Ulexing exception Error of int * int * string let error i j s = raise (Error (i,j,s)) (***********************************************************) (* Buffer for string literals *) let string_buff = Buffer.create 1024 let store_lexeme lexbuf = Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) let store_ascii = Buffer.add_char string_buff let store_code = Utf8.store string_buff let clear_buff () = Buffer.clear string_buff let get_stored_string () = let s = Buffer.contents string_buff in clear_buff (); Buffer.clear string_buff; s (***********************************************************) (* Lexer *) let illegal lexbuf = error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "Illegal character" let return lexbuf tok = (tok, L.loc lexbuf) let return_loc i j tok = (tok, (i,j)) let regexp ncname_char = xml_letter | xml_digit | [ '-' '_' '.'] | xml_combining_char | xml_extender let hexa_digit = function | '0'..'9' as c -> (Char.code c) - (Char.code '0') | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 | _ -> -1 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) let regexp digit = ['0'-'9'] let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)? let parse_char lexbuf base i = let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in let r = ref 0 in for i = 0 to String.length s - 1 do let c = hexa_digit s.[i] in if (c >= base) || (c < 0) then error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; r := !r * base + c; done; !r let keyword_or_tag s = try List.assoc s [ "self", AXIS Ast.Self; "descendant", AXIS (Ast.Descendant false); "child", AXIS Ast.Child; "descendant-or-self", AXIS (Ast.Descendant true); "attribute", AXIS Ast.Attribute; "following-sibling", AXIS Ast.FollowingSibling; "preceding-sibling", AXIS Ast.PrecedingSibling; "parent", AXIS Ast.Parent; "ancestor", AXIS (Ast.Ancestor false); "ancestor-or-self", AXIS (Ast.Ancestor true); "preceding", AXIS Ast.Preceding; "following", AXIS Ast.Following; "and", AND; "or" , OR; "div", DIV; "mod", MOD; ] with _ -> TAG s let rec token = lexer | [' ' '\t' '\n'] -> token lexbuf | "*" -> STAR | "/" -> SLASH | "//" -> SLASHSLASH | "::" -> COLONCOLON | "(" -> LP | ")" -> RP | "[" -> LB | "]" -> RB | "," -> COMMA | "|" -> PIPE | "+" -> ADD | "-" -> SUB | "<" -> LT | "<=" -> LTE | ">" -> GT | ">=" -> GTE | "=" -> EQ | "!=" -> NEQ | ".." -> DOTDOT | "." -> DOT | "node()" -> NODE | "text()" -> TEXT | "comment()" -> COMMENT | '@' ncname -> let l = L.utf8_lexeme lexbuf in ATTNAME (String.sub l 1 (String.length l - 1)) | "processing-instruction()" -> PI "" | "processing-instruction('"ncname"')" | "processing-instruction(\""ncname"\")"-> let s = L.utf8_lexeme lexbuf in PI (String.sub s 24 (String.length s - 26)) | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf) | float -> let s = L.utf8_lexeme lexbuf in (try INT (int_of_string s) with _ -> FLOAT (float_of_string s)) | '"' | "'" -> let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in string (L.lexeme_start lexbuf) double_quote lexbuf; let s = get_stored_string () in STRING s | eof -> EOF | _ -> illegal lexbuf and string start double = lexer | '"' | "'" -> let d = L.latin1_lexeme_char lexbuf 0 = '"' in if d != double then (store_lexeme lexbuf; string start double lexbuf) | '\\' ['\\' '"' '\''] -> store_ascii (L.latin1_lexeme_char lexbuf 1); string start double lexbuf | "\\n" -> store_ascii '\n'; string start double lexbuf | "\\t" -> store_ascii '\t'; string start double lexbuf | "\\r" -> store_ascii '\r'; string start double lexbuf | '\\' ['0'-'9']+ ';' -> store_code (parse_char lexbuf 10 1); string start double lexbuf | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> store_code (parse_char lexbuf 16 2); string start double lexbuf | '\\' -> illegal lexbuf; | eof -> error start (start+1) "Unterminated string" | _ -> store_lexeme lexbuf; string start double lexbuf