X-Git-Url: http://git.nguyen.vg/gitweb/?p=tatoo.git;a=blobdiff_plain;f=src%2Fulexer.ml;h=a27d55db49fe418317845d2c017b9d805fd78d6e;hp=157e7d2521e6b13edc0a2f6fc8b8eb3322451204;hb=ddd758716b1cd691c8748d2e86c179e803b1d3af;hpb=c951f1d4b8f4264acb0b5910dc544ad3a6ceebab diff --git a/src/ulexer.ml b/src/ulexer.ml index 157e7d2..a27d55d 100644 --- a/src/ulexer.ml +++ b/src/ulexer.ml @@ -4,129 +4,7 @@ (* Copyright NICTA 2008 *) (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) -open Camlp4.PreCast - -module Loc = struct - type t = int * int - - let mk _ = (0,0) - let ghost = (-1,-1) - - let of_lexing_position _ = assert false - let to_ocaml_location _ = assert false - let of_ocaml_location _ = assert false - let of_lexbuf _ = assert false - let of_tuple _ = assert false - let to_tuple _ = assert false - - let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2) - let join (x1, _) = (x1, x1) - let move _ _ _ = assert false - let shift _ _ = assert false - let move_line _ _ = assert false - let file_name _ = assert false - let start_line _ = assert false - let stop_line _ = assert false - let start_bol _ = assert false - let stop_bol _ = assert false - let start_off = fst - let stop_off = snd - let start_pos _ = assert false - let stop_pos _ = assert false - let is_ghost _ = assert false - let ghostify _ = assert false - let set_file_name _ = assert false - let strictly_before _ = assert false - let make_absolute _ = assert false - let print _ = assert false - let dump _ = assert false - let to_string _ = assert false - exception Exc_located of t * exn - let raise loc exn = - match exn with - | Exc_located _ -> raise exn - | _ -> raise (Exc_located (loc, exn)) - let name = ref "_loc" -end - -type token = - | TAG of string - | STRING of string - | INT of int - | KWD of string - | ATT of string - | EOI - -module Token = struct - open Format - module Loc = Loc - type t = token - type token = t - - let sf = Printf.sprintf - - let to_string = - function - | TAG s -> sf "TAG <%s>" s - | STRING s -> sf "STRING \"%s\"" s - | KWD s -> sf "KWD %s" s - | INT i -> sf "INT %i" i - | ATT s -> sf "ATT %s" s - | EOI -> sf "EOI" - - let print ppf x = pp_print_string ppf (to_string x) - - let match_keyword kwd = - function - | KWD kwd' when kwd = kwd' -> true - | _ -> false - - let extract_string = - function - | KWD s | STRING s | TAG s | ATT s -> s - | INT i -> string_of_int i - | tok -> - invalid_arg ("Cannot extract a string from this token: "^ - to_string tok) - - module Error = struct - type t = string - exception E of string - let print = pp_print_string - let to_string x = x - end - - module Filter = struct - type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter - - type t = - { is_kwd : string -> bool; - mutable filter : token_filter } - - let mk is_kwd = - { is_kwd = is_kwd; - filter = (fun s -> s) } - - let filter x = - let f tok loc = - let tok' = tok in - (tok', loc) - in - let rec filter = - parser - | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >] - | [< >] -> [< >] - in - fun strm -> x.filter (filter strm) - - let define_filter x f = x.filter <- f x.filter - - let keyword_added _ _ _ = () - let keyword_removed _ _ = () - end - -end -module Error = Camlp4.Struct.EmptyError +open Parser module L = Ulexing @@ -136,10 +14,10 @@ let error i j s = raise (Error (i,j,s)) (***********************************************************) (* Buffer for string literals *) - + let string_buff = Buffer.create 1024 -let store_lexeme lexbuf = +let store_lexeme lexbuf = Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) let store_ascii = Buffer.add_char string_buff let store_code = Utf8.store string_buff @@ -156,13 +34,13 @@ let get_stored_string () = let illegal lexbuf = error (L.lexeme_start lexbuf) - (L.lexeme_end lexbuf) + (L.lexeme_end lexbuf) "Illegal character" let return lexbuf tok = (tok, L.loc lexbuf) let return_loc i j tok = (tok, (i,j)) -let regexp ncname_char = +let regexp ncname_char = xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." let hexa_digit = function @@ -172,36 +50,80 @@ let hexa_digit = function | _ -> -1 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) +let regexp digit = ['0'-'9'] +let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)? let parse_char lexbuf base i = - let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in + let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in let r = ref 0 in for i = 0 to String.length s - 1 do let c = hexa_digit s.[i] in - if (c >= base) || (c < 0) then + if (c >= base) || (c < 0) then error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; r := !r * base + c; done; !r +let keyword_or_tag s = + try + List.assoc s [ + "self", AXIS XPath.Ast.Self; + "descendant", AXIS XPath.Ast.Descendant; + "child", AXIS XPath.Ast.Child; + "descendant-or-self", AXIS XPath.Ast.DescendantOrSelf; + "attribute", AXIS XPath.Ast.Attribute; + "following-sibling", AXIS XPath.Ast.FollowingSibling; + "preceding-sibling", AXIS XPath.Ast.PrecedingSibling; + "parent", AXIS XPath.Ast.Parent; + "ancestor", AXIS XPath.Ast.Ancestor; + "ancestor-or-self", AXIS XPath.Ast.AncestorOrSelf; + "preceding", AXIS XPath.Ast.Preceding; + "following", AXIS XPath.Ast.Following; + "and", AND; + "or" , OR; + "div", DIV; + "mod", MOD; + ] + with + _ -> TAG s + + let rec token = lexer | [' ' '\t' '\n'] -> token lexbuf - | "and" | "not" | "or" | "text()" | "node()" - | "self" | "descendant" | "child" | "descendant-or-self" - | "attribute" | "following-sibling" | "preceding-sibling" - | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following" - | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "." | ".." | "@" - -> return lexbuf (KWD (L.utf8_lexeme lexbuf)) - | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf)) - | '-'? ['0'-'9']+ -> let i = INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i + | "*" -> STAR + | "/" -> SLASH + | "//" -> SLASHSLASH + | "::" -> COLONCOLON + | "(" -> LP + | ")" -> RP + | "[" -> LB + | "]" -> RB + | "," -> COMMA + | "|" -> PIPE + | "+" -> ADD + | "-" -> SUB + | "<" -> LT + | "<=" -> LTE + | ">" -> GT + | ">=" -> GTE + | "=" -> EQ + | "!=" -> NEQ + | "node()" -> NODE + | "text()" -> TEXT + | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf) + | float -> + let s = L.utf8_lexeme lexbuf in + (try + INT (int_of_string s) + with + _ -> FLOAT (float_of_string s)) | '"' | "'" -> - let start = L.lexeme_start lexbuf in let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in string (L.lexeme_start lexbuf) double_quote lexbuf; let s = get_stored_string () in - return_loc start (L.lexeme_end lexbuf) (STRING s) + STRING s - | eof -> return lexbuf EOI + | eof -> EOF | _ -> illegal lexbuf and string start double = lexer @@ -211,11 +133,11 @@ and string start double = lexer | '\\' ['\\' '"' '\''] -> store_ascii (L.latin1_lexeme_char lexbuf 1); string start double lexbuf - | "\\n" -> + | "\\n" -> store_ascii '\n'; string start double lexbuf - | "\\t" -> + | "\\t" -> store_ascii '\t'; string start double lexbuf - | "\\r" -> + | "\\r" -> store_ascii '\r'; string start double lexbuf | '\\' ['0'-'9']+ ';' -> store_code (parse_char lexbuf 10 1); @@ -232,33 +154,3 @@ and string start double = lexer string start double lexbuf - -(***********************************************************) - -let enc = ref L.Latin1 -let lexbuf = ref None -let last_tok = ref (KWD "DUMMY") - -let raise_clean e = - clear_buff (); - raise e - -let mk () _loc cs = - let lb = L.from_var_enc_stream enc cs in - lexbuf := Some lb; - let next _ = - let tok, loc = - try token lb - with - | Ulexing.Error -> - raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, - "Unexpected character")) - | Ulexing.InvalidCodepoint i -> - raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, - "Code point invalid for the current encoding")) - | e -> raise_clean e - in - last_tok := tok; - Some (tok, loc) - in - Stream.from next