X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=src%2Fulexer.ml;fp=src%2Fulexer.ml;h=157e7d2521e6b13edc0a2f6fc8b8eb3322451204;hb=cba2938d929fd5119b1491686ddc224d5af618c6;hp=0000000000000000000000000000000000000000;hpb=0cf8def92c8c6e708ec333b13dbe46decf554d81;p=tatoo.git diff --git a/src/ulexer.ml b/src/ulexer.ml new file mode 100644 index 0000000..157e7d2 --- /dev/null +++ b/src/ulexer.ml @@ -0,0 +1,264 @@ +(******************************************************************************) +(* SXSI : XPath evaluator *) +(* Kim Nguyen (Kim.Nguyen@nicta.com.au) *) +(* Copyright NICTA 2008 *) +(* Distributed under the terms of the LGPL (see LICENCE) *) +(******************************************************************************) +open Camlp4.PreCast + +module Loc = struct + type t = int * int + + let mk _ = (0,0) + let ghost = (-1,-1) + + let of_lexing_position _ = assert false + let to_ocaml_location _ = assert false + let of_ocaml_location _ = assert false + let of_lexbuf _ = assert false + let of_tuple _ = assert false + let to_tuple _ = assert false + + let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2) + let join (x1, _) = (x1, x1) + let move _ _ _ = assert false + let shift _ _ = assert false + let move_line _ _ = assert false + let file_name _ = assert false + let start_line _ = assert false + let stop_line _ = assert false + let start_bol _ = assert false + let stop_bol _ = assert false + let start_off = fst + let stop_off = snd + let start_pos _ = assert false + let stop_pos _ = assert false + let is_ghost _ = assert false + let ghostify _ = assert false + let set_file_name _ = assert false + let strictly_before _ = assert false + let make_absolute _ = assert false + let print _ = assert false + let dump _ = assert false + let to_string _ = assert false + exception Exc_located of t * exn + let raise loc exn = + match exn with + | Exc_located _ -> raise exn + | _ -> raise (Exc_located (loc, exn)) + let name = ref "_loc" +end + +type token = + | TAG of string + | STRING of string + | INT of int + | KWD of string + | ATT of string + | EOI + +module Token = struct + open Format + module Loc = Loc + type t = token + type token = t + + let sf = Printf.sprintf + + let to_string = + function + | TAG s -> sf "TAG <%s>" s + | STRING s -> sf "STRING \"%s\"" s + | KWD s -> sf "KWD %s" s + | INT i -> sf "INT %i" i + | ATT s -> sf "ATT %s" s + | EOI -> sf "EOI" + + let print ppf x = pp_print_string ppf (to_string x) + + let match_keyword kwd = + function + | KWD kwd' when kwd = kwd' -> true + | _ -> false + + let extract_string = + function + | KWD s | STRING s | TAG s | ATT s -> s + | INT i -> string_of_int i + | tok -> + invalid_arg ("Cannot extract a string from this token: "^ + to_string tok) + + module Error = struct + type t = string + exception E of string + let print = pp_print_string + let to_string x = x + end + + module Filter = struct + type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter + + type t = + { is_kwd : string -> bool; + mutable filter : token_filter } + + let mk is_kwd = + { is_kwd = is_kwd; + filter = (fun s -> s) } + + let filter x = + let f tok loc = + let tok' = tok in + (tok', loc) + in + let rec filter = + parser + | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >] + | [< >] -> [< >] + in + fun strm -> x.filter (filter strm) + + let define_filter x f = x.filter <- f x.filter + + let keyword_added _ _ _ = () + let keyword_removed _ _ = () + end + +end +module Error = Camlp4.Struct.EmptyError + +module L = Ulexing + +exception Error of int * int * string + +let error i j s = raise (Error (i,j,s)) + +(***********************************************************) +(* Buffer for string literals *) + +let string_buff = Buffer.create 1024 + +let store_lexeme lexbuf = + Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) +let store_ascii = Buffer.add_char string_buff +let store_code = Utf8.store string_buff +let clear_buff () = Buffer.clear string_buff +let get_stored_string () = + let s = Buffer.contents string_buff in + clear_buff (); + Buffer.clear string_buff; + s + +(***********************************************************) +(* Lexer *) + +let illegal lexbuf = + error + (L.lexeme_start lexbuf) + (L.lexeme_end lexbuf) + "Illegal character" + +let return lexbuf tok = (tok, L.loc lexbuf) +let return_loc i j tok = (tok, (i,j)) + +let regexp ncname_char = + xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." + +let hexa_digit = function + | '0'..'9' as c -> (Char.code c) - (Char.code '0') + | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 + | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 + | _ -> -1 + +let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) + +let parse_char lexbuf base i = + let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in + let r = ref 0 in + for i = 0 to String.length s - 1 do + let c = hexa_digit s.[i] in + if (c >= base) || (c < 0) then + error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; + r := !r * base + c; + done; + !r + +let rec token = lexer + | [' ' '\t' '\n'] -> token lexbuf + | "and" | "not" | "or" | "text()" | "node()" + | "self" | "descendant" | "child" | "descendant-or-self" + | "attribute" | "following-sibling" | "preceding-sibling" + | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following" + | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "." | ".." | "@" + -> return lexbuf (KWD (L.utf8_lexeme lexbuf)) + | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf)) + | '-'? ['0'-'9']+ -> let i = INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i + | '"' | "'" -> + let start = L.lexeme_start lexbuf in + let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in + string (L.lexeme_start lexbuf) double_quote lexbuf; + let s = get_stored_string () in + return_loc start (L.lexeme_end lexbuf) (STRING s) + + | eof -> return lexbuf EOI + | _ -> illegal lexbuf + +and string start double = lexer + | '"' | "'" -> + let d = L.latin1_lexeme_char lexbuf 0 = '"' in + if d != double then (store_lexeme lexbuf; string start double lexbuf) + | '\\' ['\\' '"' '\''] -> + store_ascii (L.latin1_lexeme_char lexbuf 1); + string start double lexbuf + | "\\n" -> + store_ascii '\n'; string start double lexbuf + | "\\t" -> + store_ascii '\t'; string start double lexbuf + | "\\r" -> + store_ascii '\r'; string start double lexbuf + | '\\' ['0'-'9']+ ';' -> + store_code (parse_char lexbuf 10 1); + string start double lexbuf + | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> + store_code (parse_char lexbuf 16 2); + string start double lexbuf + | '\\' -> + illegal lexbuf; + | eof -> + error start (start+1) "Unterminated string" + | _ -> + store_lexeme lexbuf; + string start double lexbuf + + + +(***********************************************************) + +let enc = ref L.Latin1 +let lexbuf = ref None +let last_tok = ref (KWD "DUMMY") + +let raise_clean e = + clear_buff (); + raise e + +let mk () _loc cs = + let lb = L.from_var_enc_stream enc cs in + lexbuf := Some lb; + let next _ = + let tok, loc = + try token lb + with + | Ulexing.Error -> + raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, + "Unexpected character")) + | Ulexing.InvalidCodepoint i -> + raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, + "Code point invalid for the current encoding")) + | e -> raise_clean e + in + last_tok := tok; + Some (tok, loc) + in + Stream.from next