(******************************************************************************) (* SXSI : XPath evaluator *) (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *) (* Copyright NICTA 2008 *) (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) open Camlp4.PreCast module Loc = struct type t = int * int let mk _ = (0,0) let ghost = (-1,-1) let of_lexing_position _ = assert false let to_ocaml_location _ = assert false let of_ocaml_location _ = assert false let of_lexbuf _ = assert false let of_tuple _ = assert false let to_tuple _ = assert false let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2) let join (x1, _) = (x1, x1) let move _ _ _ = assert false let shift _ _ = assert false let move_line _ _ = assert false let file_name _ = assert false let start_line _ = assert false let stop_line _ = assert false let start_bol _ = assert false let stop_bol _ = assert false let start_off = fst let stop_off = snd let start_pos _ = assert false let stop_pos _ = assert false let is_ghost _ = assert false let ghostify _ = assert false let set_file_name _ = assert false let strictly_before _ = assert false let make_absolute _ = assert false let print _ = assert false let dump _ = assert false let to_string _ = assert false exception Exc_located of t * exn let raise loc exn = match exn with | Exc_located _ -> raise exn | _ -> raise (Exc_located (loc, exn)) let name = ref "_loc" end type token = | TAG of string | STRING of string | INT of int | KWD of string | ATT of string | EOI module Token = struct open Format module Loc = Loc type t = token type token = t let sf = Printf.sprintf let to_string = function | TAG s -> sf "TAG <%s>" s | STRING s -> sf "STRING \"%s\"" s | KWD s -> sf "KWD %s" s | INT i -> sf "INT %i" i | ATT s -> sf "ATT %s" s | EOI -> sf "EOI" let print ppf x = pp_print_string ppf (to_string x) let match_keyword kwd = function | KWD kwd' when kwd = kwd' -> true | _ -> false let extract_string = function | KWD s | STRING s | TAG s | ATT s -> s | INT i -> string_of_int i | tok -> invalid_arg ("Cannot extract a string from this token: "^ to_string tok) module Error = struct type t = string exception E of string let print = pp_print_string let to_string x = x end module Filter = struct type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter type t = { is_kwd : string -> bool; mutable filter : token_filter } let mk is_kwd = { is_kwd = is_kwd; filter = (fun s -> s) } let filter x = let f tok loc = let tok' = tok in (tok', loc) in let rec filter = parser | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >] | [< >] -> [< >] in fun strm -> x.filter (filter strm) let define_filter x f = x.filter <- f x.filter let keyword_added _ _ _ = () let keyword_removed _ _ = () end end module Error = Camlp4.Struct.EmptyError module L = Ulexing exception Error of int * int * string let error i j s = raise (Error (i,j,s)) (***********************************************************) (* Buffer for string literals *) let string_buff = Buffer.create 1024 let store_lexeme lexbuf = Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) let store_ascii = Buffer.add_char string_buff let store_code = Utf8.store string_buff let clear_buff () = Buffer.clear string_buff let get_stored_string () = let s = Buffer.contents string_buff in clear_buff (); Buffer.clear string_buff; s (***********************************************************) (* Lexer *) let illegal lexbuf = error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "Illegal character" let return lexbuf tok = (tok, L.loc lexbuf) let return_loc i j tok = (tok, (i,j)) let regexp ncname_char = xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." let hexa_digit = function | '0'..'9' as c -> (Char.code c) - (Char.code '0') | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 | _ -> -1 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) let parse_char lexbuf base i = let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in let r = ref 0 in for i = 0 to String.length s - 1 do let c = hexa_digit s.[i] in if (c >= base) || (c < 0) then error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; r := !r * base + c; done; !r let rec token = lexer | [' ' '\t' '\n'] -> token lexbuf | "and" | "not" | "or" | "text()" | "node()" | "self" | "descendant" | "child" | "descendant-or-self" | "attribute" | "following-sibling" | "preceding-sibling" | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following" | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "." | ".." | "@" -> return lexbuf (KWD (L.utf8_lexeme lexbuf)) | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf)) | '-'? ['0'-'9']+ -> let i = INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i | '"' | "'" -> let start = L.lexeme_start lexbuf in let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in string (L.lexeme_start lexbuf) double_quote lexbuf; let s = get_stored_string () in return_loc start (L.lexeme_end lexbuf) (STRING s) | eof -> return lexbuf EOI | _ -> illegal lexbuf and string start double = lexer | '"' | "'" -> let d = L.latin1_lexeme_char lexbuf 0 = '"' in if d != double then (store_lexeme lexbuf; string start double lexbuf) | '\\' ['\\' '"' '\''] -> store_ascii (L.latin1_lexeme_char lexbuf 1); string start double lexbuf | "\\n" -> store_ascii '\n'; string start double lexbuf | "\\t" -> store_ascii '\t'; string start double lexbuf | "\\r" -> store_ascii '\r'; string start double lexbuf | '\\' ['0'-'9']+ ';' -> store_code (parse_char lexbuf 10 1); string start double lexbuf | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> store_code (parse_char lexbuf 16 2); string start double lexbuf | '\\' -> illegal lexbuf; | eof -> error start (start+1) "Unterminated string" | _ -> store_lexeme lexbuf; string start double lexbuf (***********************************************************) let enc = ref L.Latin1 let lexbuf = ref None let last_tok = ref (KWD "DUMMY") let raise_clean e = clear_buff (); raise e let mk () _loc cs = let lb = L.from_var_enc_stream enc cs in lexbuf := Some lb; let next _ = let tok, loc = try token lb with | Ulexing.Error -> raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, "Unexpected character")) | Ulexing.InvalidCodepoint i -> raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, "Code point invalid for the current encoding")) | e -> raise_clean e in last_tok := tok; Some (tok, loc) in Stream.from next