--- /dev/null
+(******************************************************************************)
+(* SXSI : XPath evaluator *)
+(* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
+(* Copyright NICTA 2008 *)
+(* Distributed under the terms of the LGPL (see LICENCE) *)
+(******************************************************************************)
+open Camlp4.PreCast
+
+module Loc = struct
+ type t = int * int
+
+ let mk _ = (0,0)
+ let ghost = (-1,-1)
+
+ let of_lexing_position _ = assert false
+ let to_ocaml_location _ = assert false
+ let of_ocaml_location _ = assert false
+ let of_lexbuf _ = assert false
+ let of_tuple _ = assert false
+ let to_tuple _ = assert false
+
+ let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
+ let join (x1, _) = (x1, x1)
+ let move _ _ _ = assert false
+ let shift _ _ = assert false
+ let move_line _ _ = assert false
+ let file_name _ = assert false
+ let start_line _ = assert false
+ let stop_line _ = assert false
+ let start_bol _ = assert false
+ let stop_bol _ = assert false
+ let start_off = fst
+ let stop_off = snd
+ let start_pos _ = assert false
+ let stop_pos _ = assert false
+ let is_ghost _ = assert false
+ let ghostify _ = assert false
+ let set_file_name _ = assert false
+ let strictly_before _ = assert false
+ let make_absolute _ = assert false
+ let print _ = assert false
+ let dump _ = assert false
+ let to_string _ = assert false
+ exception Exc_located of t * exn
+ let raise loc exn =
+ match exn with
+ | Exc_located _ -> raise exn
+ | _ -> raise (Exc_located (loc, exn))
+ let name = ref "_loc"
+end
+
+type token =
+ | TAG of string
+ | STRING of string
+ | INT of int
+ | KWD of string
+ | ATT of string
+ | EOI
+
+module Token = struct
+ open Format
+ module Loc = Loc
+ type t = token
+ type token = t
+
+ let sf = Printf.sprintf
+
+ let to_string =
+ function
+ | TAG s -> sf "TAG <%s>" s
+ | STRING s -> sf "STRING \"%s\"" s
+ | KWD s -> sf "KWD %s" s
+ | INT i -> sf "INT %i" i
+ | ATT s -> sf "ATT %s" s
+ | EOI -> sf "EOI"
+
+ let print ppf x = pp_print_string ppf (to_string x)
+
+ let match_keyword kwd =
+ function
+ | KWD kwd' when kwd = kwd' -> true
+ | _ -> false
+
+ let extract_string =
+ function
+ | KWD s | STRING s | TAG s | ATT s -> s
+ | INT i -> string_of_int i
+ | tok ->
+ invalid_arg ("Cannot extract a string from this token: "^
+ to_string tok)
+
+ module Error = struct
+ type t = string
+ exception E of string
+ let print = pp_print_string
+ let to_string x = x
+ end
+
+ module Filter = struct
+ type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter
+
+ type t =
+ { is_kwd : string -> bool;
+ mutable filter : token_filter }
+
+ let mk is_kwd =
+ { is_kwd = is_kwd;
+ filter = (fun s -> s) }
+
+ let filter x =
+ let f tok loc =
+ let tok' = tok in
+ (tok', loc)
+ in
+ let rec filter =
+ parser
+ | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
+ | [< >] -> [< >]
+ in
+ fun strm -> x.filter (filter strm)
+
+ let define_filter x f = x.filter <- f x.filter
+
+ let keyword_added _ _ _ = ()
+ let keyword_removed _ _ = ()
+ end
+
+end
+module Error = Camlp4.Struct.EmptyError
+
+module L = Ulexing
+
+exception Error of int * int * string
+
+let error i j s = raise (Error (i,j,s))
+
+(***********************************************************)
+(* Buffer for string literals *)
+
+let string_buff = Buffer.create 1024
+
+let store_lexeme lexbuf =
+ Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
+let store_ascii = Buffer.add_char string_buff
+let store_code = Utf8.store string_buff
+let clear_buff () = Buffer.clear string_buff
+let get_stored_string () =
+ let s = Buffer.contents string_buff in
+ clear_buff ();
+ Buffer.clear string_buff;
+ s
+
+(***********************************************************)
+(* Lexer *)
+
+let illegal lexbuf =
+ error
+ (L.lexeme_start lexbuf)
+ (L.lexeme_end lexbuf)
+ "Illegal character"
+
+let return lexbuf tok = (tok, L.loc lexbuf)
+let return_loc i j tok = (tok, (i,j))
+
+let regexp ncname_char =
+ xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
+
+let hexa_digit = function
+ | '0'..'9' as c -> (Char.code c) - (Char.code '0')
+ | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
+ | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
+ | _ -> -1
+
+let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
+
+let parse_char lexbuf base i =
+ let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
+ let r = ref 0 in
+ for i = 0 to String.length s - 1 do
+ let c = hexa_digit s.[i] in
+ if (c >= base) || (c < 0) then
+ error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
+ r := !r * base + c;
+ done;
+ !r
+
+let rec token = lexer
+ | [' ' '\t'] -> token lexbuf
+ | "and" | "not" | "or" | "text" | "node"
+ | "self" | "descendant" | "child" | "descendant-or-self"
+ | "attribute" | "following-sibling" | "preceding-sibling"
+ | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following"
+ | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "." | ".." | "@"
+ -> return lexbuf (KWD (L.utf8_lexeme lexbuf))
+ | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf))
+ | '-'? ['0'-'9']+ -> let i = INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i
+ | '"' | "'" ->
+ let start = L.lexeme_start lexbuf in
+ let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
+ string (L.lexeme_start lexbuf) double_quote lexbuf;
+ let s = get_stored_string () in
+ return_loc start (L.lexeme_end lexbuf) (STRING s)
+
+ | eof -> return lexbuf EOI
+ | _ -> illegal lexbuf
+
+and string start double = lexer
+ | '"' | "'" ->
+ let d = L.latin1_lexeme_char lexbuf 0 = '"' in
+ if d != double then (store_lexeme lexbuf; string start double lexbuf)
+ | '\\' ['\\' '"' '\''] ->
+ store_ascii (L.latin1_lexeme_char lexbuf 1);
+ string start double lexbuf
+ | "\\n" ->
+ store_ascii '\n'; string start double lexbuf
+ | "\\t" ->
+ store_ascii '\t'; string start double lexbuf
+ | "\\r" ->
+ store_ascii '\r'; string start double lexbuf
+ | '\\' ['0'-'9']+ ';' ->
+ store_code (parse_char lexbuf 10 1);
+ string start double lexbuf
+ | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
+ store_code (parse_char lexbuf 16 2);
+ string start double lexbuf
+ | '\\' ->
+ illegal lexbuf;
+ | eof ->
+ error start (start+1) "Unterminated string"
+ | _ ->
+ store_lexeme lexbuf;
+ string start double lexbuf
+
+
+
+(***********************************************************)
+
+let enc = ref L.Latin1
+let lexbuf = ref None
+let last_tok = ref (KWD "DUMMY")
+
+let raise_clean e =
+ clear_buff ();
+ raise e
+
+let mk () _loc cs =
+ let lb = L.from_var_enc_stream enc cs in
+ lexbuf := Some lb;
+ let next _ =
+ let tok, loc =
+ try token lb
+ with
+ | Ulexing.Error ->
+ raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
+ "Unexpected character"))
+ | Ulexing.InvalidCodepoint i ->
+ raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
+ "Code point invalid for the current encoding"))
+ | e -> raise_clean e
+ in
+ last_tok := tok;
+ Some (tok, loc)
+ in
+ Stream.from next