(******************************************************************************)
(*  SXSI : XPath evaluator                                                    *)
(*  Kim Nguyen (Kim.Nguyen@nicta.com.au)                                      *)
(*  Copyright NICTA 2008                                                      *)
(*  Distributed under the terms of the LGPL (see LICENCE)                     *)
(******************************************************************************)
open Camlp4.PreCast

module Loc = struct
  type t = int * int

  let mk _ = (0,0)
  let ghost = (-1,-1)

  let of_lexing_position _ = assert false
  let to_ocaml_location _ = assert false
  let of_ocaml_location _ = assert false
  let of_lexbuf _ = assert false
  let of_tuple _ = assert false
  let to_tuple _ = assert false

  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  let join (x1, _) = (x1, x1)
  let move _ _ _ = assert false
  let shift _ _ = assert false
  let move_line _ _ = assert false
  let file_name  _ = assert false
  let start_line _ = assert false
  let stop_line  _ = assert false
  let start_bol  _ = assert false
  let stop_bol   _ = assert false
  let start_off  = fst
  let stop_off   = snd
  let start_pos  _ = assert false
  let stop_pos   _ = assert false
  let is_ghost   _ = assert false
  let ghostify   _ = assert false
  let set_file_name _ = assert false
  let strictly_before _ = assert false
  let make_absolute _ = assert false
  let print _ = assert false
  let dump  _ = assert false
  let to_string _ = assert false
  exception Exc_located of t * exn
  let raise loc exn =
    match exn with 
    | Exc_located _ -> raise exn
    | _ -> raise (Exc_located (loc, exn))
  let name = ref "_loc"
end

type token =
  | TAG of string
  | STRING of string
  | INT of int
  | KWD of string
  | ATT of string
  | EOI

module Token = struct
  open Format
  module Loc = Loc
  type t = token
  type token = t

  let sf = Printf.sprintf

  let to_string =
    function
      | TAG s -> sf "TAG <%s>" s
      | STRING s -> sf "STRING \"%s\"" s
      | KWD s -> sf "KWD %s" s
      | INT i  -> sf "INT %i" i
      | ATT s -> sf "ATT %s" s
      | EOI    -> sf "EOI"

  let print ppf x = pp_print_string ppf (to_string x)

  let match_keyword kwd =
    function
    | KWD kwd' when kwd = kwd' -> true
    | _ -> false

  let extract_string =
    function
      | KWD s | STRING s | TAG s | ATT s  -> s
      | INT i -> string_of_int i
      | tok ->
          invalid_arg ("Cannot extract a string from this token: "^
			 to_string tok)

  module Error = struct
    type t = string
    exception E of string
    let print = pp_print_string
    let to_string x = x
  end

  module Filter = struct
    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter

    type t =
      { is_kwd : string -> bool;
        mutable filter : token_filter }

    let mk is_kwd =
      { is_kwd = is_kwd;
        filter = (fun s -> s) }

    let filter x =
      let f tok loc =
        let tok' = tok in
        (tok', loc)
      in
      let rec filter =
        parser
        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
        | [< >] -> [< >]
      in
      fun strm -> x.filter (filter strm)

    let define_filter x f = x.filter <- f x.filter

    let keyword_added _ _ _ = ()
    let keyword_removed _ _ = ()
  end

end
module Error = Camlp4.Struct.EmptyError

module L = Ulexing

exception Error of int * int * string

let error i j s = raise (Error (i,j,s))

(***********************************************************)
(* Buffer for string literals *)
  
let string_buff = Buffer.create 1024

let store_lexeme lexbuf = 
  Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code  = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
  let s = Buffer.contents string_buff in
  clear_buff ();
  Buffer.clear string_buff;
  s

(***********************************************************)
(* Lexer *)

let illegal lexbuf =
  error
    (L.lexeme_start lexbuf)
    (L.lexeme_end lexbuf) 
    "Illegal character"

let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))

let regexp ncname_char = 
  xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."

let hexa_digit = function
  | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  | _ -> -1

let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)

let parse_char lexbuf base i =
  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in 
  let r = ref 0 in
  for i = 0 to String.length s - 1 do
    let c = hexa_digit s.[i] in
    if (c >= base) || (c < 0) then 
      error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
    r := !r * base + c;
  done;
  !r

let rec token = lexer
 | [' ' '\t' '\n'] -> token lexbuf
 | "and" | "not" | "or"  | "text()" | "node()" 
 | "self" | "descendant" | "child" | "descendant-or-self" 
 | "attribute" | "following-sibling" | "preceding-sibling"
 | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following"
 | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "."  | ".." | "@"
     -> return lexbuf (KWD (L.utf8_lexeme lexbuf))
 | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf))
 | '-'? ['0'-'9']+ -> let i =  INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i
 | '"' | "'" ->
     let start = L.lexeme_start lexbuf in
     let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
     string (L.lexeme_start lexbuf) double_quote lexbuf;
     let s = get_stored_string () in
       return_loc start (L.lexeme_end lexbuf) (STRING s)

 | eof -> return lexbuf EOI
 | _ -> illegal lexbuf

and string start double = lexer
  | '"' | "'" ->
      let d = L.latin1_lexeme_char lexbuf 0 = '"' in
      if d != double then (store_lexeme lexbuf; string start double lexbuf)
  | '\\' ['\\' '"' '\''] ->
      store_ascii (L.latin1_lexeme_char lexbuf 1);
      string start double lexbuf
  | "\\n" -> 
      store_ascii '\n'; string start double lexbuf
  | "\\t" -> 
      store_ascii '\t'; string start double lexbuf
  | "\\r" -> 
      store_ascii '\r'; string start double lexbuf
  | '\\' ['0'-'9']+ ';' ->
      store_code (parse_char lexbuf 10 1);
      string start double lexbuf
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
      store_code (parse_char lexbuf 16 2);
      string start double lexbuf
  | '\\' ->
      illegal lexbuf;
  | eof ->
      error start (start+1) "Unterminated string"
  | _ ->
      store_lexeme lexbuf;
      string start double lexbuf


(***********************************************************)

let enc = ref L.Latin1
let lexbuf = ref None
let last_tok = ref (KWD "DUMMY")

let raise_clean e =
  clear_buff ();
  raise e

let mk () _loc cs =
  let lb = L.from_var_enc_stream enc cs in
  lexbuf := Some lb;
  let next _ =
    let tok, loc = 
      try token lb
      with
        | Ulexing.Error -> 
            raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
                          "Unexpected character"))
        | Ulexing.InvalidCodepoint i ->
            raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
                          "Code point invalid for the current encoding"))
        | e -> raise_clean e
    in
    last_tok := tok;
    Some (tok, loc)
  in
  Stream.from next