1 (******************************************************************************)
2 (* SXSI : XPath evaluator *)
3 (* Kim Nguyen (Kim.Nguyen@nicta.com.au) *)
4 (* Copyright NICTA 2008 *)
5 (* Distributed under the terms of the LGPL (see LICENCE) *)
6 (******************************************************************************)
15 let of_lexing_position _ = assert false
16 let to_ocaml_location _ = assert false
17 let of_ocaml_location _ = assert false
18 let of_lexbuf _ = assert false
19 let of_tuple _ = assert false
20 let to_tuple _ = assert false
22 let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
23 let join (x1, _) = (x1, x1)
24 let move _ _ _ = assert false
25 let shift _ _ = assert false
26 let move_line _ _ = assert false
27 let file_name _ = assert false
28 let start_line _ = assert false
29 let stop_line _ = assert false
30 let start_bol _ = assert false
31 let stop_bol _ = assert false
34 let start_pos _ = assert false
35 let stop_pos _ = assert false
36 let is_ghost _ = assert false
37 let ghostify _ = assert false
38 let set_file_name _ = assert false
39 let strictly_before _ = assert false
40 let make_absolute _ = assert false
41 let print _ = assert false
42 let dump _ = assert false
43 let to_string _ = assert false
44 exception Exc_located of t * exn
47 | Exc_located _ -> raise exn
48 | _ -> raise (Exc_located (loc, exn))
66 let sf = Printf.sprintf
70 | TAG s -> sf "TAG <%s>" s
71 | STRING s -> sf "STRING \"%s\"" s
72 | KWD s -> sf "KWD %s" s
73 | INT i -> sf "INT %i" i
74 | ATT s -> sf "ATT %s" s
77 let print ppf x = pp_print_string ppf (to_string x)
79 let match_keyword kwd =
81 | KWD kwd' when kwd = kwd' -> true
86 | KWD s | STRING s | TAG s | ATT s -> s
87 | INT i -> string_of_int i
89 invalid_arg ("Cannot extract a string from this token: "^
95 let print = pp_print_string
99 module Filter = struct
100 type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter
103 { is_kwd : string -> bool;
104 mutable filter : token_filter }
108 filter = (fun s -> s) }
117 | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
120 fun strm -> x.filter (filter strm)
122 let define_filter x f = x.filter <- f x.filter
124 let keyword_added _ _ _ = ()
125 let keyword_removed _ _ = ()
129 module Error = Camlp4.Struct.EmptyError
133 exception Error of int * int * string
135 let error i j s = raise (Error (i,j,s))
137 (***********************************************************)
138 (* Buffer for string literals *)
140 let string_buff = Buffer.create 1024
142 let store_lexeme lexbuf =
143 Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
144 let store_ascii = Buffer.add_char string_buff
145 let store_code = Utf8.store string_buff
146 let clear_buff () = Buffer.clear string_buff
147 let get_stored_string () =
148 let s = Buffer.contents string_buff in
150 Buffer.clear string_buff;
153 (***********************************************************)
158 (L.lexeme_start lexbuf)
159 (L.lexeme_end lexbuf)
162 let return lexbuf tok = (tok, L.loc lexbuf)
163 let return_loc i j tok = (tok, (i,j))
165 let regexp ncname_char =
166 xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
168 let hexa_digit = function
169 | '0'..'9' as c -> (Char.code c) - (Char.code '0')
170 | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
171 | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
175 let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
177 let parse_char lexbuf base i =
178 let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
180 for i = 0 to String.length s - 1 do
181 let c = hexa_digit s.[i] in
182 if (c >= base) || (c < 0) then
183 error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
188 let rec token = lexer
189 | [' ' '\t'] -> token lexbuf
190 | "text()" | "node()" | "and" | "not" | "or"
191 | "contains" | "contains_full"
192 | "self" | "descendant" | "child" | "descendant-or-self"
193 | "attribute" | "following-sibling" | "preceding-sibling"
194 | "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following"
195 | "(" |")" | "," | "::" | "/" | "//" | "[" | "]" | "*" | "." | ".."
196 -> return lexbuf (KWD (L.utf8_lexeme lexbuf))
197 | ncname -> return lexbuf (TAG(L.utf8_lexeme lexbuf))
198 | '@' (ncname|'*') ->
199 let s = L.utf8_sub_lexeme lexbuf 1
200 (L.lexeme_length lexbuf - 1)
201 in return lexbuf (ATT(s))
202 | '-'? ['0'-'9']+ -> let i = INT (int_of_string(L.utf8_lexeme lexbuf)) in return lexbuf i
204 let start = L.lexeme_start lexbuf in
205 let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
206 string (L.lexeme_start lexbuf) double_quote lexbuf;
207 let s = get_stored_string () in
208 return_loc start (L.lexeme_end lexbuf) (STRING s)
210 | eof -> return lexbuf EOI
211 | _ -> illegal lexbuf
213 and string start double = lexer
215 let d = L.latin1_lexeme_char lexbuf 0 = '"' in
216 if d != double then (store_lexeme lexbuf; string start double lexbuf)
217 | '\\' ['\\' '"' '\''] ->
218 store_ascii (L.latin1_lexeme_char lexbuf 1);
219 string start double lexbuf
221 store_ascii '\n'; string start double lexbuf
223 store_ascii '\t'; string start double lexbuf
225 store_ascii '\r'; string start double lexbuf
226 | '\\' ['0'-'9']+ ';' ->
227 store_code (parse_char lexbuf 10 1);
228 string start double lexbuf
229 | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
230 store_code (parse_char lexbuf 16 2);
231 string start double lexbuf
235 error start (start+1) "Unterminated string"
238 string start double lexbuf
242 (***********************************************************)
244 let enc = ref L.Latin1
245 let lexbuf = ref None
246 let last_tok = ref (KWD "DUMMY")
253 let lb = L.from_var_enc_stream enc cs in
260 raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
261 "Unexpected character"))
262 | Ulexing.InvalidCodepoint i ->
263 raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
264 "Code point invalid for the current encoding"))