From: Kim Nguyễn Date: Tue, 5 Feb 2013 14:03:32 +0000 (+0100) Subject: Refactor xpath parser and ast in a submodule. X-Git-Tag: v0.1~187 X-Git-Url: http://git.nguyen.vg/gitweb/?p=tatoo.git;a=commitdiff_plain;h=9c0b145d050a5981010435f54848dc862782709c Refactor xpath parser and ast in a submodule. --- diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..83aeb8f --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +SOURCE_DIR=src,src/XPath +TARGET=main.otarget + + +all: + ocamlbuild -Is $(SOURCE_DIR) $(TARGET) + + +clean: + ocamlbuild -clean diff --git a/_tags b/_tags new file mode 100644 index 0000000..cf9b611 --- /dev/null +++ b/_tags @@ -0,0 +1 @@ +: for-pack(XPath) diff --git a/main.itarget b/main.itarget new file mode 100644 index 0000000..8171f3b --- /dev/null +++ b/main.itarget @@ -0,0 +1 @@ +src/test.native diff --git a/src/XPath/ast.ml b/src/XPath/ast.ml new file mode 100644 index 0000000..a097024 --- /dev/null +++ b/src/XPath/ast.ml @@ -0,0 +1,164 @@ +(***********************************************************************) +(* *) +(* TAToo *) +(* *) +(* Kim Nguyen, LRI UMR8623 *) +(* Université Paris-Sud & CNRS *) +(* *) +(* Copyright 2010-2012 Université Paris-Sud and Centre National de la *) +(* Recherche Scientifique. All rights reserved. This file is *) +(* distributed under the terms of the GNU Lesser General Public *) +(* License, with the special exception on linking described in file *) +(* ../LICENSE. *) +(* *) +(***********************************************************************) + +(* + Time-stamp: +*) + + +type path = single_path list +and single_path = Absolute of step list | Relative of step list +and step = axis * test * expr list +and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling + | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following + +and test = QNameSet.t + +and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod +and unop = Neg +and expr = + | Number of [ `Int of int | `Float of float ] + | String of string + | Fun_call of QName.t * expr list + | Path of path + | Binop of expr * binop * expr + | Unop of unop * expr + + +type t = path + + +let text = QNameSet.singleton QName.text +let node = QNameSet.any +let star = + QNameSet.complement ( + QNameSet.from_list [ QName.text; + QName.document; + QName.cdata_section; + QName.comment]) + + + +let pp fmt e = Format.fprintf fmt e + +let prio e = + match e with + | Unop (Neg, _) -> 11 + | Path _ -> 10 + | Number _ | String _ | Fun_call _ -> 9 + | Binop (_,op,_) -> begin match op with + | Lt | Lte | Gt | Gte -> 7 + | Neq | Eq -> 6 + | And -> 5 + | Or -> 4 + | Mult | Div | Mod -> 3 + | Add | Sub -> 2 + end + +let print_binop fmt o = + pp fmt "%s" begin match o with + | Eq -> "=" + | Neq -> "!=" + | Lt -> "<" + | Gt -> ">" + | Lte -> "<=" + | Gte -> ">=" + | Or -> "or" + | And -> "and" + | Add -> "+" + | Sub -> "-" + | Mult -> "*" + | Div -> "div" + | Mod -> "mod" + end +let print_unop fmt o = + pp fmt "%s" begin match o with + | Neg -> "-" + end + +let rec print_path fmt p = + Pretty.print_list ~sep:" | " print_single_path fmt p + +and print_single_path fmt p = + let l = match p with + | Absolute l -> pp fmt "/"; l + | Relative l -> l + in + Pretty.print_list ~sep:"/" print_step fmt l + +and print_step fmt (axis, test, expr) = + pp fmt "%a::%a" print_axis axis print_test test; + match expr with + [] -> () + | l -> pp fmt "[ "; + Pretty.print_list ~sep:" ][ " print_expr fmt l; + pp fmt " ]" + +and print_axis fmt a = pp fmt "%s" begin + match a with + Self -> "self" + | Child -> "child" + | Descendant -> "descendant" + | DescendantOrSelf -> "descendant-or-self" + | FollowingSibling -> "following-sibling" + | Attribute -> "attribute" + | Ancestor -> "ancestor" + | AncestorOrSelf -> "ancestor-or-self" + | PrecedingSibling -> "preceding-sibling" + | Parent -> "parent" + | Preceding -> "preceding" + | Following -> "following" +end + +and print_test fmt ts = + try + pp fmt "%s" (List.assoc ts + [ text,"text()"; + node,"node()"; + star, "*" ] ) + with + Not_found -> pp fmt "%s" + (if QNameSet.is_finite ts + then QName.to_string (QNameSet.choose ts) + else "" + ) + +and print_expr fmt = function +| Number (`Int(i)) -> pp fmt "%i" i +| Number (`Float(f)) -> pp fmt "%f" f +| String s -> pp fmt "'%S'" s +| Fun_call (n, args) -> + pp fmt "%a(" QName.print n; + Pretty.print_list ~sep:", " print_expr fmt args; + pp fmt ")" +| Path p -> print_path fmt p +| Binop (e1, op, e2) as e -> + let pe = prio e in + let need_par1 = prio e1 < pe in + if need_par1 then pp fmt "("; + pp fmt "%a" print_expr e1; + if need_par1 then pp fmt ")"; + pp fmt " %a " print_binop op; + let need_par2 = prio e2 < pe in + if need_par2 then pp fmt "("; + pp fmt "%a" print_expr e2; + if need_par2 then pp fmt ")" +| Unop (op, e0) as e -> + let need_par0 = prio e0 < prio e in + print_unop fmt op; + if need_par0 then pp fmt "("; + print_expr fmt e0; + if need_par0 then pp fmt ")" + diff --git a/src/XPath/ast.mli b/src/XPath/ast.mli new file mode 100644 index 0000000..a2b0579 --- /dev/null +++ b/src/XPath/ast.mli @@ -0,0 +1,49 @@ +(***********************************************************************) +(* *) +(* TAToo *) +(* *) +(* Kim Nguyen, LRI UMR8623 *) +(* Université Paris-Sud & CNRS *) +(* *) +(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) +(* Recherche Scientifique. All rights reserved. This file is *) +(* distributed under the terms of the GNU Lesser General Public *) +(* License, with the special exception on linking described in file *) +(* ../LICENSE. *) +(* *) +(***********************************************************************) + +(* + Time-stamp: +*) + +type path = single_path list +and single_path = Absolute of step list | Relative of step list +and step = axis * test * expr list +and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling + | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following + +and test = QNameSet.t + +and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod +and unop = Neg +and expr = + | Number of [ `Int of int | `Float of float ] + | String of string + | Fun_call of QName.t * expr list + | Path of path + | Binop of expr * binop * expr + | Unop of unop * expr +type t = path +val text : QNameSet.t +val node : QNameSet.t +val star : QNameSet.t +val print_binop : Format.formatter -> binop -> unit +val print_unop : Format.formatter -> unop -> unit +val print_path : Format.formatter -> path -> unit +val print_single_path : Format.formatter -> single_path -> unit +val print_step : Format.formatter -> step -> unit +val print_axis : Format.formatter -> axis -> unit +val print_test : Format.formatter -> test -> unit +val print_expr : Format.formatter -> expr -> unit + diff --git a/src/XPath/parser.ml b/src/XPath/parser.ml new file mode 100644 index 0000000..69e76e1 --- /dev/null +++ b/src/XPath/parser.ml @@ -0,0 +1,24 @@ +(***********************************************************************) +(* *) +(* TAToo *) +(* *) +(* Kim Nguyen, LRI UMR8623 *) +(* Université Paris-Sud & CNRS *) +(* *) +(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) +(* Recherche Scientifique. All rights reserved. This file is *) +(* distributed under the terms of the GNU Lesser General Public *) +(* License, with the special exception on linking described in file *) +(* ../LICENSE. *) +(* *) +(***********************************************************************) + +(* + Time-stamp: +*) + +include Xpath_internal_parser + +let parse (l : Ulexing.lexbuf) = + xpath (fun _ -> Ulexer.token l) (Lexing.from_string "!!dummy!!") + diff --git a/src/XPath/ulexer.ml b/src/XPath/ulexer.ml new file mode 100644 index 0000000..aa66179 --- /dev/null +++ b/src/XPath/ulexer.ml @@ -0,0 +1,169 @@ +(***********************************************************************) +(* *) +(* TAToo *) +(* *) +(* Kim Nguyen, LRI UMR8623 *) +(* Université Paris-Sud & CNRS *) +(* *) +(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) +(* Recherche Scientifique. All rights reserved. This file is *) +(* distributed under the terms of the GNU Lesser General Public *) +(* License, with the special exception on linking described in file *) +(* ../LICENSE. *) +(* *) +(***********************************************************************) + +(* + Time-stamp: +*) + +open Xpath_internal_parser + +module L = Ulexing + +exception Error of int * int * string + +let error i j s = raise (Error (i,j,s)) + +(***********************************************************) +(* Buffer for string literals *) + +let string_buff = Buffer.create 1024 + +let store_lexeme lexbuf = + Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) +let store_ascii = Buffer.add_char string_buff +let store_code = Utf8.store string_buff +let clear_buff () = Buffer.clear string_buff +let get_stored_string () = + let s = Buffer.contents string_buff in + clear_buff (); + Buffer.clear string_buff; + s + +(***********************************************************) +(* Lexer *) + +let illegal lexbuf = + error + (L.lexeme_start lexbuf) + (L.lexeme_end lexbuf) + "Illegal character" + +let return lexbuf tok = (tok, L.loc lexbuf) +let return_loc i j tok = (tok, (i,j)) + +let regexp ncname_char = + xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." + +let hexa_digit = function + | '0'..'9' as c -> (Char.code c) - (Char.code '0') + | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 + | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 + | _ -> -1 + +let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) +let regexp digit = ['0'-'9'] +let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)? + +let parse_char lexbuf base i = + let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in + let r = ref 0 in + for i = 0 to String.length s - 1 do + let c = hexa_digit s.[i] in + if (c >= base) || (c < 0) then + error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; + r := !r * base + c; + done; + !r + +let keyword_or_tag s = + try + List.assoc s [ + "self", AXIS Ast.Self; + "descendant", AXIS Ast.Descendant; + "child", AXIS Ast.Child; + "descendant-or-self", AXIS Ast.DescendantOrSelf; + "attribute", AXIS Ast.Attribute; + "following-sibling", AXIS Ast.FollowingSibling; + "preceding-sibling", AXIS Ast.PrecedingSibling; + "parent", AXIS Ast.Parent; + "ancestor", AXIS Ast.Ancestor; + "ancestor-or-self", AXIS Ast.AncestorOrSelf; + "preceding", AXIS Ast.Preceding; + "following", AXIS Ast.Following; + "and", AND; + "or" , OR; + "div", DIV; + "mod", MOD; + ] + with + _ -> TAG s + + +let rec token = lexer + | [' ' '\t' '\n'] -> token lexbuf + | "*" -> STAR + | "/" -> SLASH + | "//" -> SLASHSLASH + | "::" -> COLONCOLON + | "(" -> LP + | ")" -> RP + | "[" -> LB + | "]" -> RB + | "," -> COMMA + | "|" -> PIPE + | "+" -> ADD + | "-" -> SUB + | "<" -> LT + | "<=" -> LTE + | ">" -> GT + | ">=" -> GTE + | "=" -> EQ + | "!=" -> NEQ + | "node()" -> NODE + | "text()" -> TEXT + | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf) + | float -> + let s = L.utf8_lexeme lexbuf in + (try + INT (int_of_string s) + with + _ -> FLOAT (float_of_string s)) + | '"' | "'" -> + let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in + string (L.lexeme_start lexbuf) double_quote lexbuf; + let s = get_stored_string () in + STRING s + + | eof -> EOF + | _ -> illegal lexbuf + +and string start double = lexer + | '"' | "'" -> + let d = L.latin1_lexeme_char lexbuf 0 = '"' in + if d != double then (store_lexeme lexbuf; string start double lexbuf) + | '\\' ['\\' '"' '\''] -> + store_ascii (L.latin1_lexeme_char lexbuf 1); + string start double lexbuf + | "\\n" -> + store_ascii '\n'; string start double lexbuf + | "\\t" -> + store_ascii '\t'; string start double lexbuf + | "\\r" -> + store_ascii '\r'; string start double lexbuf + | '\\' ['0'-'9']+ ';' -> + store_code (parse_char lexbuf 10 1); + string start double lexbuf + | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> + store_code (parse_char lexbuf 16 2); + string start double lexbuf + | '\\' -> + illegal lexbuf; + | eof -> + error start (start+1) "Unterminated string" + | _ -> + store_lexeme lexbuf; + string start double lexbuf + + diff --git a/src/XPath/xpath_internal_parser.mly b/src/XPath/xpath_internal_parser.mly new file mode 100644 index 0000000..cd32ebb --- /dev/null +++ b/src/XPath/xpath_internal_parser.mly @@ -0,0 +1,145 @@ +%{ +(***********************************************************************) +(* *) +(* TAToo *) +(* *) +(* Kim Nguyen, LRI UMR8623 *) +(* Université Paris-Sud & CNRS *) +(* *) +(* Copyright 2010-2012 Université Paris-Sud and Centre National de la *) +(* Recherche Scientifique. All rights reserved. This file is *) +(* distributed under the terms of the GNU Lesser General Public *) +(* License, with the special exception on linking described in file *) +(* ../LICENSE. *) +(* *) +(***********************************************************************) + +(* + Time-stamp: +*) + + open Ast + let f () = () +%} + +%token TAG +%token STRING +%token INT +%token FLOAT +%token AXIS +%token RB LB LP RP +%token SLASH SLASHSLASH COLONCOLON STAR PIPE +%token EQ NEQ LT GT LTE GTE OR AND ADD SUB DIV MOD +%token NODE TEXT +%token COMMA +%token EOF + +%left OR +%left AND +%left EQ NEQ +%left LT GT LTE GTE +%left ADD SUB +%left MOD DIV STAR +%nonassoc uminus + +%start xpath +%type xpath + + +%% +xpath: +path EOF { $1 } +; + +path: + path_rev { List.rev $1 } +; + +path_rev: + simple_path { [ $1 ] } +| path_rev PIPE simple_path { $3 :: $1 } +; + + +simple_path: + absolute_path { Absolute (List.rev $1) } +| relative_path { Relative (List.rev $1) } +; + +absolute_path: + SLASH relative_path { $2 } +| SLASHSLASH relative_path { (DescendantOrSelf, node, []) :: $2 } +; + +relative_path: + step { [ $1 ] } +| relative_path SLASH step { $3 :: $1 } +| relative_path SLASHSLASH step { $3 + :: (DescendantOrSelf, node, []) + :: $1 } +; + +step: + axis_test pred_list { let a, b = $1 in a, b, $2 } +; + +axis_test: + AXIS COLONCOLON test { $1, $3 } +| test { Child, $1 } +| AXIS { + let _ = Format.flush_str_formatter () in + let () = Format.fprintf Format.str_formatter "%a" Ast.print_axis $1 in + let a = Format.flush_str_formatter () in + Child, QNameSet.singleton (QName.of_string a) +} +; + +test: + NODE { node } +| TEXT { text } +| STAR { star } +| TAG { QNameSet.singleton(QName.of_string $1) } +; + +pred_list: + pred_list_rev { List.rev $1 } +; + +pred_list_rev: + { [] } +| pred_list LB expr RB { $3 :: $1 } +; + +expr: + INT { Number(`Int($1)) } +| FLOAT { Number(`Float($1)) } +| STRING { String $1 } +| SUB expr %prec uminus { Unop(Neg, $2) } +| expr AND expr { Binop($1, And, $3) } +| expr OR expr { Binop($1, Or, $3) } +| expr ADD expr { Binop($1, Add, $3) } +| expr SUB expr { Binop($1, Sub, $3) } +| expr STAR expr { Binop($1, Mult, $3) } +| expr DIV expr { Binop($1, Div, $3) } +| expr MOD expr { Binop($1, Mod, $3) } +| expr EQ expr { Binop($1, Eq, $3) } +| expr NEQ expr { Binop($1, Neq, $3) } +| expr LT expr { Binop($1, Lt, $3) } +| expr LTE expr { Binop($1, Lte, $3) } +| expr GT expr { Binop($1, Gt, $3) } +| expr GTE expr { Binop($1, Gte, $3) } +| TAG LP arg_list RP { Fun_call(QName.of_string $1, $3) } +| LP expr RP { $2 } +| path { Path $1 } +; + +arg_list: + { [] } +| arg_list1 { List.rev $1 } +; + +arg_list1: + expr { [ $1 ] } +| arg_list1 COMMA expr { $3 :: $1 } +; + diff --git a/src/parser.mly b/src/parser.mly deleted file mode 100644 index 4d100ef..0000000 --- a/src/parser.mly +++ /dev/null @@ -1,145 +0,0 @@ -%{ -(***********************************************************************) -(* *) -(* TAToo *) -(* *) -(* Kim Nguyen, LRI UMR8623 *) -(* Université Paris-Sud & CNRS *) -(* *) -(* Copyright 2010-2012 Université Paris-Sud and Centre National de la *) -(* Recherche Scientifique. All rights reserved. This file is *) -(* distributed under the terms of the GNU Lesser General Public *) -(* License, with the special exception on linking described in file *) -(* ../LICENSE. *) -(* *) -(***********************************************************************) - -(* - Time-stamp: -*) - - open XPath.Ast - let f () = () -%} - -%token TAG -%token STRING -%token INT -%token FLOAT -%token AXIS -%token RB LB LP RP -%token SLASH SLASHSLASH COLONCOLON STAR PIPE -%token EQ NEQ LT GT LTE GTE OR AND ADD SUB DIV MOD -%token NODE TEXT -%token COMMA -%token EOF - -%left OR -%left AND -%left EQ NEQ -%left LT GT LTE GTE -%left ADD SUB -%left MOD DIV STAR -%nonassoc uminus - -%start xpath -%type xpath - - -%% -xpath: -path EOF { $1 } -; - -path: - path_rev { List.rev $1 } -; - -path_rev: - simple_path { [ $1 ] } -| path_rev PIPE simple_path { $3 :: $1 } -; - - -simple_path: - absolute_path { Absolute (List.rev $1) } -| relative_path { Relative (List.rev $1) } -; - -absolute_path: - SLASH relative_path { $2 } -| SLASHSLASH relative_path { (DescendantOrSelf, node, []) :: $2 } -; - -relative_path: - step { [ $1 ] } -| relative_path SLASH step { $3 :: $1 } -| relative_path SLASHSLASH step { $3 - :: (DescendantOrSelf, node, []) - :: $1 } -; - -step: - axis_test pred_list { let a, b = $1 in a, b, $2 } -; - -axis_test: - AXIS COLONCOLON test { $1, $3 } -| test { Child, $1 } -| AXIS { - let _ = Format.flush_str_formatter () in - let () = Format.fprintf Format.str_formatter "%a" XPath.Ast.print_axis $1 in - let a = Format.flush_str_formatter () in - Child, QNameSet.singleton (QName.of_string a) -} -; - -test: - NODE { node } -| TEXT { text } -| STAR { star } -| TAG { QNameSet.singleton(QName.of_string $1) } -; - -pred_list: - pred_list_rev { List.rev $1 } -; - -pred_list_rev: - { [] } -| pred_list LB expr RB { $3 :: $1 } -; - -expr: - INT { Number(`Int($1)) } -| FLOAT { Number(`Float($1)) } -| STRING { String $1 } -| SUB expr %prec uminus { Unop(Neg, $2) } -| expr AND expr { Binop($1, And, $3) } -| expr OR expr { Binop($1, Or, $3) } -| expr ADD expr { Binop($1, Add, $3) } -| expr SUB expr { Binop($1, Sub, $3) } -| expr STAR expr { Binop($1, Mult, $3) } -| expr DIV expr { Binop($1, Div, $3) } -| expr MOD expr { Binop($1, Mod, $3) } -| expr EQ expr { Binop($1, Eq, $3) } -| expr NEQ expr { Binop($1, Neq, $3) } -| expr LT expr { Binop($1, Lt, $3) } -| expr LTE expr { Binop($1, Lte, $3) } -| expr GT expr { Binop($1, Gt, $3) } -| expr GTE expr { Binop($1, Gte, $3) } -| TAG LP arg_list RP { Fun_call(QName.of_string $1, $3) } -| LP expr RP { $2 } -| path { Path $1 } -; - -arg_list: - { [] } -| arg_list1 { List.rev $1 } -; - -arg_list1: - expr { [ $1 ] } -| arg_list1 COMMA expr { $3 :: $1 } -; - diff --git a/src/test.ml b/src/test.ml index 0aaea6b..dec8918 100644 --- a/src/test.ml +++ b/src/test.ml @@ -14,7 +14,7 @@ (***********************************************************************) (* - Time-stamp: + Time-stamp: *) (** use: xml_file "XPath querie" @@ -33,9 +33,9 @@ let doc = close_in fd; d -let query = +let query = let arg2 = Sys.argv.(2) in - Uparser.xpath Ulexer.token (Ulexing.from_latin1_string arg2) + XPath.Parser.parse (Ulexing.from_latin1_string arg2) open Format diff --git a/src/ulexer.ml b/src/ulexer.ml deleted file mode 100644 index 4951023..0000000 --- a/src/ulexer.ml +++ /dev/null @@ -1,169 +0,0 @@ -(***********************************************************************) -(* *) -(* TAToo *) -(* *) -(* Kim Nguyen, LRI UMR8623 *) -(* Université Paris-Sud & CNRS *) -(* *) -(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) -(* Recherche Scientifique. All rights reserved. This file is *) -(* distributed under the terms of the GNU Lesser General Public *) -(* License, with the special exception on linking described in file *) -(* ../LICENSE. *) -(* *) -(***********************************************************************) - -(* - Time-stamp: -*) - -open Parser - -module L = Ulexing - -exception Error of int * int * string - -let error i j s = raise (Error (i,j,s)) - -(***********************************************************) -(* Buffer for string literals *) - -let string_buff = Buffer.create 1024 - -let store_lexeme lexbuf = - Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) -let store_ascii = Buffer.add_char string_buff -let store_code = Utf8.store string_buff -let clear_buff () = Buffer.clear string_buff -let get_stored_string () = - let s = Buffer.contents string_buff in - clear_buff (); - Buffer.clear string_buff; - s - -(***********************************************************) -(* Lexer *) - -let illegal lexbuf = - error - (L.lexeme_start lexbuf) - (L.lexeme_end lexbuf) - "Illegal character" - -let return lexbuf tok = (tok, L.loc lexbuf) -let return_loc i j tok = (tok, (i,j)) - -let regexp ncname_char = - xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." - -let hexa_digit = function - | '0'..'9' as c -> (Char.code c) - (Char.code '0') - | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 - | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 - | _ -> -1 - -let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) -let regexp digit = ['0'-'9'] -let regexp float = '-'? digit+ ('.' digit+ (['e''E'] digit+)?)? - -let parse_char lexbuf base i = - let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in - let r = ref 0 in - for i = 0 to String.length s - 1 do - let c = hexa_digit s.[i] in - if (c >= base) || (c < 0) then - error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; - r := !r * base + c; - done; - !r - -let keyword_or_tag s = - try - List.assoc s [ - "self", AXIS XPath.Ast.Self; - "descendant", AXIS XPath.Ast.Descendant; - "child", AXIS XPath.Ast.Child; - "descendant-or-self", AXIS XPath.Ast.DescendantOrSelf; - "attribute", AXIS XPath.Ast.Attribute; - "following-sibling", AXIS XPath.Ast.FollowingSibling; - "preceding-sibling", AXIS XPath.Ast.PrecedingSibling; - "parent", AXIS XPath.Ast.Parent; - "ancestor", AXIS XPath.Ast.Ancestor; - "ancestor-or-self", AXIS XPath.Ast.AncestorOrSelf; - "preceding", AXIS XPath.Ast.Preceding; - "following", AXIS XPath.Ast.Following; - "and", AND; - "or" , OR; - "div", DIV; - "mod", MOD; - ] - with - _ -> TAG s - - -let rec token = lexer - | [' ' '\t' '\n'] -> token lexbuf - | "*" -> STAR - | "/" -> SLASH - | "//" -> SLASHSLASH - | "::" -> COLONCOLON - | "(" -> LP - | ")" -> RP - | "[" -> LB - | "]" -> RB - | "," -> COMMA - | "|" -> PIPE - | "+" -> ADD - | "-" -> SUB - | "<" -> LT - | "<=" -> LTE - | ">" -> GT - | ">=" -> GTE - | "=" -> EQ - | "!=" -> NEQ - | "node()" -> NODE - | "text()" -> TEXT - | ncname -> keyword_or_tag (L.utf8_lexeme lexbuf) - | float -> - let s = L.utf8_lexeme lexbuf in - (try - INT (int_of_string s) - with - _ -> FLOAT (float_of_string s)) - | '"' | "'" -> - let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in - string (L.lexeme_start lexbuf) double_quote lexbuf; - let s = get_stored_string () in - STRING s - - | eof -> EOF - | _ -> illegal lexbuf - -and string start double = lexer - | '"' | "'" -> - let d = L.latin1_lexeme_char lexbuf 0 = '"' in - if d != double then (store_lexeme lexbuf; string start double lexbuf) - | '\\' ['\\' '"' '\''] -> - store_ascii (L.latin1_lexeme_char lexbuf 1); - string start double lexbuf - | "\\n" -> - store_ascii '\n'; string start double lexbuf - | "\\t" -> - store_ascii '\t'; string start double lexbuf - | "\\r" -> - store_ascii '\r'; string start double lexbuf - | '\\' ['0'-'9']+ ';' -> - store_code (parse_char lexbuf 10 1); - string start double lexbuf - | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> - store_code (parse_char lexbuf 16 2); - string start double lexbuf - | '\\' -> - illegal lexbuf; - | eof -> - error start (start+1) "Unterminated string" - | _ -> - store_lexeme lexbuf; - string start double lexbuf - - diff --git a/src/uparser.ml b/src/uparser.ml deleted file mode 100644 index c5aeb1a..0000000 --- a/src/uparser.ml +++ /dev/null @@ -1,23 +0,0 @@ -(***********************************************************************) -(* *) -(* TAToo *) -(* *) -(* Kim Nguyen, LRI UMR8623 *) -(* Université Paris-Sud & CNRS *) -(* *) -(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) -(* Recherche Scientifique. All rights reserved. This file is *) -(* distributed under the terms of the GNU Lesser General Public *) -(* License, with the special exception on linking described in file *) -(* ../LICENSE. *) -(* *) -(***********************************************************************) - -(* - Time-stamp: -*) - -let xpath (f : Ulexing.lexbuf -> Parser.token) (l: Ulexing.lexbuf) = - Parser.xpath (fun _ -> f l) (Lexing.from_string "!!dummy!!") - - diff --git a/src/xPath.ml b/src/xPath.ml deleted file mode 100644 index a825d84..0000000 --- a/src/xPath.ml +++ /dev/null @@ -1,164 +0,0 @@ -(***********************************************************************) -(* *) -(* TAToo *) -(* *) -(* Kim Nguyen, LRI UMR8623 *) -(* Université Paris-Sud & CNRS *) -(* *) -(* Copyright 2010-2012 Université Paris-Sud and Centre National de la *) -(* Recherche Scientifique. All rights reserved. This file is *) -(* distributed under the terms of the GNU Lesser General Public *) -(* License, with the special exception on linking described in file *) -(* ../LICENSE. *) -(* *) -(***********************************************************************) - -(* - Time-stamp: -*) - -module Ast = -struct - - type path = single_path list - and single_path = Absolute of step list | Relative of step list - and step = axis * test * expr list - and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling - | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following - - and test = QNameSet.t - - and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod - and unop = Neg - and expr = - | Number of [ `Int of int | `Float of float ] - | String of string - | Fun_call of QName.t * expr list - | Path of path - | Binop of expr * binop * expr - | Unop of unop * expr - - - type t = path - - - let text = QNameSet.singleton QName.text - let node = QNameSet.any - let star = - QNameSet.complement ( - QNameSet.from_list [ QName.text; - QName.document; - QName.cdata_section; - QName.comment]) - - - - let pp fmt e = Format.fprintf fmt e - - let prio e = match e with - | Unop (Neg, _) -> 11 - | Path _ -> 10 - | Number _ | String _ | Fun_call _ -> 9 - | Binop (_,op,_) -> begin match op with - | Lt | Lte | Gt | Gte -> 7 - | Neq | Eq -> 6 - | And -> 5 - | Or -> 4 - | Mult | Div | Mod -> 3 - | Add | Sub -> 2 - end - - let print_binop fmt o = - pp fmt "%s" begin match o with - | Eq -> "=" - | Neq -> "!=" - | Lt -> "<" - | Gt -> ">" - | Lte -> "<=" - | Gte -> ">=" - | Or -> "or" - | And -> "and" - | Add -> "+" - | Sub -> "-" - | Mult -> "*" - | Div -> "div" - | Mod -> "mod" - end - let print_unop fmt o = - pp fmt "%s" begin match o with - | Neg -> "-" - end - - let rec print_path fmt p = - Pretty.print_list ~sep:" | " print_single_path fmt p - - and print_single_path fmt p = - let l = match p with - | Absolute l -> pp fmt "/"; l - | Relative l -> l - in - Pretty.print_list ~sep:"/" print_step fmt l - - and print_step fmt (axis, test, expr) = - pp fmt "%a::%a" print_axis axis print_test test; - match expr with - [] -> () - | l -> pp fmt "[ "; - Pretty.print_list ~sep:" ][ " print_expr fmt l; - pp fmt " ]" - - and print_axis fmt a = pp fmt "%s" (match a with - Self -> "self" - | Child -> "child" - | Descendant -> "descendant" - | DescendantOrSelf -> "descendant-or-self" - | FollowingSibling -> "following-sibling" - | Attribute -> "attribute" - | Ancestor -> "ancestor" - | AncestorOrSelf -> "ancestor-or-self" - | PrecedingSibling -> "preceding-sibling" - | Parent -> "parent" - | Preceding -> "preceding" - | Following -> "following" - ) - and print_test fmt ts = - try - pp fmt "%s" (List.assoc ts - [ text,"text()"; - node,"node()"; - star, "*" ] ) - with - Not_found -> pp fmt "%s" - (if QNameSet.is_finite ts - then QName.to_string (QNameSet.choose ts) - else "" - ) - - and print_expr fmt = function - | Number (`Int(i)) -> pp fmt "%i" i - | Number (`Float(f)) -> pp fmt "%f" f - | String s -> pp fmt "'%S'" s - | Fun_call (n, args) -> - pp fmt "%a(" QName.print n; - Pretty.print_list ~sep:", " print_expr fmt args; - pp fmt ")" - | Path p -> print_path fmt p - | Binop (e1, op, e2) as e -> - let pe = prio e in - let need_par1 = prio e1 < pe in - if need_par1 then pp fmt "("; - pp fmt "%a" print_expr e1; - if need_par1 then pp fmt ")"; - pp fmt " %a " print_binop op; - let need_par2 = prio e2 < pe in - if need_par2 then pp fmt "("; - pp fmt "%a" print_expr e2; - if need_par2 then pp fmt ")" - | Unop (op, e0) as e -> - let need_par0 = prio e0 < prio e in - print_unop fmt op; - if need_par0 then pp fmt "("; - print_expr fmt e0; - if need_par0 then pp fmt ")" - -end diff --git a/src/xPath.mli b/src/xPath.mli deleted file mode 100644 index 76a6ab3..0000000 --- a/src/xPath.mli +++ /dev/null @@ -1,53 +0,0 @@ -(***********************************************************************) -(* *) -(* TAToo *) -(* *) -(* Kim Nguyen, LRI UMR8623 *) -(* Université Paris-Sud & CNRS *) -(* *) -(* Copyright 2010-2013 Université Paris-Sud and Centre National de la *) -(* Recherche Scientifique. All rights reserved. This file is *) -(* distributed under the terms of the GNU Lesser General Public *) -(* License, with the special exception on linking described in file *) -(* ../LICENSE. *) -(* *) -(***********************************************************************) - -(* - Time-stamp: -*) - - - -module Ast : -sig - type path = single_path list - and single_path = Absolute of step list | Relative of step list - and step = axis * test * expr list - and axis = Self | Attribute | Child | Descendant | DescendantOrSelf | FollowingSibling - | Parent | Ancestor | AncestorOrSelf | PrecedingSibling | Preceding | Following - - and test = QNameSet.t - - and binop = Eq | Neq | Lt | Gt | Lte | Gte | Or | And | Add | Sub | Mult | Div | Mod - and unop = Neg - and expr = - | Number of [ `Int of int | `Float of float ] - | String of string - | Fun_call of QName.t * expr list - | Path of path - | Binop of expr * binop * expr - | Unop of unop * expr - type t = path - val text : QNameSet.t - val node : QNameSet.t - val star : QNameSet.t - val print_binop : Format.formatter -> binop -> unit - val print_unop : Format.formatter -> unop -> unit - val print_path : Format.formatter -> path -> unit - val print_single_path : Format.formatter -> single_path -> unit - val print_step : Format.formatter -> step -> unit - val print_axis : Format.formatter -> axis -> unit - val print_test : Format.formatter -> test -> unit - val print_expr : Format.formatter -> expr -> unit - end diff --git a/src/xPath.mlpack b/src/xPath.mlpack new file mode 100644 index 0000000..e062c25 --- /dev/null +++ b/src/xPath.mlpack @@ -0,0 +1,4 @@ +Ast +Parser +Ulexer +Xpath_internal_parser