From 7489c542a7b7357a1c2bbc436d1d77c601833d3b Mon Sep 17 00:00:00 2001 From: kim Date: Mon, 23 Mar 2009 04:00:59 +0000 Subject: [PATCH] Merged -correctxpath branch git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@269 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- Makefile | 4 +- OCamlDriver.cpp | 10 +- ata.ml | 776 +++++++++++++-------------- ata.mli | 75 ++- main.ml | 25 +- ptset.ml | 66 +-- ptset.mli | 2 + tree.ml | 1345 +++++++++++++++++------------------------------ tree.mli | 115 ++-- unit_test.ml | 124 +---- xPath.ml | 19 +- 11 files changed, 1036 insertions(+), 1525 deletions(-) diff --git a/Makefile b/Makefile index 38b2b26..b9f6155 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -INLINE=10 +INLINE=1000 DEBUG=false PROFILE=true VERBOSE=false @@ -48,7 +48,7 @@ OCAMLOPT = ocamlopt -g -cc "$(CXX)" SYNT_DEBUG = -ppopt -DDEBUG else CXX = g++ -OCAMLOPT = ocamlopt -g -unsafe -cc "$(CXX)" -ccopt -O3 -ccopt -std=c++0x -noassert -inline $(INLINE) +OCAMLOPT = ocamlopt -nodynlink -cc "$(CXX)" -ccopt -O3 -ccopt -std=c++0x -noassert -inline $(INLINE) endif ifeq ($(PROFILE), true) SYNT_PROF = $(SYNT_DEBUG) -ppopt -DPROFILE diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index ed9afe1..8603c11 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -377,7 +377,7 @@ extern "C" CAMLprim value caml_int_vector_set(value vec, value i, value v){ #define VECT(x) ((int*) (x)) -extern "C" CAMLprim value caml_xml_tree_tagged_below(value tree, value node, value ctags, value dtags){ +extern "C" CAMLprim value caml_xml_tree_select_below(value tree, value node, value ctags, value dtags){ CAMLparam4(tree,node,ctags,dtags); CAMLreturn (Val_int ( @@ -388,7 +388,7 @@ extern "C" CAMLprim value caml_xml_tree_tagged_below(value tree, value node, val VECT(dtags)[0])))); } -extern "C" CAMLprim value caml_xml_tree_tagged_next(value tree, value node, value ctags, value ftags,value root){ +extern "C" CAMLprim value caml_xml_tree_select_next(value tree, value node, value ctags, value ftags,value root){ CAMLparam5(tree,node,ctags,ftags,root); CAMLreturn (Val_int ( (XMLTREE(tree)->TaggedNext(TREENODEVAL(node), @@ -399,7 +399,7 @@ extern "C" CAMLprim value caml_xml_tree_tagged_next(value tree, value node, valu TREENODEVAL(root))))); } -extern "C" CAMLprim value caml_xml_tree_tagged_desc_only(value tree, value node,value dtags){ +extern "C" CAMLprim value caml_xml_tree_select_desc_only(value tree, value node,value dtags){ CAMLparam3(tree,node,dtags); CAMLreturn (Val_int ( @@ -408,7 +408,7 @@ extern "C" CAMLprim value caml_xml_tree_tagged_desc_only(value tree, value node, VECT(dtags)[0])))); } -extern "C" CAMLprim value caml_xml_tree_tagged_foll_only(value tree, value node, value ftags,value root){ +extern "C" CAMLprim value caml_xml_tree_select_foll_only(value tree, value node, value ftags,value root){ CAMLparam4(tree,node,ftags,root); CAMLreturn (Val_int ( (XMLTREE(tree)->TaggedFollOnly(TREENODEVAL(node), @@ -417,7 +417,7 @@ extern "C" CAMLprim value caml_xml_tree_tagged_foll_only(value tree, value node, TREENODEVAL(root))))); } -extern "C" CAMLprim value caml_xml_tree_tagged_desc_or_foll_only(value tree, value node, value ftags,value root){ +extern "C" CAMLprim value caml_xml_tree_select_desc_or_foll_only(value tree, value node, value ftags,value root){ CAMLparam4(tree,node,ftags,root); CAMLreturn (Val_int ( (XMLTREE(tree)->TaggedDescOrFollOnly(TREENODEVAL(node), diff --git a/ata.ml b/ata.ml index fc29e98..1ba9c40 100644 --- a/ata.ml +++ b/ata.ml @@ -1,12 +1,51 @@ (* Todo refactor and remove this alias *) INCLUDE "debug.ml" -module Tree = Tree.Binary - - let gen_id = let id = ref (-1) in fun () -> incr id;!id + module TS = + struct + type t = Nil | Cons of Tree.t * t | Concat of t*t + let empty = Nil + + let cons e t = Cons(e,t) + let concat t1 t2 = Concat (t1,t2) + let append e t = Concat(t,Cons(e,Nil)) + + let fold f l acc = + let rec loop acc = function + | Nil -> acc + | Cons(e,t) -> loop (f e acc) t + | Concat(t1,t2) -> loop (loop acc t1) t2 + in + loop acc l + + let length l = fold (fun _ x -> x+1) l 0 + + + let iter f l = + let rec loop = function + | Nil -> () + | Cons(e,t) -> let _ = f e in loop t + | Concat(t1,t2) -> let _ = loop t1 in loop t2 + in loop l + + end + + + +let h_union = Hashtbl.create 4097 + +let pt_cup s1 s2 = + let h = (Ptset.hash s1)*(Ptset.hash s2) - ((Ptset.hash s2)+(Ptset.hash s1)) in + try + Hashtbl.find h_union h + with + | Not_found -> let s = Ptset.union s1 s2 + in + Hashtbl.add h_union h s;s + module State = struct @@ -18,13 +57,7 @@ let mk_state = State.mk type state = State.t -type predicate = [ `Left of (Tree.t -> bool) | `Right of (Tree.t -> bool) | - `True - ] -let eval_pred t = - function `True -> true - | `Left f | `Right f -> f t type formula_expr = | False | True @@ -35,7 +68,7 @@ and formula = { fid: int; fkey : int; pos : formula_expr; neg : formula; - st : (Ptset.t*Ptset.t)*(Ptset.t*Ptset.t); + st : (Ptset.t*Ptset.t*Ptset.t)*(Ptset.t*Ptset.t*Ptset.t); size: int; } @@ -45,9 +78,9 @@ external int_bool : bool -> int = "%identity" let hash_node_form t = match t with | False -> 0 | True -> 1 - | And(f1,f2) -> (2+17*f1.fkey + 37*f2.fkey) land max_int - | Or(f1,f2) -> (3+101*f1.fkey + 253*f2.fkey) land max_int - | Atom(v,b,s) -> ((hash_const_variant v) + (3846*(int_bool b) +257) + (s lsl 13 - s)) land max_int + | And(f1,f2) -> (2+17*f1.fkey + 37*f2.fkey) (*land max_int *) + | Or(f1,f2) -> (3+101*f1.fkey + 253*f2.fkey) (*land max_int *) + | Atom(v,b,s) -> ((hash_const_variant v) + (3846*(int_bool b) +257) + (s lsl 13 - s)) (*land max_int *) module FormNode = @@ -70,12 +103,12 @@ module WH = Weak.Make(FormNode) let f_pool = WH.create 107 -let empty_pair = Ptset.empty,Ptset.empty -let empty_quad = empty_pair,empty_pair +let empty_triple = Ptset.empty,Ptset.empty,Ptset.empty +let empty_hex = empty_triple,empty_triple let true_,false_ = - let rec t = { fid = 1; pos = True; fkey=1; neg = f ; st = empty_quad; size =1; } - and f = { fid = 0; pos = False; fkey=0; neg = t; st = empty_quad; size = 1; } + let rec t = { fid = 1; pos = True; fkey=1; neg = f ; st = empty_hex; size =1; } + and f = { fid = 0; pos = False; fkey=0; neg = t; st = empty_hex; size = 1; } in WH.add f_pool f; WH.add f_pool t; @@ -107,21 +140,21 @@ let cons pos neg s1 s2 size1 size2 = let atom_ d p s = let si = Ptset.singleton s in let ss = match d with - | `Left -> (si,Ptset.empty),empty_pair - | `Right -> empty_pair,(si,Ptset.empty) - | `LLeft -> (Ptset.empty,si),empty_pair - | `RRight -> empty_pair,(Ptset.empty,si) + | `Left -> (si,Ptset.empty,si),empty_triple + | `Right -> empty_triple,(si,Ptset.empty,si) + | `LLeft -> (Ptset.empty,si,si),empty_triple + | `RRight -> empty_triple,(Ptset.empty,si,si) in fst (cons (Atom(d,p,s)) (Atom(d,not p,s)) ss ss 1 1) -let union_quad ((l1,ll1),(r1,rr1)) ((l2,ll2),(r2,rr2)) = - (Ptset.union l1 l2 ,Ptset.union ll1 ll2), - (Ptset.union r1 r2 ,Ptset.union rr1 rr2) +let union_hex ((l1,ll1,lll1),(r1,rr1,rrr1)) ((l2,ll2,lll2),(r2,rr2,rrr2)) = + (pt_cup l1 l2 ,pt_cup ll1 ll2,pt_cup lll1 lll2), + (pt_cup r1 r2 ,pt_cup rr1 rr2,pt_cup rrr1 rrr2) let merge_states f1 f2 = let sp = - union_quad f1.st f2.st + union_hex f1.st f2.st and sn = - union_quad f1.neg.st f2.neg.st + union_hex f1.neg.st f2.neg.st in sp,sn @@ -161,16 +194,181 @@ let and_ f1 f2 = let not_ f = f.neg +let k_hash (s,t) = ((Ptset.hash s)) lsl 31 lxor (Tag.hash t) module HTagSetKey = struct type t = Ptset.t*Tag.t - let int_hash key = key lsl 31 lor (key lsl 8) let equal (s1,s2) (t1,t2) = (s2 == t2) && Ptset.equal s1 t1 - let hash (s,t) = int_hash (Ptset.hash s) lxor ( int_hash (Tag.hash t)) + let hash = k_hash +end + +module HTagSet = +struct + type key = Ptset.t*Tag.t + let equal (s1,s2) (t1,t2) = (s2 == t2) && Ptset.equal s1 t1 + let hash (s,t) = ((Ptset.hash s)) lsl 31 lxor (Tag.hash t) + +type 'a t = + { mutable size: int; (* number of elements *) + mutable data: (key,'a) bucketlist array } (* the buckets *) + +and ('a, 'b) bucketlist = + Empty + | Cons of 'a * 'b * ('a, 'b) bucketlist + +let create initial_size = + let s = min (max 1 initial_size) Sys.max_array_length in + { size = 0; data = Array.make s Empty } + +let clear h = + for i = 0 to Array.length h.data - 1 do + h.data.(i) <- Empty + done; + h.size <- 0 + +let copy h = + { size = h.size; + data = Array.copy h.data } + +let length h = h.size + +let resize tbl = + let odata = tbl.data in + let osize = Array.length odata in + let nsize = min (2 * osize + 1) Sys.max_array_length in + if nsize <> osize then begin + let ndata = Array.create nsize Empty in + let rec insert_bucket = function + Empty -> () + | Cons(key, data, rest) -> + insert_bucket rest; (* preserve original order of elements *) + let nidx = (hash key) mod nsize in + ndata.(nidx) <- Cons(key, data, ndata.(nidx)) in + for i = 0 to osize - 1 do + insert_bucket odata.(i) + done; + tbl.data <- ndata; + end + +let add h key info = + let i = (hash key) mod (Array.length h.data) in + let bucket = Cons(key, info, h.data.(i)) in + h.data.(i) <- bucket; + h.size <- succ h.size; + if h.size > Array.length h.data lsl 1 then resize h + +let remove h key = + let rec remove_bucket = function + Empty -> + Empty + | Cons(k, i, next) -> + if equal k key + then begin h.size <- pred h.size; next end + else Cons(k, i, remove_bucket next) in + let i = (hash key) mod (Array.length h.data) in + h.data.(i) <- remove_bucket h.data.(i) + +let rec find_rec key = function + Empty -> + raise Not_found + | Cons(k, d, rest) -> + if equal key k then d else find_rec key rest + +let find h key = + match h.data.((hash key) mod (Array.length h.data)) with + Empty -> raise Not_found + | Cons(k1, d1, rest1) -> + if equal key k1 then d1 else + match rest1 with + Empty -> raise Not_found + | Cons(k2, d2, rest2) -> + if equal key k2 then d2 else + match rest2 with + Empty -> raise Not_found + | Cons(k3, d3, rest3) -> + if equal key k3 then d3 else find_rec key rest3 + +let find_all h key = + let rec find_in_bucket = function + Empty -> + [] + | Cons(k, d, rest) -> + if equal k key + then d :: find_in_bucket rest + else find_in_bucket rest in + find_in_bucket h.data.((hash key) mod (Array.length h.data)) + +let replace h key info = + let rec replace_bucket = function + Empty -> + raise Not_found + | Cons(k, i, next) -> + if equal k key + then Cons(k, info, next) + else Cons(k, i, replace_bucket next) in + let i = (hash key) mod (Array.length h.data) in + let l = h.data.(i) in + try + h.data.(i) <- replace_bucket l + with Not_found -> + h.data.(i) <- Cons(key, info, l); + h.size <- succ h.size; + if h.size > Array.length h.data lsl 1 then resize h + +let mem h key = + let rec mem_in_bucket = function + | Empty -> + false + | Cons(k, d, rest) -> + equal k key || mem_in_bucket rest in + mem_in_bucket h.data.((hash key) mod (Array.length h.data)) + +let iter f h = + let rec do_bucket = function + Empty -> + () + | Cons(k, d, rest) -> + f k d; do_bucket rest in + let d = h.data in + for i = 0 to Array.length d - 1 do + do_bucket d.(i) + done + +let fold f h init = + let rec do_bucket b accu = + match b with + Empty -> + accu + | Cons(k, d, rest) -> + do_bucket rest (f k d accu) in + let d = h.data in + let accu = ref init in + for i = 0 to Array.length d - 1 do + accu := do_bucket d.(i) !accu + done; + !accu + + end -module HTagSet = Hashtbl.Make(HTagSetKey) + + + + + + + + + + + + +type dispatch = { first : Tree.t -> Tree.t; + flabel : string; + next : Tree.t -> Tree.t -> Tree.t; + nlabel : string; + } type t = { id : int; mutable states : Ptset.t; @@ -178,11 +376,9 @@ type t = { mutable final : Ptset.t; universal : Ptset.t; (* Transitions of the Alternating automaton *) - phi : (state,(TagSet.t*(bool*formula*predicate)) list) Hashtbl.t; - delta : (state*Tag.t, (bool*formula*predicate)) Hashtbl.t; -(* delta : (state,(bool*formula*predicate) TagMap.t) Hashtbl.t; *) - sigma : (bool*formula*(predicate list*predicate list)*bool) HTagSet.t; - } + phi : (state,(TagSet.t*(bool*formula*bool)) list) Hashtbl.t; + sigma : (dispatch*bool*formula) HTagSet.t; +} module Pair (X : Set.OrderedType) (Y : Set.OrderedType) = struct @@ -196,7 +392,7 @@ type t = { module PL = Set.Make (Pair (Ptset) (Ptset)) - let pr_st ppf l = Format.fprintf ppf "{"; + let pr_st ppf l = Format.fprintf ppf "{"; begin match l with | [] -> () @@ -296,12 +492,12 @@ type t = { Format.fprintf ppf "\n")l; Format.fprintf ppf "NFA transitions :\n------------------------------\n"; - HTagSet.iter (fun (qs,t) (b,f,_,_) -> + HTagSet.iter (fun (qs,t) (disp,b,f) -> pr_st ppf (Ptset.elements qs); Format.fprintf ppf ",%s %s " (Tag.to_string t) (if b then "=>" else "->"); pr_frm ppf f; Format.fprintf ppf "(fid=%i) left=" f.fid; - let (l,ll),(r,rr) = f.st in + let (l,ll,_),(r,rr,_) = f.st in pr_st ppf (Ptset.elements l); Format.fprintf ppf ", "; pr_st ppf (Ptset.elements ll); @@ -309,15 +505,15 @@ type t = { pr_st ppf (Ptset.elements r); Format.fprintf ppf ", "; pr_st ppf (Ptset.elements rr); - Format.fprintf ppf "\n"; + Format.fprintf ppf ", first=%s, next=%s\n" disp.flabel disp.nlabel; ) a.sigma; - Format.fprintf ppf "=======================================\n" + Format.fprintf ppf "=======================================\n%!" module Transitions = struct - type t = state*TagSet.t*bool*formula*predicate + type t = state*TagSet.t*bool*formula*bool let ( ?< ) x = x - let ( >< ) state (l,b) = state,(l,b,`True) - let ( ><@ ) state (l,b,p) = state,(l,b,p) + let ( >< ) state (l,b) = state,(l,b,false) + let ( ><@ ) state (l,b) = state,(l,b,true) let ( >=> ) (state,(label,mark,pred)) form = (state,label,mark,form,pred) let ( +| ) f1 f2 = or_ f1 f2 let ( *& ) f1 f2 = and_ f1 f2 @@ -330,85 +526,26 @@ type t = { let equal_trans (q1,t1,m1,f1,_) (q2,t2,m2,f2,_) = (q1 == q2) && (TagSet.equal t1 t2) && (m1 == m2) && (equal_form f1 f2) - module TS = - struct - type node = Nil | Cons of Tree.t * node | Concat of node*node - and t = { node : node; size : int } - let node n s = { node=n; size = s } - - let empty = node Nil 0 - - let cons e t = node (Cons(e,t.node)) (t.size+1) - let concat t1 t2 = node (Concat (t1.node,t2.node)) (t1.size+t2.size) - let append = cons -(* let append e t = node (Concat(t.node,Cons(e,Nil))) (t.size+1) *) - - let to_list_rev t = - let rec aux acc l rest = - match l with - | Nil -> begin - match rest with - | Nil -> acc - | Cons(e,t) -> aux (e::acc) t Nil - | Concat(t1,t2) -> aux acc t1 t2 - end - | Cons(e,r) -> aux (e::acc) r rest - | Concat(t1,t2) -> aux acc t1 (Concat(t2,rest)) - in - aux [] t.node Nil - - let length = function { size = s } -> s - - let iter f { node = n } = - let rec loop = function - | Nil -> () - | Cons(e,n) -> let _ = f e in loop n - | Concat(n1,n2) -> let _ = loop n1 in loop n2 - in loop n - let rev_iter f { node = n } = - let rec loop = function - | Nil -> () - | Cons(e,n) -> let _ = loop n in f e - | Concat(n1,n2) -> let _ = loop n2 in loop n1 - in loop n - - - let find f { node = n } = - let rec loop = function - | Nil -> raise Not_found - | Cons(e,n) -> if f e then e else loop n - | Concat(n1,n2) -> try - loop n1 - with - | Not_found -> loop n2 - in - loop n - - end -(* - module BottomUpJumpNew = struct - -*) - module HFEval = Hashtbl.Make( - struct - type t = int*Ptset.t*Ptset.t - let equal (a,b,c) (d,e,f) = - a==d && (Ptset.equal b e) && (Ptset.equal c f) - let hash (a,b,c) = - a+17*(Ptset.hash b) + 31*(Ptset.hash c) - end) - - let hfeval = HFEval.create 4097 - + module HFEval = Hashtbl.Make( + struct + type t = int*Ptset.t*Ptset.t + let equal (a,b,c) (d,e,f) = + a==d && (Ptset.equal b e) && (Ptset.equal c f) + let hash (a,b,c) = + a+17*(Ptset.hash b) + 31*(Ptset.hash c) + end) + + let hfeval = HFEval.create 4097 + let eval_form_bool f s1 s2 = let rec eval f = match f.pos with - | Atom((`Left|`LLeft),b,q) -> if b == (Ptset.mem q s1) then (true,true,false) else false,false,false - | Atom((`Right|`RRight),b,q) -> if b == (Ptset.mem q s2) then (true,false,true) else false,false,false - (* test some inlining *) + (* test some inlining *) | True -> true,true,true | False -> false,false,false + | Atom((`Left|`LLeft),b,q) -> if b == (Ptset.mem q s1) then (true,true,false) else false,false,false + | Atom(_,b,q) -> if b == (Ptset.mem q s2) then (true,false,true) else false,false,false | _ -> try HFEval.find hfeval (f.fid,s1,s2) @@ -453,7 +590,30 @@ type t = { | `Right _ -> l1,p::l2 | _ -> l1,l2 + + + + let tags_of_state a q = Hashtbl.fold + (fun p l acc -> + if p == q then + List.fold_left + (fun acc (ts,(_,_,aux)) -> + if aux then acc else + TagSet.cup ts acc) acc l + else acc) a.phi TagSet.empty + + + let tags a qs = + let ts = Ptset.fold (fun q acc -> TagSet.cup acc (tags_of_state a q)) qs TagSet.empty + in + if TagSet.is_finite ts + then `Positive(TagSet.positive ts) + else `Negative(TagSet.negative ts) + + + + let merge_trans t a tag q acc = List.fold_left (fun (accf,accm,acchtrue) (ts,(m,f,pred)) -> if TagSet.mem tag ts @@ -472,15 +632,26 @@ type t = { (or_ tmpf accf,accm||m,acchtrue||hastrue) else (accf,accm,acchtrue) ) acc (try Hashtbl.find a.phi q with Not_found -> []) + + let inter_text a b = + match b with + | `Positive s -> let r = Ptset.inter a s in (r,Ptset.mem Tag.pcdata r, true) + | `Negative s -> (Ptset.empty, not (Ptset.mem Tag.pcdata s), false) + + let mk_nil_ctx x _ = Tree.mk_nil x + let next_sibling_ctx x _ = Tree.next_sibling x + let r_ignore _ x = x + + let get_trans t a tag r = - try - let mark,f,predl,has_true = + try + let dispatch,mark,f = HTagSet.find a.sigma (r,tag) - in f.st,f,mark,has_true,r + in f.st,dispatch,f,mark,r with - Not_found -> - let f,mark,has_true,accq = + Not_found -> + let f,mark,_,accq = Ptset.fold (fun q (accf,accm,acchtrue,accq) -> let naccf,naccm,nacctrue = merge_trans t a tag q (accf,accm,acchtrue ) @@ -490,290 +661,119 @@ type t = { ) r (false_,false,false,Ptset.empty) in - HTagSet.add a.sigma (accq,tag) (mark,f,([],[]),has_true); - f.st,f,mark,has_true,accq - - let h_union = Hashtbl.create 4097 - - let pt_cup s1 s2 = - let h = (Ptset.hash s1,Ptset.hash s2) in - try - Hashtbl.find h_union h - with - | Not_found -> let s = Ptset.union s1 s2 - in - Hashtbl.add h_union h s;s - - - - let tags_of_state a q = Hashtbl.fold - (fun p l acc -> - if p == q then - List.fold_left - (fun acc (ts,_) -> - pt_cup (TagSet.positive ts) acc) acc l - else acc) a.phi Ptset.empty - - let h_tags_states = Hashtbl.create 4096 - - - - - let tags a qs = - try - Hashtbl.find h_tags_states (Ptset.hash qs) - with - | Not_found -> - let l = Ptset.fold (fun q acc -> pt_cup acc (tags_of_state a q)) qs Ptset.empty + let (ls,lls,_),(rs,rrs,_) = f.st in + let tb,ta = + Tree.tags t tag + in + let tl,htlt,lfin = inter_text tb (tags a ls) + and tll,htllt,llfin = inter_text tb (tags a lls) + and tr,htrt,rfin = inter_text ta (tags a rs) + and trr,htrrt,rrfin = inter_text ta (tags a rrs) in - Hashtbl.add h_tags_states (Ptset.hash qs) l;l - - let time cpt acc f x = - let t1 = Unix.gettimeofday () in - let r = f x in - let t2 = Unix.gettimeofday () in - let t = (1000. *.(t2 -. t1)) in - acc:=!acc+.t; - incr cpt; - r - - - let h_time = Hashtbl.create 4096 - let calls = ref 0 - - let rtime s f x = - - let cpt,atime = - try - Hashtbl.find h_time s - with - | _ -> (ref 0, ref 0.) - in - let r = time cpt atime f x - in - Hashtbl.replace h_time s (cpt,atime); - r - - let rec accepting_among_time a t r ctx = - incr calls; - let orig = r in - let rest = Ptset.inter r a.final in - let r = Ptset.diff r rest in - if Ptset.is_empty r then rest,TS.empty else - if Tree.is_node t - then - let among,result,form = - let ((ls,lls),(rs,rrs)),formula,mark,has_true,r' = - let tag = rtime "Tree.tag" Tree.tag t in - rtime "get_trans" (get_trans t a tag) r - in - let tl = rtime "tags" (tags a) ls - and tr = rtime "tags" (tags a) rs - and tll = rtime "tags" (tags a) lls - and trr = rtime "tags" (tags a) rrs - in - let first = - if Ptset.mem Tag.pcdata (pt_cup tl tll) - then - rtime "Tree.text_below" (Tree.text_below) t - else - let etl = Ptset.is_empty tl - and etll = Ptset.is_empty tll - in - if etl && etll - then Tree.mk_nil t + let first,flabel = + if (llfin && lfin) then (* no stars *) + (if htlt || htllt then (Tree.text_below, "#text_below") + else + let etl = Ptset.is_empty tl + and etll = Ptset.is_empty tll + in + if (etl && etll) + then (Tree.mk_nil, "#mk_nil") + else + if etl then + if Ptset.is_singleton tll + then (Tree.tagged_desc (Ptset.choose tll), "#tagged_desc") + else (Tree.select_desc_only tll, "#select_desc_only") + else if etll then (Tree.node_child,"#node_child") + else (Tree.select_below tl tll,"#select_below")) + else (* stars or node() *) + if htlt||htllt then (Tree.first_child,"#first_child") + else (Tree.node_child,"#node_child") + and next,nlabel = + if (rrfin && rfin) then (* no stars *) + ( if htrt || htrrt + then (Tree.text_next, "#text_next") else - if etl then rtime "Tree.tagged_desc_only" (Tree.tagged_desc_only t) tll - else if etll then rtime "Tree.first_child" (Tree.first_child) t - else (* add child only *) - rtime "Tree.tagged_below" (Tree.tagged_below t tl) tll - and next = - if Ptset.mem Tag.pcdata (pt_cup tr trr) - then - rtime "Tree.text_next" (Tree.text_next t) ctx - else - let etr = Ptset.is_empty tr - and etrr = Ptset.is_empty trr - in - if etr && etrr - then Tree.mk_nil t - else - if etr then rtime "Tree.tagged_foll_only" (Tree.tagged_foll_only t trr) ctx - else if etrr then rtime "Tree.next_sibling" (Tree.next_sibling) t - else (* add ns only *) - rtime "Tree.tagged_next" (Tree.tagged_next t tr trr) ctx - - in - let s1,res1 = accepting_among_time a first (pt_cup ls lls) t - and s2,res2 = accepting_among_time a next (pt_cup rs rrs) ctx - in - let rb,rb1,rb2 = rtime "eval_form_bool" (eval_form_bool formula s1) s2 in - if rb - then - let res1 = if rb1 then res1 else TS.empty - and res2 = if rb2 then res2 else TS.empty - in r', rtime "TS.concat" (TS.concat res2) (if mark then rtime "TS.append" (TS.append t) res1 else res1),formula - else Ptset.empty,TS.empty,formula - + let etr = Ptset.is_empty tr + and etrr = Ptset.is_empty trr + in + if etr && etrr + then (mk_nil_ctx, "#mk_nil_ctx") + else + if etr then + if Ptset.is_singleton trr + then (Tree.tagged_foll_below (Ptset.choose trr),"#tagged_foll_below") + else (Tree.select_foll_only trr,"#select_foll_only") + else if etrr then (Tree.node_sibling_ctx,"#node_sibling_ctx") + else + (Tree.select_next tr trr,"#select_next") ) + + else if htrt || htrrt then (Tree.next_sibling_ctx,"#next_sibling_ctx") + else (Tree.node_sibling_ctx,"#node_sibling_ctx") + in + let dispatch = { first = first; flabel = flabel; next = next; nlabel = nlabel} in - - among,result - - else orig,TS.empty - - - let run_time a t = - let st,res = accepting_among_time a t a.init t in - let _ = Printf.eprintf "\n Timings\n"; - let total_time = Hashtbl.fold (fun fname ({ contents=cpt }, {contents=atime}) (total_time) -> - Printf.eprintf "%s\t %i calls, %f ms accumulated time, %f ms mean time\n" - fname cpt atime (atime /. (float_of_int cpt)); - total_time +. atime ) h_time 0. - in - Printf.eprintf "total calls %i, total monitored time %f ms\n%!" !calls total_time - in - if Ptset.is_empty (st) then TS.empty else res - - - - let rec accepting_among a t r ctx = - let orig = r in - let rest = Ptset.inter r a.final in - let r = Ptset.diff r rest in - if Ptset.is_empty r then rest,TS.empty else - if Tree.is_node t - then - let among,result,form = - let ((ls,lls),(rs,rrs)),formula,mark,has_true,r' = - let tag = Tree.tag t in - get_trans t a tag r - in - let tl = tags a ls - and tr = tags a rs - and tll = tags a lls - and trr = tags a rrs - in - let first = - if Ptset.mem Tag.pcdata (pt_cup tl tll) - then - Tree.text_below t - else - let etl = Ptset.is_empty tl - and etll = Ptset.is_empty tll - in - if etl && etll - then Tree.mk_nil t - else - if etl then Tree.tagged_desc_only t tll - else if etll then Tree.first_child t - else (* add child only *) - Tree.tagged_below t tl tll - and next = - if Ptset.mem Tag.pcdata (pt_cup tr trr) - then - Tree.text_next t ctx - else - let etr = Ptset.is_empty tr - and etrr = Ptset.is_empty trr - in - if etr && etrr - then Tree.mk_nil t - else - if etr then Tree.tagged_foll_only t trr ctx - else if etrr then Tree.next_sibling t - else (* add ns only *) - Tree.tagged_next t tr trr ctx - - in - let s1,res1 = accepting_among a first (pt_cup ls lls) t - and s2,res2 = accepting_among a next (pt_cup rs rrs) ctx - in - let rb,rb1,rb2 = eval_form_bool formula s1 s2 in - if rb - then - let res1 = if rb1 then res1 else TS.empty - and res2 = if rb2 then res2 else TS.empty - in r', TS.concat res2 (if mark then TS.append t res1 else res1),formula - else Ptset.empty,TS.empty,formula - - in - among,result + HTagSet.add a.sigma (accq,tag) (dispatch,mark,f); + f.st,dispatch,f,mark,accq - else orig,TS.empty - - - let run a t = - let st,res = accepting_among a t a.init t in - if Ptset.is_empty (st) then TS.empty else res + let rec accepting_among a t orig ctx = + let rest = Ptset.inter orig a.universal in + let r = Ptset.diff orig rest in + if Ptset.is_empty r then rest,0,TS.empty else + if Tree.is_nil t + then orig,0,TS.empty + else + let ((_,_,llls),(_,_,rrrs)),dispatch,formula,mark,r' = + get_trans t a (Tree.tag t) r + in + let s1,n1,res1 = accepting_among a (dispatch.first t) llls t in + let s2,n2,res2 = accepting_among a (dispatch.next t ctx) rrrs ctx in + let rb,rb1,rb2 = eval_form_bool formula s1 s2 in + if rb + then + let n1,res1 = if rb1 then n1,res1 else 0,TS.empty + and n2,res2 = if rb2 then n2,res2 else 0,TS.empty + in + if mark + then r',1+n1+n2,TS.Cons(t,(TS.Concat(res1,res2))) + else r',n1+n2,TS.Concat(res1,res2) + else Ptset.empty,0,TS.empty - let rec accepting_among_count a t r ctx = - let orig = r in - let rest = Ptset.inter r a.final in - let r = Ptset.diff r rest in + + let rec accepting_among_count a t orig ctx = + let rest = Ptset.inter orig a.universal in + let r = Ptset.diff orig rest in if Ptset.is_empty r then rest,0 else if Tree.is_node t then - let ((ls,lls),(rs,rrs)),formula,mark,has_true,r' = - let tag = Tree.tag t in - get_trans t a tag r - in - let tl = tags a ls - and tr = tags a rs - and tll = tags a lls - and trr = tags a rrs - in - let first = - if Ptset.mem Tag.pcdata (pt_cup tl tll) - then - Tree.text_below t - else - let etl = Ptset.is_empty tl - and etll = Ptset.is_empty tll - in - if etl && etll - then Tree.mk_nil t - else - if etl then Tree.tagged_desc_only t tll - else if etll then Tree.first_child t - else (* add child only *) - Tree.tagged_below t tl tll - and next = - if Ptset.mem Tag.pcdata (pt_cup tr trr) - then - Tree.text_next t ctx - else - let etr = Ptset.is_empty tr - and etrr = Ptset.is_empty trr - in - if etr && etrr - then Tree.mk_nil t - else - if etr then Tree.tagged_foll_only t trr ctx - else if etrr then Tree.next_sibling t - else (* add ns only *) - Tree.tagged_next t tr trr ctx - + let ((_,_,llls),(_,_,rrrs)),dispatch,formula,mark,r' = + get_trans t a (Tree.tag t) r in - let s1,res1 = accepting_among_count a first (pt_cup ls lls) t - and s2,res2 = accepting_among_count a next (pt_cup rs rrs) ctx + let s1,res1 = accepting_among_count a (dispatch.first t) llls t + and s2,res2 = accepting_among_count a (dispatch.next t ctx) rrrs ctx in let rb,rb1,rb2 = eval_form_bool formula s1 s2 in - if rb - then - let res1 = if rb1 then res1 else 0 - and res2 = if rb2 then res2 else 0 - in r', res2 + (if mark then 1 + res1 else res1) - else Ptset.empty,0 - - - + if rb + then + let res1 = if rb1 then res1 else 0 + and res2 = if rb2 then res2 else 0 + in r', if mark then 1+res1+res2 else res1+res2 + else Ptset.empty,0 else orig,0 - + + let run a t = + let st,n,res = accepting_among a t a.init t in + if Ptset.is_empty (st) then TS.empty,0 else res,n + + + let run_count a t = let st,res = accepting_among_count a t a.init t in - if Ptset.is_empty (st) then 0 else res + if Ptset.is_empty (st) then 0 else res + + let run_time _ _ = failwith "blah" diff --git a/ata.mli b/ata.mli index bf9368f..cd6610b 100644 --- a/ata.mli +++ b/ata.mli @@ -1,26 +1,23 @@ -(* module Ptset : sig - include Set.S with type elt = int - val from_list : elt list -> t - end - *) +module TS : sig + type t + val empty : t + val cons : Tree.t -> t -> t + val append : Tree.t -> t -> t + val concat : t -> t -> t + val length : t -> int + val iter : (Tree.t -> unit) -> t -> unit +end type state = int val mk_state : unit -> state -type predicate = [ `Left of (Tree.Binary.t -> bool) | `Right of (Tree.Binary.t -> bool) | - `True - ] - - -val eval_pred : Tree.Binary.t -> predicate -> bool - type formula_expr = False | True | Or of formula * formula | And of formula * formula | Atom of ([ `Left | `Right | `LLeft | `RRight ] * bool * state) -and formula = { fid : int; fkey : int; pos : formula_expr; neg : formula; st : (Ptset.t*Ptset.t)*(Ptset.t*Ptset.t); size: int;} +and formula = { fid : int; fkey : int; pos : formula_expr; neg : formula; st : (Ptset.t*Ptset.t*Ptset.t)*(Ptset.t*Ptset.t*Ptset.t); size: int;} val true_ : formula val false_ : formula val atom_ : [`Left | `Right | `LLeft | `RRight ] -> bool -> state -> formula @@ -33,27 +30,30 @@ val pr_frm : Format.formatter -> formula -> unit module HTagSet : Hashtbl.S with type key = Ptset.t*Tag.t -type t = { - id : int; - mutable states : Ptset.t; - init : Ptset.t; - mutable final : Ptset.t; - universal : Ptset.t; - phi : (state,(TagSet.t*(bool*formula*predicate)) list) Hashtbl.t; - delta : (state*Tag.t, (bool*formula*predicate)) Hashtbl.t; -(* delta : (state,(bool*formula*predicate) TagMap.t) Hashtbl.t; *) - sigma : (bool*formula*(predicate list*predicate list)*bool) HTagSet.t; - +type dispatch = { first : Tree.t -> Tree.t; + flabel : string; + next : Tree.t -> Tree.t -> Tree.t; + nlabel : string; + } +type t = { + id : int; + mutable states : Ptset.t; + init : Ptset.t; + mutable final : Ptset.t; + universal : Ptset.t; + (* Transitions of the Alternating automaton *) + phi : (state,(TagSet.t*(bool*formula*bool)) list) Hashtbl.t; + sigma : (dispatch*bool*formula) HTagSet.t; } val dump : Format.formatter -> t -> unit module Transitions : sig -type t = state*TagSet.t*bool*formula*predicate +type t = state*TagSet.t*bool*formula*bool (* Doing this avoid the parenthesis *) val ( ?< ) : state -> state -val ( >< ) : state -> TagSet.t*bool -> state*(TagSet.t*bool*predicate) -val ( ><@ ) : state -> TagSet.t*bool*predicate -> state*(TagSet.t*bool*predicate) -val ( >=> ) : state*(TagSet.t*bool*predicate) -> formula -> t +val ( >< ) : state -> TagSet.t*bool -> state*(TagSet.t*bool*bool) +val ( ><@ ) : state -> TagSet.t*bool -> state*(TagSet.t*bool*bool) +val ( >=> ) : state*(TagSet.t*bool*bool) -> formula -> t val ( +| ) : formula -> formula -> formula val ( *& ) : formula -> formula -> formula val ( ** ) : [`Left | `Right | `LLeft | `RRight ] -> state -> formula @@ -62,23 +62,12 @@ end type transition = Transitions.t val equal_trans : transition -> transition -> bool -module TS : sig - type t - val empty : t - val cons : Tree.Binary.t -> t -> t - val append : Tree.Binary.t -> t -> t - val concat : t -> t -> t - val to_list_rev : t -> Tree.Binary.t list - val length : t -> int - val iter : (Tree.Binary.t -> unit) -> t -> unit - val rev_iter : (Tree.Binary.t -> unit) -> t -> unit - val find : (Tree.Binary.t -> bool) -> t -> Tree.Binary.t -end + (*module BottomUpJumpNew : sig *) - val run : t -> Tree.Binary.t -> TS.t - val run_count : t -> Tree.Binary.t -> int - val run_time : t -> Tree.Binary.t -> TS.t + val run : t -> Tree.t -> TS.t*int + val run_count : t -> Tree.t -> int + val run_time : t -> Tree.t -> TS.t*int (*end *) diff --git a/main.ml b/main.ml index ab043b7..a7250a0 100644 --- a/main.ml +++ b/main.ml @@ -22,7 +22,7 @@ let total_time () = List.fold_left (+.) 0. !l;; let main v query output = - let _ = Tag.init (Tree.Binary.tag_pool v) in + let _ = Tag.init (Tree.tag_pool v) in Printf.eprintf "Parsing query : "; let query = try time @@ -44,16 +44,16 @@ let main v query output = let _ = match contains with None -> () | Some s -> - let r = Tree.Binary.count v s + let r = Tree.count v s in Printf.eprintf "Global count is %i, using " r; if r < 60000 then begin Printf.eprintf "TextCollection contains\nCalling global contains : "; - time (Tree.Binary.init_contains v) s + time (Tree.init_contains v) s end else begin Printf.eprintf "Naive contains\nCalling global contains : "; - time (Tree.Binary.init_naive_contains v) s + time (Tree.init_naive_contains v) s end in Printf.eprintf "Execution time %s : " (if !Options.count_only then "(counting only)" else ""); @@ -63,9 +63,9 @@ let main v query output = let _ = Printf.eprintf "Number of nodes in the result set : %i\n%!" r in () else - (* let _ = Gc.set ({ Gc.get() with Gc.max_overhead = 1000000; Gc.space_overhead = 100 }) in *) - let result = time (if !Options.time then run_time auto else run auto) v in - Printf.eprintf "Number of nodes in the result set : %i\n" (TS.length result); +(* let _ = Gc.set ({ Gc.get() with Gc.max_overhead = 1000000; Gc.space_overhead = 100 }) in *) + let result,rcount = time (if !Options.time then run_time auto else run auto) v in + Printf.eprintf "Number of nodes in the result set : %i\n" rcount; Printf.eprintf "\n%!"; begin match output with @@ -75,11 +75,12 @@ let main v query output = time( fun () -> let oc = open_out f in output_string oc "\n"; - TS.rev_iter (fun t -> output_string oc "----------\n"; - Tree.Binary.print_xml_fast oc t; + TS.iter (fun t -> output_string oc "----------\n"; + Tree.print_xml_fast oc t; output_char oc '\n') result) (); end; end; + let _ = Ata.dump Format.err_formatter auto in Printf.eprintf "Total running time : %fms\n%!" (total_time()) ;; @@ -90,19 +91,19 @@ let v = then begin Printf.eprintf "Loading from file : "; - time (Tree.Binary.load ~sample:!Options.sample_factor ) + time (Tree.load ~sample:!Options.sample_factor ) (Filename.chop_suffix !Options.input_file ".srx"); end else let v = - time (fun () -> let v = Tree.Binary.parse_xml_uri !Options.input_file; + time (fun () -> let v = Tree.parse_xml_uri !Options.input_file; in Printf.eprintf "Parsing document : %!";v ) () in if !Options.save_file <> "" then begin Printf.eprintf "Writing file to disk : "; - time (Tree.Binary.save v) !Options.save_file; + time (Tree.save v) !Options.save_file; end; v in diff --git a/ptset.ml b/ptset.ml index 4b2c845..e16cc2c 100644 --- a/ptset.ml +++ b/ptset.ml @@ -45,13 +45,7 @@ module Node = end module WH =Weak.Make(Node) -(* struct - include Hashtbl.Make(Node) - let merge h v = - if mem h v then v - else (add h v v;v) -end -*) + let pool = WH.create 4093 (* Neat trick thanks to Alain Frisch ! *) @@ -89,6 +83,9 @@ let branch_ne = function let zero_bit k m = (k land m) == 0 let singleton k = leaf k +let is_singleton n = + match n.node with Leaf _ -> true + | _ -> false let rec mem k n = match n.node with | Empty -> false @@ -186,35 +183,38 @@ let rec min_elt n = match n.node with let compare a b = if a == b then 0 else a.id - b.id + let h_merge = Hashtbl.create 4097 + let com_hash x y = (x*y - (x+y)) land max_int let rec merge s t = if (equal s t) (* This is cheap thanks to hash-consing *) then s else - match s.node,t.node with - | Empty, _ -> t - | _, Empty -> s - | Leaf k, _ -> add k t - | _, Leaf k -> add k s - | Branch (p,m,s0,s1), Branch (q,n,t0,t1) -> - if m == n && match_prefix q p m then - branch p m (merge s0 t0) (merge s1 t1) - else if m > n && match_prefix q p m then - if zero_bit q m then - branch p m (merge s0 t) s1 - else - branch p m s0 (merge s1 t) - else if m < n && match_prefix p q n then - if zero_bit p n then - branch q n (merge s t0) t1 - else - branch q n t0 (merge s t1) + match s.node,t.node with + | Empty, _ -> t + | _, Empty -> s + | Leaf k, _ -> add k t + | _, Leaf k -> add k s + | Branch (p,m,s0,s1), Branch (q,n,t0,t1) -> + if m == n && match_prefix q p m then + branch p m (merge s0 t0) (merge s1 t1) + else if m > n && match_prefix q p m then + if zero_bit q m then + branch p m (merge s0 t) s1 + else + branch p m s0 (merge s1 t) + else if m < n && match_prefix p q n then + if zero_bit p n then + branch q n (merge s t0) t1 else - (* The prefixes disagree. *) - join p s q t - - - + branch q n t0 (merge s t1) + else + (* The prefixes disagree. *) + join p s q t + + + + let rec subset s1 s2 = (equal s1 s2) || match (s1.node,s2.node) with | Empty, _ -> true @@ -232,8 +232,10 @@ let rec min_elt n = match n.node with else false - let union s t = - merge s t + + + + let union s1 s2 = merge s1 s2 let rec inter s1 s2 = if equal s1 s2 diff --git a/ptset.mli b/ptset.mli index 8a25ffc..47c28ba 100644 --- a/ptset.mli +++ b/ptset.mli @@ -84,6 +84,8 @@ val max_elt : t -> int intersection. *) val intersect : t -> t -> bool +val is_singleton : t -> bool + val hash : t -> int val from_list : int list -> t diff --git a/tree.ml b/tree.ml index 487a057..4cf4047 100644 --- a/tree.ml +++ b/tree.ml @@ -4,526 +4,340 @@ (* Copyright NICTA 2008 *) (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) -INCLUDE "debug.ml" -module type BINARY = -sig - type node_content - type string_content - type descr = Nil | Node of node_content |String of string_content - type t - val parse_xml_uri : string -> t - val parse_xml_string : string -> t - val save : t -> string -> unit - val load : ?sample:int -> string -> t - val tag_pool : t -> Tag.pool - val string : t -> string - val descr : t -> descr - val is_node : t -> bool - val left : t -> t - val right : t -> t - val first_child : t -> t - val next_sibling : t -> t - val parent : t -> t - val root : t -> t - val is_root : t -> bool - val id : t -> int - val tag : t -> Tag.t - val print_xml_fast : out_channel -> t -> unit - val compare : t -> t -> int - val equal : t -> t -> bool - module DocIdSet : - sig - include Set.S - end - with type elt = string_content - val string_below : t -> string_content -> bool - val contains : t -> string -> DocIdSet.t - val contains_old : t -> string -> DocIdSet.t - val contains_iter : t -> string -> DocIdSet.t - val count_contains : t -> string -> int - val count : t -> string -> int - val dump : t -> unit - val get_string : t -> string_content -> string - val has_tagged_desc : t -> Tag.t -> bool - val has_tagged_foll : t -> Tag.t -> bool - val tagged_desc : t -> Tag.t -> t - val tagged_foll : t -> Tag.t -> t - val tagged_below : t -> Ptset.t -> Ptset.t -> t - val tagged_next : t -> Ptset.t -> Ptset.t -> t -> t - val tagged_desc_only : t -> Ptset.t -> t - val tagged_foll_only : t -> Ptset.t -> t -> t - val text_below : t -> t - val text_next : t -> t -> t - val init_tagged_next : t -> Tag.t -> unit - val subtree_tags : t -> Tag.t -> int - val is_left : t -> bool - val print_id : Format.formatter -> t -> unit - val test_xml_tree : Format.formatter -> Ptset.t -> t -> unit - val init_contains : t -> string -> unit - val init_naive_contains : t -> string -> unit - val mk_nil : t -> t - val test_jump : t -> Tag.t -> unit - val time_xml_tree : t -> Tag.t -> int list - val time_xml_tree2 : t -> Tag.t -> int list -end - -module XML = -struct - - type t - type 'a node = int - type node_kind = [`Text | `Tree ] +(*INCLUDE "debug.ml" *) - let compare : 'a node -> 'a node -> int = (-) - let equal : 'a node -> 'a node -> bool = (==) - - (* abstract type, values are pointers to a XMLTree C++ object *) +type tree +type 'a node = int +type node_kind = [`Text | `Tree ] - external int_of_node : 'a node -> int = "%identity" - - external parse_xml_uri : string -> int -> bool -> bool -> t = "caml_call_shredder_uri" - external parse_xml_string : string -> int -> bool -> bool -> t = "caml_call_shredder_string" - - external save_tree : t -> string -> unit = "caml_xml_tree_save" - external load_tree : string -> int -> t = "caml_xml_tree_load" - - - module Text = - struct - let equal : [`Text] node -> [`Text] node -> bool = equal - - (* Todo *) - external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt" - let nil = nullt () - external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text" - -(* let get_text t n = - if equal nil n then "" - else get_text t n -*) +let compare_node : 'a node -> 'a node -> int = (-) +let equal_node : 'a node -> 'a node -> bool = (==) + +(* abstract type, values are pointers to a XMLTree C++ object *) + +external int_of_node : 'a node -> int = "%identity" + +external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shredder_uri" +external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string" + +external save_tree : tree -> string -> unit = "caml_xml_tree_save" +external load_tree : string -> int -> tree = "caml_xml_tree_load" + +external nullt : unit -> 'a node = "caml_xml_tree_nullt" + +let nil : 'a node = Obj.magic (-1) + +external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text" - external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text" - - let is_empty t n = - (equal nil n) || is_empty t n - - external get_cached_text : t -> [`Text ] node -> string = "caml_text_collection_get_cached_text" - +external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text" - let get_text t n = - if equal nil n then "" - else get_cached_text t n +let text_is_empty t n = + (equal_node nil n) || text_is_empty t n + +external get_cached_text : tree -> [`Text ] node -> string = "caml_text_collection_get_cached_text" + + +let text_get_text t n = + if equal_node nil n then "" + else get_cached_text t n - external size : t -> int = "caml_text_collection_size" - external is_contains : t -> string -> bool = "caml_text_collection_is_contains" - external count_contains : t -> string -> int = "caml_text_collection_count_contains" - external count : t -> string -> int = "caml_text_collection_count" - external contains : t -> string -> [`Text ] node array = "caml_text_collection_contains" - end +external text_size : tree -> int = "caml_text_collection_size" +external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains" +external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains" +external text_count : tree -> string -> int = "caml_text_collection_count" +external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains" - module Tree = - struct +external tree_serialize : tree -> string -> unit = "caml_xml_tree_serialize" - let equal : [`Tree ] node -> [`Tree] node -> bool = equal - external serialize : t -> string -> unit = "caml_xml_tree_serialize" - external unserialize : string -> t = "caml_xml_tree_unserialize" +external tree_unserialize : string -> tree = "caml_xml_tree_unserialize" - external root : t -> [`Tree] node = "caml_xml_tree_root" - external nullt : unit -> [`Tree ] node = "caml_xml_tree_nullt" - - let nil = nullt () - let is_nil x = equal x nil +external tree_root : tree -> [`Tree] node = "caml_xml_tree_root" - external parent : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" - external parent_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" - external prev_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" - external first_child : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" - +let tree_is_nil x = equal_node x nil - - external next_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" - external prev_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" - external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" +external tree_parent : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" +external tree_parent_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" +external tree_prev_doc : tree -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" +external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" +external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" +external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" +external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" -(* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*) - external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" - -(* - let tag_hash = Array.make 6_000_000 (Tag.nullt) +(* external tag : tree -> [`Tree ] node -> T = "caml_xml_tree_tag"*) +external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" - let tag_id t id = - let tag = tag_hash.(int_of_node id) - in - if tag != Tag.nullt then tag - else - let tag = tag_id t id in - (tag_hash.(int_of_node id) <- tag; tag) -*) - let is_last t n = equal nil (next_sibling t n) +let tree_is_last t n = equal_node nil (tree_next_sibling t n) - external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" - - - external my_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" - external next_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" - external doc_ids : t -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids" - external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" - external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" - external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" - external tagged_desc : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" - external tagged_foll : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_foll" - external subtree_tags : t -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" - external tagged_below : t -> [`Tree ] node -> Ptset.int_vector -> Ptset.int_vector -> [`Tree ] node = "caml_xml_tree_tagged_below" - external tagged_desc_only : t -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node = "caml_xml_tree_tagged_desc_only" - external tagged_next : t -> [`Tree ] node -> Ptset.int_vector -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_next" - external tagged_foll_only : t -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_only" - external tagged_desc_or_foll_only : t -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_only" - external tagged_foll_below : t -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" - - let test_jump tree tag = - let rec loop id ctx = - if id != nil - then - let first = tagged_desc tree id tag - and next = tagged_desc tree id tag - in - loop first id; - loop next ctx - in - loop (root tree) (root tree) +external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" + +external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" +external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" +external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids" +external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" +external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" +external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" +external tree_tagged_desc : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" +external tree_tagged_foll_below : tree -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" +external tree_subtree_tags : tree -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" +external tree_select_below : tree -> [`Tree ] node -> Ptset.int_vector -> Ptset.int_vector -> [`Tree ] node = "caml_xml_tree_select_below" +external tree_select_desc_only : tree -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node = "caml_xml_tree_select_desc_only" +external tree_select_next : tree -> [`Tree ] node -> Ptset.int_vector -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_select_next" +external tree_select_foll_only : tree -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_select_foll_only" +external tree_select_desc_or_foll_only : tree -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_select_foll_only" + +type descr = + | Nil + | Node of [`Tree] node + | Text of [`Text] node * [`Tree] node + +type t = { doc : tree; + node : descr; + ttable : (Tag.t,(Ptset.t*Ptset.t)) Hashtbl.t; + } + - - let test_xml_tree ppf tags v = - let pr x = Format.fprintf ppf x in - let rec aux id = - if (is_nil id) - then () - else - begin - pr "Node %i, (Tag) %i='%s' (GetTagName), NodeXMLId (Preorder)=%i\n%!" - (int_of_node id) - (tag_id v id) - (Tag.to_string (tag_id v id)) - (node_xml_id v id); - pr "DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) ParentDoc(my_text)=%i PrevDoc(next_text)=%i\n%!" - (int_of_node (prev_text v id)) - (Text.get_text v (prev_text v id)) - (int_of_node (my_text v id)) - (Text.get_text v (my_text v id)) - (int_of_node (next_text v id)) - (Text.get_text v (next_text v id)) - (int_of_node(parent_doc v (my_text v id))) - (int_of_node(prev_doc v (next_text v id))); - let i1,i2 = doc_ids v id in - pr "Testing DocIds below (%i,%i)*\n%!" - (int_of_node i1) (int_of_node i2); - pr "Testing Tagged*\n%!"; - Ptset.iter (fun t -> - let str = Tag.to_string t in - if Tag.pcdata <> t - then begin - pr "Tag: %s : \n%!" str; - pr "TaggedDesc = %i%!, " (tagged_desc v id t); - pr "TaggedFoll = %i\n%!" (tagged_foll v id t); - pr "SubtreeTags = %i\n%!" (subtree_tags v id t); - end) tags; - pr "----------------------------\n"; - aux(first_child v id); - aux(next_sibling v id); - end - in - aux (root v) - - let rrrr = ref 0 - - let time_xml_tree v tag = - - let rec aux id acc = - incr rrrr; - if (is_nil id) - then acc - else begin - let acc = - if tag == (tag_id v id) - then - id::acc - else acc - in - aux (next_sibling v id) (aux (first_child v id) acc); - end - in - let r = aux (root v) [] in - Printf.eprintf "%i\n%!" !rrrr;r - - let rrrr2 = ref 0 - let time_xml_tree2 v tag = - let rec aux id acc ctx= - incr rrrr2; - if (is_nil id) - then acc - else begin - let acc = - if tag == (tag_id v id) - then - id::acc - else acc - in - aux (tagged_foll_below v id tag ctx) (aux (tagged_desc v id tag) acc id) ctx; - end - in - let r = aux (root v) [] (root v) in - Printf.eprintf "%i\n%!" !rrrr2; r +let update h t sb sa = + let sbelow,safter = + try + Hashtbl.find h t + with + | Not_found -> Ptset.empty,Ptset.empty + in + Hashtbl.replace h t (Ptset.union sbelow sb, Ptset.union safter sa) +let collect_tags tree = + let h = Hashtbl.create 511 in + let rec loop id acc = + if equal_node id nil + then (Ptset.singleton Tag.pcdata, Ptset.add Tag.pcdata acc) + else + let below2,after2 = loop (tree_next_sibling tree id) acc in + let below1,after1 = loop (tree_first_child tree id) after2 in + let tag = tree_tag_id tree id in + update h tag below1 after2; + Ptset.add tag (Ptset.union below1 below2), (Ptset.add tag after1) + in + let b,a = loop (tree_root tree) Ptset.empty in + update h Tag.pcdata b a; + h - let print_skel t = - let rec aux id = - if (is_nil id) - then Printf.eprintf "#\n" - else - begin - Printf.eprintf "Node %i has tag '%i=%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" - (int_of_node id) - (tag_id t id) - (Tag.to_string (tag_id t id)) - (node_xml_id t id) - (int_of_node (prev_text t id)) - (Text.get_text t (prev_text t id)) - (int_of_node (my_text t id)) - (Text.get_text t (my_text t id)) - (int_of_node (next_text t id)) - (Text.get_text t (next_text t id)) - (int_of_node(parent_doc t (my_text t id))); - - aux(first_child t id); - aux(next_sibling t id); - end - in - aux (root t) - - let traversal t = - let rec aux id = - if not (is_nil id) - then - begin - (* ignore (tag t id); - ignore (Text.get_text t (prev_text t id)); - if (is_leaf t id) - then ignore (Text.get_text t (my_text t id)); - if (is_last t id) - then ignore (Text.get_text t (next_text t id)); *) - aux (first_child t id); - aux (next_sibling t id); - end - in - aux (root t) - - end - +let contains_array = ref [| |] + +let init_contains t s = + let a = text_contains t.doc s + in + Array.fast_sort (compare) a; + contains_array := a - module Binary = struct - - type node_content = - NC of [`Tree ] node - | SC of [`Text ] node * [`Tree ] node - type string_content = [ `Text ] node - type descr = - | Nil - | Node of node_content - | String of string_content - - type doc = t - - type t = { doc : doc; - node : descr } - - let dump { doc=t } = Tree.print_skel t - let test_xml_tree ppf tags { doc=t } = Tree.test_xml_tree ppf tags t - let time_xml_tree { doc=t } tag = Tree.time_xml_tree t tag - let time_xml_tree2 { doc=t } tag = Tree.time_xml_tree2 t tag - let test_jump { doc=t } tag = Tree.test_jump t tag - let contains_array = ref [| |] - - let init_contains t s = - let a = Text.contains t.doc s +let init_naive_contains t s = + let i,j = tree_doc_ids t.doc (tree_root t.doc) + in + let regexp = Str.regexp_string s in + let matching arg = + try + let _ = Str.search_forward regexp arg 0; + in true + with _ -> false + in + let rec loop n acc l = + if n >= j then acc,l + else + let s = text_get_text t.doc n in - Array.fast_sort (compare) a; - contains_array := a + if matching s + then loop (n+1) (n::acc) (l+1) + else loop (n+1) acc l + in + let acc,l = loop i [] 0 in + let a = Array.create l nil in + let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc + in + contains_array := a - let init_naive_contains t s = - let i,j = Tree.doc_ids t.doc (Tree.root t.doc) - in - let regexp = Str.regexp_string s in - let matching arg = - try - let _ = Str.search_forward regexp arg 0; - in true - with _ -> false - in - let rec loop n acc l = - if n >= j then acc,l - else - let s = (*Printf.eprintf "%i \n%!" n;*)Text.get_cached_text t.doc n - in - if matching s - then loop (n+1) (n::acc) (l+1) - else loop (n+1) acc l - in - let acc,l = loop i [] 0 in - let a = Array.create l Text.nil in - let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc - in - contains_array := a - - - - module DocIdSet = struct - include Set.Make (struct type t = string_content - let compare = (-) end) - - end - let is_node = function { node=Node(_) } -> true | _ -> false - - let get_string t (i:string_content) = Text.get_text t.doc i - open Tree - let node_of_t t = { doc= t; - node = Node(NC (root t)) } - - - let parse_xml_uri str = node_of_t - (MM((parse_xml_uri str - !Options.sample_factor - !Options.index_empty_texts - !Options.disable_text_collection),__LOCATION__)) - let parse_xml_string str = node_of_t - (MM((parse_xml_string str - !Options.sample_factor - !Options.index_empty_texts - !Options.disable_text_collection),__LOCATION__)) +module DocIdSet = struct + include Set.Make (struct type t = [`Text] node + let compare = compare_node end) + +end +let is_nil t = match t.node with + | Nil -> true + | Node(i) -> equal_node i nil + | _ -> false - let save t str = save_tree t.doc str +let is_node t = +match t.node with + | Node(i) -> not(equal_node i nil) + | _ -> false - let load ?(sample=64) str = node_of_t (load_tree str sample) +let node_of_t t = + let _ = Tag.init (Obj.magic t) in + let table = collect_tags t + in +(* + let _ = Hashtbl.iter (fun t (sb,sa) -> + Printf.eprintf "'%s' -> { " (Tag.to_string t); + Ptset.iter (fun i -> Printf.eprintf "'%s' " (Tag.to_string i)) sb; + Printf.eprintf "}\n { "; + Ptset.iter (fun i -> Printf.eprintf "'%s' " (Tag.to_string i)) sa; + Printf.eprintf "} \n----------------------------------\n"; + ) table in +*) + { doc= t; + node = Node(tree_root t); + ttable = table; + } + + +let parse_xml_uri str = node_of_t + (parse_xml_uri str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection) + +let parse_xml_string str = node_of_t + (parse_xml_string str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection) + +external pool : tree -> Tag.pool = "%identity" +let save t str = save_tree t.doc str +let load ?(sample=64) str = + node_of_t (load_tree str sample) + - external pool : doc -> Tag.pool = "%identity" - let tag_pool t = pool t.doc - let compare a b = match a.node,b.node with - | Node(NC i),Node(NC j) -> compare i j - | _, Node(NC( _ )) -> 1 - | Node(SC (i,_)),Node(SC (j,_)) -> compare i j - | Node(NC( _ )),Node(SC (_,_)) -> -1 - | _, Node(SC (_,_)) -> 1 - | String i, String j -> compare i j - | Node _ , String _ -> -1 - | _ , String _ -> 1 - | Nil, Nil -> 0 - | _,Nil -> -1 - let equal a b = (compare a b) == 0 +let tag_pool t = pool t.doc + +let compare a b = match a.node,b.node with + | Nil, Nil -> 0 + | Nil,_ -> 1 + | _ , Nil -> -1 + | Node(i),Node(j) -> compare_node i j + | Text(i,_), Text(j,_) -> compare_node i j + | Node(i), Text(_,j) -> compare_node i j + | Text(_,i), Node(j) -> compare_node i j + +let equal a b = (compare a b) == 0 + + +let norm (n : [`Tree ] node ) = if tree_is_nil n then Nil else Node (n) + +let nts = function + Nil -> "Nil" + | Text (i,j) -> Printf.sprintf "Text (%i, %i)" i j + | Node (i) -> Printf.sprintf "Node (%i)" i + +let mk_nil t = { t with node = Nil } +let root n = { n with node = norm (tree_root n.doc) } - let string t = match t.node with - | String i -> Text.get_text t.doc i - | _ -> assert false - - let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n) - - let descr t = t.node - - let nts = function - Nil -> "Nil" - | String i -> Printf.sprintf "String %i" i - | Node (NC t) -> Printf.sprintf "Node (NC %i)" (int_of_node t) - | Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i) - - let mk_nil t = { t with node = Nil } - let root n = { n with node = norm (Tree.root n.doc) } - let is_root n = match n.node with - | Node(NC t) -> (int_of_node t) == 0 - | _ -> false - - let parent n = - if is_root n then { n with node=Nil} - else - let node' = - match n.node with - | Node(NC t) -> - let txt = prev_text n.doc t in - if Text.is_empty n.doc txt then - let ps = Tree.prev_sibling n.doc t in - if is_nil ps - then - Node(NC (Tree.parent n.doc t)) - else Node(NC ps) - else - Node(SC (txt,t)) - | Node(SC(i,t)) -> - let ps = Tree.prev_sibling n.doc t in - if is_nil ps - then Node (NC(parent_doc n.doc i)) - else Node(NC ps) - | _ -> failwith "parent" - in - { n with node = node' } - - let first_child n = - let node' = - match n.node with - | Node (NC t) when is_leaf n.doc t -> - let txt = my_text n.doc t in - if Text.is_empty n.doc txt +let is_root n = match n.node with + | Node(t) -> (int_of_node t) == 0 + | _ -> false + +let parent n = + let node' = + match n.node with (* inlined parent *) + | Node(t) when (int_of_node t)== 0 -> Nil + | Node(t) -> + let txt = tree_prev_text n.doc t in + if text_is_empty n.doc txt then + let ps = tree_prev_sibling n.doc t in + if tree_is_nil ps + then + Node(tree_parent n.doc t) + else Node(ps) + else + Text(txt,t) + | Text(i,t) -> + let ps = tree_prev_doc n.doc i in + if tree_is_nil ps + then Node (tree_parent_doc n.doc i) + else Node(ps) + | _ -> failwith "parent" + in + { n with node = node' } + +let node_child n = + match n.node with + | Node i -> { n with node= norm(tree_first_child n.doc i) } + | _ -> { n with node = Nil } + +let node_sibling n = + match n.node with + | Node i -> { n with node= norm(tree_next_sibling n.doc i) } + | _ -> { n with node = Nil } + +let node_sibling_ctx n _ = + match n.node with + | Node i -> { n with node= norm(tree_next_sibling n.doc i) } + | _ -> { n with node = Nil } + + +let first_child n = + let node' = + match n.node with + | Node (t) -> + let fs = tree_first_child n.doc t in + if equal_node nil fs + then + let txt = tree_my_text n.doc t in + if equal_node nil txt then Nil - else Node(SC (txt,Tree.nil)) - | Node (NC t) -> - let fs = first_child n.doc t in - let txt = prev_text n.doc fs in - if Text.is_empty n.doc txt - then norm fs - else Node (SC (txt, fs)) - | Node(SC (i,_)) -> String i - | Nil | String _ -> failwith "first_child" - in - { n with node = node'} - - let next_sibling n = - let node' = - match n.node with - | Node (SC (_,ns)) -> norm ns - | Node(NC t) -> - let ns = next_sibling n.doc t in - let txt = next_text n.doc t in - if Text.is_empty n.doc txt - then norm ns - else Node (SC (txt, ns)) - | Nil | String _ -> failwith "next_sibling" - in - { n with node = node'} + else Text(txt,nil) + else + let txt = tree_prev_text n.doc fs in + if equal_node nil txt + then Node(fs) + else Text(txt, fs) + | Text(_,_) -> Nil + | Nil -> failwith "first_child" + in + { n with node = node'} + +let next_sibling n = + let node' = + match n.node with + | Text (_,ns) -> norm ns + | Node(t) -> + let ns = tree_next_sibling n.doc t in + let txt = tree_next_text n.doc t in + if equal_node nil txt + then norm ns + else Text(txt, ns) + | Nil -> failwith "next_sibling" + in + { n with node = node'} +let next_sibling_ctx n _ = next_sibling n - let left = first_child - let right = next_sibling +let left = first_child +let right = next_sibling - let id = - function { doc=d; node=Node(NC n)} -> node_xml_id d n - | { doc=d; node=Node(SC (i,_) )} -> text_xml_id d i - | _ -> -1 (* - Format.fprintf Format.err_formatter "Failure id on %s\n%!" (nts x.node); - failwith "id" *) - - let tag = - function { node=Node(SC _) } -> Tag.pcdata - | { doc=d; node=Node(NC n)} -> tag_id d n - | _ -> failwith "tag" +let id t = + match t.node with + | Node(n) -> tree_node_xml_id t.doc n + | Text(i,_) -> tree_text_xml_id t.doc i + | _ -> -1 + +let tag t = + match t.node with + | Text(_) -> Tag.pcdata + | Node(n) -> tree_tag_id t.doc n + | _ -> failwith "tag" +(* let string_below t id = let strid = parent_doc t.doc id in match t.node with @@ -555,137 +369,173 @@ struct | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) } | _ -> { t with node=Nil } - - let tagged_next t tb tf s = - match s with - | { node = Node (NC(below)) } -> begin - match t with - | { doc = d; node=Node(NC n) } -> - { t with node= norm (tagged_next d n (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) } - | { doc = d; node=Node(SC (i,n) ) } when is_nil n -> - let p = parent_doc d i in - { t with node= norm (tagged_next d p (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) } - | { doc = d; node=Node(SC (_,n) ) } -> - if Ptset.mem (tag_id d n) (Ptset.union tb tf) - then { t with node=Node(NC(n)) } - else - let vb = Ptset.to_int_vector tb in - let vf = Ptset.to_int_vector tf in - let node = - let dsc = tagged_below d n vb vf in - if is_nil dsc - then tagged_next d n vb vf below - else dsc - in - { t with node = norm node } - | _ -> {t with node=Nil } - end - - | _ -> {t with node=Nil } - - let tagged_foll_only t tf s = - match s with - | { node = Node (NC(below)) } -> begin - match t with - | { doc = d; node=Node(NC n) } -> - { t with node= norm (tagged_foll_only d n (Ptset.to_int_vector tf) below) } - | { doc = d; node=Node(SC (i,n) ) } when is_nil n -> - let p = parent_doc d i in - { t with node= norm (tagged_foll_only d p (Ptset.to_int_vector tf) below) } - | { doc = d; node=Node(SC (_,n) ) } -> - if Ptset.mem (tag_id d n) tf - then { t with node=Node(NC(n)) } - else - let vf = Ptset.to_int_vector tf in - let node = - let dsc = tagged_desc_only d n vf in - if is_nil dsc - then tagged_foll_only d n vf below - else dsc - in - { t with node = norm node } - | _ -> {t with node=Nil } - end - - | _ -> {t with node=Nil } - +*) +let select_next tb tf t s = + match s.node with + | Node (below) -> begin + match t.node with + | Node( n) -> + { t with node = norm (tree_select_next t.doc n (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) } + | Text (i,n) when equal_node nil n -> + let p = tree_parent_doc t.doc i in + { t with node = norm (tree_select_next t.doc p (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) } + | Text(_,n) -> + if Ptset.mem (tree_tag_id t.doc n) (Ptset.union tb tf) + then { t with node=Node(n) } + else + let vb = Ptset.to_int_vector tb in + let vf = Ptset.to_int_vector tf in + let node = + let dsc = tree_select_below t.doc n vb vf in + if equal_node nil dsc + then tree_select_next t.doc n vb vf below + else dsc + in + { t with node = norm node } + | _ -> {t with node = Nil } + end + + | _ -> { t with node = Nil } - let tagged_below t tc td = - match t with - | { doc = d; node=Node(NC n) } -> - let vc = Ptset.to_int_vector tc - in - let vd = Ptset.to_int_vector td - in - { t with node= norm(tagged_below d n vc vd) } - | _ -> { t with node=Nil } + - let tagged_desc_only t td = - match t with - | { doc = d; node=Node(NC n) } -> - let vd = Ptset.to_int_vector td - in - { t with node= norm(tagged_desc_only d n vd) } - | _ -> { t with node=Nil } + let select_foll_only tf t s = + match s.node with + | Node (below) -> + begin + match t.node with + | Node(n) -> + { t with node= norm (tree_select_foll_only t.doc n (Ptset.to_int_vector tf) below) } + | Text(i,n) when equal_node nil n -> + let p = tree_parent_doc t.doc i in + { t with node= norm (tree_select_foll_only t.doc p (Ptset.to_int_vector tf) below) } + | Text(_,n) -> + if Ptset.mem (tree_tag_id t.doc n) tf + then { t with node=Node(n) } + else + let vf = Ptset.to_int_vector tf in + let node = + let dsc = tree_select_desc_only t.doc n vf in + if tree_is_nil dsc + then tree_select_foll_only t.doc n vf below + else dsc + in + { t with node = norm node } + | _ -> { t with node = Nil } + end + | _ -> {t with node=Nil } + +let select_below tc td t= + match t.node with + | Node( n) -> + let vc = Ptset.to_int_vector tc + in + let vd = Ptset.to_int_vector td + in + { t with node= norm(tree_select_below t.doc n vc vd) } + | _ -> { t with node=Nil } - let last_idx = ref 0 - let array_find a i j = - let l = Array.length a in - let rec loop idx x y = - if x > y || idx >= l then Text.nil + +let select_desc_only td t = + match t.node with + | Node(n) -> + let vd = Ptset.to_int_vector td + in + { t with node = norm(tree_select_desc_only t.doc n vd) } + | _ -> { t with node = Nil } + + +let tagged_desc tag t = + match t.node with + | Node(n) -> + { t with node = norm(tree_tagged_desc t.doc n tag) } + | _ -> { t with node = Nil } + + +let tagged_foll_below tag t s = + match s.node with + | Node (below) -> + begin + match t.node with + | Node(n) -> + { t with node= norm (tree_tagged_foll_below t.doc n tag below) } + | Text(i,n) when equal_node nil n -> + let p = tree_prev_doc t.doc i in + { t with node= norm (tree_tagged_foll_below t.doc p tag below) } + | Text(_,n) -> + if (tree_tag_id t.doc n) == tag + then { t with node=Node(n) } + else + let node = + let dsc = tree_tagged_desc t.doc n tag in + if tree_is_nil dsc + then tree_tagged_foll_below t.doc n tag below + else dsc + in + { t with node = norm node } + | _ -> { t with node = Nil } + end + | _ -> {t with node=Nil } + + +let last_idx = ref 0 +let array_find a i j = + let l = Array.length a in + let rec loop idx x y = + if x > y || idx >= l then nil else - if a.(idx) >= x then if a.(idx) > y then Text.nil else (last_idx := idx;a.(idx)) + if a.(idx) >= x then if a.(idx) > y then nil else (last_idx := idx;a.(idx)) else loop (idx+1) x y - in - if a.(0) > j || a.(l-1) < i then Text.nil - else loop !last_idx i j - + in + if a.(0) > j || a.(l-1) < i then nil + else loop !last_idx i j + + - let text_below t = - let l = Array.length !contains_array in - if l = 0 then { t with node=Nil } - else - match t with - | { doc = d; node=Node(NC n) } -> - let i,j = doc_ids t.doc n in - let id = array_find !contains_array i j - in - if id == Text.nil then - { t with node=Nil } - else - {t with node = Node(SC(id, Tree.next_sibling d (Tree.prev_doc d id))) } - | _ -> { t with node=Nil } - - let text_next t root = - let l = Array.length !contains_array in - if l = 0 then { t with node=Nil } - else - let inf = match t with - | { doc =d; node = Node(NC n) } -> snd(doc_ids d n)+1 - | { node = Node(SC(i,_)) } -> i+1 - | _ -> assert false - in - match root with - | { doc = d; node= Node (NC n) } -> - let _,j = doc_ids t.doc n in - - let id = array_find !contains_array inf j - in - if id == Text.nil then { doc = d; node= Nil } - else - {doc = d; node = Node(SC(id,Tree.next_sibling d (Tree.prev_doc d id))) } - | _ -> { t with node=Nil} +let text_below t = + let l = Array.length !contains_array in + if l = 0 then { t with node=Nil } + else + match t.node with + | Node(n) -> + let i,j = tree_doc_ids t.doc n in + let id = array_find !contains_array i j + in + if id == nil then + { t with node=Nil } + else + { t with node = Text(id, tree_next_sibling t.doc (tree_prev_doc t.doc id)) } + | _ -> { t with node = Nil } + +let text_next t root = + let l = Array.length !contains_array in + if l = 0 then { t with node=Nil } + else + let inf = match t.node with + | Node(n) -> snd(tree_doc_ids t.doc n)+1 + | Text(i,_) -> i+1 + | _ -> assert false + in + match root.node with + | Node (n) -> + let _,j = tree_doc_ids t.doc n in + let id = array_find !contains_array inf j + in + if id == nil then { t with node= Nil } + else + { t with node = Text(id,tree_next_sibling t.doc (tree_prev_doc t.doc id)) } + | _ -> { t with node = Nil} - +(* let subtree_tags t tag = match t with { doc = d; node = Node(NC n) } -> subtree_tags d n tag | _ -> 0 - let tagged_desc_array = ref [| |] + let select_desc_array = ref [| |] let idx = ref 0 let init_tagged_next t tagid = @@ -773,8 +623,10 @@ struct let count_contains t s = Text.count_contains t.doc s - let count t s = Text.count t.doc s +*) + let count t s = text_count t.doc s +(* let is_left t = if is_root t then false else @@ -782,50 +634,47 @@ struct else let u = left (parent t) in (id t) == (id u) - - let print_xml_fast outc t = - let rec loop ?(print_right=true) t = match t.node with - | Nil -> () - | String (s) -> output_string outc (Text.get_text t.doc s) - | Node _ when Tag.equal (tag t) Tag.pcdata -> - loop (left t); - if print_right then loop (right t) - - | Node (_) -> - let tg = Tag.to_string (tag t) in - let l = left t - and r = right t - in - output_char outc '<'; - output_string outc tg; - ( match l.node with - Nil -> output_string outc "/>" - | String _ -> assert false - | Node(_) when Tag.equal (tag l) Tag.attribute -> - (loop_attributes (left l); - match (right l).node with - | Nil -> output_string outc "/>" - | _ -> - output_char outc '>'; - loop (right l); - output_string outc "' ) - | _ -> - output_char outc '>'; - loop l; - output_string outc "' - );if print_right then loop r - and loop_attributes a = - +*) + let print_xml_fast outc t = + let rec loop ?(print_right=true) t = + match t.node with + | Nil -> () + | Text(i,n) -> output_string outc (text_get_text t.doc i); + if print_right + then loop (left t) + | Node (n) -> + let tg = Tag.to_string (tag t) in + let l = left t + and r = right t + in + output_char outc '<'; + output_string outc tg; + ( match l.node with + Nil -> output_string outc "/>" + | Node(_) when Tag.equal (tag l) Tag.attribute -> + (loop_attributes (left l); + match (right l).node with + | Nil -> output_string outc "/>" + | _ -> + output_char outc '>'; + loop (right l); + output_string outc "' ) + | _ -> + output_char outc '>'; + loop l; + output_string outc "' + );if print_right then loop r + and loop_attributes a = match a.node with | Node(_) -> let value = match (left a).node with - | Nil -> "" - | _ -> string (left(left a)) + | Text(i,_) -> text_get_text a.doc i + | _ -> assert false in output_char outc ' '; output_string outc (Tag.to_string (tag a)); @@ -833,227 +682,21 @@ struct output_string outc value; output_char outc '"'; loop_attributes (right a) - | _ -> () - in + | _ -> () + in loop ~print_right:false t - - - let print_xml_fast outc t = - if Tag.to_string (tag t) = "" then - print_xml_fast outc (first_child t) - else print_xml_fast outc t - - - - - - let traversal t = Tree.traversal t.doc - let full_traversal t = - let rec aux n = - match n.node with - | Nil -> () - | String i -> () (*ignore(Text.get_text t.doc i) *) - | Node(_) -> - (* tag_id n; *) - aux (first_child n); - aux (next_sibling n) - in aux t - - let print_stats _ = () - end - -end - - - -IFDEF DEBUG -THEN -module DEBUGTREE - = struct - - let _timings = Hashtbl.create 107 - - - let time _ref f arg = - let t1 = Unix.gettimeofday () in - let r = f arg in - let t2 = Unix.gettimeofday () in - let t = (1000. *.(t2 -. t1)) in - - let (time,count) = try - Hashtbl.find _timings _ref - with - | Not_found -> 0.,0 - in - let time = time+. t - and count = count + 1 - in - Hashtbl.replace _timings _ref (time,count);r - - include XML.Binary - - - let first_child_ doc node = - time ("XMLTree.FirstChild()") (XML.Tree.first_child doc) node - let next_sibling_ doc node = - time ("XMLTree.NextSibling()") (XML.Tree.next_sibling doc) node - - let is_empty_ text node = - time ("TextCollection.IsEmpty()") (XML.Text.is_empty text) node - - let prev_text_ doc node = - time ("XMLTree.PrevText()") (XML.Tree.prev_text doc) node - - let my_text_ doc node = - time ("XMLTree.MyText()") (XML.Tree.my_text doc) node - - let next_text_ doc node = - time ("XMLTree.NextText()") (XML.Tree.next_text doc) node - - let is_leaf_ doc node = - time ("XMLTree.IsLeaf()") (XML.Tree.is_leaf doc ) node - - let node_xml_id_ doc node = - time ("XMLTree.NodeXMLId()") (XML.Tree.node_xml_id doc ) node - - let text_xml_id_ doc node = - time ("XMLTree.TextXMLId()") (XML.Tree.text_xml_id doc ) node - - - let first_child n = - let node' = - match n.node with - | Node (NC t) when is_leaf_ n.doc t -> - let txt = my_text_ n.doc t in - if is_empty_ n.doc txt - then Nil - else Node(SC (txt,XML.Tree.nil)) - | Node (NC t) -> - let fs = first_child_ n.doc t in - let txt = prev_text_ n.doc fs in - if is_empty_ n.doc txt - then norm fs - else Node (SC (txt, fs)) - | Node(SC (i,_)) -> String i - | Nil | String _ -> failwith "first_child" - in - { n with node = node'} - - let next_sibling n = - let node' = - match n.node with - | Node (SC (_,ns)) -> norm ns - | Node(NC t) -> - let ns = next_sibling_ n.doc t in - let txt = - if XML.Tree.is_nil ns then - next_text_ n.doc t - else prev_text_ n.doc ns - in - if is_empty_ n.doc txt - then norm ns - else Node (SC (txt, ns)) - | Nil | String _ -> failwith "next_sibling" - in - { n with node = node'} - - let id = - function { doc=d; node=Node(NC n)} -> node_xml_id_ d n - | { doc=d; node=Node(SC (i,_) )} -> text_xml_id_ d i - | _ -> failwith "id" - - (* Wrapper around critical function *) - let string t = time ("TextCollection.GetText()") (string) t - let left = first_child - let right = next_sibling - let tag t = time ("XMLTree.GetTag()") (tag) t - - let print_stats ppf = - let total_time,total_calls = - Hashtbl.fold (fun _ (t,c) (tacc,cacc) -> - tacc+. t, cacc + c) _timings (0.,0) - - in - Format.fprintf ppf - "Timing : Function Name, number of calls,%% of total calls, mean time, total time, %% of total time\n%!"; - Hashtbl.iter (fun name (time,count) -> - Format.fprintf ppf "%-27s% 8d\t% 4.2f%%\t% 4.6f ms\t% 4.6f ms\t%04.2f%%\n%!" - name - count - (100. *. (float_of_int count)/.(float_of_int total_calls)) - (time /. (float_of_int count)) - time - (100. *. time /. total_time)) _timings; - Format.fprintf ppf "-------------------------------------------------------------------\n"; - Format.fprintf ppf "%-27s% 8d\t% 4.0f%%\t########## ms\t% 4.6f ms\t% 4.0f%%\n%!" - "Total" total_calls 100. total_time 100. - - - let print_xml_fast outc t = - let rec loop ?(print_right=true) t = match t.node with - | Nil -> () - | String (s) -> output_string outc (string t) - | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t) - - | Node (_) -> - let tg = Tag.to_string (tag t) in - let l = left t - and r = right t - in - output_char outc '<'; - output_string outc tg; - ( match l.node with - Nil -> output_string outc "/>" - | String _ -> assert false - | Node(_) when Tag.equal (tag l) Tag.attribute -> - (loop_attributes (left l); - match (right l).node with - | Nil -> output_string outc "/>" - | _ -> - output_char outc '>'; - loop (right l); - output_string outc "' ) - | _ -> - output_char outc '>'; - loop l; - output_string outc "' - );if print_right then loop r - and loop_attributes a = - - match a.node with - | Node(_) -> - let value = - match (left a).node with - | Nil -> "" - | _ -> string (left(left a)) - in - output_char outc ' '; - output_string outc (Tag.to_string (tag a)); - output_string outc "=\""; - output_string outc value; - output_char outc '"'; - loop_attributes (right a) - | _ -> () - in - loop ~print_right:false t - - + let print_xml_fast outc t = if Tag.to_string (tag t) = "" then print_xml_fast outc (first_child t) else print_xml_fast outc t - +let tags_below t tag = + fst(Hashtbl.find t.ttable tag) -end +let tags_after t tag = + snd(Hashtbl.find t.ttable tag) -module Binary = DEBUGTREE -ELSE -module Binary = XML.Binary -END (* IFDEF DEBUG *) +let tags t tag = Hashtbl.find t.ttable tag diff --git a/tree.mli b/tree.mli index 79321c2..d0a4f5a 100644 --- a/tree.mli +++ b/tree.mli @@ -1,76 +1,39 @@ -(******************************************************************************) -(* SXSI : XPath evaluator *) -(* Kim Nguyen (Kim.Nguyen@nicta.com.au) *) -(* Copyright NICTA 2008 *) -(* Distributed under the terms of the LGPL (see LICENCE) *) -(******************************************************************************) -module type BINARY = -sig - type node_content - type string_content - type descr = Nil| Node of node_content | String of string_content - type t - val parse_xml_uri : string -> t - val parse_xml_string : string -> t - val save : t -> string -> unit - val load : ?sample:int -> string -> t - val tag_pool : t -> Tag.pool - val string : t -> string - val descr : t -> descr - val is_node : t -> bool - val left : t -> t - val right : t -> t - val first_child : t -> t - val next_sibling : t -> t - val parent : t -> t - val root : t -> t - val is_root : t -> bool - val id : t -> int - val tag : t -> Tag.t - val print_xml_fast : out_channel -> t -> unit - val compare : t -> t -> int - val equal : t -> t -> bool - module DocIdSet : - sig - include Set.S - end with type elt = string_content - val string_below : t -> string_content -> bool - val contains : t -> string -> DocIdSet.t - val contains_old : t -> string -> DocIdSet.t - val contains_iter : t -> string -> DocIdSet.t - val count_contains : t -> string -> int - val count : t -> string -> int - val dump : t -> unit - val get_string : t -> string_content -> string - val has_tagged_desc : t -> Tag.t -> bool - val has_tagged_foll : t -> Tag.t -> bool - val tagged_desc : t -> Tag.t -> t - val tagged_foll : t -> Tag.t -> t - val tagged_below : t -> Ptset.t -> Ptset.t -> t - val tagged_next : t -> Ptset.t -> Ptset.t -> t -> t - val tagged_desc_only : t -> Ptset.t -> t - val tagged_foll_only : t -> Ptset.t -> t -> t - val text_below : t -> t - val text_next : t -> t -> t - val init_tagged_next : t -> Tag.t -> unit - val subtree_tags : t -> Tag.t -> int - val is_left : t -> bool - val print_id : Format.formatter -> t -> unit - val test_xml_tree : Format.formatter -> Ptset.t -> t -> unit - val init_contains : t -> string -> unit - val init_naive_contains : t -> string -> unit - val mk_nil : t -> t - val test_jump : t -> Tag.t -> unit - val time_xml_tree : t -> Tag.t -> int list - val time_xml_tree2 : t -> Tag.t -> int list -end - -module Binary : BINARY - -IFDEF DEBUG -THEN -module DEBUGTREE : sig - include BINARY - val print_stats : Format.formatter -> unit -end -ENDIF +type t +val init_contains : t -> string -> unit +val init_naive_contains : t -> string -> unit +val is_nil : t -> bool +val is_node : t -> bool +val parse_xml_uri : string -> t +val parse_xml_string : string -> t +val save : t -> string -> unit +val load : ?sample:int -> string -> t +val tag_pool : t -> Tag.pool +val compare : t -> t -> int +val equal : t -> t -> bool +val mk_nil : t -> t +val root : t -> t +val is_root : t -> bool +val parent : t -> t +val first_child : t -> t +val next_sibling : t -> t +val next_sibling_ctx : t -> t -> t +val left : t -> t +val right : t -> t +val id : t -> int +val tag : t -> Tag.t +val text_below : t -> t +val text_next : t -> t -> t +val tagged_desc : Tag.t -> t -> t +val tagged_foll_below : Tag.t -> t -> t -> t +val select_desc_only : Ptset.t -> t -> t +val select_foll_only : Ptset.t -> t -> t -> t +val select_below : Ptset.t -> Ptset.t -> t -> t +val select_next : Ptset.t -> Ptset.t -> t -> t -> t +val count : t -> string -> int +val print_xml_fast : out_channel -> t -> unit +val node_child : t -> t +val node_sibling : t -> t +val node_sibling_ctx : t -> t -> t +val tags_below : t -> Tag.t -> Ptset.t +val tags_after : t -> Tag.t -> Ptset.t +val tags : t -> Tag.t -> Ptset.t*Ptset.t diff --git a/unit_test.ml b/unit_test.ml index ff8d573..1f7e732 100644 --- a/unit_test.ml +++ b/unit_test.ml @@ -5,19 +5,6 @@ (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) -let collect_tags v = - let rec aux acc v = - if Tree.Binary.is_node v - then - let tag = Tree.Binary.tag v - in - let acc = aux (Ptset.add tag acc) (Tree.Binary.first_child v) - in - aux (Ptset.add tag acc) (Tree.Binary.next_sibling v) - else acc - in - aux Ptset.empty v -;; if Array.length (Sys.argv) <> 2 @@ -30,109 +17,34 @@ then let doc = try - Tree.Binary.load Sys.argv.(1) + Tree.load Sys.argv.(1) with | _ -> ( try - Tree.Binary.parse_xml_uri Sys.argv.(1) + Tree.parse_xml_uri Sys.argv.(1) with | _ ->( Printf.printf "Error parsing document\n"; exit 2)) ;; -let _ = Tag.init (Tree.Binary.tag_pool doc) -;; -(* - let tags = (collect_tags doc) - ;; -(* -let _ = Tree.Binary.test_xml_tree Format.std_formatter tags doc -;; -let _ = Printf.printf "Testing //a with jumping\n" -;; -*) -let rec test_a dir t acc ctx = - if Tree.Binary.is_node t - then - let acc = - if (Tree.Binary.tag t) == (Tag.tag "a") - then Ata.TS.cons t acc - else acc - in - let first = Tree.Binary.tagged_below t Ptset.empty (Ptset.singleton (Tag.tag "a")) - and next = Tree.Binary.tagged_next t Ptset.empty (Ptset.singleton (Tag.tag "a")) ctx - in - let _ = - Printf.printf "t is :"; - Tree.Binary.print_xml_fast stdout t; - Printf.printf " called from %s of " (if dir then "below" else "next"); - Tree.Binary.print_xml_fast stdout ctx; - if (Tree.Binary.is_node next) - then begin - Printf.printf ", Next a is %!"; - Tree.Binary.print_xml_fast stdout next; - end - else - Printf.printf ", Next a is empty!"; - print_newline(); - in - test_a false next (test_a true first acc t) t - else acc -;; -let rec test_text dir t acc ctx = - if Tree.Binary.is_node t - then - let acc = - if (Tree.Binary.tag t) == (Tag.pcdata) - then Ata.TS.cons t acc - else acc - in - let first = Tree.Binary.text_below t - and next = Tree.Binary.text_next t ctx - in - (* - let _ = - Printf.printf "t is :"; - Tree.Binary.print_xml_fast stdout t; - Printf.printf " called from %s of " (if dir then "below" else "next"); - Tree.Binary.print_xml_fast stdout ctx; - if (Tree.Binary.is_node first) - then begin - Printf.printf "First (text) is %!"; - Tree.Binary.print_xml_fast stdout first; - end - else - Printf.printf "First (text) is empty!"; - if (Tree.Binary.is_node next) - then begin - Printf.printf ", Next (text) is %!"; - Tree.Binary.print_xml_fast stdout next; - end - else - Printf.printf ", Next (text) is empty!"; - print_newline(); - in *) - test_text false next (test_text true first acc t) ctx - else acc +let full_traversal tree = + let rec loop t = + if Tree.is_node t + then + begin + (*ignore (Tree.tag t); *) + loop (Tree.node_child t); + loop (Tree.node_sibling t); + end + in loop tree ;; -(* -let r = test_a true doc Ata.TS.empty doc;; -(* -let _ = Printf.printf "==> %i nodes\n" (Ata.TS.length r) -let _ = Ata.TS.iter (fun t -> Tree.Binary.print_xml_fast stdout t; print_newline();) r + -*) -let _ = Tree.Binary.init_contains doc "car" +let _ = Tag.init (Tree.tag_pool doc) -let r = test_text true doc Ata.TS.empty doc -let _ = Printf.printf "==> %i nodes\n" (Ata.TS.length r) -(* let _ = Ata.TS.iter (fun t -> Tree.Binary.print_xml_fast stdout t; print_newline();) r *) -;; - -*) *) let time f x = let t1 = Unix.gettimeofday () in let r = f x in @@ -141,8 +53,6 @@ let time f x = Printf.eprintf " %fms\n%!" t ; r ;; -let _ = Printf.eprintf "Timing full //keyword ... " -let x = List.length (time (Tree.Binary.time_xml_tree doc) (Tag.tag "keyword")) -let _ = Printf.eprintf "Timing jump //keyword ... " -let y = List.length (time (Tree.Binary.time_xml_tree2 doc) (Tag.tag "keyword")) -let _ = Printf.eprintf "coherant : %b\n" (x=y) +let _ = Printf.eprintf "Timing traversal ... ";; +let _ = time (full_traversal) doc +;; diff --git a/xPath.ml b/xPath.ml index de9c056..4d83634 100644 --- a/xPath.ml +++ b/xPath.ml @@ -229,6 +229,7 @@ type config = { st_root : Ata.state; (* state matching the root element (initial tr_aux : (Ata.state,int*(Ata.transition list)) Hashtbl.t; mutable entry_points : (Tag.t*Ptset.t) list; mutable contains : string option; + mutable univ_states : Ata.state list; } let dummy_conf = { st_root = -1; st_univ = -1; @@ -240,6 +241,7 @@ let dummy_conf = { st_root = -1; tr_aux = Hashtbl.create 0; entry_points = []; contains = None; + univ_states = []; } @@ -288,7 +290,7 @@ let or_self conf old_dst q_src q_dst dir test pred mark = (if mark then replace old_dst f else f) *& pred *& (if mark then Ata.true_ else (_l dir) ** q_dst), - `True)::acc) + false)::acc) l l in Hashtbl.replace conf.tr q_src (num,l2) with Not_found -> () @@ -340,13 +342,12 @@ let rec compile_step ?(existential=false) conf q_src dir ctx_path nrec step num let _ = if axis=Descendant then add_trans num conf.tr_aux ( ?< q_src><@ ((if ex||nrec then TagSet.diff TagSet.star test - else TagSet.star),false, - `True )>=> `LLeft ** q_src ) + else TagSet.star),false)>=> `LLeft ** q_src ) in let t3 = ?< q_src><@ ((if ex then TagSet.diff TagSet.any test - else TagSet.any), false, `True )>=> - if ex then ( Ata.atom_ `Left false q_src) *& right ** q_src + else TagSet.any), false)>=> + if ex then right ** q_src else (if axis=Descendant then `RRight else `Right) ** q_src in let _ = add_trans num conf.tr_aux t3 @@ -468,7 +469,7 @@ and compile_expr conf states q_src idx ctx_path dir e qdst = let _ = match annot_path with | (((Parent|Ancestor|AncestorOrSelf),_,_),_)::_ -> conf.final_state <- Ptset.add qdst conf.final_state | _ -> () - in + in let _ = conf.univ_states <- a_dst::conf.univ_states in (a_st,anc_st,par_st,pre_st,idx, ((ret_dir) ** q)) | True -> states,Ptset.empty,Ptset.empty,Ptset.empty,idx,Ata.true_ | False -> states,Ptset.empty,Ptset.empty,Ptset.empty,idx,Ata.false_ @@ -500,7 +501,8 @@ let compile path = tr = Hashtbl.create 5; tr_aux = Hashtbl.create 5; entry_points = []; - contains = None + contains = None; + univ_states = []; } in let q0 = Ata.mk_state() in @@ -545,9 +547,8 @@ let compile path = Ata.states = if has_backward then Ptset.add config.st_from_root a_st else a_st; Ata.init = Ptset.singleton config.st_root; Ata.final = Ptset.union anc_st config.final_state; - Ata.universal = Ptset.singleton a_dst; + Ata.universal = Ptset.add a_dst (Ptset.from_list config.univ_states); Ata.phi = phi; - Ata.delta = Hashtbl.create 17; Ata.sigma = Ata.HTagSet.create 17; },config.entry_points,!contains -- 2.17.1