From f98a8d98d86941a885f492d5cc134e34989c198a Mon Sep 17 00:00:00 2001 From: kim Date: Fri, 17 Apr 2009 00:07:56 +0000 Subject: [PATCH] bottom up run works for text nodes git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@324 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- Makefile | 2 +- OCamlDriver.cpp | 50 +- SXSIStorageInterface.cpp | 2 +- SXSIStorageInterface.h | 10 + ata.ml | 1205 ++++++++++++++++++++++++++++++-------- ata.mli | 63 +- compile.sh | 15 - main.ml | 60 +- myTimeXMLTree.cpp | 339 ----------- options.ml | 2 + options.mli | 1 + ptset.mli | 1 - testXMLTree.cpp | 43 -- tests/test.xml | 2 +- timeXMLTree.cpp | 2 +- tree.ml | 223 +++++-- tree.mli | 11 + xPath.ml | 7 +- xPath.mli | 2 +- 19 files changed, 1279 insertions(+), 761 deletions(-) delete mode 100755 compile.sh delete mode 100644 myTimeXMLTree.cpp delete mode 100644 testXMLTree.cpp diff --git a/Makefile b/Makefile index bcd3f92..26f4e45 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ OCAMLOPT = ocamlopt -g -cc "$(CXX)" SYNT_DEBUG = -ppopt -DDEBUG else CXX = g++ -OCAMLOPT = ocamlopt -cc "$(CXX)" -ccopt -O3 -ccopt -std=c++0x -noassert -inline $(INLINE) +OCAMLOPT = ocamlopt -cc "$(CXX)" -ccopt -O3 -ccopt -std=c++0x -noassert -inline $(INLINE) endif ifeq ($(PROFILE), true) SYNT_PROF = $(SYNT_DEBUG) -ppopt -DPROFILE diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index 8603c11..78d3e15 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -16,9 +16,28 @@ extern "C" { #include #include #include + + +#include +#include +#include +#include + + struct tms t1; + struct tms t2; + double ticks = (double) sysconf(_SC_CLK_TCK)/1000; + void start_clock() { + times (&t1); + } + + + double stop_clock() { + times (&t2); + return (t2.tms_utime-t1.tms_utime)/ticks; + } +} //extern C -} //extern C //#include "TextCollection/TextCollection.h" #include "XMLDocShredder.h" @@ -146,14 +165,6 @@ extern "C" CAMLprim value caml_text_collection_get_cached_text(value tree, value CAMLreturn (str); } -extern "C" CAMLprim value caml_text_collection_size(value tree){ - CAMLparam1(tree); - // CAMLreturn (Val_int( XMLTREE(tree)->CachedText.size())); - NOT_IMPLEMENTED("text_collection_size"); - CAMLreturn (Val_unit); -} - - extern "C" CAMLprim value caml_text_collection_empty_text(value tree,value id){ CAMLparam2(tree,id); @@ -194,6 +205,16 @@ extern "C" CAMLprim value caml_text_collection_contains(value tree,value str){ }; CAMLreturn (resarray); } +extern "C" CAMLprim value caml_text_collection_unsorted_contains(value tree,value str){ + CAMLparam2(tree,str); + uchar * cstr = (uchar *) String_val(str); + std::vector results; + start_clock(); + results = XMLTREE(tree)->Contains(cstr); + double d = stop_clock(); + std::cerr << "Internal timing " << d <<" ms\n"; + CAMLreturn (Val_unit); +} extern "C" CAMLprim value caml_xml_tree_root(value tree){ @@ -240,6 +261,15 @@ extern "C" CAMLprim value caml_xml_tree_unserialize(value filename){ CAMLreturn(Val_unit); } +extern "C" CAMLprim value caml_xml_tree_last_child(value tree, value id){ + CAMLparam2(tree,id); + CAMLreturn(Val_int (XMLTREE(tree)->LastChild(TREENODEVAL(id)))); +} + +extern "C" CAMLprim value caml_xml_tree_is_first_child(value tree, value id){ + CAMLparam2(tree,id); + CAMLreturn(Val_bool (XMLTREE(tree)->IsFirstChild(TREENODEVAL(id)))); +} extern "C" CAMLprim value caml_xml_tree_first_child(value tree, value id){ CAMLparam2(tree,id); @@ -274,9 +304,7 @@ extern "C" CAMLprim value caml_xml_tree_next_sibling(value tree, value id){ extern "C" CAMLprim value caml_xml_tree_prev_text(value tree, value id){ CAMLparam2(tree,id); - CAMLlocal1(res); CAMLreturn(Val_int((XMLTREE(tree)->PrevText(TREENODEVAL(id))))); - CAMLreturn(res); } extern "C" CAMLprim value caml_xml_tree_next_text(value tree, value id){ CAMLparam2(tree,id); diff --git a/SXSIStorageInterface.cpp b/SXSIStorageInterface.cpp index d770d7e..88ed42f 100644 --- a/SXSIStorageInterface.cpp +++ b/SXSIStorageInterface.cpp @@ -11,7 +11,6 @@ #include "SXSIStorageInterface.h" #include "Utils.h" - SXSIStorageInterface::SXSIStorageInterface(int sf,bool iet,bool dtc) { tree = new XMLTree(); @@ -64,6 +63,7 @@ void *SXSIStorageInterface::returnDocument(){ return ((void *) tree); } + void SXSIStorageInterface::printStats(){ std::cerr << "Parsing stats : \n"; std::cerr << _new_child << " calls to newOpenTag/newClosingTag\n"; diff --git a/SXSIStorageInterface.h b/SXSIStorageInterface.h index 807fc79..943a000 100644 --- a/SXSIStorageInterface.h +++ b/SXSIStorageInterface.h @@ -12,7 +12,16 @@ #include "XMLTree.h" #include "StorageInterface.h" +extern "C" { +#include +#include +#include +#include +#include +#include + +} //extern C using namespace std; @@ -29,6 +38,7 @@ class SXSIStorageInterface: public StorageInterface virtual void printStats(); private: + XMLTree* tree; int _new_text; int _new_empty_text; diff --git a/ata.ml b/ata.ml index 425fad6..bba3203 100644 --- a/ata.ml +++ b/ata.ml @@ -1,63 +1,28 @@ -(* Todo refactor and remove this alias *) INCLUDE "debug.ml" +INCLUDE "utils.ml" + +let cpt_trans = ref 0 +let miss_trans = ref 0 +let cpt_eval = ref 0 +let miss_eval = ref 0 + let gen_id = let id = ref (-1) in fun () -> incr id;!id - module TS = - struct - type t = Nil - | Sing of Tree.t - | Cons of Tree.t*t - | ConsCat of Tree.t * t * t - | Concat of t*t - let empty = Nil - - let cons e t = Cons(e,t) - let concat t1 t2 = Concat(t1,t2) - let append e t = Concat(t,Sing(e)) - - - - - let fold f l acc = - let rec loop acc = function - | Nil -> acc - | Sing e -> f e acc - | Cons (e,t) -> loop (f e acc) t - | ConsCat (e,t1,t2) -> loop (loop (f e acc) t1) t2 - | Concat (t1,t2) -> loop (loop acc t1) t2 - in - loop acc l - - let length l = fold (fun _ x -> x+1) l 0 - - - let iter f l = - let rec loop = function - | Nil -> () - | Sing e -> f e - | Cons (e,t) -> f e; loop t - | ConsCat(e,t1,t2) -> - f e; loop t1; loop t2 - | Concat(t1,t2) -> loop t1;loop t2 - in loop l - - end - - - let h_union = Hashtbl.create 4097 -let pt_cup s1 s2 = - let h = (Ptset.hash s1)*(Ptset.hash s2) - ((Ptset.hash s2)+(Ptset.hash s1)) in +let pt_cup s1 s2 = + (* special case, since this is a union we want hash(s1,s2) = hash(s2,s1) *) + let x = Ptset.hash s1 + and y = Ptset.hash s2 in + let h = if x < y then HASHINT2(x,y) else HASHINT2(y,x) in try Hashtbl.find h_union h with - | Not_found -> let s = Ptset.union s1 s2 - in - Hashtbl.add h_union h s;s - + | Not_found -> let s = Ptset.union s1 s2 + in + Hashtbl.add h_union h s;s module State = struct @@ -92,7 +57,8 @@ let hash_node_form t = match t with | True -> 1 | And(f1,f2) -> (2+17*f1.fkey + 37*f2.fkey) (*land max_int *) | Or(f1,f2) -> (3+101*f1.fkey + 253*f2.fkey) (*land max_int *) - | Atom(v,b,s) -> ((hash_const_variant v) + (3846*(vb b) +257) + (s lsl 13 - s)) (*land max_int *) + | Atom(v,b,s) -> HASHINT3(hash_const_variant v,(3846*(vb b) +257),s) + module FormNode = @@ -206,9 +172,9 @@ let and_ f1 f2 = let not_ f = f.neg -let k_hash (s,t) = ((Ptset.hash s)) lsl 31 lxor (Tag.hash t) +let k_hash (s,t) = HASHINT2(Ptset.hash s,Tag.hash t) -module HTagSetKey = +module HTagSetKey = struct type t = Ptset.t*Tag.t let equal (s1,s2) (t1,t2) = (s2 == t2) && Ptset.equal s1 t1 @@ -217,25 +183,16 @@ end module HTagSet = Hashtbl.Make(HTagSetKey) -type dispatch = { first : Tree.t -> Tree.t; - flabel : string; - next : Tree.t -> Tree.t -> Tree.t; - nlabel : string; - consres : Tree.t -> TS.t -> TS.t -> bool -> bool -> TS.t - } - -type formlist = Nil | Cons of state*formula*int*formlist +type skiplist = Nothing | All + | Zero of skiplist + | One of skiplist | Two of skiplist | Three of skiplist + | Four of skiplist | Five of skiplist | Six of skiplist + | Seven of skiplist | Eight of skiplist | Nine of skiplist -let f_hash (h,s,t) = h * 41+((Ptset.hash s) lsl 10 ) lxor (Ptset.hash t)*4097 -module HFormlistKey = -struct - type t = int*Ptset.t*Ptset.t - let equal (h1,s1,t1) (h2,s2,t2) = h1==h2 && s1 == s2 && t1 == t2 - let hash = f_hash -end -module HFormlist = Hashtbl.Make (HFormlistKey) + +type formlist = Nil | Cons of state*formula*int*bool*formlist -type t = { +type 'a t = { id : int; mutable states : Ptset.t; init : Ptset.t; @@ -244,9 +201,9 @@ type t = { starstate : Ptset.t option; (* Transitions of the Alternating automaton *) phi : (state,(TagSet.t*(bool*formula*bool)) list) Hashtbl.t; - sigma : (dispatch*bool*formlist*Ptset.t*Ptset.t) HTagSet.t; + sigma : (int,('a t -> Tree.t -> Tree.t -> Ptset.t*'a)) Hashtbl.t; } - + module Pair (X : Set.OrderedType) (Y : Set.OrderedType) = struct type t = X.t*Y.t @@ -289,50 +246,6 @@ type t = { | `LLeft -> "⇓₁" | `RRight -> "⇓₂") s - let dnf_hash = Hashtbl.create 17 - - let rec dnf_aux f = match f.pos with - | False -> PL.empty - | True -> PL.singleton (Ptset.empty,Ptset.empty) - | Atom((`Left|`LLeft),_,s) -> PL.singleton (Ptset.singleton s,Ptset.empty) - | Atom((`Right|`RRight),_,s) -> PL.singleton (Ptset.empty,Ptset.singleton s) - | Or(f1,f2) -> PL.union (dnf f1) (dnf f2) - | And(f1,f2) -> - let pl1 = dnf f1 - and pl2 = dnf f2 - in - PL.fold (fun (s1,s2) acc -> - PL.fold ( fun (s1', s2') acc' -> - (PL.add - ((Ptset.union s1 s1'), - (Ptset.union s2 s2')) acc') ) - pl2 acc ) - pl1 PL.empty - - and dnf f = - try - Hashtbl.find dnf_hash f.fid - with - Not_found -> - let d = dnf_aux f in - Hashtbl.add dnf_hash f.fid d;d - - - let can_top_down f = - let nf = dnf f in - if (PL.cardinal nf > 3)then None - else match PL.elements nf with - | [(s1,s2); (t1,t2); (u1,u2)] when - Ptset.is_empty s1 && Ptset.is_empty s2 && Ptset.is_empty t1 && Ptset.is_empty u2 - -> Some(true,t2,u1) - | [(t1,t2); (u1,u2)] when Ptset.is_empty t1 && Ptset.is_empty u2 - -> Some(false,t2,u1) - | _ -> None - - - let equal_form f1 f2 = - (f1.fid == f2.fid) || (FormNode.equal f1 f2) || (PL.equal (dnf f1) (dnf f2)) - let dump ppf a = Format.fprintf ppf "Automaton (%i) :\n" a.id; Format.fprintf ppf "States : "; pr_st ppf (Ptset.elements a.states); @@ -400,7 +313,7 @@ type t = { type transition = Transitions.t let equal_trans (q1,t1,m1,f1,_) (q2,t2,m2,f2,_) = - (q1 == q2) && (TagSet.equal t1 t2) && (m1 == m2) && (equal_form f1 f2) + (q1 == q2) && (TagSet.equal t1 t2) && (m1 == m2) (*&& (equal_form f1 f2) *) module HFEval = Hashtbl.Make( @@ -409,12 +322,13 @@ type t = { let equal (a,b,c) (d,e,f) = a==d && (Ptset.equal b e) && (Ptset.equal c f) let hash (a,b,c) = - a+17*(Ptset.hash b) + 31*(Ptset.hash c) + HASHINT3(a,Ptset.hash b,Ptset.hash c) end) - let hfeval = HFEval.create 4097 - + + + let hfeval = HFEval.create 4097 let eval_form_bool f s1 s2 = let rec eval f = match f.pos with (* test some inlining *) @@ -459,35 +373,42 @@ type t = { in eval f - let h_formlist = HFormlist.create 511 - let form_list_fold_left f acc fl = let rec loop acc fl = match fl with | Nil -> acc - | Cons(s,frm,h,fll) -> loop (f acc s frm h) fll + | Cons(s,frm,h,m,fll) -> loop (f acc s frm h m) fll in loop acc fl - - let rec eval_formlist s1 s2 = function - | Nil -> Ptset.empty,false,false,false - | Cons(q,f,h,fl) -> - let k = (h,s1,s2) + let h_formlist = Hashtbl.create 4096 + let rec eval_formlist ?(memo=true) s1 s2 fl = + match fl with + | Nil -> Ptset.empty,false,false,false,false + | Cons(q,f,h,mark,fll) -> + let k = (h,Ptset.hash s1,Ptset.hash s2,mark) in - try HFormlist.find h_formlist k + + try + if memo then Hashtbl.find h_formlist k + else (raise Not_found) with - Not_found -> - let s,b',b1',b2' = eval_formlist s1 s2 fl in - let b,b1,b2 = eval_form_bool f s1 s2 in - let r = if b then (Ptset.add q s, b'||b, b1'||b1,b2'||b2) - else s,b',b1',b2' - in - HFormlist.add h_formlist k r;r - - - - + Not_found -> + let s,b',b1',b2',amark = eval_formlist (~memo:memo) s1 s2 fll in + let b,b1,b2 = eval_form_bool f s1 s2 in + let r = if b then (Ptset.add q s, b, b1'||b1,b2'||b2,mark||amark) + else s,b',b1',b2',amark + in +(* Format.fprintf Format.err_formatter "\nEvaluating formula (%i) %i %s" h q (if mark then "=>" else "->"); + pr_frm (Format.err_formatter) f; + Format.fprintf Format.err_formatter " in context "; + pr_st Format.err_formatter (Ptset.elements s1); + Format.fprintf Format.err_formatter ", "; + pr_st Format.err_formatter (Ptset.elements s2); + Format.fprintf Format.err_formatter " result is %b\n%!" b; *) + (Hashtbl.add h_formlist k r;r) + + let tags_of_state a q = Hashtbl.fold (fun p l acc -> @@ -506,72 +427,49 @@ type t = { if TagSet.is_finite ts then `Positive(TagSet.positive ts) else `Negative(TagSet.negative ts) - - - - let cons_res e s1 s2 b1 b2 = - if b1&&b2 then - if s2 == TS.Nil && s1 == TS.Nil - then TS.Sing e - else if s1 == TS.Nil - then TS.Cons (e,s2) - else if s2 == TS.Nil - then TS.Cons (e,s1) - else TS.ConsCat(e,s1,s2) - else if not(b1 || b2) - then TS.Sing e - else if b1 then if s1 == TS.Nil then TS.Sing e else TS.Cons(e,s1) - else if s2 = TS.Nil then TS.Sing e else TS.Cons(e,s2) - - let cat_res _ s1 s2 b1 b2 = - if b1&&b2 then if s1 == TS.Nil && s2 == TS.Nil then TS.Nil - else - if s1 == TS.Nil - then s2 - else - if s2 == TS.Nil then s1 else TS.Concat(s1,s2) - else if not(b1 || b2) - then TS.Nil - else if b1 then s1 - else s2 - - - - let merge_trans t a tag q acc = - List.fold_left (fun (accf,accm,acchtrue,acchash) (ts,(m,f,pred)) -> - if TagSet.mem tag ts - then - let acchash = acchash+31*f.fid+42*q in - (Cons(q,f,acchash,accf),accm||m,acchtrue||(is_true f),acchash) - else (accf,accm,acchtrue,acchash) - ) acc (try Hashtbl.find a.phi q with Not_found -> []) - + let inter_text a b = match b with | `Positive s -> let r = Ptset.inter a s in (r,Ptset.mem Tag.pcdata r, true) - | `Negative s -> (Ptset.empty, not (Ptset.mem Tag.pcdata s), false) + | `Negative s -> let r = Ptset.diff a s in (r, Ptset.mem Tag.pcdata r, false) let mk_nil_ctx x _ = Tree.mk_nil x let next_sibling_ctx x _ = Tree.next_sibling x let r_ignore _ x = x + + let set_get_tag r t = r := (fun _ -> t) + (* + + let merge_trans t a tag q acc = + List.fold_left (fun (accf,acchash,idx) (ts,(m,f,pred)) -> + if TagSet.mem tag ts + then + let acchash = HASHINT3(acchash,f.fid,q) in + (Cons(q,f,acchash,idx,m,accf),acchash,idx+1) + else (accf,acchash,idx) + ) acc (try Hashtbl.find a.phi q with Not_found -> []) - let get_trans t a tag r = + + + let cast_cont :'b -> ('a t -> Tree.t -> Tree.t -> Ptset.t*'a) = + Obj.magic + + let get_trans conti t a tag r = try - HTagSet.find a.sigma (r,tag) + Hashtbl.find a.sigma (HASHINT2(Ptset.hash r,Tag.hash tag)) with - Not_found -> - let fl,mark,_,_,accq = - Ptset.fold (fun q (accf,accm,acchtrue,acchash,accq) -> - let naccf,naccm,nacctrue,acchash = - merge_trans t a tag q (accf,accm,acchtrue,acchash ) + Not_found -> + let fl,_,accq,_ = + Ptset.fold (fun q (accf,acchash,accq,aidx) -> + let naccf,acchash,naidx = + merge_trans t a tag q (accf,acchash,aidx ) in - (* if is_false naccf then (naccf,naccm,nacctrue,accq) - else *) (naccf,naccm,nacctrue,acchash,Ptset.add q accq) + (naccf,acchash,Ptset.add q accq,naidx) ) - r (Nil,false,false,17,Ptset.empty) + r (Nil,17,Ptset.empty,0) in let (ls,lls,llls),(rs,rrs,rrrs) = - form_list_fold_left (fun ((a1,b1,c1),(a2,b2,c2)) _ f _ -> + form_list_fold_left (fun ((a1,b1,c1),(a2,b2,c2)) _ f _ _ _ -> let (x1,y1,z1),(x2,y2,z2) = f.st in ((Ptset.union x1 a1),(Ptset.union y1 b1),(Ptset.union c1 z1)), ((Ptset.union x2 a2),(Ptset.union y2 b2),(Ptset.union c2 z2))) @@ -586,21 +484,8 @@ type t = { and tll,htllt,llfin = inter_text tb (tags a lls) and tr,htrt,rfin = inter_text ta (tags a rs) and trr,htrrt,rrfin = inter_text ta (tags a rrs) - in(* - let _ = - Format.fprintf Format.err_formatter "Tag %s, right_states " (Tag.to_string tag); - pr_st Format.err_formatter (Ptset.elements rs); - Format.fprintf Format.err_formatter " tags = "; - Ptset.iter (fun t -> Format.fprintf Format.err_formatter "%s " - (Tag.to_string t)) tr; - Format.fprintf Format.err_formatter ", next_states "; - pr_st Format.err_formatter (Ptset.elements rrs); - Format.fprintf Format.err_formatter " tags = "; - Ptset.iter (fun t -> Format.fprintf Format.err_formatter "%s " - (Tag.to_string t)) trr; - Format.fprintf Format.err_formatter "\n%!"; - - in*) + in + let get_tag = ref Tree.tag in let first,flabel = if (llfin && lfin) then (* no stars *) (if htlt || htllt then (Tree.text_below, "#text_below") @@ -613,7 +498,10 @@ type t = { else if etl then if Ptset.is_singleton tll - then (Tree.tagged_desc (Ptset.choose tll), "#tagged_desc") + then begin + set_get_tag get_tag (Ptset.choose tll); + (Tree.tagged_desc (Ptset.choose tll), "#tagged_desc") + end else (Tree.select_desc_only tll, "#select_desc_only") else if etll then (Tree.node_child,"#node_child") else (Tree.select_below tl tll,"#select_below")) @@ -633,7 +521,10 @@ type t = { else if etr then if Ptset.is_singleton trr - then (Tree.tagged_foll_below (Ptset.choose trr),"#tagged_foll_below") + then begin + set_get_tag get_tag (Ptset.choose trr); + (Tree.tagged_foll_below (Ptset.choose trr),"#tagged_foll_below") + end else (Tree.select_foll_only trr,"#select_foll_only") else if etrr then (Tree.node_sibling_ctx,"#node_sibling_ctx") else @@ -642,45 +533,41 @@ type t = { else if htrt || htrrt then (Tree.next_sibling_ctx,"#next_sibling_ctx") else (Tree.node_sibling_ctx,"#node_sibling_ctx") in - let dispatch = { first = first; flabel = flabel; next = next; nlabel = nlabel; - consres = if mark then cons_res else cat_res } - in - HTagSet.add a.sigma (accq,tag) (dispatch,mark,fl,llls,rrrs); - dispatch,mark,fl,llls,rrrs + let cont = let flist = fl in + fun a t res ctx -> + let s1,res1 = conti a (first t) llls res t + and s2,res2 = conti a (next t ctx) rrrs res ctx in + let r',rb,rb1,rb2,mark,idxl = eval_formlist s1 s2 flist + in + r',(vb rb)*((vb mark) + (vb rb1)*res1 + (vb rb2)*res2) + in + Hashtbl.add a.sigma (HASHINT2(Ptset.hash r,Tag.hash tag)) (cast_cont cont); + (cast_cont cont) - +(* let rec accepting_among a t r ctx = if Tree.is_nil t || Ptset.is_empty r then Ptset.empty,0,TS.Nil else let dispatch,mark,flist,llls,rrrs = - get_trans t a (Tree.tag t) r + get_trans (fun _ _ _ _ -> failwith "toto") t a (Tree.tag t) r in let s1,n1,res1 = accepting_among a (dispatch.first t) llls t in let s2,n2,res2 = accepting_among a (dispatch.next t ctx) rrrs ctx in let r',rb,rb1,rb2 = eval_formlist s1 s2 flist in r',(vb rb)*((vb mark) + (vb rb1)* n1 + (vb rb2)*n2),if rb then dispatch.consres t res1 res2 rb1 rb2 - else TS.Nil + else TS.Nil *) - let run a t = + let run a t = assert false (* let st,n,res = accepting_among a t a.init t in - if Ptset.is_empty (st) then TS.empty,0 else res,n - - - let rec accepting_among_count_no_star a t r ctx = - if Tree.is_nil t||Ptset.is_empty r then Ptset.empty,0 else - let dispatch,mark,flist,llls,rrrs = - get_trans t a (Tree.tag t) r - in - let s1,res1 = accepting_among_count_no_star a (dispatch.first t) llls t - and s2,res2 = accepting_among_count_no_star a (dispatch.next t ctx) rrrs ctx - in - let r',rb,rb1,rb2 = eval_formlist s1 s2 flist - in - r',(vb rb)*((vb mark) + (vb rb1)*res1 + (vb rb2)*res2) - - - + if Ptset.is_empty (st) then TS.empty,0 else res,n *) + + let rec accepting_among_count_no_star a t r ctx = + if Tree.is_nil t then Ptset.empty,0 else + (get_trans (accepting_among_count_no_star) t a (Tree.tag t) r) + a t ctx + +(* let rec accepting_among_count_star a t n = if Tree.is_nil t then n else if (Tree.tag t == Tag.attribute) @@ -693,8 +580,8 @@ type t = { else if Tree.is_nil t||Ptset.is_empty r then Ptset.empty,0 else let dispatch,mark,flist,llls,rrrs = - get_trans t a (Tree.tag t) r - in + get_trans (fun _ _ _ _ -> failwith "toto") t a (Tree.tag t) r + in let s1,res1 = accepting_among_count_may_star starstate a (dispatch.first t) llls t and s2,res2 = accepting_among_count_may_star starstate a (dispatch.next t ctx) rrrs ctx in @@ -702,21 +589,855 @@ type t = { in r',(vb rb)*((vb mark) + (vb rb1)*res1 + (vb rb2)*res2) - +*) let run_count a t = let st,res = match a.starstate with - | None -> accepting_among_count_no_star a t a.init t - | Some s -> accepting_among_count_may_star s a t a.init t + | None -> accepting_among_count_no_star a t a.init t + | Some s -> assert false (*accepting_among_count_may_star s a t a.init t *) in if Ptset.is_empty (st) then 0 else res let run_time _ _ = failwith "blah" + + + module RealBottomUp = struct + + (* decrease number of arguments *) + let ton t = if Tree.is_nil t then "##" + else Tag.to_string (Tree.tag t) + ;; + let ion t = Tree.dump_node t + let memo = Hashtbl.create 4097 + let rlist = ref [] + + let cpt = ref 0;; + let rec run a t res r root rinit next targettag r0 first tomark = + incr cpt; + let res = (vb tomark) + res in + let newr,newres = if first then + accepting_among_count_no_star a t r t + else r, res + in + let r,res = if Ptset.is_empty newr then r,0 else newr,newres in + if Tree.equal t root then + if Ptset.intersect r rinit then (r,res,next) + else (Ptset.empty,0,next) + else + let tag = Tree.tag t in + let parent = Tree.binary_parent t in + let parent_tag = Tree.tag parent in + let left = Tree.is_left t in + let r',mark = + try Hashtbl.find memo (r,parent_tag,left) with + | Not_found -> + let pair = + Hashtbl.fold + (fun q l acc -> + List.fold_left + (fun (aq,am) (ts,(mark,form,_)) -> + if TagSet.mem parent_tag ts then + let (value,_,_) = if left then + eval_form_bool form r Ptset.empty + else + eval_form_bool form Ptset.empty r + in +(* let _ = if value then begin + Format.fprintf Format.err_formatter "Can take transition (%i,%s)%s%!" + q (Tag.to_string parent_tag) + (if mark then "=>" else "->"); + pr_frm Format.err_formatter form; + Format.fprintf Format.err_formatter "%! %s(" (if left then "left" else "right"); + pr_st Format.err_formatter (Ptset.elements r); + Format.fprintf Format.err_formatter ")\n%!" end; + in *) + if value then (Ptset.add q aq, mark||am) + else (aq,am) + else (aq,am)) + acc l + ) a.phi (Ptset.empty,false) + in Hashtbl.add memo (r,parent_tag,left) pair;pair + in + if Ptset.is_empty r' then Ptset.empty,0,next + else + if Tree.is_below_right t next then + let rn,resn,nextofnext = run a next 0 r0 t r (Tree.tagged_next next targettag) targettag r0 true false + in + let rn,resn = if Ptset.is_empty rn then Ptset.empty,0 else rn,resn in + run a (parent) (resn+res) r' root rinit nextofnext targettag r0 false false + else + run a (parent) (res) r' root rinit next targettag r0 false (mark&&left) + + + + let accept_count a t tag initset = + let tree1 = Tree.tagged_lowest t tag in + let tree2 = Tree.tagged_next tree1 tag in + let c,b,_ =run a tree1 0 initset t a.init tree2 tag initset true false + in Printf.eprintf "%i\n%!" !cpt; + if Ptset.is_empty c then 0 else b + + end *) +(* + module RealBottomUp2 = struct + module Formlist = + struct + type t = formlist + let nil : t = Nil + let cons q f i m l = Cons(q,f,i,m,l) + let hash = function Nil -> 0 | Cons(_,_,i,_,_) -> max_int land i + let pr fmt l = + let rec loop = function + | Nil -> () + | Cons(q,f,_,m,l) -> + Format.fprintf fmt "%i %s" q (if m then "=>" else "->"); + pr_frm fmt f; + Format.fprintf fmt "\n%!"; + loop l + in + loop l + end + + type ptset_list = Nil | Cons of Ptset.t*int*ptset_list + let hpl l = match l with + | Nil -> 0 + | Cons (_,i,_) -> i + + let cons s l = Cons (s,(Ptset.hash s) + 65599 * (hpl l), l) + + let rec empty_size n = + if n == 0 then Nil + else cons Ptset.empty (empty_size (n-1)) + + let fold_pl f l acc = + let rec loop l acc = match l with + Nil -> acc + | Cons(s,h,pl) -> loop pl (f s h acc) + in + loop l acc + let map_pl f l = + let rec loop = + function Nil -> Nil + | Cons(s,h,ll) -> cons (f s) (loop ll) + in loop l + + let rev_pl l = + let rec loop acc l = match l with + | Nil -> acc + | Cons(s,_,ll) -> loop (cons s acc) ll + in + loop Nil l + + let rev_map_pl f l = + let rec loop acc l = + match l with + | Nil -> acc + | Cons(s,_,ll) -> loop (cons (f s) acc) ll + in + loop Nil l + + let merge_int _ rb rb1 rb2 mark _ res1 res2 = + if rb then (vb mark) + ((vb rb1)*res1) + ((vb rb2)*res2) + else 0 + + let td_trans = Hashtbl.create 4096 + + let choose_jump tagset qtags1 qtagsn a f_nil f_text f_t1 f_s1 f_tn f_sn f_notext = + let tags1,hastext1,fin1 = inter_text tagset (tags a qtags1) in + let tagsn,hastextn,finn = inter_text tagset (tags a qtagsn) in +(* Format.fprintf Format.err_formatter "Tags below states "; + pr_st Format.err_formatter (Ptset.elements qtags1); + Format.fprintf Format.err_formatter " are { "; + Ptset.iter (fun t -> Format.fprintf Format.err_formatter "%s " (Tag.to_string t)) tags1; + Format.fprintf Format.err_formatter "}, %b,%b\n%!" hastext1 fin1; + + Format.fprintf Format.err_formatter "Tags below states "; + pr_st Format.err_formatter (Ptset.elements qtagsn); + Format.fprintf Format.err_formatter " are { "; + Ptset.iter (fun t -> Format.fprintf Format.err_formatter "%s " (Tag.to_string t)) tagsn; + Format.fprintf Format.err_formatter "}, %b,%b\n%!" hastextn finn; +*) + if (hastext1||hastextn) then f_text (* jumping to text nodes doesn't work really well *) + else if (Ptset.is_empty tags1) && (Ptset.is_empty tagsn) then f_nil + else if (Ptset.is_empty tagsn) then + if (Ptset.is_singleton tags1) then f_t1 (Ptset.choose tags1) (* TaggedChild/Sibling *) + else f_s1 tags1 (* SelectChild/Sibling *) + else if (Ptset.is_empty tags1) then + if (Ptset.is_singleton tagsn) then f_tn (Ptset.choose tagsn) (* TaggedDesc/Following *) + else f_sn tagsn (* SelectDesc/Following *) + else f_notext + + let choose_jump_down a b c d = + choose_jump a b c d + (Tree.mk_nil) + (Tree.text_below ) + (fun _ -> Tree.node_child ) (* !! no tagged_child in Tree.ml *) + (fun _ -> Tree.node_child ) (* !! no select_child in Tree.ml *) + (Tree.tagged_desc) + (fun _ -> Tree.node_child ) (* !! no select_desc *) + (Tree.node_child) + + let choose_jump_next a b c d = + choose_jump a b c d + (fun t _ -> Tree.mk_nil t) + (Tree.text_next) + (fun _ -> Tree.node_sibling_ctx) (* !! no tagged_sibling in Tree.ml *) + (fun _ -> Tree.node_sibling_ctx) (* !! no select_child in Tree.ml *) + (Tree.tagged_foll_below) + (fun _ -> Tree.node_sibling_ctx) (* !! no select_foll *) + (Tree.node_sibling_ctx) + + module type RS = sig + type t + type elt + val empty : t + val cons : elt -> t -> t + val concat : t -> t -> t + end + + + let get_trans slist tag a t = + try + Hashtbl.find td_trans (tag,hpl slist) + with + | Not_found -> + let fl_list,llist,rlist,ca,da,sa,fa = + fold_pl + (fun set _ (fll_acc,lllacc,rllacc,ca,da,sa,fa) -> (* For each set *) + let fl,ll,rr,ca,da,sa,fa = + Ptset.fold + (fun q acc -> + fst ( + List.fold_left + (fun (((fl_acc,ll_acc,rl_acc,c_acc,d_acc,s_acc,f_acc),h_acc) as acc) + (ts,(m,f,_)) -> + if (TagSet.mem tag ts) + then + let (child,desc,below),(sibl,foll,after) = f.st in + ((Formlist.cons q f h_acc m fl_acc, + Ptset.union ll_acc below, + Ptset.union rl_acc after, + Ptset.union child c_acc, + Ptset.union desc d_acc, + Ptset.union sibl s_acc, + Ptset.union foll f_acc), + HASHINT3(h_acc,f.fid,HASHINT2(q,vb m))) + else acc ) (acc,0) ( + try Hashtbl.find a.phi q + with + Not_found -> Printf.eprintf "Looking for state %i, doesn't exist!!!\n%!" + q;[] + )) + + ) set (Formlist.nil,Ptset.empty,Ptset.empty,ca,da,sa,fa) + in fl::fll_acc, cons ll lllacc, cons rr rllacc,ca,da,sa,fa) + slist ([],Nil,Nil,Ptset.empty,Ptset.empty,Ptset.empty,Ptset.empty) + in + (* Logic to chose the first and next function *) + let tags_below,tags_after = Tree.tags t tag in + let first = choose_jump_down tags_below ca da a + and next = choose_jump_next tags_after sa fa a in + let v = (fl_list,llist,rlist,first,next) in + Hashtbl.add td_trans (tag, hpl slist) v; v + + + let top_down ?(noright=false) a merge null t slist ctx slot_size = + let pempty = empty_size slot_size in + + let eval_fold2_slist fll sl1 sl2 res1 res2 t = + let res = Array.copy res1 in + let rec fold l1 l2 fll i aq = match l1,l2,fll with + | Cons(s1,_,ll1), Cons(s2, _ ,ll2),fl::fll -> + let r',rb,rb1,rb2,mark = eval_formlist s1 s2 fl in + let _ = res.(i) <- merge null rb rb1 rb2 mark t res1.(i) res2.(i) + in +(* let _ = Format.fprintf Format.err_formatter "(%b,%b,%b,%b) Result was %i %i, now %i\n%!" + rb rb1 rb2 mark (Obj.magic res1.(i)) (Obj.magic res2.(i)) (Obj.magic res.(i)); + in *) + + fold ll1 ll2 fll (i+1) (cons r' aq) + | Nil, Nil,[] -> aq,res + | _ -> assert false + in + fold sl1 sl2 fll 0 Nil + in + let rec loop t slist ctx = + if Tree.is_nil t then (pempty,Array.make slot_size null) + else + let tag = Tree.tag t in + let fl_list,llist,rlist,first,next = get_trans slist tag a t in + let sl1,res1 = loop (first t) llist t in + let sl2,res2 = if noright then (pempty,Array.make slot_size null) + else loop (next t ctx) rlist ctx in + eval_fold2_slist fl_list sl1 sl2 res1 res2 t + in + loop t slist ctx + + let run_top_down_count a t = + let init = cons a.init Nil in + let _,res = top_down a (fun _ rb rb1 rb2 mark t res1 res2 -> + (vb rb)*( (vb mark) + (vb rb1)*res1 + (vb rb2)*res2)) + 0 t init t 1 + in res.(0) + ;; + + let run_top_down a t = + let init = cons a.init Nil in + let _,res = + top_down a (fun null rb rb1 rb2 mark t res1 res2 -> + if rb then + TS.concat + (TS.concat (if mark then TS.Sing(t) else null) + (if rb1 then res1 else null)) + (if rb2 then res2 else null) + else null) + TS.Nil t init t 1 + in res.(0) + ;; + + + end +*) + module type ResultSet = + sig + type t + val empty : t + val cons : Tree.t -> t -> t + val concat : t -> t -> t + val iter : (Tree.t -> unit) -> t -> unit + val fold : (Tree.t -> 'a -> 'a) -> t -> 'a -> 'a + val map : (Tree.t -> Tree.t) -> t -> t + val length : t -> int + end + + module Integer : ResultSet = + struct + type t = int + let empty = 0 + let cons _ x = x+1 + let concat x y = x + y + let iter _ _ = failwith "iter not implemented" + let fold _ _ _ = failwith "fold not implemented" + let map _ _ = failwith "map not implemented" + let length x = x + end + + module IdSet : ResultSet = + struct + type node = Nil + | Cons of Tree.t * node + | Concat of node*node + + and t = { node : node; + length : int } + + let empty = { node = Nil; length = 0 } + + let cons e t = { node = Cons(e,t.node); length = t.length+1 } + let concat t1 t2 = { node = Concat(t1.node,t2.node); length = t1.length+t2.length } + let append e t = { node = Concat(t.node,Cons(e,Nil)); length = t.length+1 } + + + let fold f l acc = + let rec loop acc t = match t with + | Nil -> acc + | Cons (e,t) -> loop (f e acc) t + | Concat (t1,t2) -> loop (loop acc t1) t2 + in + loop acc l.node + + let length l = l.length + + + let iter f l = + let rec loop = function + | Nil -> () + | Cons (e,t) -> f e; loop t + | Concat(t1,t2) -> loop t1;loop t2 + in loop l.node + + let map f l = + let rec loop = function + | Nil -> Nil + | Cons(e,t) -> Cons(f e, loop t) + | Concat(t1,t2) -> Concat(loop t1,loop t2) + in + { l with node = loop l.node } + + end + module Run (RS : ResultSet) = + struct + module Formlist = + struct + type t = formlist + let nil : t = Nil + let cons q f i m l = Cons(q,f,i,m,l) + let hash = function Nil -> 0 | Cons(_,_,i,_,_) -> max_int land i + let pr fmt l = + let rec loop = function + | Nil -> () + | Cons(q,f,_,m,l) -> + Format.fprintf fmt "%i %s" q (if m then "=>" else "->"); + pr_frm fmt f; + Format.fprintf fmt "\n%!"; + loop l + in + loop l + end + + type ptset_list = Nil | Cons of Ptset.t*int*ptset_list + let hpl l = match l with + | Nil -> 0 + | Cons (_,i,_) -> i + let cons s l = Cons (s,(Ptset.hash s) + 65599 * (hpl l), l) + + let rec empty_size n = + if n == 0 then Nil + else cons Ptset.empty (empty_size (n-1)) + + let fold_pl f l acc = + let rec loop l acc = match l with + Nil -> acc + | Cons(s,h,pl) -> loop pl (f s h acc) + in + loop l acc + let map_pl f l = + let rec loop = + function Nil -> Nil + | Cons(s,h,ll) -> cons (f s) (loop ll) + in loop l + + let rev_pl l = + let rec loop acc l = match l with + | Nil -> acc + | Cons(s,_,ll) -> loop (cons s acc) ll + in + loop Nil l -(* - end + let rev_map_pl f l = + let rec loop acc l = + match l with + | Nil -> acc + | Cons(s,_,ll) -> loop (cons (f s) acc) ll + in + loop Nil l + + let td_trans = Hashtbl.create 4096 + + + let choose_jump tagset qtags1 qtagsn a f_nil f_text f_t1 f_s1 f_tn f_sn f_notext = + let tags1,hastext1,fin1 = inter_text tagset (tags a qtags1) in + let tagsn,hastextn,finn = inter_text tagset (tags a qtagsn) in +(* Format.fprintf Format.err_formatter "Tags below states "; + pr_st Format.err_formatter (Ptset.elements qtags1); + Format.fprintf Format.err_formatter " are { "; + Ptset.iter (fun t -> Format.fprintf Format.err_formatter "%s " (Tag.to_string t)) tags1; + Format.fprintf Format.err_formatter "}, %b,%b\n%!" hastext1 fin1; + + Format.fprintf Format.err_formatter "Tags below states "; + pr_st Format.err_formatter (Ptset.elements qtagsn); + Format.fprintf Format.err_formatter " are { "; + Ptset.iter (fun t -> Format.fprintf Format.err_formatter "%s " (Tag.to_string t)) tagsn; + Format.fprintf Format.err_formatter "}, %b,%b\n%!" hastextn finn; *) + if (hastext1||hastextn) then f_text (* jumping to text nodes doesn't work really well *) + else if (Ptset.is_empty tags1) && (Ptset.is_empty tagsn) then f_nil + else if (Ptset.is_empty tagsn) then + if (Ptset.is_singleton tags1) then f_t1 (Ptset.choose tags1) (* TaggedChild/Sibling *) + else f_s1 tags1 (* SelectChild/Sibling *) + else if (Ptset.is_empty tags1) then + if (Ptset.is_singleton tagsn) then f_tn (Ptset.choose tagsn) (* TaggedDesc/Following *) + else f_sn tagsn (* SelectDesc/Following *) + else f_notext + + let choose_jump_down a b c d = + choose_jump a b c d + (Tree.mk_nil) + (Tree.text_below) + (*fun x -> let i,j = Tree.doc_ids x in + let res = Tree.text_below x in + Printf.printf "Calling text_below %s (tag=%s), docids= (%i,%i), res=%s\n" + (Tree.dump_node x) (Tag.to_string (Tree.tag x)) i j (Tree.dump_node res); + res*) + (fun _ -> Tree.node_child ) (* !! no tagged_child in Tree.ml *) + (fun _ -> Tree.node_child ) (* !! no select_child in Tree.ml *) + (Tree.tagged_desc) + (fun _ -> Tree.node_child ) (* !! no select_desc *) + (Tree.node_child) + + let choose_jump_next a b c d = + choose_jump a b c d + (fun t _ -> Tree.mk_nil t) + (Tree.text_next) + (*fun x y -> let i,j = Tree.doc_ids x in + let res = Tree.text_next x y in + Printf.printf "Calling text_next %s (tag=%s) ctx=%s, docids= (%i,%i), res=%s\n" + (Tree.dump_node x) (Tag.to_string (Tree.tag x)) (Tree.dump_node y) i j (Tree.dump_node res); + res*) + + (fun _ -> Tree.node_sibling_ctx) (* !! no tagged_sibling in Tree.ml *) + (fun _ -> Tree.node_sibling_ctx) (* !! no select_child in Tree.ml *) + (Tree.tagged_foll_below) + (fun _ -> Tree.node_sibling_ctx) (* !! no select_foll *) + (Tree.node_sibling_ctx) + + + let get_trans slist tag a t = + try + Hashtbl.find td_trans (tag,hpl slist) + with + | Not_found -> + let fl_list,llist,rlist,ca,da,sa,fa = + fold_pl + (fun set _ (fll_acc,lllacc,rllacc,ca,da,sa,fa) -> (* For each set *) + let fl,ll,rr,ca,da,sa,fa = + Ptset.fold + (fun q acc -> + fst ( + List.fold_left + (fun (((fl_acc,ll_acc,rl_acc,c_acc,d_acc,s_acc,f_acc),h_acc) as acc) + (ts,(m,f,_)) -> + if (TagSet.mem tag ts) + then + let (child,desc,below),(sibl,foll,after) = f.st in + ((Formlist.cons q f h_acc m fl_acc, + Ptset.union ll_acc below, + Ptset.union rl_acc after, + Ptset.union child c_acc, + Ptset.union desc d_acc, + Ptset.union sibl s_acc, + Ptset.union foll f_acc), + HASHINT3(h_acc,f.fid,HASHINT2(q,vb m))) + else acc ) (acc,0) ( + try Hashtbl.find a.phi q + with + Not_found -> Printf.eprintf "Looking for state %i, doesn't exist!!!\n%!" + q;[] + )) + + ) set (Formlist.nil,Ptset.empty,Ptset.empty,ca,da,sa,fa) + in fl::fll_acc, cons ll lllacc, cons rr rllacc,ca,da,sa,fa) + slist ([],Nil,Nil,Ptset.empty,Ptset.empty,Ptset.empty,Ptset.empty) + in + (* Logic to chose the first and next function *) + let tags_below,tags_after = Tree.tags t tag in + let first = choose_jump_down tags_below ca da a + and next = choose_jump_next tags_after sa fa a in + let v = (fl_list,llist,rlist,first,next) in + Hashtbl.add td_trans (tag, hpl slist) v; v + + let merge rb rb1 rb2 mark t res1 res2 = + if rb + then + let res1 = if rb1 then res1 else RS.empty + and res2 = if rb2 then res2 else RS.empty + in + if mark then RS.cons t (RS.concat res1 res2) + else RS.concat res1 res2 + else RS.empty + + let top_down ?(noright=false) a t slist ctx slot_size = + let pempty = empty_size slot_size in + let eval_fold2_slist fll sl1 sl2 res1 res2 t = + let res = Array.copy res1 in + let rec fold l1 l2 fll i aq = match l1,l2,fll with + | Cons(s1,_,ll1), Cons(s2, _ ,ll2),fl::fll -> + let r',rb,rb1,rb2,mark = eval_formlist s1 s2 fl in + let _ = res.(i) <- merge rb rb1 rb2 mark t res1.(i) res2.(i) + in + fold ll1 ll2 fll (i+1) (cons r' aq) + | Nil, Nil,[] -> aq,res + | _ -> assert false + in + fold sl1 sl2 fll 0 Nil + in + let null_result() = (pempty,Array.make slot_size RS.empty) in + let rec loop t slist ctx = + if Tree.is_nil t then null_result() + else + let tag = Tree.tag t in + let fl_list,llist,rlist,first,next = get_trans slist tag a t in + let sl1,res1 = loop (first t) llist t in + let sl2,res2 = if noright then null_result() + else loop (next t ctx) rlist ctx in + eval_fold2_slist fl_list sl1 sl2 res1 res2 t + in + let loop_no_right t slist ctx = + if Tree.is_nil t then null_result() + else + let tag = Tree.tag t in + let fl_list,llist,rlist,first,next = get_trans slist tag a t in + let sl1,res1 = loop (first t) llist t in + let sl2,res2 = null_result() in + eval_fold2_slist fl_list sl1 sl2 res1 res2 t + in + (if noright then loop_no_right else loop) t slist ctx + + let run_top_down a t = + let init = cons a.init Nil in + let _,res = top_down a t init t 1 + in res.(0) + ;; + + module Configuration = + struct + module Ptss = Set.Make(Ptset) + module IMap = Map.Make(Ptset) + type t = { hash : int; + sets : Ptss.t; + results : RS.t IMap.t } + let empty = { hash = 0; + sets = Ptss.empty; + results = IMap.empty; + } + let is_empty c = Ptss.is_empty c.sets + let add c s r = + if Ptss.mem s c.sets then + { c with results = IMap.add s (RS.concat r (IMap.find s c.results)) c.results} + else + { hash = HASHINT2(c.hash,Ptset.hash s); + sets = Ptss.add s c.sets; + results = IMap.add s r c.results + } + + let pr fmt c = Format.fprintf fmt "{"; + Ptss.iter (fun s -> pr_st fmt (Ptset.elements s); + Format.fprintf fmt " ") c.sets; + Format.fprintf fmt "}\n%!"; + IMap.iter (fun k d -> + pr_st fmt (Ptset.elements k); + Format.fprintf fmt "-> %i\n" (RS.length d)) c.results; + Format.fprintf fmt "\n%!" + + let merge c1 c2 = + let acc1 = IMap.fold (fun s r acc -> + IMap.add s + (try + RS.concat r (IMap.find s acc) + with + | Not_found -> r) acc) c1.results IMap.empty + in + let imap = + IMap.fold (fun s r acc -> + IMap.add s + (try + RS.concat r (IMap.find s acc) + with + | Not_found -> r) acc) c2.results acc1 + in + let h,s = + Ptss.fold + (fun s (ah,ass) -> (HASHINT2(ah,Ptset.hash s), + Ptss.add s ass)) + (Ptss.union c1.sets c2.sets) (0,Ptss.empty) + in + { hash = h; + sets =s; + results = imap } + + end + let fmt = Format.err_formatter + let pr x = Format.fprintf fmt x + let h_fold = Hashtbl.create 511 + + let fold_f_conf t slist fl_list conf dir= + let rec loop sl fl acc = + match sl,fl with + |Nil,[] -> acc + | Cons(s,hs,sll), formlist::fll -> + let r',rb,rb1,rb2,mark = + try + Hashtbl.find h_fold (hs,Formlist.hash formlist,dir) + with + Not_found -> let res = + if dir then eval_formlist ~memo:false s Ptset.empty formlist + else eval_formlist ~memo:false Ptset.empty s formlist + in (Hashtbl.add h_fold (hs,Formlist.hash formlist,dir) res;res) + in(* + let _ = pr "Evaluating on set (%s) with tree %s=%s" + (if dir then "left" else "right") + (Tag.to_string (Tree.tag t)) + (Tree.dump_node t) ; + pr_st fmt (Ptset.elements s); + pr ", formualae (with hash %i): \n" (Formlist.hash formlist); + Formlist.pr fmt formlist; + pr "result is "; + pr_st fmt (Ptset.elements r'); + pr " %b %b %b %b \n%!" rb rb1 rb2 mark ; + in *) + if rb && ((dir&&rb1)|| ((not dir) && rb2)) + then + let acc = + let old_r = + try Configuration.IMap.find s conf.Configuration.results + with Not_found -> RS.empty + in + Configuration.add acc r' (if mark then RS.cons t old_r else old_r) + in + loop sll fll acc + else loop sll fll acc + | _ -> assert false + in + loop slist fl_list Configuration.empty + + let h_trans = Hashtbl.create 4096 + + let get_up_trans slist ptag a tree = + let key = (HASHINT2(hpl slist,Tag.hash ptag)) in + try + Hashtbl.find h_trans key + with + | Not_found -> + let f_list,_ = + Hashtbl.fold (fun q l acc -> + List.fold_left (fun (fl_acc,h_acc) (ts,(m,f,_)) -> + if TagSet.mem ptag ts + then (Formlist.cons q f h_acc m fl_acc, + HASHINT3(h_acc,f.fid,q)) + else (fl_acc,h_acc)) + acc l) + a.phi (Formlist.nil,0) + in + let res = fold_pl (fun _ _ acc -> f_list::acc) slist [] + in + (Hashtbl.add h_trans key res;res) + + + let rec bottom_up a tree conf next jump_fun root dotd init accu = + if (not dotd) && (Configuration.is_empty conf ) then + (* let _ = pr "Returning early from %s, with accu %i, next is %s\n%!" + (Tree.dump_node tree) (Obj.magic accu) (Tree.dump_node next) + in *) + accu,conf,next + else +(* let _ = + pr "Going bottom up for tree with tag %s configuration is" + (if Tree.is_nil tree then "###" else Tag.to_string (Tree.tag tree)); + Configuration.pr fmt conf + in *) + let below_right = Tree.is_below_right tree next in +(* let _ = Format.fprintf Format.err_formatter "below_right %s %s = %b\n%!" + (Tree.dump_node tree) (Tree.dump_node next) below_right + in *) + let accu,rightconf,next_of_next = + if below_right then (* jump to the next *) +(* let _ = pr "Jumping to %s\n%!" (Tree.dump_node next) in *) + bottom_up a next conf (jump_fun next) jump_fun (Tree.next_sibling tree) true init accu + else accu,Configuration.empty,next + in +(* let _ = if below_right then pr "Returning from jump to next\n" in *) + let sub = + if dotd then + if below_right then (* only recurse on the left subtree *) + (* let _ = pr "Topdown on subtree\n%!" in *) + prepare_topdown a tree true + else +(* let _ = pr "Topdown on whole tree\n%!" in *) + prepare_topdown a tree false + else conf + in + let conf,next = + (Configuration.merge rightconf sub, next_of_next) + in + if Tree.equal tree root then +(* let _ = pr "Stopping at root, configuration after topdown is:" ; + Configuration.pr fmt conf; + pr "\n%!" + in *) accu,conf,next + else + let parent = Tree.binary_parent tree in + let ptag = Tree.tag parent in + let dir = Tree.is_left tree in + let slist = Configuration.Ptss.fold (fun e a -> cons e a) conf.Configuration.sets Nil in + let fl_list = get_up_trans slist ptag a parent in + let slist = rev_pl (slist) in +(* let _ = pr "Current conf is : %i " (Tree.id tree); + Configuration.pr fmt conf; + pr "\n" + in *) + let newconf = fold_f_conf parent slist fl_list conf dir in +(* let _ = pr "New conf before pruning is (dir=%b):" dir; + Configuration.pr fmt newconf ; + pr "accu is %i\n" (RS.length accu); + in *) + let accu,newconf = Configuration.IMap.fold (fun s res (ar,nc) -> + if Ptset.intersect s init then + ( RS.concat res ar ,nc) + else (ar,Configuration.add nc s res)) + (newconf.Configuration.results) (accu,Configuration.empty) + in +(* let _ = pr "New conf after pruning is (dir=%b):" dir; + Configuration.pr fmt newconf ; + pr "accu is %i\n" (RS.length accu); + in *) + bottom_up a parent newconf next jump_fun root false init accu + + and prepare_topdown a t noright = +(* pr "Going top down on tree with tag %s\n%!" + (if Tree.is_nil t then "###" else (Tag.to_string(Tree.tag t))); *) + let r = cons a.states Nil in + let set,res = top_down (~noright:noright) a t r t 1 in + let set = match set with + | Cons(x,_,Nil) ->x + | _ -> assert false + in +(* pr "Result of topdown run is %!"; + pr_st fmt (Ptset.elements set); + pr ", number is %i\n%!" (RS.length res.(0)); *) + Configuration.add Configuration.empty set res.(0) + + + + let run_bottom_up_contains a t = + let trlist = Hashtbl.find a.phi (Ptset.choose a.init) + in + let init = List.fold_left + (fun acc (_,(_,f,_)) -> + Ptset.union acc (let (_,_,l) = fst (f.st) in l)) + Ptset.empty trlist + in + let tree1 = Tree.text_below t in + let jump_fun = fun tree -> Tree.text_next tree t in + let tree2 = jump_fun tree1 in + let rec loop tree next acc = +(* let _ = pr "\n_________________________\nNew iteration\n" in *) +(* let _ = pr "Jumping to %s\n%!" (Tree.dump_node tree) in *) + let acc,conf,next_of_next = bottom_up a tree + Configuration.empty next jump_fun (Tree.root tree) true init acc + in + (* let _ = pr "End of first iteration, conf is:\n%!"; + Configuration.pr fmt conf + in *) + let acc = Configuration.IMap.fold + ( fun s res acc -> if Ptset.intersect init s + then RS.concat res acc else acc) conf.Configuration.results acc + in + if Tree.is_nil next_of_next (*|| Tree.equal next next_of_next *)then + acc + else loop next_of_next (jump_fun next_of_next) acc + in + loop tree1 tree2 RS.empty + + + + + + + + + + + + + + end + + let top_down_count a t = let module RI = Run(Integer) in Integer.length (RI.run_top_down a t) + let top_down a t = let module RI = Run(IdSet) in (RI.run_top_down a t) + let bottom_up_count_contains a t = let module RI = Run(Integer) in Integer.length (RI.run_bottom_up_contains a t) + let bottom_up_count a t = failwith "not implemented" + diff --git a/ata.mli b/ata.mli index 4ec2f59..c56c22b 100644 --- a/ata.mli +++ b/ata.mli @@ -1,13 +1,3 @@ -module TS : sig - type t - val empty : t - val cons : Tree.t -> t -> t - val append : Tree.t -> t -> t - val concat : t -> t -> t - val length : t -> int - val iter : (Tree.t -> unit) -> t -> unit -end - type state = int val mk_state : unit -> state @@ -30,27 +20,20 @@ val pr_frm : Format.formatter -> formula -> unit module HTagSet : Hashtbl.S with type key = Ptset.t*Tag.t -type dispatch = { first : Tree.t -> Tree.t; - flabel : string; - next : Tree.t -> Tree.t -> Tree.t; - nlabel : string; - consres : Tree.t -> TS.t -> TS.t -> bool -> bool -> TS.t; - } -type formlist = Nil | Cons of state*formula*int*formlist - -type t = { - id : int; - mutable states : Ptset.t; - init : Ptset.t; - mutable final : Ptset.t; - universal : Ptset.t; - starstate : Ptset.t option; - (* Transitions of the Alternating automaton *) - phi : (state,(TagSet.t*(bool*formula*bool)) list) Hashtbl.t; - sigma : (dispatch*bool*formlist*Ptset.t*Ptset.t) HTagSet.t; +type 'a t = { + id : int; + mutable states : Ptset.t; + init : Ptset.t; + mutable final : Ptset.t; + universal : Ptset.t; + starstate : Ptset.t option; + (* Transitions of the Alternating automaton *) + phi : (state,(TagSet.t*(bool*formula*bool)) list) Hashtbl.t; + sigma : (int,('a t -> Tree.t -> Tree.t -> Ptset.t*'a)) Hashtbl.t; } -val dump : Format.formatter -> t -> unit + +val dump : Format.formatter -> 'a t -> unit module Transitions : sig type t = state*TagSet.t*bool*formula*bool @@ -68,11 +51,21 @@ type transition = Transitions.t val equal_trans : transition -> transition -> bool + module type ResultSet = + sig + type t + val empty : t + val cons : Tree.t -> t -> t + val concat : t -> t -> t + val iter : (Tree.t -> unit) -> t -> unit + val fold : (Tree.t -> 'a -> 'a) -> t -> 'a -> 'a + val map : (Tree.t -> Tree.t) -> t -> t + val length : t -> int + end -(*module BottomUpJumpNew : -sig *) - val run : t -> Tree.t -> TS.t*int - val run_count : t -> Tree.t -> int - val run_time : t -> Tree.t -> TS.t*int -(*end *) + module IdSet : ResultSet + val top_down_count : 'a t -> Tree.t -> int + val top_down : 'a t -> Tree.t -> IdSet.t + val bottom_up_count_contains : 'a t -> Tree.t -> int + val bottom_up_count : 'a t -> Tree.t -> Tag.t -> int diff --git a/compile.sh b/compile.sh deleted file mode 100755 index 2dda53b..0000000 --- a/compile.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -cd XMLTree/libcds -make clean -make -cd ../TextCollection -make clean -make -cd .. -make clean -make -cd .. -make clean -make -make timeXMLTree diff --git a/main.ml b/main.ml index 985a5b2..c8ed40e 100644 --- a/main.ml +++ b/main.ml @@ -13,7 +13,7 @@ let time f x = let t1 = Unix.gettimeofday () in let r = f x in let t2 = Unix.gettimeofday () in - let t = (1000. *.(t2 -. t1)) in + let t = (1000. *. (t2 -. t1)) in l:= t::!l; Printf.eprintf " %fms\n%!" t ; r @@ -35,41 +35,55 @@ let main v query output = in XPath.Ast.print Format.err_formatter query; Format.fprintf Format.err_formatter "\n%!"; -(* Printf.eprintf "Dummy iteration : "; - time (fill_hashtag) v; - Printf.eprintf "Dummy iteration (tag access cached) : "; - time (fill_hashtag) v; -*) Printf.eprintf "Compiling query : "; let auto,ltags,contains = time XPath.Compile.compile query in let _ = Ata.dump Format.err_formatter auto in let _ = Printf.eprintf "%!" in - let _ = match contains with - None -> () + + let do_contains = match contains with + None -> false | Some s -> let r = Tree.count v s in + Printf.eprintf "%i documents in the TextCollection\n" (Tree.text_size v); Printf.eprintf "Global count is %i, using " r; - if r < !Options.tc_threshold then begin - Printf.eprintf "TextCollection contains\nCalling global contains : "; - time (Tree.init_contains v) s + if r < !Options.tc_threshold then begin + Printf.eprintf "TextCollection contains\nTiming call to raw global contains (1st time): "; + time (Tree.unsorted_contains v) s; + Printf.eprintf "Calling global contains : "; + time (Tree.init_contains v) s; + Printf.eprintf "Timing call to global count contains : "; + let r = time (Tree.count_contains v) s + in + Printf.eprintf " number of matching nodes %i \n%!" r; + Printf.eprintf "Timing call to raw global contains (2nd time): "; + time (Tree.unsorted_contains v) s; end else begin Printf.eprintf "Naive contains\nCalling global contains : "; time (Tree.init_naive_contains v) s - end + end;true in - Printf.eprintf "Execution time %s : " (if !Options.count_only then "(counting only)" else ""); + Printf.eprintf "Execution time %s : " + (if !Options.count_only then "(counting only)" else if !Options.backward then "(bottomup)" else ""); begin let _ = Gc.full_major();Gc.compact() in let _ = Gc.set (disabled_gc) in - if !Options.count_only then - let r = time ( run_count auto )v in + if !Options.backward then + let tag,set = List.hd ltags in + let r = if do_contains + then time (bottom_up_count_contains auto) v + else time (bottom_up_count auto v) tag in + let _ = Printf.eprintf "Number of nodes in the result set : %i\n%!" r + in () + else + if !Options.count_only then + let r = time ( top_down_count auto ) v in (* not clean *) let _ = Printf.eprintf "Number of nodes in the result set : %i\n%!" r in () else - - let result,rcount = time (if !Options.time then run_time auto else run auto) v in + let result = time (top_down auto) v in + let rcount = IdSet.length result in Printf.eprintf "Number of nodes in the result set : %i\n" rcount; Printf.eprintf "\n%!"; begin @@ -80,9 +94,11 @@ let main v query output = time( fun () -> let oc = open_out f in output_string oc "\n"; - TS.iter (fun t -> output_string oc "----------\n"; - Tree.print_xml_fast oc t; - output_char oc '\n') result) (); + IdSet.iter (fun t -> + Tree.print_xml_fast oc t; + output_char oc '\n'; + output_string oc "----------\n"; + ) result) (); end; end; let _ = Gc.set enabled_gc in @@ -98,8 +114,8 @@ let v = begin Printf.eprintf "Loading from file : "; time (Tree.load ~sample:!Options.sample_factor ) - (Filename.chop_suffix !Options.input_file ".srx"); - end + (Filename.chop_suffix !Options.input_file ".srx"); + end else let v = time (fun () -> let v = Tree.parse_xml_uri !Options.input_file; diff --git a/myTimeXMLTree.cpp b/myTimeXMLTree.cpp deleted file mode 100644 index e7b97b2..0000000 --- a/myTimeXMLTree.cpp +++ /dev/null @@ -1,339 +0,0 @@ -#include "XMLDocShredder.h" -#include "XMLTree.h" -#include "Utils.h" -#include -#include -#include -#include -#include - -using namespace std; - -/* Time meassuring */ -double ticks= (double)sysconf(_SC_CLK_TCK); -struct tms t1,t2; - -void start_clock() { - times (&t1); -} - - -double stop_clock() { - times (&t2); - return (t2.tms_utime-t1.tms_utime)/ticks; -} - - -/* end Time meassuring */ - -void printStats(double time, string fname, uint queries) { - cout.width(15); - cout << std::left << fname; - cout << " : "; - cout.width(8); - cout << std::right << queries << " calls, "; - cout.width(8); - cout << std::right << time << "ms, mean: "; - cout.width(8); - cout << std::right << time/queries << endl; -} - - -#define STATS1(fname,vect) {\ - start_clock(); \ - uint q = 0; \ - while(q<(vect).size()) \ - { \ - acc += tree->fname((vect)[q]); \ - q++; \ - } \ - double t = 1000.0*stop_clock(); \ - printStats(t,#fname,(vect).size()); \ - } - -#define STATS1p(fname,vect) {\ - start_clock(); \ - uint q = 0; \ - while(q<(vect).size()) \ - { \ - acc += tree->fname((vect)[q]).min; \ - q++; \ - } \ - double t = 1000.0*stop_clock(); \ - printStats(t,#fname,(vect).size()); \ - } - -#define STATS2(fname,vect) {\ - start_clock(); \ - uint q = 0; \ - while(q<(vect).size()) \ - { \ - acc += tree->fname((vect)[q].first,(vect)[q].second); \ - q++; \ - } \ - double t = 1000.0*stop_clock(); \ - printStats(t,#fname,(vect).size()); \ - cout.flush();\ - } - -TagType target_tag = -1; -vector treenodeQueries; -vector > treenodetagQueries; -vector docidQueries; - -uint acc = 0; - -void runQueries(XMLTree * tree) { - STATS1(Tag,treenodeQueries); - STATS1(Parent,treenodeQueries); - STATS1p(DocIds,treenodeQueries); - STATS1(MyText,treenodeQueries); - STATS1(PrevText,treenodeQueries); - STATS1(NextText,treenodeQueries); - STATS1(FirstChild,treenodeQueries); - STATS1(NextSibling,treenodeQueries); - STATS1(ParentNode,docidQueries); - STATS1(PrevNode,docidQueries); - STATS2(TaggedDesc,treenodetagQueries); - STATS2(TaggedFoll,treenodetagQueries); -} - - -void fill_queries(XMLTree * tree, treeNode node,unsigned char* targettagname) { - treeNode res1,res2; - TagType tag; - DocID id1,id2,id3; - queue q; - q.push(node); - while(!q.empty()) { - node = q.front(); - q.pop(); - if (node != NULLT) { - tag = tree->Tag(node); - if (target_tag == -1) { - const unsigned char * tagname; - tagname = tree->GetTagNameByRef(tag); - if (strcmp( (char*) tagname, (char*) targettagname) == 0) - target_tag = tag; - } - treenodeQueries.push_back(node); - treenodetagQueries.push_back(pair(node,tag)); - id1 = tree->MyText(node); - id2 = tree->PrevText(node); - id3 = tree->NextText(node); - id1 = max(id1, max(id2,id3)); - docidQueries.push_back(id1); - res1 = tree->FirstChild(node); - res2 = tree->NextSibling(node); - q.push(res1); - q.push(res2); - } - } -} - - -vector traversalQueries; -uint cFullTraversal = 0; - -void traversal_time(XMLTree * tree) { - start_clock(); - uint q = 0; - while(qFirstChild(node); - acc += tree->NextSibling(node); - q++; - } - double t = 1000.0*stop_clock(); - printStats(t,"FullTraversal",traversalQueries.size()); -} - - -unsigned int traversal(XMLTree *tree,treeNode node) { - uint ret = 0; - TagType tag; - queue q; - q.push(node); - while(!q.empty()) { - node = q.front(); - q.pop(); - if (node != NULLT) { - cFullTraversal++; - tag = tree->Tag(node); - if (tag == target_tag) - ret++; - treeNode t1 = tree->FirstChild(node); - q.push(tree->FirstChild(node)); - treeNode t2 = tree->NextSibling(node); - q.push(tree->NextSibling(node)); - if(t1!=NULLT) - traversalQueries.push_back(t1); - if(t2!=NULLT) - traversalQueries.push_back(t2); - } - } - return ret; -} - - -vector > jumpQueries; -uint cJumpTraversal = 0; - -void jump_time(XMLTree * tree) { - start_clock(); - uint q = 0; - while(qTaggedDesc(node,target_tag); - acc += tree->TaggedFollBelow(node,target_tag,root); - q++; - } - double t = 1000.0*stop_clock(); - printStats(t,"JumpTraversal",jumpQueries.size()); -} - - -/* This simulates the run function of the jumping automata*/ -unsigned int jump_traversal(XMLTree* tree, treeNode node,treeNode root) { - uint ret = 0; - TagType tag; - queue > q; - q.push(pair(node,root)); - while(!q.empty()) { - pair p = q.front(); - q.pop(); - node = p.first; - root = p.second; - if (node != NULLT) { - cJumpTraversal++; - tag = tree->Tag(node); - if (tag == target_tag) - ret++; - pair p1(tree->TaggedDesc(node,target_tag),node); - pair p2(tree->TaggedFollBelow(node,target_tag,root),root); - if(p1.first!=NULLT) - jumpQueries.push_back(p1); - if(p2.first!=NULLT) - jumpQueries.push_back(p2); - q.push(p1); - q.push(p2); - } - } - return ret; -} - - -int usage(char ** argv) { - std::cout << "usage : " << argv[0] << " [-d] [-s] file.{xml,.srx} tagname\n"; - return 1; -} - - -int main(int argc, char ** argv) { - unsigned int count1,count2; - unsigned char * tagname; - string arg,filename,ext; - bool disable_tc = false; - bool save = false; - bool srx; - XMLTree * tree; - - int i = 1; - if ( i >= argc) - return usage(argv); - - arg = argv[i]; - if (arg.compare("-d") == 0) { - disable_tc = true; - i++; - if ( i >= argc) - return usage(argv); - arg = argv[i]; - } - - if (arg.compare("-s") == 0) { - save = true; - i++; - if ( i >= argc) - return usage(argv); - arg = argv[i]; - } - - // The filename - if (arg.size() < 4) - return usage(argv); - - ext=(arg.substr(arg.size()-4,4)); - if (ext.compare(".srx") == 0) { - // must truncate - filename = arg.substr(0,arg.size()-4); - srx = true; - } - else if (ext.compare(".xml")==0) { - filename = arg; - srx = false; - } - else - return usage(argv); - i++; - - if (i >= argc) - return usage(argv); - - tagname = (unsigned char*) argv[i]; - - if (srx) - // The samplerate is not taken into account for loading anymore - tree = XMLTree::Load((unsigned char*) filename.c_str(),64); - else { - try - { - //filename, sampling factor, index empty texts, disable tc - XMLDocShredder shredder(filename.c_str(),64,false,disable_tc); - shredder.processStartDocument(""); - shredder.parse(); - shredder.processEndDocument(); - tree = (XMLTree *) shredder.storageIfc_->returnDocument(); - if (save) { - filename = filename.substr(0,filename.size()-4).append(".srx"); - struct stat stats; - int exists = stat(filename.c_str(),&stats); - if(exists == 0) { - std::cout << "Warning : indexed file " << filename << " exists, not overwriting\n"; - } - else { - tree->Save((unsigned char*) filename.substr(0,filename.size()-4).c_str()); - } - - } - } - catch (const std::exception& e) { - cout << "Error during parsing : " << e.what() << "\n"; - return 2; - } - } - - fill_queries(tree,tree->Root(),tagname); - runQueries(tree); - - if (target_tag == -1) { - cout << "Warning: tag " << tagname << " was not found in the document!\n" - << "Warning: not timing traversal and jumping functions\n"; - return 0; - } - - count1 = traversal(tree,tree->Root()); - count2 = jump_traversal(tree,tree->Root(),tree->Root()); - - cout << endl << endl; - cout << "Full traversal found " << count1 << " '" << tagname << "' nodes, " - << cFullTraversal << " function calls." << endl; - traversal_time(tree); - cout << endl << endl; - cout << "Jump traversal found " << count2 << " '" << tagname << "' nodes, " - << cJumpTraversal << " function calls." << endl; - jump_time(tree); - - return 0; -} diff --git a/options.ml b/options.ml index 43f665f..6698921 100644 --- a/options.ml +++ b/options.ml @@ -9,6 +9,7 @@ let output_file = ref None let save_file = ref "" let count_only = ref false let time = ref false +let backward = ref false let usage_msg = Printf.sprintf "%s 'query' [output]" Sys.argv.(0) @@ -28,6 +29,7 @@ let spec = [ "-c", Arg.Set(count_only), "counting only (don't materialize the re "-i", Arg.Set(index_empty_texts), "index empty texts [default=false]"; "-d", Arg.Set(disable_text_collection), "disable text collection[default=false]"; "-s", Arg.Set_string(save_file), "save the intermediate representation into file.srx"; + "-b", Arg.Set(backward), "real bottom up run"; ] let parse_cmdline() = diff --git a/options.mli b/options.mli index a7d870d..3d9eda2 100644 --- a/options.mli +++ b/options.mli @@ -9,3 +9,4 @@ val output_file : string option ref val save_file : string ref val time : bool ref val tc_threshold : int ref +val backward : bool ref diff --git a/ptset.mli b/ptset.mli index 47c28ba..0d29da8 100644 --- a/ptset.mli +++ b/ptset.mli @@ -92,4 +92,3 @@ val from_list : int list -> t type int_vector val to_int_vector : t -> int_vector - diff --git a/testXMLTree.cpp b/testXMLTree.cpp deleted file mode 100644 index 3e98c8b..0000000 --- a/testXMLTree.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "XMLDocShredder.h" -#include "XMLTree.h" -#include "Utils.h" - - -void print_structure(XMLTree* tree, treeNode x){ - DocID text; - if (x != NULLT){ - std::cout << "Par idx: " << x << ", preorder: " << - tree->NodeXMLId(x) << ", tag='" << tree->GetTagName(tree->Tag(x)) - << "'\n"; - text = tree->PrevText(x); - std::cout << "PrevText(" << x << ")= " << text - << ", value='" << ((text == NULLT) ? "" : (const char*)tree->GetText(text)) - << "'\n"; - text = tree->MyText(x); - std::cout << "MyText(" << x << ")= " << text - << ", value='" << ((text == NULLT) ? "" : (const char*)tree->GetText(text)) - << "'\n"; - text = tree->NextText(x); - std::cout << "NextText(" << x << ")= " << text - << ", value='" << ((text == NULLT) ? "" : (const char*)tree->GetText(text)) - << "'\n"; - print_structure(tree,tree->FirstChild(x)); - print_structure(tree,tree->NextSibling(x)); - }; -} - -int main(int argc, char** argv){ - XMLTree * tree; - if (argc != 2){ - std::cout << "Usage " << argv[0] << " filename.xml" << std::endl; - return 1; - }; - - XMLDocShredder shredder(argv[1],64,false,false); - shredder.processStartDocument(""); - shredder.parse(); - shredder.processEndDocument(); - tree = (XMLTree *) shredder.storageIfc_->returnDocument(); - print_structure(tree,tree->Root()); - return 0; -} diff --git a/tests/test.xml b/tests/test.xml index 51167ad..f695c0c 100644 --- a/tests/test.xml +++ b/tests/test.xml @@ -1,2 +1,2 @@ -T0T1T2T3T4T5T6 +1423 diff --git a/timeXMLTree.cpp b/timeXMLTree.cpp index 399c919..49fd56e 100644 --- a/timeXMLTree.cpp +++ b/timeXMLTree.cpp @@ -315,7 +315,7 @@ int main(int argc, char ** argv){ STARTTIMER(); count1 = time_traversal(tree,tree->Root()); STOPTIMER(FullTraversal); - STARTTIMER(); + count2 = time_jump(tree,tree->Root(),tree->Root()); STOPTIMER(JumpTraversal); diff --git a/tree.ml b/tree.ml index 74903de..889dd98 100644 --- a/tree.ml +++ b/tree.ml @@ -21,34 +21,32 @@ external parse_xml_uri : string -> int -> bool -> bool -> tree = "caml_call_shre external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string" external save_tree : tree -> string -> unit = "caml_xml_tree_save" -external load_tree : string -> int -> tree = "caml_xml_tree_load" +external load_tree : string -> int -> tree = "caml_xml_tree_load" external nullt : unit -> 'a node = "caml_xml_tree_nullt" let nil : 'a node = Obj.magic (-1) -external text_get_text : tree -> [`Text] node -> string = "caml_text_collection_get_text" +external text_get_tc_text : tree -> [`Text] node -> string = "caml_text_collection_get_text" external text_is_empty : tree -> [`Text ] node -> bool = "caml_text_collection_empty_text" let text_is_empty t n = (equal_node nil n) || text_is_empty t n -external get_cached_text : tree -> [`Text ] node -> string = "caml_text_collection_get_cached_text" - - -let text_get_text t n = - if equal_node nil n then "" - else get_cached_text t n -external text_size : tree -> int = "caml_text_collection_size" + external text_is_contains : tree -> string -> bool = "caml_text_collection_is_contains" external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains" external text_count : tree -> string -> int = "caml_text_collection_count" external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains" +external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains" +external get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text" +let get_cached_text t x = + if x == -1 then "" + else get_cached_text t x - -external tree_serialize : tree -> string -> unit = "caml_xml_tree_serialize" +external tree_serialize : tree -> string -> unit = "caml_xml_tree_serialize" external tree_unserialize : string -> tree = "caml_xml_tree_unserialize" @@ -63,9 +61,12 @@ external tree_first_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tre external tree_next_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" external tree_prev_sibling : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" external tree_is_leaf : tree -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" - +external tree_last_child : tree -> [`Tree] node -> [`Tree] node = "caml_xml_tree_last_child" +external tree_is_first_child : tree -> [`Tree] node -> bool = "caml_xml_tree_is_first_child" + (* external tag : tree -> [`Tree ] node -> T = "caml_xml_tree_tag"*) external tree_tag_id : tree -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" + let tree_is_last t n = equal_node nil (tree_next_sibling t n) @@ -74,6 +75,9 @@ external tree_prev_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree external tree_my_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" external tree_next_text : tree -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" external tree_doc_ids : tree -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids" + +let text_size tree = int_of_node (snd ( tree_doc_ids tree (Obj.magic 0) )) + external tree_text_xml_id : tree -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" external tree_node_xml_id : tree -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" external tree_is_ancestor : tree -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" @@ -93,10 +97,9 @@ type descr = type t = { doc : tree; node : descr; - ttable : (Tag.t,(Ptset.t*Ptset.t)) Hashtbl.t; + ttable : (Tag.t,(Ptset.t*Ptset.t)) Hashtbl.t; } - let update h t sb sa = let sbelow,safter = try @@ -108,7 +111,7 @@ let update h t sb sa = - +let text_size t = text_size t.doc let collect_tags tree = let h = Hashtbl.create 511 in @@ -128,14 +131,26 @@ let collect_tags tree = + + let contains_array = ref [| |] - +let contains_index = Hashtbl.create 4096 +let in_array _ i = + try + Hashtbl.find contains_index i + with + Not_found -> false + let init_contains t s = let a = text_contains t.doc s in Array.fast_sort (compare) a; - contains_array := a + contains_array := a; + Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array +let count_contains t s = text_count_contains t.doc s +let unsorted_contains t s = text_unsorted_contains t.doc s + let init_naive_contains t s = let i,j = tree_doc_ids t.doc (tree_root t.doc) in @@ -149,7 +164,7 @@ let init_naive_contains t s = let rec loop n acc l = if n >= j then acc,l else - let s = text_get_text t.doc n + let s = get_cached_text t.doc n in if matching s then loop (n+1) (n::acc) (l+1) @@ -172,7 +187,7 @@ let is_nil t = t.node == Nil let is_node t = t.node != Nil -let node_of_t t = +let node_of_t t = let _ = Tag.init (Obj.magic t) in let table = collect_tags t in @@ -184,27 +199,32 @@ let node_of_t t = Ptset.iter (fun i -> Printf.eprintf "'%s' " (Tag.to_string i)) sa; Printf.eprintf "} \n----------------------------------\n"; ) table in -*) + let i,j = tree_doc_ids t (tree_root t) in + Printf.eprintf "%i docs, range from %i to %i\n%!" (Array.length s) i j; + Array.iter (fun i -> print_endline (">>>" ^ i ^ "<<<")) s; *) { doc= t; node = Node(tree_root t); ttable = table; } +let finalize _ = Printf.eprintf "Release the string list !\n%!" +;; + +let parse f str = + node_of_t + (f str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection) + +let parse_xml_uri str = parse parse_xml_uri str +let parse_xml_string str = parse parse_xml_string str - -let parse_xml_uri str = node_of_t - (parse_xml_uri str - !Options.sample_factor - !Options.index_empty_texts - !Options.disable_text_collection) - -let parse_xml_string str = node_of_t - (parse_xml_string str - !Options.sample_factor - !Options.index_empty_texts - !Options.disable_text_collection) external pool : tree -> Tag.pool = "%identity" -let save t str = save_tree t.doc str + +let save t str = (save_tree t.doc str) +;; + let load ?(sample=64) str = node_of_t (load_tree str sample) @@ -232,6 +252,8 @@ let nts = function | Text (i,j) -> Printf.sprintf "Text (%i, %i)" i j | Node (i) -> Printf.sprintf "Node (%i)" i +let dump_node t = nts t.node + let mk_nil t = { t with node = Nil } let root n = { n with node = norm (tree_root n.doc) } @@ -239,6 +261,27 @@ let is_root n = match n.node with | Node(t) -> (int_of_node t) == 0 | _ -> false +let is_left n = match n.node with + | Node(t) -> (tree_is_first_child n.doc t) && (equal_node nil (tree_prev_text n.doc t)) + | Text(_,t) -> tree_is_nil t || tree_is_first_child n.doc t + | _ -> false + +let is_below_right t1 t2 = + match (t1.node,t2.node) with + | Nil,_ | _,Nil -> false + | Node(i1), Node(i2) -> + tree_is_ancestor t1.doc (tree_parent t1.doc i1) i2 + && not (tree_is_ancestor t1.doc i1 i2) + | Text(_,i1),Node(i2) -> i1 == i2 || + (tree_is_ancestor t1.doc (tree_parent t1.doc i1) i2 && i1 < i2) + | Text(_,i1),Text(i,_) -> + let x,y = tree_doc_ids t1.doc i1 in + i >= x && i <= y + | Node(i1), Text(i,_) -> + let i2 = tree_next_sibling t1.doc i1 in + let x,y = tree_doc_ids t1.doc i2 in + i >= x && i <= y + let parent n = let node' = match n.node with (* inlined parent *) @@ -488,32 +531,34 @@ let array_find a i j = let text_below t = let l = Array.length !contains_array in - if l = 0 then { t with node=Nil } - else match t.node with - | Node(n) -> + | Node(n) -> let i,j = tree_doc_ids t.doc n in - let id = array_find !contains_array i j + let id = if l == 0 then i else (array_find !contains_array i j) in +(* Printf.printf "Looking for text below node %i with tag %s in range %i %i, in array : [|\n%!" + n (Tag.to_string (tree_tag_id t.doc n)) i j; + Array.iter (fun i -> Printf.printf "%i " (int_of_node i )) !contains_array; + Printf.printf "|]\nResult is %i\n%!" id; *) if id == nil then { t with node=Nil } else { t with node = Text(id, tree_next_sibling t.doc (tree_prev_doc t.doc id)) } - | _ -> { t with node = Nil } + | _ -> (*Printf.printf "Here\n%!"; *) + { t with node = Nil } let text_next t root = let l = Array.length !contains_array in - if l = 0 then { t with node=Nil } - else let inf = match t.node with - | Node(n) -> snd(tree_doc_ids t.doc n)+1 + | Node(n) -> snd(tree_doc_ids t.doc n)+1 | Text(i,_) -> i+1 | _ -> assert false in match root.node with | Node (n) -> - let _,j = tree_doc_ids t.doc n in - let id = array_find !contains_array inf j + let _,j = tree_doc_ids t.doc n in + let id = if l == 0 then if inf > j then nil else inf + else array_find !contains_array inf j in if id == nil then { t with node= Nil } else @@ -632,7 +677,7 @@ let text_next t root = let rec loop ?(print_right=true) t = match t.node with | Nil -> () - | Text(i,n) -> output_string outc (text_get_text t.doc i); + | Text(i,n) -> output_string outc (get_cached_text t.doc i); if print_right then loop (right t) | Node (n) -> @@ -666,7 +711,7 @@ let text_next t root = | Node(_) -> let value = match (left a).node with - | Text(i,_) -> text_get_text a.doc i + | Text(i,_) -> (get_cached_text a.doc i) | _ -> assert false in output_char outc ' '; @@ -693,3 +738,91 @@ let tags_after t tag = snd(Hashtbl.find t.ttable tag) let tags t tag = Hashtbl.find t.ttable tag + +let tagged_lowest t tag = + let rec loop_lowest i = + let j = tree_tagged_desc t.doc i tag in + if tree_is_nil j then i else loop_lowest j + in + match t.node with + | Node i -> + let j = loop_lowest i in + { t with + node = norm( + if tree_is_nil j then + if (tree_tag_id t.doc i) == tag + then i + else j + else j) } + | Nil -> t + | _ -> assert false + + +let tagged_next t tag = + match t.node with + | Node(i) -> + let n = tree_tagged_foll_below t.doc i tag (Obj.magic 0) + in + if tree_is_nil n then mk_nil t + else + tagged_lowest { t with node = Node n } tag + | Nil -> t + | _ -> assert false + +let rec binary_parent t = + let res = + match t.node with + | Node(0) -> { t with node = Nil } + | Node(i) -> + let j = tree_prev_sibling t.doc i in + if tree_is_nil j then + let idoc = tree_prev_text t.doc i in + if equal_node nil idoc then + { t with node = Node (tree_parent t.doc i) } + else + { t with node = Text(idoc,i) } + else + let idoc = tree_prev_text t.doc i in + if equal_node nil idoc then + { t with node = Node (j) } + else { t with node = Text(idoc,i) } + | Text(d,i) -> + if tree_is_nil i then + let n = tree_parent_doc t.doc d in + let lc = tree_last_child t.doc n in + if tree_is_nil lc then {t with node = Node n } + else { t with node = Node lc } + else + let j = tree_prev_sibling t.doc i in + if tree_is_nil j then + { t with node = Node (tree_parent t.doc i) } + else { t with node = Node j } + | Nil -> t + in match res.node with + | Text(idoc,t) -> + if (Array.length !contains_array) != 0 + then if in_array !contains_array idoc then res + else binary_parent res + else res + | _ -> res + +let benchmark_text t = + let doc = t.doc in + match (root t).node with + | Node i -> let _,size = tree_doc_ids doc i in + Printf.eprintf "%i will take ~ %i seconds\n%!" + size (size/10000) ; + let a = Array.create size "" in + for i = 0 to size + do + a.(i) <- text_get_tc_text t.doc (i+1) + done; a + | _ -> assert false + +let doc_ids (t:t) : (int*int) = + (Obj.magic ( + match t.node with + | Node i -> tree_doc_ids t.doc i + | Text (i,_) -> (i,i) + | Nil -> (nil,nil) + )) diff --git a/tree.mli b/tree.mli index d0a4f5a..75fb8fd 100644 --- a/tree.mli +++ b/tree.mli @@ -3,6 +3,7 @@ val init_contains : t -> string -> unit val init_naive_contains : t -> string -> unit val is_nil : t -> bool val is_node : t -> bool +val dump_node : t -> string val parse_xml_uri : string -> t val parse_xml_string : string -> t val save : t -> string -> unit @@ -37,3 +38,13 @@ val node_sibling_ctx : t -> t -> t val tags_below : t -> Tag.t -> Ptset.t val tags_after : t -> Tag.t -> Ptset.t val tags : t -> Tag.t -> Ptset.t*Ptset.t +val is_below_right : t -> t -> bool +val is_left : t -> bool +val tagged_lowest : t -> Tag.t -> t +val tagged_next : t -> Tag.t -> t +val binary_parent : t -> t +val benchmark_text : t -> string array +val count_contains : t -> string -> int +val unsorted_contains : t -> string -> unit +val text_size : t -> int +val doc_ids : t -> int*int diff --git a/xPath.ml b/xPath.ml index 27479be..3fbfacf 100644 --- a/xPath.ml +++ b/xPath.ml @@ -328,8 +328,9 @@ let rec compile_step ?(existential=false) conf q_src dir ctx_path nrec step num in let new_st,new_dst, new_ctx = match axis with - | Child | Descendant -> + if (TagSet.is_finite test) + then conf.entry_points <- (TagSet.choose test,Ptset.singleton q_src)::conf.entry_points; let left,right = if nrec then `LLeft,`RRight else `Left,`Right @@ -549,12 +550,12 @@ let compile path = let s = Ptset.union anc_st (Ptset.from_list []) in if has_backward then Ptset.add config.st_from_root s else s in { Ata.id = Oo.id (object end); - Ata.states = if has_backward then Ptset.add config.st_from_root a_st else a_st; + Ata.states = Hashtbl.fold (fun q _ acc -> Ptset.add q acc) phi Ptset.empty; Ata.init = Ptset.singleton config.st_root; Ata.final = Ptset.union anc_st config.final_state; Ata.universal = Ptset.add a_dst (Ptset.from_list config.univ_states); Ata.phi = phi; - Ata.sigma = Ata.HTagSet.create 17; + Ata.sigma = Hashtbl.create 17; Ata.starstate = config.starstate; },config.entry_points,!contains diff --git a/xPath.mli b/xPath.mli index 23235d7..3ecf868 100644 --- a/xPath.mli +++ b/xPath.mli @@ -35,5 +35,5 @@ sig end module Compile : sig -val compile : Ast.path -> Ata.t * (Tag.t*Ptset.t) list * string option +val compile : Ast.path -> 'a Ata.t * (Tag.t*Ptset.t) list * string option end -- 2.17.1