X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=tree.ml;h=487a057b30ad73301d11d3c9bef22f918df09c5c;hb=92455238a637876bec18bfdaed4f5342f4cbbd1f;hp=780791a914e0463e31f8b70f80da940c519c03e0;hpb=4a66518948bf6356b5cb72ba30b4d48a8c9e833a;p=SXSI%2Fxpathcomp.git diff --git a/tree.ml b/tree.ml index 780791a..487a057 100644 --- a/tree.ml +++ b/tree.ml @@ -48,9 +48,23 @@ sig val has_tagged_foll : t -> Tag.t -> bool val tagged_desc : t -> Tag.t -> t val tagged_foll : t -> Tag.t -> t - val tagged_next : t -> Tag.t -> t + val tagged_below : t -> Ptset.t -> Ptset.t -> t + val tagged_next : t -> Ptset.t -> Ptset.t -> t -> t + val tagged_desc_only : t -> Ptset.t -> t + val tagged_foll_only : t -> Ptset.t -> t -> t + val text_below : t -> t + val text_next : t -> t -> t + val init_tagged_next : t -> Tag.t -> unit val subtree_tags : t -> Tag.t -> int val is_left : t -> bool + val print_id : Format.formatter -> t -> unit + val test_xml_tree : Format.formatter -> Ptset.t -> t -> unit + val init_contains : t -> string -> unit + val init_naive_contains : t -> string -> unit + val mk_nil : t -> t + val test_jump : t -> Tag.t -> unit + val time_xml_tree : t -> Tag.t -> int list + val time_xml_tree2 : t -> Tag.t -> int list end module XML = @@ -89,6 +103,7 @@ struct *) external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text" + let is_empty t n = (equal nil n) || is_empty t n @@ -96,7 +111,7 @@ struct let get_text t n = - if (equal nil n) || is_empty t n then "" + if equal nil n then "" else get_cached_text t n external size : t -> int = "caml_text_collection_size" @@ -122,30 +137,29 @@ struct external parent : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent" external parent_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" + external prev_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_prev_doc" external first_child : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" external next_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" - + external prev_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" (* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*) external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" (* - let tag_hash = Hashtbl.create 4097 + let tag_hash = Array.make 6_000_000 (Tag.nullt) - let tag_id t id = - try - Hashtbl.find tag_hash id - with - | Not_found -> + let tag_id t id = + let tag = tag_hash.(int_of_node id) + in + if tag != Tag.nullt then tag + else let tag = tag_id t id in - Hashtbl.add tag_hash id tag;tag + (tag_hash.(int_of_node id) <- tag; tag) *) - - let is_last t n = equal nil (next_sibling t n) external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" @@ -153,14 +167,118 @@ struct external my_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" external next_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" - + external doc_ids : t -> [`Tree ] node -> [`Text ] node * [`Text ] node = "caml_xml_tree_doc_ids" external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" external tagged_desc : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" external tagged_foll : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_foll" - external tagged_next : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_next" external subtree_tags : t -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" + external tagged_below : t -> [`Tree ] node -> Ptset.int_vector -> Ptset.int_vector -> [`Tree ] node = "caml_xml_tree_tagged_below" + external tagged_desc_only : t -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node = "caml_xml_tree_tagged_desc_only" + external tagged_next : t -> [`Tree ] node -> Ptset.int_vector -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_next" + external tagged_foll_only : t -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_only" + external tagged_desc_or_foll_only : t -> [`Tree ] node -> Ptset.int_vector -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_only" + external tagged_foll_below : t -> [`Tree ] node -> Tag.t -> [`Tree ] node -> [`Tree ] node = "caml_xml_tree_tagged_foll_below" + + let test_jump tree tag = + let rec loop id ctx = + if id != nil + then + let first = tagged_desc tree id tag + and next = tagged_desc tree id tag + in + loop first id; + loop next ctx + in + loop (root tree) (root tree) + + + let test_xml_tree ppf tags v = + let pr x = Format.fprintf ppf x in + let rec aux id = + if (is_nil id) + then () + else + begin + pr "Node %i, (Tag) %i='%s' (GetTagName), NodeXMLId (Preorder)=%i\n%!" + (int_of_node id) + (tag_id v id) + (Tag.to_string (tag_id v id)) + (node_xml_id v id); + pr "DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) ParentDoc(my_text)=%i PrevDoc(next_text)=%i\n%!" + (int_of_node (prev_text v id)) + (Text.get_text v (prev_text v id)) + (int_of_node (my_text v id)) + (Text.get_text v (my_text v id)) + (int_of_node (next_text v id)) + (Text.get_text v (next_text v id)) + (int_of_node(parent_doc v (my_text v id))) + (int_of_node(prev_doc v (next_text v id))); + let i1,i2 = doc_ids v id in + pr "Testing DocIds below (%i,%i)*\n%!" + (int_of_node i1) (int_of_node i2); + pr "Testing Tagged*\n%!"; + Ptset.iter (fun t -> + let str = Tag.to_string t in + if Tag.pcdata <> t + then begin + pr "Tag: %s : \n%!" str; + pr "TaggedDesc = %i%!, " (tagged_desc v id t); + pr "TaggedFoll = %i\n%!" (tagged_foll v id t); + pr "SubtreeTags = %i\n%!" (subtree_tags v id t); + end) tags; + pr "----------------------------\n"; + aux(first_child v id); + aux(next_sibling v id); + end + in + aux (root v) + + let rrrr = ref 0 + + let time_xml_tree v tag = + + let rec aux id acc = + incr rrrr; + if (is_nil id) + then acc + else begin + let acc = + if tag == (tag_id v id) + then + id::acc + else acc + in + aux (next_sibling v id) (aux (first_child v id) acc); + end + in + let r = aux (root v) [] in + Printf.eprintf "%i\n%!" !rrrr;r + + let rrrr2 = ref 0 + let time_xml_tree2 v tag = + let rec aux id acc ctx= + incr rrrr2; + if (is_nil id) + then acc + else begin + let acc = + if tag == (tag_id v id) + then + id::acc + else acc + in + aux (tagged_foll_below v id tag ctx) (aux (tagged_desc v id tag) acc id) ctx; + end + in + let r = aux (root v) [] (root v) in + Printf.eprintf "%i\n%!" !rrrr2; r + + + + + let print_skel t = let rec aux id = @@ -168,8 +286,9 @@ struct then Printf.eprintf "#\n" else begin - Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" + Printf.eprintf "Node %i has tag '%i=%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" (int_of_node id) + (tag_id t id) (Tag.to_string (tag_id t id)) (node_xml_id t id) (int_of_node (prev_text t id)) @@ -202,6 +321,9 @@ struct end in aux (root t) + + + end @@ -222,6 +344,45 @@ struct node : descr } let dump { doc=t } = Tree.print_skel t + let test_xml_tree ppf tags { doc=t } = Tree.test_xml_tree ppf tags t + let time_xml_tree { doc=t } tag = Tree.time_xml_tree t tag + let time_xml_tree2 { doc=t } tag = Tree.time_xml_tree2 t tag + let test_jump { doc=t } tag = Tree.test_jump t tag + let contains_array = ref [| |] + + let init_contains t s = + let a = Text.contains t.doc s + in + Array.fast_sort (compare) a; + contains_array := a + + let init_naive_contains t s = + let i,j = Tree.doc_ids t.doc (Tree.root t.doc) + in + let regexp = Str.regexp_string s in + let matching arg = + try + let _ = Str.search_forward regexp arg 0; + in true + with _ -> false + in + let rec loop n acc l = + if n >= j then acc,l + else + let s = (*Printf.eprintf "%i \n%!" n;*)Text.get_cached_text t.doc n + in + if matching s + then loop (n+1) (n::acc) (l+1) + else loop (n+1) acc l + in + let acc,l = loop i [] 0 in + let a = Array.create l Text.nil in + let _ = List.fold_left (fun cpt e -> a.(cpt) <- e; (cpt-1)) (l-1) acc + in + contains_array := a + + + module DocIdSet = struct include Set.Make (struct type t = string_content let compare = (-) end) @@ -284,22 +445,32 @@ struct | Node (NC t) -> Printf.sprintf "Node (NC %i)" (int_of_node t) | Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i) - + let mk_nil t = { t with node = Nil } let root n = { n with node = norm (Tree.root n.doc) } let is_root n = match n.node with - | Node(NC t) when (Tree.root n.doc) == t -> true + | Node(NC t) -> (int_of_node t) == 0 | _ -> false - let parent n = + let parent n = + if is_root n then { n with node=Nil} + else let node' = match n.node with | Node(NC t) -> let txt = prev_text n.doc t in if Text.is_empty n.doc txt then - Node(NC (Tree.parent n.doc t)) + let ps = Tree.prev_sibling n.doc t in + if is_nil ps + then + Node(NC (Tree.parent n.doc t)) + else Node(NC ps) else Node(SC (txt,t)) - | Node(SC(t,_)) -> Node (NC(parent_doc n.doc t)) + | Node(SC(i,t)) -> + let ps = Tree.prev_sibling n.doc t in + if is_nil ps + then Node (NC(parent_doc n.doc i)) + else Node(NC ps) | _ -> failwith "parent" in { n with node = node' } @@ -322,7 +493,6 @@ struct | Nil | String _ -> failwith "first_child" in { n with node = node'} - let next_sibling n = let node' = @@ -345,18 +515,15 @@ struct let id = function { doc=d; node=Node(NC n)} -> node_xml_id d n | { doc=d; node=Node(SC (i,_) )} -> text_xml_id d i - | _ -> failwith "id" + | _ -> -1 (* + Format.fprintf Format.err_formatter "Failure id on %s\n%!" (nts x.node); + failwith "id" *) let tag = function { node=Node(SC _) } -> Tag.pcdata | { doc=d; node=Node(NC n)} -> tag_id d n | _ -> failwith "tag" -(* let tag_id = - function { node=Node(SC _) } -> () - | { doc=d; node=Node(NC n)} -> tag_id d n - | _ -> () -*) let string_below t id = let strid = parent_doc t.doc id in match t.node with @@ -388,19 +555,175 @@ struct | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) } | _ -> { t with node=Nil } + + let tagged_next t tb tf s = + match s with + | { node = Node (NC(below)) } -> begin + match t with + | { doc = d; node=Node(NC n) } -> + { t with node= norm (tagged_next d n (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) } + | { doc = d; node=Node(SC (i,n) ) } when is_nil n -> + let p = parent_doc d i in + { t with node= norm (tagged_next d p (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) } + | { doc = d; node=Node(SC (_,n) ) } -> + if Ptset.mem (tag_id d n) (Ptset.union tb tf) + then { t with node=Node(NC(n)) } + else + let vb = Ptset.to_int_vector tb in + let vf = Ptset.to_int_vector tf in + let node = + let dsc = tagged_below d n vb vf in + if is_nil dsc + then tagged_next d n vb vf below + else dsc + in + { t with node = norm node } + | _ -> {t with node=Nil } + end + + | _ -> {t with node=Nil } + + let tagged_foll_only t tf s = + match s with + | { node = Node (NC(below)) } -> begin + match t with + | { doc = d; node=Node(NC n) } -> + { t with node= norm (tagged_foll_only d n (Ptset.to_int_vector tf) below) } + | { doc = d; node=Node(SC (i,n) ) } when is_nil n -> + let p = parent_doc d i in + { t with node= norm (tagged_foll_only d p (Ptset.to_int_vector tf) below) } + | { doc = d; node=Node(SC (_,n) ) } -> + if Ptset.mem (tag_id d n) tf + then { t with node=Node(NC(n)) } + else + let vf = Ptset.to_int_vector tf in + let node = + let dsc = tagged_desc_only d n vf in + if is_nil dsc + then tagged_foll_only d n vf below + else dsc + in + { t with node = norm node } + | _ -> {t with node=Nil } + end + + | _ -> {t with node=Nil } + - let tagged_next t tag = - if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_next" - else match t with - | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_next d n tag) } - | { doc=d; node=Node(SC (_,n)) } -> { t with node = norm (tagged_next d n tag) } + let tagged_below t tc td = + match t with + | { doc = d; node=Node(NC n) } -> + let vc = Ptset.to_int_vector tc + in + let vd = Ptset.to_int_vector td + in + { t with node= norm(tagged_below d n vc vd) } | _ -> { t with node=Nil } + let tagged_desc_only t td = + match t with + | { doc = d; node=Node(NC n) } -> + let vd = Ptset.to_int_vector td + in + { t with node= norm(tagged_desc_only d n vd) } + | _ -> { t with node=Nil } + + + let last_idx = ref 0 + let array_find a i j = + let l = Array.length a in + let rec loop idx x y = + if x > y || idx >= l then Text.nil + else + if a.(idx) >= x then if a.(idx) > y then Text.nil else (last_idx := idx;a.(idx)) + else loop (idx+1) x y + in + if a.(0) > j || a.(l-1) < i then Text.nil + else loop !last_idx i j + + + let text_below t = + let l = Array.length !contains_array in + if l = 0 then { t with node=Nil } + else + match t with + | { doc = d; node=Node(NC n) } -> + let i,j = doc_ids t.doc n in + let id = array_find !contains_array i j + in + if id == Text.nil then + { t with node=Nil } + else + {t with node = Node(SC(id, Tree.next_sibling d (Tree.prev_doc d id))) } + | _ -> { t with node=Nil } + + let text_next t root = + let l = Array.length !contains_array in + if l = 0 then { t with node=Nil } + else + let inf = match t with + | { doc =d; node = Node(NC n) } -> snd(doc_ids d n)+1 + | { node = Node(SC(i,_)) } -> i+1 + | _ -> assert false + in + match root with + | { doc = d; node= Node (NC n) } -> + let _,j = doc_ids t.doc n in + + let id = array_find !contains_array inf j + in + if id == Text.nil then { doc = d; node= Nil } + else + {doc = d; node = Node(SC(id,Tree.next_sibling d (Tree.prev_doc d id))) } + | _ -> { t with node=Nil} + + + let subtree_tags t tag = match t with - { doc = d; node = Node(NC n) } -> subtree_tags d n tag + { doc = d; node = Node(NC n) } -> + subtree_tags d n tag | _ -> 0 + let tagged_desc_array = ref [| |] + let idx = ref 0 + + let init_tagged_next t tagid = + let l = subtree_tags (root t) tagid + in + tagged_desc_array := Array.create l { t with node= Nil }; + let i = ref 0 in + let rec collect t = + if is_node t then begin + if tag t == tagid then + begin + !tagged_desc_array.(!i) <- t; + incr i; + end; + collect (first_child t); + collect (next_sibling t) + end; + in + collect t; + idx := 0 + + let print_id ppf v = + let pr x= Format.fprintf ppf x in + match v with + { node=Nil } -> pr "NULLT: -1" + | { node=String(i) } | { node=Node(SC(i,_)) } -> pr "DocID: %i" (int_of_node i) + | { node=Node(NC(i)) } -> pr "Node: %i" (int_of_node i) + + + +(* let tagged_next t tag = + if !idx >= Array.length !tagged_desc_array + then {t with node=Nil} + else + let r = !tagged_desc_array.(!idx) + in + incr idx; r +*) let has_tagged_foll t tag = is_node (tagged_foll t tag) @@ -453,13 +776,17 @@ struct let count t s = Text.count t.doc s let is_left t = - let u = left (parent t) in - (id t) == (id u) + if is_root t then false + else + if tag (parent t) == Tag.pcdata then false + else + let u = left (parent t) in + (id t) == (id u) let print_xml_fast outc t = let rec loop ?(print_right=true) t = match t.node with | Nil -> () - | String (s) -> output_string outc (string t) + | String (s) -> output_string outc (Text.get_text t.doc s) | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); if print_right then loop (right t) @@ -516,6 +843,10 @@ struct print_xml_fast outc (first_child t) else print_xml_fast outc t + + + + let traversal t = Tree.traversal t.doc let full_traversal t = let rec aux n =