X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=tree.ml;h=20a77925c9d346c7ae83f7529a75c5ef66ead2ff;hb=aafe9afd804263ac5e28cb2e7857cc003e3c1d2d;hp=2218c2843f913208ce5e758fd99ccf266b82fd82;hpb=d64e3a3a9ef6329caafdba848ef78427fce0d689;p=SXSI%2Fxpathcomp.git diff --git a/tree.ml b/tree.ml index 2218c28..20a7792 100644 --- a/tree.ml +++ b/tree.ml @@ -24,6 +24,8 @@ sig val first_child : t -> t val next_sibling : t -> t val parent : t -> t + val root : t -> t + val is_root : t -> bool val id : t -> int val tag : t -> Tag.t val print_xml_fast : out_channel -> t -> unit @@ -36,9 +38,22 @@ sig with type elt = string_content val string_below : t -> string_content -> bool val contains : t -> string -> DocIdSet.t - val contains_old : t -> string -> bool + val contains_old : t -> string -> DocIdSet.t + val contains_iter : t -> string -> DocIdSet.t + val count_contains : t -> string -> int + val count : t -> string -> int val dump : t -> unit val get_string : t -> string_content -> string + val has_tagged_desc : t -> Tag.t -> bool + val has_tagged_foll : t -> Tag.t -> bool + val tagged_desc : t -> Tag.t -> t + val tagged_foll : t -> Tag.t -> t + val init_tagged_next : t -> Tag.t -> unit + val tagged_next : t -> Tag.t -> t + val subtree_tags : t -> Tag.t -> int + val is_left : t -> bool + val print_id : Format.formatter -> t -> unit + val test_xml_tree : Format.formatter -> Ptset.t -> t -> unit end module XML = @@ -71,16 +86,27 @@ struct let nil = nullt () external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text" - let get_text t n = +(* let get_text t n = if equal nil n then "" else get_text t n +*) external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text" + let is_empty t n = (equal nil n) || is_empty t n + external get_cached_text : t -> [`Text ] node -> string = "caml_text_collection_get_cached_text" + + + let get_text t n = + if equal nil n then "" + else get_cached_text t n + + external size : t -> int = "caml_text_collection_size" external is_contains : t -> string -> bool = "caml_text_collection_is_contains" external count_contains : t -> string -> int = "caml_text_collection_count_contains" + external count : t -> string -> int = "caml_text_collection_count" external contains : t -> string -> [`Text ] node array = "caml_text_collection_contains" end @@ -105,12 +131,23 @@ struct external next_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" - + external prev_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling" external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" (* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*) external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" +(* + let tag_hash = Array.make 6_000_000 (Tag.nullt) + + let tag_id t id = + let tag = tag_hash.(int_of_node id) + in + if tag != Tag.nullt then tag + else + let tag = tag_id t id in + (tag_hash.(int_of_node id) <- tag; tag) +*) let is_last t n = equal nil (next_sibling t n) external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" @@ -122,6 +159,54 @@ struct external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" + external tagged_desc : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" + external tagged_foll : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_foll" + external tagged_next : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_next" + external subtree_tags : t -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" + + + + let test_xml_tree ppf tags v = + let pr x = Format.fprintf ppf x in + let rec aux id = + if (is_nil id) + then () + else + begin + pr "Node %i, (Tag) %i='%s' (GetTagName), NodeXMLId (Preorder)=%i\n%!" + (int_of_node id) + (tag_id v id) + (Tag.to_string (tag_id v id)) + (node_xml_id v id); + pr "DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) ParentDoc(my_text)=%i\n%!" + (int_of_node (prev_text v id)) + (Text.get_text v (prev_text v id)) + (int_of_node (my_text v id)) + (Text.get_text v (my_text v id)) + (int_of_node (next_text v id)) + (Text.get_text v (next_text v id)) + (int_of_node(parent_doc v (my_text v id))); + pr "Testing Tagged*\n%!"; + Ptset.iter (fun t -> + let str = Tag.to_string t in + if Tag.pcdata <> t + then begin + pr "Tag: %s : \n%!" str; + pr "TaggedDesc = %i%!, " (tagged_desc v id t); + pr "TaggedFoll = %i\n%!" (tagged_foll v id t); + pr "SubtreeTags = %i\n%!" (subtree_tags v id t); + end) tags; + pr "----------------------------\n"; + aux(first_child v id); + aux(next_sibling v id); + end + in + aux (root v) + + + + + let print_skel t = let rec aux id = @@ -129,8 +214,9 @@ struct then Printf.eprintf "#\n" else begin - Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" + Printf.eprintf "Node %i has tag '%i=%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" (int_of_node id) + (tag_id t id) (Tag.to_string (tag_id t id)) (node_xml_id t id) (int_of_node (prev_text t id)) @@ -163,6 +249,9 @@ struct end in aux (root t) + + + end @@ -183,12 +272,15 @@ struct node : descr } let dump { doc=t } = Tree.print_skel t + let test_xml_tree ppf tags { doc=t } = Tree.test_xml_tree ppf tags t + module DocIdSet = struct include Set.Make (struct type t = string_content let compare = (-) end) end let is_node = function { node=Node(_) } -> true | _ -> false + let get_string t (i:string_content) = Text.get_text t.doc i open Tree let node_of_t t = { doc= t; @@ -245,14 +337,32 @@ struct | Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i) - let parent n = + let root n = { n with node = norm (Tree.root n.doc) } + let is_root n = match n.node with + | Node(NC t) -> (int_of_node t) == 0 + | _ -> false + + let parent n = + if is_root n then { n with node=Nil} + else let node' = match n.node with - | Node(NC t) | Node(SC (_,t)) -> - if (Tree.root n.doc) == t - then Nil - else Node(NC(Tree.parent n.doc t)) (* A parent node can never be a SC *) - | _ -> assert false + | Node(NC t) -> + let txt = prev_text n.doc t in + if Text.is_empty n.doc txt then + let ps = Tree.prev_sibling n.doc t in + if is_nil ps + then + Node(NC (Tree.parent n.doc t)) + else Node(NC ps) + else + Node(SC (txt,t)) + | Node(SC(i,t)) -> + let ps = Tree.prev_sibling n.doc t in + if is_nil ps + then Node (NC(parent_doc n.doc i)) + else Node(NC ps) + | _ -> failwith "parent" in { n with node = node' } @@ -274,7 +384,6 @@ struct | Nil | String _ -> failwith "first_child" in { n with node = node'} - let next_sibling n = let node' = @@ -297,18 +406,15 @@ struct let id = function { doc=d; node=Node(NC n)} -> node_xml_id d n | { doc=d; node=Node(SC (i,_) )} -> text_xml_id d i - | _ -> failwith "id" + | _ -> -1 (* + Format.fprintf Format.err_formatter "Failure id on %s\n%!" (nts x.node); + failwith "id" *) let tag = function { node=Node(SC _) } -> Tag.pcdata | { doc=d; node=Node(NC n)} -> tag_id d n | _ -> failwith "tag" -(* let tag_id = - function { node=Node(SC _) } -> () - | { doc=d; node=Node(NC n)} -> tag_id d n - | _ -> () -*) let string_below t id = let strid = parent_doc t.doc id in match t.node with @@ -317,10 +423,91 @@ struct | Node(SC(i,_)) -> Text.equal i id | _ -> false + + let tagged_foll t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_foll" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_foll d n tag) } + | { doc=d; node=Node(SC (_,n)) } when is_nil n -> { t with node= Nil } + | { doc=d; node=Node(SC (_,n)) } -> + let nnode = + if tag_id d n == tag then n + else + let n' = tagged_desc d n tag in + if is_nil n' then tagged_foll d n tag + else n' + in {t with node= norm nnode} + | _ -> { t with node=Nil } + + + let tagged_desc t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_desc" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) } + | _ -> { t with node=Nil } + +(* + let tagged_next t tag = + if tag == Tag.attribute || tag == Tag.pcdata then failwith "tagged_next" + else + match tagged_desc t tag with + | { doc = d; node=Nil } -> tagged_foll t tag + | x -> x +*) + let subtree_tags t tag = + match t with + { doc = d; node = Node(NC n) } -> + subtree_tags d n tag + | _ -> 0 + + let tagged_desc_array = ref [| |] + let idx = ref 0 + + let init_tagged_next t tagid = + let l = subtree_tags (root t) tagid + in + tagged_desc_array := Array.create l { t with node= Nil }; + let i = ref 0 in + let rec collect t = + if is_node t then begin + if tag t == tagid then + begin + !tagged_desc_array.(!i) <- t; + incr i; + end; + collect (first_child t); + collect (next_sibling t) + end; + in + collect t; + idx := 0 + + let print_id ppf v = + let pr x= Format.fprintf ppf x in + match v with + { node=Nil } -> pr "NULLT: -1" + | { node=String(i) } | { node=Node(SC(i,_)) } -> pr "DocID: %i" (int_of_node i) + | { node=Node(NC(i)) } -> pr "Node: %i" (int_of_node i) + + + let tagged_next t tag = + if !idx >= Array.length !tagged_desc_array + then {t with node=Nil} + else + let r = !tagged_desc_array.(!idx) + in + incr idx; r + + + + let has_tagged_foll t tag = is_node (tagged_foll t tag) + let has_tagged_desc t tag = is_node (tagged_desc t tag) + let contains t s = Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s) + let contains_old t s = let regexp = Str.regexp_string s in let matching arg = @@ -329,17 +516,52 @@ struct in true with _ -> false in - let rec find t = match t.node with - | Nil -> false - | String _ -> matching (string t) - | Node(_) -> (find (left t )) || (find (right t)) + let rec find t acc = match t.node with + | Nil -> acc + | String i -> + if matching (string t) then DocIdSet.add i acc else acc + | Node(_) -> (find (left t )) ((find (right t)) acc) + in + find t DocIdSet.empty + + + let contains_iter t s = + let regexp = Str.regexp_string s in + let matching arg = + try + let _ = Str.search_forward regexp arg 0; + in true + with _ -> false + in + let size = Text.size t.doc in + let rec find acc n = + if n == size then acc + else + find + (if matching (Text.get_cached_text t.doc (Obj.magic n)) then + DocIdSet.add (Obj.magic n) acc + else acc) (n+1) in - find t + find DocIdSet.empty 0 + + + + + let count_contains t s = Text.count_contains t.doc s + let count t s = Text.count t.doc s + + let is_left t = + if is_root t then false + else + if tag (parent t) == Tag.pcdata then false + else + let u = left (parent t) in + (id t) == (id u) let print_xml_fast outc t = let rec loop ?(print_right=true) t = match t.node with | Nil -> () - | String (s) -> output_string outc (string t) + | String (s) -> output_string outc (Text.get_text t.doc s) | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); if print_right then loop (right t) @@ -396,6 +618,10 @@ struct print_xml_fast outc (first_child t) else print_xml_fast outc t + + + + let traversal t = Tree.traversal t.doc let full_traversal t = let rec aux n =