X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=tree.ml;h=16b32a30c6f9699205d4086a7502955b4b1fbded;hb=4680fa5b41156d70f0fde69981f0d241184b19d9;hp=0c1f10b2ec8e4cf2af2fd453f824de1577e8a8d1;hpb=24fdea81b5506233d139bd7d72364a190bef35b8;p=SXSI%2Fxpathcomp.git diff --git a/tree.ml b/tree.ml index 0c1f10b..16b32a3 100644 --- a/tree.ml +++ b/tree.ml @@ -4,6 +4,7 @@ (* Copyright NICTA 2008 *) (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) +INCLUDE "debug.ml" module type BINARY = sig type node_content @@ -12,21 +13,38 @@ sig type t val parse_xml_uri : string -> t val parse_xml_string : string -> t + val save : t -> string -> unit + val load : ?sample:int -> string -> t + val tag_pool : t -> Tag.pool val string : t -> string val descr : t -> descr + val is_node : t -> bool val left : t -> t val right : t -> t + val first_child : t -> t + val next_sibling : t -> t val parent : t -> t val id : t -> int val tag : t -> Tag.t val print_xml_fast : out_channel -> t -> unit val compare : t -> t -> int val equal : t -> t -> bool - module DocIdSet : Set.S with type elt = string_content + module DocIdSet : + sig + include Set.S + end + with type elt = string_content val string_below : t -> string_content -> bool val contains : t -> string -> DocIdSet.t val contains_old : t -> string -> bool val dump : t -> unit + val get_string : t -> string_content -> string + val has_tagged_desc : t -> Tag.t -> bool + val has_tagged_foll : t -> Tag.t -> bool + val tagged_desc : t -> Tag.t -> t + val tagged_foll : t -> Tag.t -> t + val tagged_next : t -> Tag.t -> t + val subtree_tags : t -> Tag.t -> int end module XML = @@ -43,29 +61,37 @@ struct external int_of_node : 'a node -> int = "%identity" - external parse_xml_uri : string -> t = "caml_call_shredder_uri" - let parse_xml_uri uri = parse_xml_uri uri - - external parse_xml_string : string -> t = "caml_call_shredder_string" - let parse_xml_string uri = parse_xml_string uri - + external parse_xml_uri : string -> int -> bool -> bool -> t = "caml_call_shredder_uri" + external parse_xml_string : string -> int -> bool -> bool -> t = "caml_call_shredder_string" + + external save_tree : t -> string -> unit = "caml_xml_tree_save" + external load_tree : string -> int -> t = "caml_xml_tree_load" + module Text = struct - type t (* pointer to the text collection *) + let equal : [`Text] node -> [`Text] node -> bool = equal + (* Todo *) external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt" let nil = nullt () external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text" - let get_text t n = +(* let get_text t n = if equal nil n then "" else get_text t n +*) external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text" let is_empty t n = (equal nil n) || is_empty t n + external get_cached_text : t -> [`Text ] node -> string = "caml_text_collection_get_cached_text" + + let get_text t n = + if (equal nil n) || is_empty t n then "" + else get_cached_text t n + external is_contains : t -> string -> bool = "caml_text_collection_is_contains" external count_contains : t -> string -> int = "caml_text_collection_count_contains" external contains : t -> string -> [`Text ] node array = "caml_text_collection_contains" @@ -75,7 +101,7 @@ struct module Tree = struct - + let equal : [`Tree ] node -> [`Tree] node -> bool = equal external serialize : t -> string -> unit = "caml_xml_tree_serialize" external unserialize : string -> t = "caml_xml_tree_unserialize" @@ -95,10 +121,8 @@ struct external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" - external tag : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" - external tag_id : t -> [`Tree ] node -> unit = "caml_xml_tree_tag_id" - - external text_collection : t -> Text.t = "caml_xml_tree_text_collection" +(* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*) + external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id" let is_last t n = equal nil (next_sibling t n) @@ -111,24 +135,29 @@ struct external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" + external tagged_desc : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" + external tagged_foll : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_foll" + external tagged_next : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_next" + external subtree_tags : t -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" let print_skel t = - let textcol = text_collection t in let rec aux id = if (is_nil id) then Printf.eprintf "#\n" else begin - Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!" + Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" (int_of_node id) - (Tag.to_string (tag t id)) + (Tag.to_string (tag_id t id)) (node_xml_id t id) (int_of_node (prev_text t id)) - (Text.get_text textcol (prev_text t id)) + (Text.get_text t (prev_text t id)) (int_of_node (my_text t id)) - (Text.get_text textcol (my_text t id)) + (Text.get_text t (my_text t id)) (int_of_node (next_text t id)) - (Text.get_text textcol (next_text t id)); + (Text.get_text t (next_text t id)) + (int_of_node(parent_doc t (my_text t id))); + aux(first_child t id); aux(next_sibling t id); end @@ -136,17 +165,16 @@ struct aux (root t) let traversal t = - let textcol = text_collection t in let rec aux id = if not (is_nil id) then begin (* ignore (tag t id); - ignore (Text.get_text textcol (prev_text t id)); + ignore (Text.get_text t (prev_text t id)); if (is_leaf t id) - then ignore (Text.get_text textcol (my_text t id)); + then ignore (Text.get_text t (my_text t id)); if (is_last t id) - then ignore (Text.get_text textcol (next_text t id)); *) + then ignore (Text.get_text t (next_text t id)); *) aux (first_child t id); aux (next_sibling t id); end @@ -168,23 +196,43 @@ struct type doc = t - type t = { doc : doc; - text : Text.t; + type t = { doc : doc; node : descr } let dump { doc=t } = Tree.print_skel t - module DocIdSet = Set.Make (struct type t = string_content - let compare = (-) end) + module DocIdSet = struct + include Set.Make (struct type t = string_content + let compare = (-) end) + + end + let is_node = function { node=Node(_) } -> true | _ -> false - + let get_string t (i:string_content) = Text.get_text t.doc i open Tree let node_of_t t = { doc= t; - text = text_collection t; node = Node(NC (root t)) } - let parse_xml_uri str = node_of_t (parse_xml_uri str) - let parse_xml_string str = node_of_t (parse_xml_string str) + let parse_xml_uri str = node_of_t + (MM((parse_xml_uri str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection),__LOCATION__)) + + let parse_xml_string str = node_of_t + (MM((parse_xml_string str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection),__LOCATION__)) + + + let save t str = save_tree t.doc str + + let load ?(sample=64) str = node_of_t (load_tree str sample) + + + external pool : doc -> Tag.pool = "%identity" + let tag_pool t = pool t.doc let compare a b = match a.node,b.node with | Node(NC i),Node(NC j) -> compare i j @@ -201,7 +249,7 @@ struct let equal a b = (compare a b) == 0 let string t = match t.node with - | String i -> Text.get_text t.text i + | String i -> Text.get_text t.doc i | _ -> assert false let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n) @@ -231,13 +279,13 @@ struct match n.node with | Node (NC t) when is_leaf n.doc t -> let txt = my_text n.doc t in - if Text.is_empty n.text txt + if Text.is_empty n.doc txt then Nil else Node(SC (txt,Tree.nil)) | Node (NC t) -> let fs = first_child n.doc t in let txt = prev_text n.doc fs in - if Text.is_empty n.text txt + if Text.is_empty n.doc txt then norm fs else Node (SC (txt, fs)) | Node(SC (i,_)) -> String i @@ -253,7 +301,7 @@ struct | Node(NC t) -> let ns = next_sibling n.doc t in let txt = next_text n.doc t in - if Text.is_empty n.text txt + if Text.is_empty n.doc txt then norm ns else Node (SC (txt, ns)) | Nil | String _ -> failwith "next_sibling" @@ -271,23 +319,65 @@ struct let tag = function { node=Node(SC _) } -> Tag.pcdata - | { doc=d; node=Node(NC n)} -> tag d n - | _ -> failwith "Tag" + | { doc=d; node=Node(NC n)} -> tag_id d n + | _ -> failwith "tag" - let tag_id = +(* let tag_id = function { node=Node(SC _) } -> () | { doc=d; node=Node(NC n)} -> tag_id d n | _ -> () - +*) let string_below t id = - let pid = parent_doc t.doc id in + let strid = parent_doc t.doc id in match t.node with - | Node(NC(i)) -> (is_ancestor t.doc i pid) - | Node(SC(i,_)) -> (is_ancestor t.doc (parent_doc t.doc i) pid) + | Node(NC(i)) -> + (Tree.equal i strid) || (is_ancestor t.doc i strid) + | Node(SC(i,_)) -> Text.equal i id | _ -> false - + + + let tagged_foll t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_foll" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_foll d n tag) } + | { doc=d; node=Node(SC (_,n)) } when is_nil n -> { t with node= Nil } + | { doc=d; node=Node(SC (_,n)) } -> + let nnode = + if tag_id d n == tag then n + else + let n' = tagged_desc d n tag in + if is_nil n' then tagged_foll d n tag + else n' + in {t with node= norm nnode} + | _ -> { t with node=Nil } + + + let tagged_desc t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_desc" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) } + | _ -> { t with node=Nil } + + + let tagged_next t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_next" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_next d n tag) } + | { doc=d; node=Node(SC (_,n)) } -> { t with node = norm (tagged_next d n tag) } + | _ -> { t with node=Nil } + + let subtree_tags t tag = + match t with + { doc = d; node = Node(NC n) } -> subtree_tags d n tag + | _ -> 0 + + + + let has_tagged_foll t tag = is_node (tagged_foll t tag) + let has_tagged_desc t tag = is_node (tagged_desc t tag) + let contains t s = - Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.text s) + Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s) let contains_old t s = let regexp = Str.regexp_string s in @@ -308,7 +398,9 @@ struct let rec loop ?(print_right=true) t = match t.node with | Nil -> () | String (s) -> output_string outc (string t) - | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t) + | Node _ when Tag.equal (tag t) Tag.pcdata -> + loop (left t); + if print_right then loop (right t) | Node (_) -> let tg = Tag.to_string (tag t) in @@ -367,20 +459,22 @@ struct let rec aux n = match n.node with | Nil -> () - | String i -> () (*ignore(Text.get_text t.text i) *) + | String i -> () (*ignore(Text.get_text t.doc i) *) | Node(_) -> (* tag_id n; *) aux (first_child n); aux (next_sibling n) in aux t + + let print_stats _ = () end end - - +IFDEF DEBUG +THEN module DEBUGTREE = struct @@ -438,13 +532,13 @@ module DEBUGTREE match n.node with | Node (NC t) when is_leaf_ n.doc t -> let txt = my_text_ n.doc t in - if is_empty_ n.text txt + if is_empty_ n.doc txt then Nil else Node(SC (txt,XML.Tree.nil)) | Node (NC t) -> let fs = first_child_ n.doc t in let txt = prev_text_ n.doc fs in - if is_empty_ n.text txt + if is_empty_ n.doc txt then norm fs else Node (SC (txt, fs)) | Node(SC (i,_)) -> String i @@ -459,8 +553,12 @@ module DEBUGTREE | Node (SC (_,ns)) -> norm ns | Node(NC t) -> let ns = next_sibling_ n.doc t in - let txt = next_text_ n.doc t in - if is_empty_ n.text txt + let txt = + if XML.Tree.is_nil ns then + next_text_ n.doc t + else prev_text_ n.doc ns + in + if is_empty_ n.doc txt then norm ns else Node (SC (txt, ns)) | Nil | String _ -> failwith "next_sibling" @@ -472,7 +570,6 @@ module DEBUGTREE | { doc=d; node=Node(SC (i,_) )} -> text_xml_id_ d i | _ -> failwith "id" - (* Wrapper around critical function *) let string t = time ("TextCollection.GetText()") (string) t let left = first_child @@ -564,4 +661,6 @@ module DEBUGTREE end module Binary = DEBUGTREE - +ELSE +module Binary = XML.Binary +END (* IFDEF DEBUG *)