X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=tree.ml;h=16b32a30c6f9699205d4086a7502955b4b1fbded;hb=4680fa5b41156d70f0fde69981f0d241184b19d9;hp=1b4ce2ecb26d79d1377b652d207173de8d63d508;hpb=c10ce35cd399aff15a49f3b24a31b38cb2191da0;p=SXSI%2Fxpathcomp.git diff --git a/tree.ml b/tree.ml index 1b4ce2e..16b32a3 100644 --- a/tree.ml +++ b/tree.ml @@ -4,6 +4,7 @@ (* Copyright NICTA 2008 *) (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) +INCLUDE "debug.ml" module type BINARY = sig type node_content @@ -12,22 +13,38 @@ sig type t val parse_xml_uri : string -> t val parse_xml_string : string -> t + val save : t -> string -> unit + val load : ?sample:int -> string -> t val tag_pool : t -> Tag.pool val string : t -> string val descr : t -> descr + val is_node : t -> bool val left : t -> t val right : t -> t + val first_child : t -> t + val next_sibling : t -> t val parent : t -> t val id : t -> int val tag : t -> Tag.t val print_xml_fast : out_channel -> t -> unit val compare : t -> t -> int val equal : t -> t -> bool - module DocIdSet : Set.S with type elt = string_content + module DocIdSet : + sig + include Set.S + end + with type elt = string_content val string_below : t -> string_content -> bool val contains : t -> string -> DocIdSet.t val contains_old : t -> string -> bool val dump : t -> unit + val get_string : t -> string_content -> string + val has_tagged_desc : t -> Tag.t -> bool + val has_tagged_foll : t -> Tag.t -> bool + val tagged_desc : t -> Tag.t -> t + val tagged_foll : t -> Tag.t -> t + val tagged_next : t -> Tag.t -> t + val subtree_tags : t -> Tag.t -> int end module XML = @@ -44,29 +61,37 @@ struct external int_of_node : 'a node -> int = "%identity" - external parse_xml_uri : string -> t = "caml_call_shredder_uri" - let parse_xml_uri uri = parse_xml_uri uri - - external parse_xml_string : string -> t = "caml_call_shredder_string" - let parse_xml_string uri = parse_xml_string uri - + external parse_xml_uri : string -> int -> bool -> bool -> t = "caml_call_shredder_uri" + external parse_xml_string : string -> int -> bool -> bool -> t = "caml_call_shredder_string" + + external save_tree : t -> string -> unit = "caml_xml_tree_save" + external load_tree : string -> int -> t = "caml_xml_tree_load" + module Text = struct - + let equal : [`Text] node -> [`Text] node -> bool = equal + (* Todo *) external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt" let nil = nullt () external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text" - let get_text t n = +(* let get_text t n = if equal nil n then "" else get_text t n +*) external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text" let is_empty t n = (equal nil n) || is_empty t n + external get_cached_text : t -> [`Text ] node -> string = "caml_text_collection_get_cached_text" + + let get_text t n = + if (equal nil n) || is_empty t n then "" + else get_cached_text t n + external is_contains : t -> string -> bool = "caml_text_collection_is_contains" external count_contains : t -> string -> int = "caml_text_collection_count_contains" external contains : t -> string -> [`Text ] node array = "caml_text_collection_contains" @@ -76,7 +101,7 @@ struct module Tree = struct - + let equal : [`Tree ] node -> [`Tree] node -> bool = equal external serialize : t -> string -> unit = "caml_xml_tree_serialize" external unserialize : string -> t = "caml_xml_tree_unserialize" @@ -110,6 +135,10 @@ struct external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor" + external tagged_desc : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc" + external tagged_foll : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_foll" + external tagged_next : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_next" + external subtree_tags : t -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags" let print_skel t = let rec aux id = @@ -117,7 +146,7 @@ struct then Printf.eprintf "#\n" else begin - Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!" + Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!" (int_of_node id) (Tag.to_string (tag_id t id)) (node_xml_id t id) @@ -126,7 +155,9 @@ struct (int_of_node (my_text t id)) (Text.get_text t (my_text t id)) (int_of_node (next_text t id)) - (Text.get_text t (next_text t id)); + (Text.get_text t (next_text t id)) + (int_of_node(parent_doc t (my_text t id))); + aux(first_child t id); aux(next_sibling t id); end @@ -169,17 +200,35 @@ struct node : descr } let dump { doc=t } = Tree.print_skel t - module DocIdSet = Set.Make (struct type t = string_content - let compare = (-) end) + module DocIdSet = struct + include Set.Make (struct type t = string_content + let compare = (-) end) + + end + let is_node = function { node=Node(_) } -> true | _ -> false - + let get_string t (i:string_content) = Text.get_text t.doc i open Tree let node_of_t t = { doc= t; node = Node(NC (root t)) } - let parse_xml_uri str = node_of_t (parse_xml_uri str) - let parse_xml_string str = node_of_t (parse_xml_string str) + let parse_xml_uri str = node_of_t + (MM((parse_xml_uri str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection),__LOCATION__)) + + let parse_xml_string str = node_of_t + (MM((parse_xml_string str + !Options.sample_factor + !Options.index_empty_texts + !Options.disable_text_collection),__LOCATION__)) + + + let save t str = save_tree t.doc str + + let load ?(sample=64) str = node_of_t (load_tree str sample) external pool : doc -> Tag.pool = "%identity" @@ -279,12 +328,54 @@ struct | _ -> () *) let string_below t id = - let pid = parent_doc t.doc id in + let strid = parent_doc t.doc id in match t.node with - | Node(NC(i)) -> (is_ancestor t.doc i pid) - | Node(SC(i,_)) -> (is_ancestor t.doc (parent_doc t.doc i) pid) + | Node(NC(i)) -> + (Tree.equal i strid) || (is_ancestor t.doc i strid) + | Node(SC(i,_)) -> Text.equal i id | _ -> false - + + + let tagged_foll t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_foll" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_foll d n tag) } + | { doc=d; node=Node(SC (_,n)) } when is_nil n -> { t with node= Nil } + | { doc=d; node=Node(SC (_,n)) } -> + let nnode = + if tag_id d n == tag then n + else + let n' = tagged_desc d n tag in + if is_nil n' then tagged_foll d n tag + else n' + in {t with node= norm nnode} + | _ -> { t with node=Nil } + + + let tagged_desc t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_desc" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) } + | _ -> { t with node=Nil } + + + let tagged_next t tag = + if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_next" + else match t with + | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_next d n tag) } + | { doc=d; node=Node(SC (_,n)) } -> { t with node = norm (tagged_next d n tag) } + | _ -> { t with node=Nil } + + let subtree_tags t tag = + match t with + { doc = d; node = Node(NC n) } -> subtree_tags d n tag + | _ -> 0 + + + + let has_tagged_foll t tag = is_node (tagged_foll t tag) + let has_tagged_desc t tag = is_node (tagged_desc t tag) + let contains t s = Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s) @@ -307,7 +398,9 @@ struct let rec loop ?(print_right=true) t = match t.node with | Nil -> () | String (s) -> output_string outc (string t) - | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t) + | Node _ when Tag.equal (tag t) Tag.pcdata -> + loop (left t); + if print_right then loop (right t) | Node (_) -> let tg = Tag.to_string (tag t) in @@ -372,14 +465,16 @@ struct aux (first_child n); aux (next_sibling n) in aux t + + let print_stats _ = () end end - - +IFDEF DEBUG +THEN module DEBUGTREE = struct @@ -566,3 +661,6 @@ module DEBUGTREE end module Binary = DEBUGTREE +ELSE +module Binary = XML.Binary +END (* IFDEF DEBUG *)