(* Copyright NICTA 2008 *)
(* Distributed under the terms of the LGPL (see LICENCE) *)
(******************************************************************************)
+INCLUDE "debug.ml"
module type BINARY =
sig
type node_content
type t
val parse_xml_uri : string -> t
val parse_xml_string : string -> t
+ val save : t -> string -> unit
+ val load : ?sample:int -> string -> t
+ val tag_pool : t -> Tag.pool
val string : t -> string
val descr : t -> descr
+ val is_node : t -> bool
val left : t -> t
val right : t -> t
+ val first_child : t -> t
+ val next_sibling : t -> t
val parent : t -> t
val id : t -> int
val tag : t -> Tag.t
val print_xml_fast : out_channel -> t -> unit
val compare : t -> t -> int
val equal : t -> t -> bool
- module DocIdSet : Set.S with type elt = string_content
+ module DocIdSet :
+ sig
+ include Set.S
+ end
+ with type elt = string_content
val string_below : t -> string_content -> bool
val contains : t -> string -> DocIdSet.t
val contains_old : t -> string -> bool
val dump : t -> unit
+ val get_string : t -> string_content -> string
end
module XML =
external int_of_node : 'a node -> int = "%identity"
- external parse_xml_uri : string -> t = "caml_call_shredder_uri"
- let parse_xml_uri uri = parse_xml_uri uri
-
- external parse_xml_string : string -> t = "caml_call_shredder_string"
- let parse_xml_string uri = parse_xml_string uri
-
+ external parse_xml_uri : string -> int -> bool -> bool -> t = "caml_call_shredder_uri"
+ external parse_xml_string : string -> int -> bool -> bool -> t = "caml_call_shredder_string"
+
+ external save_tree : t -> string -> unit = "caml_xml_tree_save"
+ external load_tree : string -> int -> t = "caml_xml_tree_load"
+
module Text =
struct
- type t (* pointer to the text collection *)
+ let equal : [`Text] node -> [`Text] node -> bool = equal
+
(* Todo *)
external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt"
let nil = nullt ()
module Tree =
struct
-
+ let equal : [`Tree ] node -> [`Tree] node -> bool = equal
external serialize : t -> string -> unit = "caml_xml_tree_serialize"
external unserialize : string -> t = "caml_xml_tree_unserialize"
external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf"
- external tag : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag"
- external tag_id : t -> [`Tree ] node -> unit = "caml_xml_tree_tag_id"
-
- external text_collection : t -> Text.t = "caml_xml_tree_text_collection"
+(* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*)
+ external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id"
let is_last t n = equal nil (next_sibling t n)
external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor"
let print_skel t =
- let textcol = text_collection t in
let rec aux id =
if (is_nil id)
then Printf.eprintf "#\n"
else
begin
- Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!"
+ Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!"
(int_of_node id)
- (Tag.to_string (tag t id))
+ (Tag.to_string (tag_id t id))
(node_xml_id t id)
(int_of_node (prev_text t id))
- (Text.get_text textcol (prev_text t id))
+ (Text.get_text t (prev_text t id))
(int_of_node (my_text t id))
- (Text.get_text textcol (my_text t id))
+ (Text.get_text t (my_text t id))
(int_of_node (next_text t id))
- (Text.get_text textcol (next_text t id));
+ (Text.get_text t (next_text t id))
+ (int_of_node(parent_doc t (my_text t id)));
+
aux(first_child t id);
aux(next_sibling t id);
end
aux (root t)
let traversal t =
- let textcol = text_collection t in
let rec aux id =
if not (is_nil id)
then
begin
(* ignore (tag t id);
- ignore (Text.get_text textcol (prev_text t id));
+ ignore (Text.get_text t (prev_text t id));
if (is_leaf t id)
- then ignore (Text.get_text textcol (my_text t id));
+ then ignore (Text.get_text t (my_text t id));
if (is_last t id)
- then ignore (Text.get_text textcol (next_text t id)); *)
+ then ignore (Text.get_text t (next_text t id)); *)
aux (first_child t id);
aux (next_sibling t id);
end
type doc = t
- type t = { doc : doc;
- text : Text.t;
+ type t = { doc : doc;
node : descr }
let dump { doc=t } = Tree.print_skel t
- module DocIdSet = Set.Make (struct type t = string_content
- let compare = (-) end)
-
-
+ module DocIdSet = struct
+ include Set.Make (struct type t = string_content
+ let compare = (-) end)
+
+ end
+ let is_node = function { node=Node(_) } -> true | _ -> false
+ let get_string t (i:string_content) = Text.get_text t.doc i
open Tree
let node_of_t t = { doc= t;
- text = text_collection t;
node = Node(NC (root t)) }
- let parse_xml_uri str = node_of_t (parse_xml_uri str)
- let parse_xml_string str = node_of_t (parse_xml_string str)
+ let parse_xml_uri str = node_of_t
+ (MM((parse_xml_uri str
+ !Options.sample_factor
+ !Options.index_empty_texts
+ !Options.disable_text_collection),__LOCATION__))
+
+ let parse_xml_string str = node_of_t
+ (MM((parse_xml_string str
+ !Options.sample_factor
+ !Options.index_empty_texts
+ !Options.disable_text_collection),__LOCATION__))
+
+
+ let save t str = save_tree t.doc str
+
+ let load ?(sample=64) str = node_of_t (load_tree str sample)
+
+
+ external pool : doc -> Tag.pool = "%identity"
+ let tag_pool t = pool t.doc
let compare a b = match a.node,b.node with
| Node(NC i),Node(NC j) -> compare i j
let equal a b = (compare a b) == 0
let string t = match t.node with
- | String i -> Text.get_text t.text i
+ | String i -> Text.get_text t.doc i
| _ -> assert false
let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n)
match n.node with
| Node (NC t) when is_leaf n.doc t ->
let txt = my_text n.doc t in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then Nil
else Node(SC (txt,Tree.nil))
| Node (NC t) ->
let fs = first_child n.doc t in
let txt = prev_text n.doc fs in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then norm fs
else Node (SC (txt, fs))
| Node(SC (i,_)) -> String i
| Node(NC t) ->
let ns = next_sibling n.doc t in
let txt = next_text n.doc t in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then norm ns
else Node (SC (txt, ns))
| Nil | String _ -> failwith "next_sibling"
let tag =
function { node=Node(SC _) } -> Tag.pcdata
- | { doc=d; node=Node(NC n)} -> tag d n
- | _ -> failwith "Tag"
+ | { doc=d; node=Node(NC n)} -> tag_id d n
+ | _ -> failwith "tag"
- let tag_id =
+(* let tag_id =
function { node=Node(SC _) } -> ()
| { doc=d; node=Node(NC n)} -> tag_id d n
| _ -> ()
-
+*)
let string_below t id =
- let pid = parent_doc t.doc id in
+ let strid = parent_doc t.doc id in
match t.node with
- | Node(NC(i)) -> (is_ancestor t.doc i pid)
- | Node(SC(i,_)) -> (is_ancestor t.doc (parent_doc t.doc i) pid)
+ | Node(NC(i)) ->
+ (Tree.equal i strid) || (is_ancestor t.doc i strid)
+ | Node(SC(i,_)) -> Text.equal i id
| _ -> false
+
let contains t s =
- Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.text s)
+ Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s)
let contains_old t s =
let regexp = Str.regexp_string s in
let rec loop ?(print_right=true) t = match t.node with
| Nil -> ()
| String (s) -> output_string outc (string t)
- | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t)
+ | Node _ when Tag.equal (tag t) Tag.pcdata ->
+ loop (left t);
+ if print_right then loop (right t)
| Node (_) ->
let tg = Tag.to_string (tag t) in
let rec aux n =
match n.node with
| Nil -> ()
- | String i -> () (*ignore(Text.get_text t.text i) *)
+ | String i -> () (*ignore(Text.get_text t.doc i) *)
| Node(_) ->
(* tag_id n; *)
aux (first_child n);
aux (next_sibling n)
in aux t
+
+ let print_stats _ = ()
end
end
-
-
+IFDEF DEBUG
+THEN
module DEBUGTREE
= struct
match n.node with
| Node (NC t) when is_leaf_ n.doc t ->
let txt = my_text_ n.doc t in
- if is_empty_ n.text txt
+ if is_empty_ n.doc txt
then Nil
else Node(SC (txt,XML.Tree.nil))
| Node (NC t) ->
let fs = first_child_ n.doc t in
let txt = prev_text_ n.doc fs in
- if is_empty_ n.text txt
+ if is_empty_ n.doc txt
then norm fs
else Node (SC (txt, fs))
| Node(SC (i,_)) -> String i
| Node (SC (_,ns)) -> norm ns
| Node(NC t) ->
let ns = next_sibling_ n.doc t in
- let txt = next_text_ n.doc t in
- if is_empty_ n.text txt
+ let txt =
+ if XML.Tree.is_nil ns then
+ next_text_ n.doc t
+ else prev_text_ n.doc ns
+ in
+ if is_empty_ n.doc txt
then norm ns
else Node (SC (txt, ns))
| Nil | String _ -> failwith "next_sibling"
| { doc=d; node=Node(SC (i,_) )} -> text_xml_id_ d i
| _ -> failwith "id"
-
(* Wrapper around critical function *)
let string t = time ("TextCollection.GetText()") (string) t
let left = first_child
end
module Binary = DEBUGTREE
-
+ELSE
+module Binary = XML.Binary
+END (* IFDEF DEBUG *)