(* Copyright NICTA 2008 *)
(* Distributed under the terms of the LGPL (see LICENCE) *)
(******************************************************************************)
+INCLUDE "debug.ml"
module type BINARY =
sig
type node_content
type t
val parse_xml_uri : string -> t
val parse_xml_string : string -> t
+ val tag_pool : t -> Tag.pool
val string : t -> string
val descr : t -> descr
val left : t -> t
val right : t -> t
+ val parent : t -> t
val id : t -> int
val tag : t -> Tag.t
val print_xml_fast : out_channel -> t -> unit
val compare : t -> t -> int
val equal : t -> t -> bool
+ module DocIdSet : Set.S with type elt = string_content
+ val string_below : t -> string_content -> bool
+ val contains : t -> string -> DocIdSet.t
+ val contains_old : t -> string -> bool
+ val dump : t -> unit
end
-module OldBinary =
-struct
-
- type string_content = string
- type descr = Nil | Node of node_content | String of string_content
- and node_content = int*Tag.t * descr * descr * (descr ref)
- type t = descr
-
- let descr t = t
-
- let string = function String s -> s | _ -> failwith "string"
-
-
- external parse_xml_uri : string -> t = "caml_call_shredder_uri"
- external parse_xml_string : string -> t = "caml_call_shredder_string"
-
- let parse_xml_uri s = Node(0,Tag.tag "",parse_xml_uri s,Nil,ref Nil)
- let parse_xml_string s = Node(0,Tag.tag "",parse_xml_string s,Nil,ref Nil)
- let tstring = function Nil -> "Nil"
- | Node (_,_,_,_,_) -> "Node"
- | String _ -> "String"
-
-
-let print_xml fmt t =
- let pp_str = Format.pp_print_string fmt in
- let rec loop = function Nil -> ()
- | String (s) -> pp_str s
- | Node (_,t,l,r,_) when Tag.equal t Tag.pcdata -> loop l;loop r
- | Node (_,t,l,r,_) ->
- pp_str ("<" ^ (Tag.to_string t));
- ( match l with
- Nil -> pp_str "/>"
- | Node(_,t',atts,children,_) when Tag.equal t' Tag.attribute ->
- (loop_attributes atts;
- match children with
- | Nil -> pp_str "/>"
- | _ ->
- pp_str ">";
- loop children;
- pp_str ("</"^ (Tag.to_string t)^">" )
- )
- | _ -> pp_str ">"; loop l;
- pp_str ("</"^ (Tag.to_string t)^">" );
- );loop r
- and loop_attributes = function
- | Node(_,t,Node(_,_,String(s),_,_),r,_) ->
- pp_str (" "^(Tag.to_string t)^"=\""^ s ^"\"") ;
- loop_attributes r
- | _ -> ()
-
- in
- loop t
-
-let print_xml fmt =
- function Node(i,t,l,_,_) -> print_xml fmt (Node(i,t,l,Nil,ref Nil))
- | t -> print_xml fmt t
-
-
-(* a bit ugly but inlining like this makes serialization faster *)
-
-let print_xml_fast outc t =
- let rec loop = function Nil -> ()
- | String (s) -> output_string outc s
- | Node (_,t,l,r,_) when Tag.equal t Tag.pcdata -> loop l;loop r
- | Node (_,t,l,r,_) -> let t = Tag.to_string t in
- output_char outc '<';
- output_string outc t;
- ( match l with
- Nil -> output_string outc "/>"
- | Node(_,t',atts,children,_) when Tag.equal t' Tag.attribute ->
- (loop_attributes atts;
- match children with
- | Nil -> output_string outc "/>"
- | _ ->
- output_char outc '>';
- loop children;
- output_string outc "</";
- output_string outc t;
- output_char outc '>' )
- | _ ->
- output_char outc '>';
- loop l;
- output_string outc "</";
- output_string outc t;
- output_char outc '>'
- );loop r
- and loop_attributes = function
- | Node(_,t,Node(_,_,String(s),_,_),r,_) ->
- output_char outc ' ';
- output_string outc (Tag.to_string t);
- output_string outc "=\"";
- output_string outc s;
- output_char outc '"';
- loop_attributes r
- | _ -> ()
-
- in
- loop t
-
-let print_xml_fast outc =
- function Node(i,t,l,_,_) -> print_xml_fast outc (Node(i,t,l,Nil,ref Nil))
- | t -> print_xml_fast outc t
-
-
-
-let tabs = ref 0
-
-let prtabs fmt =
- for i = 0 to !tabs
- do
- Format.fprintf fmt " "
- done
-
-
-let rec dump fmt t =
- incr tabs;
- let _ = match t with
- | Nil -> prtabs fmt; Format.fprintf fmt "#"
- | String s -> prtabs fmt; Format.fprintf fmt "(String %s)" s
- | Node(id,t,l,r,_) ->
- prtabs fmt;
- Format.fprintf fmt " (tag='";
- Tag.print fmt t;
- Format.fprintf fmt "', id='%i')\n" id;
- prtabs fmt;
- dump fmt l;
- Format.fprintf fmt "\n";
- prtabs fmt;
- dump fmt r;
- Format.fprintf fmt "\n";
- prtabs fmt;prtabs fmt;
- Format.fprintf fmt "(id='%i'end )\n" id
- in decr tabs
-
-
-let dump fmt t =
- tabs:=0;
- dump fmt t;
- tabs:=0
-
-let id = function Node(i,_,_,_,_) -> i
- | _ -> failwith "id"
-
-let tag = function Node(_,t,_,_,_) -> t
- | _ -> failwith "tag"
-
-let left = function Node(_,_,l,_,_) -> l
- | _ -> failwith "left"
-
-let right = function Node(_,_,_,r,_) -> r
- | _ -> failwith "right"
-
-let first_child = left
-let next_sibling = right
-
-let is_root = function Node (_,_,_,_,{contents=Nil}) -> true | _ -> false
-let is_left n = match n with
- | Node (_,_,_,_,{contents=p}) when not(is_root n) && (left p) == n -> true
- | _ -> false
-
-let is_right n = match n with
- | Node (_,_,_,_,{contents=p}) when not(is_root n) && (right p) == n -> true
- | _ -> false
-
-
-let compare t1 t2 = match t1,t2 with
- | Nil,Nil -> 0
- | String s1, String s2 -> String.compare s1 s2
- | Nil, String _ -> -1
- | String _, Nil -> 1
- | Node(i1,_,_,_,_), Node(i2,_,_,_,_) -> i1 - i2
- | _, Node _ -> -1
- | Node _ , _ -> 1
-let equal t1 t2 = (compare t1 t2) == 0
-
-let int_size = Sys.word_size/8
-let ssize s = ((String.length s)/4 +1)*4
-let rec size =
- function Nil -> (int_size,1,0,0)
- | String s -> (int_size + (ssize s),0,1,0)
- | Node(_,_,l,r,_) ->
- let sizel,nl,sl,il = size l
- and sizer,nr,sr,ir = size r
- in
- (sizel+sizer+(7*int_size),nl+nr,sl+sr,il+ir+1)
-let size t =
- let s,n,st,i = size t in
- s/1024,n,st,i
-end
-
-
module XML =
struct
external int_of_node : 'a node -> int = "%identity"
- external parse_xml_uri : string -> t = "caml_call_shredder_uri"
- let parse_xml_uri uri = parse_xml_uri uri
-
- external parse_xml_string : string -> t = "caml_call_shredder_string"
- let parse_xml_string uri = parse_xml_string uri
+ external parse_xml_uri : string -> int -> bool -> bool -> t = "caml_call_shredder_uri"
+
+ external parse_xml_string : string -> int -> bool -> bool -> t = "caml_call_shredder_string"
+
module Text =
struct
- type t (* pointer to the text collection *)
+
(* Todo *)
external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt"
let nil = nullt ()
else get_text t n
external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text"
+ let is_empty t n =
+ (equal nil n) || is_empty t n
+
+ external is_contains : t -> string -> bool = "caml_text_collection_is_contains"
+ external count_contains : t -> string -> int = "caml_text_collection_count_contains"
+ external contains : t -> string -> [`Text ] node array = "caml_text_collection_contains"
end
struct
- external serialize : string -> unit = "caml_xml_tree_serialize"
+ external serialize : t -> string -> unit = "caml_xml_tree_serialize"
external unserialize : string -> t = "caml_xml_tree_unserialize"
external root : t -> [`Tree] node = "caml_xml_tree_root"
let nil = nullt ()
let is_nil x = equal x nil
- external parent : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child"
+ external parent : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_parent"
external parent_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc"
external first_child : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child"
external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf"
- external tag : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag"
-
- external text_collection : t -> Text.t = "caml_xml_tree_text_collection"
+(* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*)
+ external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id"
let is_last t n = equal nil (next_sibling t n)
external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id"
external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id"
-
+ external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor"
let print_skel t =
- let textcol = text_collection t in
let rec aux id =
if (is_nil id)
- then Printf.eprintf "#"
+ then Printf.eprintf "#\n"
else
begin
- Printf.eprintf "Node %i has tag '%s', DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!"
+ Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!"
(int_of_node id)
- (Tag.to_string (tag t id))
+ (Tag.to_string (tag_id t id))
+ (node_xml_id t id)
(int_of_node (prev_text t id))
- (Text.get_text textcol (prev_text t id))
+ (Text.get_text t (prev_text t id))
(int_of_node (my_text t id))
- (Text.get_text textcol (my_text t id))
+ (Text.get_text t (my_text t id))
(int_of_node (next_text t id))
- (Text.get_text textcol (next_text t id));
+ (Text.get_text t (next_text t id));
aux(first_child t id);
aux(next_sibling t id);
end
aux (root t)
let traversal t =
- let textcol = text_collection t in
let rec aux id =
if not (is_nil id)
then
begin
- ignore (tag t id);
- ignore (Text.get_text textcol (prev_text t id));
+ (* ignore (tag t id);
+ ignore (Text.get_text t (prev_text t id));
if (is_leaf t id)
- then ignore (Text.get_text textcol (my_text t id));
+ then ignore (Text.get_text t (my_text t id));
if (is_last t id)
- then ignore (Text.get_text textcol (next_text t id));
+ then ignore (Text.get_text t (next_text t id)); *)
aux (first_child t id);
aux (next_sibling t id);
end
type doc = t
- type t = { doc : doc;
- text : Text.t;
+ type t = { doc : doc;
node : descr }
- let dump { doc=t } = Tree.print_skel t
+ let dump { doc=t } = Tree.print_skel t
+ module DocIdSet = Set.Make (struct type t = string_content
+ let compare = (-) end)
+
+
open Tree
let node_of_t t = { doc= t;
- text = text_collection t;
node = Node(NC (root t)) }
- let parse_xml_uri str = node_of_t (parse_xml_uri str)
- let parse_xml_string str = node_of_t (parse_xml_string str)
+ let parse_xml_uri str = node_of_t
+ (MM((parse_xml_uri str
+ !Options.sample_factor
+ !Options.index_empty_texts
+ !Options.disable_text_collection),__LOCATION__))
+
+ let parse_xml_string str = node_of_t
+ (MM((parse_xml_string str
+ !Options.sample_factor
+ !Options.index_empty_texts
+ !Options.disable_text_collection),__LOCATION__))
+
+
+ external pool : doc -> Tag.pool = "%identity"
+ let tag_pool t = pool t.doc
let compare a b = match a.node,b.node with
| Node(NC i),Node(NC j) -> compare i j
let equal a b = (compare a b) == 0
let string t = match t.node with
- | String i -> Text.get_text t.text i
+ | String i -> Text.get_text t.doc i
| _ -> assert false
let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n)
| Node (NC t) -> Printf.sprintf "Node (NC %i)" (int_of_node t)
| Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i)
+
+ let parent n =
+ let node' =
+ match n.node with
+ | Node(NC t) | Node(SC (_,t)) ->
+ if (Tree.root n.doc) == t
+ then Nil
+ else Node(NC(Tree.parent n.doc t)) (* A parent node can never be a SC *)
+ | _ -> assert false
+ in
+ { n with node = node' }
+
let first_child n =
let node' =
match n.node with
| Node (NC t) when is_leaf n.doc t ->
let txt = my_text n.doc t in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then Nil
else Node(SC (txt,Tree.nil))
| Node (NC t) ->
let fs = first_child n.doc t in
let txt = prev_text n.doc fs in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then norm fs
else Node (SC (txt, fs))
| Node(SC (i,_)) -> String i
| Node(NC t) ->
let ns = next_sibling n.doc t in
let txt = next_text n.doc t in
- if Text.is_empty n.text txt
+ if Text.is_empty n.doc txt
then norm ns
else Node (SC (txt, ns))
| Nil | String _ -> failwith "next_sibling"
let right = next_sibling
let id =
- function { doc=d; node=Node(NC n)} -> text_xml_id d n
- | { doc=d; node=Node(SC (i,_) )} -> node_xml_id d i
+ function { doc=d; node=Node(NC n)} -> node_xml_id d n
+ | { doc=d; node=Node(SC (i,_) )} -> text_xml_id d i
| _ -> failwith "id"
let tag =
function { node=Node(SC _) } -> Tag.pcdata
- | { doc=d; node=Node(NC n)} -> tag d n
- | _ -> failwith "Tag"
+ | { doc=d; node=Node(NC n)} -> tag_id d n
+ | _ -> failwith "tag"
-
-
+(* let tag_id =
+ function { node=Node(SC _) } -> ()
+ | { doc=d; node=Node(NC n)} -> tag_id d n
+ | _ -> ()
+*)
+ let string_below t id =
+ let pid = parent_doc t.doc id in
+ match t.node with
+ | Node(NC(i)) -> (is_ancestor t.doc i pid)
+ | Node(SC(i,_)) -> (is_ancestor t.doc (parent_doc t.doc i) pid)
+ | _ -> false
+
+ let contains t s =
+ Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s)
+
+ let contains_old t s =
+ let regexp = Str.regexp_string s in
+ let matching arg =
+ try
+ let _ = Str.search_forward regexp arg 0;
+ in true
+ with _ -> false
+ in
+ let rec find t = match t.node with
+ | Nil -> false
+ | String _ -> matching (string t)
+ | Node(_) -> (find (left t )) || (find (right t))
+ in
+ find t
+
let print_xml_fast outc t =
let rec loop ?(print_right=true) t = match t.node with
| Nil -> ()
let rec aux n =
match n.node with
| Nil -> ()
- | String i -> ignore(Text.get_text t.text i)
+ | String i -> () (*ignore(Text.get_text t.doc i) *)
| Node(_) ->
- ignore (tag n);
+ (* tag_id n; *)
aux (first_child n);
aux (next_sibling n)
in aux t
+
+ let print_stats _ = ()
end
end
-let dump = XML.Binary.dump
-let traversal = XML.Binary.traversal
-let full_traversal = XML.Binary.full_traversal
-external cpp_traversal : XML.t -> unit = "caml_cpp_traversal"
-let cpp_traversal t = cpp_traversal t.XML.Binary.doc
-include XML
+IFDEF DEBUG
+THEN
+module DEBUGTREE
+ = struct
+
+ let _timings = Hashtbl.create 107
+
+
+ let time _ref f arg =
+ let t1 = Unix.gettimeofday () in
+ let r = f arg in
+ let t2 = Unix.gettimeofday () in
+ let t = (1000. *.(t2 -. t1)) in
+
+ let (time,count) = try
+ Hashtbl.find _timings _ref
+ with
+ | Not_found -> 0.,0
+ in
+ let time = time+. t
+ and count = count + 1
+ in
+ Hashtbl.replace _timings _ref (time,count);r
+
+ include XML.Binary
+
+
+ let first_child_ doc node =
+ time ("XMLTree.FirstChild()") (XML.Tree.first_child doc) node
+ let next_sibling_ doc node =
+ time ("XMLTree.NextSibling()") (XML.Tree.next_sibling doc) node
+
+ let is_empty_ text node =
+ time ("TextCollection.IsEmpty()") (XML.Text.is_empty text) node
+
+ let prev_text_ doc node =
+ time ("XMLTree.PrevText()") (XML.Tree.prev_text doc) node
+
+ let my_text_ doc node =
+ time ("XMLTree.MyText()") (XML.Tree.my_text doc) node
+
+ let next_text_ doc node =
+ time ("XMLTree.NextText()") (XML.Tree.next_text doc) node
+
+ let is_leaf_ doc node =
+ time ("XMLTree.IsLeaf()") (XML.Tree.is_leaf doc ) node
+
+ let node_xml_id_ doc node =
+ time ("XMLTree.NodeXMLId()") (XML.Tree.node_xml_id doc ) node
+
+ let text_xml_id_ doc node =
+ time ("XMLTree.TextXMLId()") (XML.Tree.text_xml_id doc ) node
+
+
+ let first_child n =
+ let node' =
+ match n.node with
+ | Node (NC t) when is_leaf_ n.doc t ->
+ let txt = my_text_ n.doc t in
+ if is_empty_ n.doc txt
+ then Nil
+ else Node(SC (txt,XML.Tree.nil))
+ | Node (NC t) ->
+ let fs = first_child_ n.doc t in
+ let txt = prev_text_ n.doc fs in
+ if is_empty_ n.doc txt
+ then norm fs
+ else Node (SC (txt, fs))
+ | Node(SC (i,_)) -> String i
+ | Nil | String _ -> failwith "first_child"
+ in
+ { n with node = node'}
+
+
+ let next_sibling n =
+ let node' =
+ match n.node with
+ | Node (SC (_,ns)) -> norm ns
+ | Node(NC t) ->
+ let ns = next_sibling_ n.doc t in
+ let txt =
+ if XML.Tree.is_nil ns then
+ next_text_ n.doc t
+ else prev_text_ n.doc ns
+ in
+ if is_empty_ n.doc txt
+ then norm ns
+ else Node (SC (txt, ns))
+ | Nil | String _ -> failwith "next_sibling"
+ in
+ { n with node = node'}
+
+ let id =
+ function { doc=d; node=Node(NC n)} -> node_xml_id_ d n
+ | { doc=d; node=Node(SC (i,_) )} -> text_xml_id_ d i
+ | _ -> failwith "id"
+
+ (* Wrapper around critical function *)
+ let string t = time ("TextCollection.GetText()") (string) t
+ let left = first_child
+ let right = next_sibling
+ let tag t = time ("XMLTree.GetTag()") (tag) t
+
+ let print_stats ppf =
+ let total_time,total_calls =
+ Hashtbl.fold (fun _ (t,c) (tacc,cacc) ->
+ tacc+. t, cacc + c) _timings (0.,0)
+
+ in
+ Format.fprintf ppf
+ "Timing : Function Name, number of calls,%% of total calls, mean time, total time, %% of total time\n%!";
+ Hashtbl.iter (fun name (time,count) ->
+ Format.fprintf ppf "%-27s% 8d\t% 4.2f%%\t% 4.6f ms\t% 4.6f ms\t%04.2f%%\n%!"
+ name
+ count
+ (100. *. (float_of_int count)/.(float_of_int total_calls))
+ (time /. (float_of_int count))
+ time
+ (100. *. time /. total_time)) _timings;
+ Format.fprintf ppf "-------------------------------------------------------------------\n";
+ Format.fprintf ppf "%-27s% 8d\t% 4.0f%%\t########## ms\t% 4.6f ms\t% 4.0f%%\n%!"
+ "Total" total_calls 100. total_time 100.
+
+
+ let print_xml_fast outc t =
+ let rec loop ?(print_right=true) t = match t.node with
+ | Nil -> ()
+ | String (s) -> output_string outc (string t)
+ | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t)
+
+ | Node (_) ->
+ let tg = Tag.to_string (tag t) in
+ let l = left t
+ and r = right t
+ in
+ output_char outc '<';
+ output_string outc tg;
+ ( match l.node with
+ Nil -> output_string outc "/>"
+ | String _ -> assert false
+ | Node(_) when Tag.equal (tag l) Tag.attribute ->
+ (loop_attributes (left l);
+ match (right l).node with
+ | Nil -> output_string outc "/>"
+ | _ ->
+ output_char outc '>';
+ loop (right l);
+ output_string outc "</";
+ output_string outc tg;
+ output_char outc '>' )
+ | _ ->
+ output_char outc '>';
+ loop l;
+ output_string outc "</";
+ output_string outc tg;
+ output_char outc '>'
+ );if print_right then loop r
+ and loop_attributes a =
+
+ match a.node with
+ | Node(_) ->
+ let value =
+ match (left a).node with
+ | Nil -> ""
+ | _ -> string (left(left a))
+ in
+ output_char outc ' ';
+ output_string outc (Tag.to_string (tag a));
+ output_string outc "=\"";
+ output_string outc value;
+ output_char outc '"';
+ loop_attributes (right a)
+ | _ -> ()
+ in
+ loop ~print_right:false t
+
+
+ let print_xml_fast outc t =
+ if Tag.to_string (tag t) = "" then
+ print_xml_fast outc (first_child t)
+ else print_xml_fast outc t
+
+
+
+
+end
+
+module Binary = DEBUGTREE
+ELSE
+module Binary = XML.Binary
+END (* IFDEF DEBUG *)