val tag_pool : t -> Tag.pool
val string : t -> string
val descr : t -> descr
+ val is_node : t -> bool
val left : t -> t
val right : t -> t
+ val first_child : t -> t
+ val next_sibling : t -> t
val parent : t -> t
+ val root : t -> t
+ val is_root : t -> bool
val id : t -> int
val tag : t -> Tag.t
val print_xml_fast : out_channel -> t -> unit
val compare : t -> t -> int
val equal : t -> t -> bool
- module DocIdSet : Set.S with type elt = string_content
+ module DocIdSet :
+ sig
+ include Set.S
+ end
+ with type elt = string_content
val string_below : t -> string_content -> bool
val contains : t -> string -> DocIdSet.t
- val contains_old : t -> string -> bool
+ val contains_old : t -> string -> DocIdSet.t
+ val contains_iter : t -> string -> DocIdSet.t
+ val count_contains : t -> string -> int
+ val count : t -> string -> int
val dump : t -> unit
+ val get_string : t -> string_content -> string
+ val has_tagged_desc : t -> Tag.t -> bool
+ val has_tagged_foll : t -> Tag.t -> bool
+ val tagged_desc : t -> Tag.t -> t
+ val tagged_foll : t -> Tag.t -> t
+ val init_tagged_next : t -> Tag.t -> unit
+ val tagged_next : t -> Tag.t -> t
+ val subtree_tags : t -> Tag.t -> int
+ val is_left : t -> bool
+ val print_id : Format.formatter -> t -> unit
+ val test_xml_tree : Format.formatter -> Ptset.t -> t -> unit
end
module XML =
module Text =
struct
-
+ let equal : [`Text] node -> [`Text] node -> bool = equal
+
(* Todo *)
external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt"
let nil = nullt ()
external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text"
- let get_text t n =
+(* let get_text t n =
if equal nil n then ""
else get_text t n
+*)
external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text"
+
let is_empty t n =
(equal nil n) || is_empty t n
+ external get_cached_text : t -> [`Text ] node -> string = "caml_text_collection_get_cached_text"
+
+
+ let get_text t n =
+ if equal nil n then ""
+ else get_cached_text t n
+
+ external size : t -> int = "caml_text_collection_size"
external is_contains : t -> string -> bool = "caml_text_collection_is_contains"
external count_contains : t -> string -> int = "caml_text_collection_count_contains"
+ external count : t -> string -> int = "caml_text_collection_count"
external contains : t -> string -> [`Text ] node array = "caml_text_collection_contains"
end
module Tree =
struct
-
+ let equal : [`Tree ] node -> [`Tree] node -> bool = equal
external serialize : t -> string -> unit = "caml_xml_tree_serialize"
external unserialize : string -> t = "caml_xml_tree_unserialize"
external next_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling"
-
+ external prev_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_prev_sibling"
external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf"
(* external tag : t -> [`Tree ] node -> T = "caml_xml_tree_tag"*)
external tag_id : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag_id"
+(*
+ let tag_hash = Array.make 6_000_000 (Tag.nullt)
+
+ let tag_id t id =
+ let tag = tag_hash.(int_of_node id)
+ in
+ if tag != Tag.nullt then tag
+ else
+ let tag = tag_id t id in
+ (tag_hash.(int_of_node id) <- tag; tag)
+*)
let is_last t n = equal nil (next_sibling t n)
external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text"
external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id"
external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id"
external is_ancestor : t -> [`Tree ] node -> [`Tree ] node -> bool = "caml_xml_tree_is_ancestor"
+ external tagged_desc : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_desc"
+ external tagged_foll : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_foll"
+ external tagged_next : t -> [`Tree ] node -> Tag.t -> [`Tree ] node = "caml_xml_tree_tagged_next"
+ external subtree_tags : t -> [`Tree ] node -> Tag.t -> int = "caml_xml_tree_subtree_tags"
+
+
+
+ let test_xml_tree ppf tags v =
+ let pr x = Format.fprintf ppf x in
+ let rec aux id =
+ if (is_nil id)
+ then ()
+ else
+ begin
+ pr "Node %i, (Tag) %i='%s' (GetTagName), NodeXMLId (Preorder)=%i\n%!"
+ (int_of_node id)
+ (tag_id v id)
+ (Tag.to_string (tag_id v id))
+ (node_xml_id v id);
+ pr "DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) ParentDoc(my_text)=%i\n%!"
+ (int_of_node (prev_text v id))
+ (Text.get_text v (prev_text v id))
+ (int_of_node (my_text v id))
+ (Text.get_text v (my_text v id))
+ (int_of_node (next_text v id))
+ (Text.get_text v (next_text v id))
+ (int_of_node(parent_doc v (my_text v id)));
+ pr "Testing Tagged*\n%!";
+ Ptset.iter (fun t ->
+ let str = Tag.to_string t in
+ if Tag.pcdata <> t
+ then begin
+ pr "Tag: %s : \n%!" str;
+ pr "TaggedDesc = %i%!, " (tagged_desc v id t);
+ pr "TaggedFoll = %i\n%!" (tagged_foll v id t);
+ pr "SubtreeTags = %i\n%!" (subtree_tags v id t);
+ end) tags;
+ pr "----------------------------\n";
+ aux(first_child v id);
+ aux(next_sibling v id);
+ end
+ in
+ aux (root v)
+
+
+
+
+
let print_skel t =
let rec aux id =
then Printf.eprintf "#\n"
else
begin
- Printf.eprintf "Node %i has tag '%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!"
+ Printf.eprintf "Node %i has tag '%i=%s' DocOrder=%i, DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s) parent_doc(my_text)=%i\n%!"
(int_of_node id)
+ (tag_id t id)
(Tag.to_string (tag_id t id))
(node_xml_id t id)
(int_of_node (prev_text t id))
(int_of_node (my_text t id))
(Text.get_text t (my_text t id))
(int_of_node (next_text t id))
- (Text.get_text t (next_text t id));
+ (Text.get_text t (next_text t id))
+ (int_of_node(parent_doc t (my_text t id)));
+
aux(first_child t id);
aux(next_sibling t id);
end
end
in
aux (root t)
+
+
+
end
node : descr }
let dump { doc=t } = Tree.print_skel t
- module DocIdSet = Set.Make (struct type t = string_content
- let compare = (-) end)
+ let test_xml_tree ppf tags { doc=t } = Tree.test_xml_tree ppf tags t
+
+ module DocIdSet = struct
+ include Set.Make (struct type t = string_content
+ let compare = (-) end)
+
+ end
+ let is_node = function { node=Node(_) } -> true | _ -> false
-
+ let get_string t (i:string_content) = Text.get_text t.doc i
open Tree
let node_of_t t = { doc= t;
node = Node(NC (root t)) }
| Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i)
- let parent n =
+ let root n = { n with node = norm (Tree.root n.doc) }
+ let is_root n = match n.node with
+ | Node(NC t) -> (int_of_node t) == 0
+ | _ -> false
+
+ let parent n =
+ if is_root n then { n with node=Nil}
+ else
let node' =
match n.node with
- | Node(NC t) | Node(SC (_,t)) ->
- if (Tree.root n.doc) == t
- then Nil
- else Node(NC(Tree.parent n.doc t)) (* A parent node can never be a SC *)
- | _ -> assert false
+ | Node(NC t) ->
+ let txt = prev_text n.doc t in
+ if Text.is_empty n.doc txt then
+ let ps = Tree.prev_sibling n.doc t in
+ if is_nil ps
+ then
+ Node(NC (Tree.parent n.doc t))
+ else Node(NC ps)
+ else
+ Node(SC (txt,t))
+ | Node(SC(i,t)) ->
+ let ps = Tree.prev_sibling n.doc t in
+ if is_nil ps
+ then Node (NC(parent_doc n.doc i))
+ else Node(NC ps)
+ | _ -> failwith "parent"
in
{ n with node = node' }
| Nil | String _ -> failwith "first_child"
in
{ n with node = node'}
-
let next_sibling n =
let node' =
let id =
function { doc=d; node=Node(NC n)} -> node_xml_id d n
| { doc=d; node=Node(SC (i,_) )} -> text_xml_id d i
- | _ -> failwith "id"
+ | _ -> -1 (*
+ Format.fprintf Format.err_formatter "Failure id on %s\n%!" (nts x.node);
+ failwith "id" *)
let tag =
function { node=Node(SC _) } -> Tag.pcdata
| { doc=d; node=Node(NC n)} -> tag_id d n
| _ -> failwith "tag"
-(* let tag_id =
- function { node=Node(SC _) } -> ()
- | { doc=d; node=Node(NC n)} -> tag_id d n
- | _ -> ()
-*)
let string_below t id =
- let pid = parent_doc t.doc id in
+ let strid = parent_doc t.doc id in
match t.node with
- | Node(NC(i)) -> (is_ancestor t.doc i pid)
- | Node(SC(i,_)) -> (is_ancestor t.doc (parent_doc t.doc i) pid)
+ | Node(NC(i)) ->
+ (Tree.equal i strid) || (is_ancestor t.doc i strid)
+ | Node(SC(i,_)) -> Text.equal i id
| _ -> false
+
+
+ let tagged_foll t tag =
+ if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_foll"
+ else match t with
+ | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_foll d n tag) }
+ | { doc=d; node=Node(SC (_,n)) } when is_nil n -> { t with node= Nil }
+ | { doc=d; node=Node(SC (_,n)) } ->
+ let nnode =
+ if tag_id d n == tag then n
+ else
+ let n' = tagged_desc d n tag in
+ if is_nil n' then tagged_foll d n tag
+ else n'
+ in {t with node= norm nnode}
+ | _ -> { t with node=Nil }
+
+
+ let tagged_desc t tag =
+ if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_desc"
+ else match t with
+ | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) }
+ | _ -> { t with node=Nil }
+
+(*
+ let tagged_next t tag =
+ if tag == Tag.attribute || tag == Tag.pcdata then failwith "tagged_next"
+ else
+ match tagged_desc t tag with
+ | { doc = d; node=Nil } -> tagged_foll t tag
+ | x -> x
+*)
+ let subtree_tags t tag =
+ match t with
+ { doc = d; node = Node(NC n) } ->
+ subtree_tags d n tag
+ | _ -> 0
+
+ let tagged_desc_array = ref [| |]
+ let idx = ref 0
+
+ let init_tagged_next t tagid =
+ let l = subtree_tags (root t) tagid
+ in
+ tagged_desc_array := Array.create l { t with node= Nil };
+ let i = ref 0 in
+ let rec collect t =
+ if is_node t then begin
+ if tag t == tagid then
+ begin
+ !tagged_desc_array.(!i) <- t;
+ incr i;
+ end;
+ collect (first_child t);
+ collect (next_sibling t)
+ end;
+ in
+ collect t;
+ idx := 0
+
+ let print_id ppf v =
+ let pr x= Format.fprintf ppf x in
+ match v with
+ { node=Nil } -> pr "NULLT: -1"
+ | { node=String(i) } | { node=Node(SC(i,_)) } -> pr "DocID: %i" (int_of_node i)
+ | { node=Node(NC(i)) } -> pr "Node: %i" (int_of_node i)
+
+
+ let tagged_next t tag =
+ if !idx >= Array.length !tagged_desc_array
+ then {t with node=Nil}
+ else
+ let r = !tagged_desc_array.(!idx)
+ in
+ incr idx; r
+
+
+
+ let has_tagged_foll t tag = is_node (tagged_foll t tag)
+ let has_tagged_desc t tag = is_node (tagged_desc t tag)
+
let contains t s =
Array.fold_left (fun a i -> DocIdSet.add i a) DocIdSet.empty (Text.contains t.doc s)
+
let contains_old t s =
let regexp = Str.regexp_string s in
let matching arg =
in true
with _ -> false
in
- let rec find t = match t.node with
- | Nil -> false
- | String _ -> matching (string t)
- | Node(_) -> (find (left t )) || (find (right t))
+ let rec find t acc = match t.node with
+ | Nil -> acc
+ | String i ->
+ if matching (string t) then DocIdSet.add i acc else acc
+ | Node(_) -> (find (left t )) ((find (right t)) acc)
in
- find t
+ find t DocIdSet.empty
+
+
+ let contains_iter t s =
+ let regexp = Str.regexp_string s in
+ let matching arg =
+ try
+ let _ = Str.search_forward regexp arg 0;
+ in true
+ with _ -> false
+ in
+ let size = Text.size t.doc in
+ let rec find acc n =
+ if n == size then acc
+ else
+ find
+ (if matching (Text.get_cached_text t.doc (Obj.magic n)) then
+ DocIdSet.add (Obj.magic n) acc
+ else acc) (n+1)
+ in
+ find DocIdSet.empty 0
+
+
+
+
+ let count_contains t s = Text.count_contains t.doc s
+ let count t s = Text.count t.doc s
+
+ let is_left t =
+ if is_root t then false
+ else
+ if tag (parent t) == Tag.pcdata then false
+ else
+ let u = left (parent t) in
+ (id t) == (id u)
let print_xml_fast outc t =
let rec loop ?(print_right=true) t = match t.node with
| Nil -> ()
- | String (s) -> output_string outc (string t)
- | Node _ when Tag.equal (tag t) Tag.pcdata -> loop (left t); loop (right t)
+ | String (s) -> output_string outc (Text.get_text t.doc s)
+ | Node _ when Tag.equal (tag t) Tag.pcdata ->
+ loop (left t);
+ if print_right then loop (right t)
| Node (_) ->
let tg = Tag.to_string (tag t) in
print_xml_fast outc (first_child t)
else print_xml_fast outc t
+
+
+
+
let traversal t = Tree.traversal t.doc
let full_traversal t =
let rec aux n =