- let traversal t =
- let rec aux id =
- if not (is_nil id)
- then
- begin
- (* ignore (tag t id);
- ignore (Text.get_text t (prev_text t id));
- if (is_leaf t id)
- then ignore (Text.get_text t (my_text t id));
- if (is_last t id)
- then ignore (Text.get_text t (next_text t id)); *)
- aux (first_child t id);
- aux (next_sibling t id);
- end
- in
- aux (root t)
-
-
-
- end
-
-
- module Binary = struct
-
- type node_content =
- NC of [`Tree ] node
- | SC of [`Text ] node * [`Tree ] node
- type string_content = [ `Text ] node
- type descr =
- | Nil
- | Node of node_content
- | String of string_content
-
- type doc = t
-
- type t = { doc : doc;
- node : descr }
-
- let dump { doc=t } = Tree.print_skel t
- let test_xml_tree ppf tags { doc=t } = Tree.test_xml_tree ppf tags t
- let test_jump { doc=t } tag = Tree.test_jump t tag
- let contains_array = ref [| |]
-
- let init_contains t s =
- let a = Text.contains t.doc s
- in
- Array.fast_sort (compare) a;
- contains_array := a
-
-
-
- module DocIdSet = struct
- include Set.Make (struct type t = string_content
- let compare = (-) end)
-
- end
- let is_node = function { node=Node(_) } -> true | _ -> false
-
- let get_string t (i:string_content) = Text.get_text t.doc i
- open Tree
- let node_of_t t = { doc= t;
- node = Node(NC (root t)) }
-
-
- let parse_xml_uri str = node_of_t
- (MM((parse_xml_uri str
- !Options.sample_factor
- !Options.index_empty_texts
- !Options.disable_text_collection),__LOCATION__))
-
- let parse_xml_string str = node_of_t
- (MM((parse_xml_string str
- !Options.sample_factor
- !Options.index_empty_texts
- !Options.disable_text_collection),__LOCATION__))
-
-
- let save t str = save_tree t.doc str
-
- let load ?(sample=64) str = node_of_t (load_tree str sample)
-
-
- external pool : doc -> Tag.pool = "%identity"
- let tag_pool t = pool t.doc
-
- let compare a b = match a.node,b.node with
- | Node(NC i),Node(NC j) -> compare i j
- | _, Node(NC( _ )) -> 1
- | Node(SC (i,_)),Node(SC (j,_)) -> compare i j
- | Node(NC( _ )),Node(SC (_,_)) -> -1
- | _, Node(SC (_,_)) -> 1
- | String i, String j -> compare i j
- | Node _ , String _ -> -1
- | _ , String _ -> 1
- | Nil, Nil -> 0
- | _,Nil -> -1
-
- let equal a b = (compare a b) == 0
-
- let string t = match t.node with
- | String i -> Text.get_text t.doc i
- | _ -> assert false
-
- let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n)
-
- let descr t = t.node
-
- let nts = function
- Nil -> "Nil"
- | String i -> Printf.sprintf "String %i" i
- | Node (NC t) -> Printf.sprintf "Node (NC %i)" (int_of_node t)
- | Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i)
-
- let mk_nil t = { t with node = Nil }
- let root n = { n with node = norm (Tree.root n.doc) }
- let is_root n = match n.node with
- | Node(NC t) -> (int_of_node t) == 0
- | _ -> false
-
- let parent n =
- if is_root n then { n with node=Nil}
- else
- let node' =
- match n.node with
- | Node(NC t) ->
- let txt = prev_text n.doc t in
- if Text.is_empty n.doc txt then
- let ps = Tree.prev_sibling n.doc t in
- if is_nil ps
- then
- Node(NC (Tree.parent n.doc t))
- else Node(NC ps)
- else
- Node(SC (txt,t))
- | Node(SC(i,t)) ->
- let ps = Tree.prev_sibling n.doc t in
- if is_nil ps
- then Node (NC(parent_doc n.doc i))
- else Node(NC ps)
- | _ -> failwith "parent"
- in
- { n with node = node' }
-
- let first_child n =
- let node' =
- match n.node with
- | Node (NC t) when is_leaf n.doc t ->
- let txt = my_text n.doc t in
- if Text.is_empty n.doc txt
- then Nil
- else Node(SC (txt,Tree.nil))
- | Node (NC t) ->
- let fs = first_child n.doc t in
- let txt = prev_text n.doc fs in
- if Text.is_empty n.doc txt
- then norm fs
- else Node (SC (txt, fs))
- | Node(SC (i,_)) -> String i
- | Nil | String _ -> failwith "first_child"
- in
- { n with node = node'}
-
- let next_sibling n =
- let node' =
- match n.node with
- | Node (SC (_,ns)) -> norm ns
- | Node(NC t) ->
- let ns = next_sibling n.doc t in
- let txt = next_text n.doc t in
- if Text.is_empty n.doc txt
- then norm ns
- else Node (SC (txt, ns))
- | Nil | String _ -> failwith "next_sibling"
- in
- { n with node = node'}
-
-
- let left = first_child
- let right = next_sibling
-
- let id =
- function { doc=d; node=Node(NC n)} -> node_xml_id d n
- | { doc=d; node=Node(SC (i,_) )} -> text_xml_id d i
- | _ -> -1 (*
- Format.fprintf Format.err_formatter "Failure id on %s\n%!" (nts x.node);
- failwith "id" *)
-
- let tag =
- function { node=Node(SC _) } -> Tag.pcdata
- | { doc=d; node=Node(NC n)} -> tag_id d n
- | _ -> failwith "tag"
-
- let string_below t id =
- let strid = parent_doc t.doc id in
- match t.node with
- | Node(NC(i)) ->
- (Tree.equal i strid) || (is_ancestor t.doc i strid)
- | Node(SC(i,_)) -> Text.equal i id
- | _ -> false
-
-
- let tagged_foll t tag =
- if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_foll"
- else match t with
- | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_foll d n tag) }
- | { doc=d; node=Node(SC (_,n)) } when is_nil n -> { t with node= Nil }
- | { doc=d; node=Node(SC (_,n)) } ->
- let nnode =
- if tag_id d n == tag then n
- else
- let n' = tagged_desc d n tag in
- if is_nil n' then tagged_foll d n tag
- else n'
- in {t with node= norm nnode}
- | _ -> { t with node=Nil }
-
-
- let tagged_desc t tag =
- if tag = Tag.attribute || tag = Tag.pcdata then failwith "tagged_desc"
- else match t with
- | { doc=d; node=Node(NC n) } -> { t with node = norm (tagged_desc d n tag) }
- | _ -> { t with node=Nil }
-
-
- let tagged_next t tb tf s =
- match s with
- | { node = Node (NC(below)) } -> begin
- match t with
- | { doc = d; node=Node(NC n) } ->
- { t with node= norm (tagged_next d n (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) }
- | { doc = d; node=Node(SC (i,n) ) } when is_nil n ->
- let p = parent_doc d i in
- { t with node= norm (tagged_next d p (Ptset.to_int_vector tb) (Ptset.to_int_vector tf) below) }
- | { doc = d; node=Node(SC (_,n) ) } ->
- if Ptset.mem (tag_id d n) (Ptset.union tb tf)
- then { t with node=Node(NC(n)) }
- else
- let vb = Ptset.to_int_vector tb in
- let vf = Ptset.to_int_vector tf in
- let node =
- let dsc = tagged_below d n vb vf in
- if is_nil dsc
- then tagged_next d n vb vf below
- else dsc
- in
- { t with node = norm node }
- | _ -> {t with node=Nil }
- end
-
- | _ -> {t with node=Nil }
-
- let tagged_foll_only t tf s =
- match s with
- | { node = Node (NC(below)) } -> begin
- match t with
- | { doc = d; node=Node(NC n) } ->
- { t with node= norm (tagged_foll_only d n (Ptset.to_int_vector tf) below) }
- | { doc = d; node=Node(SC (i,n) ) } when is_nil n ->
- let p = parent_doc d i in
- { t with node= norm (tagged_foll_only d p (Ptset.to_int_vector tf) below) }
- | { doc = d; node=Node(SC (_,n) ) } ->
- if Ptset.mem (tag_id d n) tf
- then { t with node=Node(NC(n)) }
- else
- let vf = Ptset.to_int_vector tf in
- let node =
- let dsc = tagged_desc_only d n vf in
- if is_nil dsc
- then tagged_foll_only d n vf below
- else dsc
- in
- { t with node = norm node }
- | _ -> {t with node=Nil }
- end
-
- | _ -> {t with node=Nil }
-
-
- let tagged_below t tc td =
- match t with
- | { doc = d; node=Node(NC n) } ->
- let vc = Ptset.to_int_vector tc
- in
- let vd = Ptset.to_int_vector td
- in
- { t with node= norm(tagged_below d n vc vd) }
- | _ -> { t with node=Nil }
+end
+let is_nil t = t == nil
+
+let is_node t = t != nil
+let is_root t = t == root
+
+let node_of_t t =
+ let _ = Tag.init (Obj.magic t) in
+ let table = collect_tags t
+ in (*
+ let _ = Hashtbl.iter (fun t (c,d,ns,f) ->
+ Printf.eprintf "Tag %s has:\n" (Tag.to_string t);
+ Printf.eprintf "Child tags: ";
+ Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) c;
+ Printf.eprintf "\nDescendant tags: ";
+ Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) d;
+ Printf.eprintf "\nNextSibling tags: ";
+ Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) ns;
+ Printf.eprintf "\nFollowing tags: ";
+ Ptset.Int.iter (fun t -> Printf.eprintf "%s "(Tag.to_string t)) f;
+ Printf.eprintf "\n\n%!";) table
+ in
+
+ *)
+ { doc= t;
+ ttable = table;
+ }
+
+let finalize _ = Printf.eprintf "Release the string list !\n%!"
+;;
+
+let parse f str =
+ node_of_t
+ (f str
+ !Options.sample_factor
+ !Options.index_empty_texts
+ !Options.disable_text_collection)
+
+let parse_xml_uri str = parse parse_xml_uri str
+let parse_xml_string str = parse parse_xml_string str
+
+let size t = tree_size t.doc;;
+
+external pool : tree -> Tag.pool = "%identity"
+
+let magic_string = "SXSI_INDEX"
+let version_string = "2"
+
+let pos fd =
+ Unix.lseek fd 0 Unix.SEEK_CUR
+
+let pr_pos fd = Printf.eprintf "At position %i\n%!" (pos fd)
+
+let write fd s =
+ let sl = String.length s in
+ let ssl = Printf.sprintf "%020i" sl in
+ ignore (Unix.write fd ssl 0 20);
+ ignore (Unix.write fd s 0 (String.length s))
+
+let rec really_read fd buffer start length =
+ if length <= 0 then () else
+ match Unix.read fd buffer start length with
+ 0 -> raise End_of_file
+ | r -> really_read fd buffer (start + r) (length - r);;
+
+let read fd =
+ let buffer = String.create 20 in
+ let _ = really_read fd buffer 0 20 in
+ let size = int_of_string buffer in
+ let buffer = String.create size in
+ let _ = really_read fd buffer 0 size in
+ buffer
+
+
+let save t str =
+ let fd = Unix.openfile str [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
+ let out_c = Unix.out_channel_of_descr fd in
+ let _ = set_binary_mode_out out_c true in
+ output_string out_c magic_string;
+ output_char out_c '\n';
+ output_string out_c version_string;
+ output_char out_c '\n';
+ Marshal.to_channel out_c t.ttable [ ];
+ (* we need to move the fd to the correct position *)
+ flush out_c;
+ ignore (Unix.lseek fd (pos_out out_c) Unix.SEEK_SET);
+ tree_save t.doc fd str;
+ close_out out_c
+;;
+
+let load ?(sample=64) ?(load_text=true) str =
+ let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
+ let in_c = Unix.in_channel_of_descr fd in
+ let _ = set_binary_mode_in in_c true in
+ let load_table () =
+ (let ms = input_line in_c in if ms <> magic_string then failwith "Invalid index file");
+ (let vs = input_line in_c in if vs <> version_string then failwith "Invalid version file");
+ let table : (Tag.t,(Ptset.Int.t*Ptset.Int.t*Ptset.Int.t*Ptset.Int.t)) Hashtbl.t =
+ Marshal.from_channel in_c
+ in
+ let ntable = Hashtbl.create (Hashtbl.length table) in
+ Hashtbl.iter (fun k (s1,s2,s3,s4) ->
+ let ss1 = Ptset.Int.fold (Ptset.Int.add) s1 Ptset.Int.empty
+ and ss2 = Ptset.Int.fold (Ptset.Int.add) s2 Ptset.Int.empty
+ and ss3 = Ptset.Int.fold (Ptset.Int.add) s3 Ptset.Int.empty
+ and ss4 = Ptset.Int.fold (Ptset.Int.add) s4 Ptset.Int.empty
+ in Hashtbl.add ntable k (ss1,ss2,ss3,ss4)
+ ) table;
+ Hashtbl.clear table;
+ (* The in_channel read a chunk of fd, so we might be after
+ the start of the XMLTree save file. Reset to the correct
+ position *)
+ ntable
+ in
+ let _ = Printf.eprintf "\nLoading tag table : " in
+ let ntable = time (load_table) () in
+ ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
+ let tree = { doc = tree_load fd str load_text sample;
+ ttable = ntable;}
+ in close_in in_c;
+ tree
+
+
+
+
+let tag_pool t = pool t.doc
+
+let compare = compare_node
+
+let equal a b = a == b
+
+let nts = function
+ -1 -> "Nil"
+ | i -> Printf.sprintf "Node (%i)" i
+
+let dump_node t = nts (inode t)
+
+let is_left t n = tree_is_first_child t.doc n
+
+
+
+let is_below_right t n1 n2 =
+ tree_is_ancestor t.doc (tree_parent t.doc n1) n2
+ && not (tree_is_ancestor t.doc n1 n2)
+
+let is_binary_ancestor t n1 n2 =
+ let p = tree_parent t.doc n1 in
+ let fin = tree_closing t.doc p in
+ n2 > n1 && n2 < fin
+(* (is_below_right t n1 n2) ||
+ (tree_is_ancestor t.doc n1 n2) *)
+
+let parent t n = tree_parent t.doc n
+
+let first_child t = let doc = t.doc in ();fun n -> tree_first_child doc n
+let first_element t = let doc = t.doc in (); fun n -> tree_first_element doc n
+let first_element t n = tree_first_element t.doc n
+(* these function will be called in two times: first partial application
+ on the tag, then application of the tag and the tree, then application of
+ the other arguments. We use the trick to let the compiler optimize application
+*)