#include <caml/fail.h>
} //extern C
+#include "TextCollection/TextCollection.h"
#include "XMLDocShredder.h"
#include "XMLTree.h"
-#include "TextCollection/TextCollection.h"
#include "Utils.h"
#define CAMLRAISECPP(e) (caml_failwith( ((e).what())))
};
}
+void traversal_rec(XMLTree* tree, treeNode id){
+ if (id == NULLT)
+ return;
+ char * tag = (char*)tree->GetTagName(tree->Tag(id));
+ if (id) {
+ DocID tid = tree->PrevText(id);
+ char * data = (char *) (tree->getTextCollection())->GetText(tid);
+ if (tree->IsLeaf(id)){
+ tid = tree->MyText(id);
+ data = (char*) (tree->getTextCollection())->GetText(tid);
+ };
+
+ if (tree->NextSibling(id) == NULLT){
+ tid = tree->NextText(id);
+ data = (char*) (tree->getTextCollection())->GetText(tid);
+ };
+ }
+ traversal_rec(tree,tree->FirstChild(id));
+ traversal_rec(tree,tree->NextSibling(id));
+ return;
+}
+
+void traversal (XMLTree* tree){
+ traversal_rec(tree, tree->Root());
+ return;
+}
+
+
+extern "C" CAMLprim value caml_cpp_traversal(value tree){
+ CAMLparam1(tree);
+ traversal(XMLTREE(tree));
+ CAMLreturn(Val_unit);
+}
+
+
extern "C" CAMLprim value caml_text_collection_get_text(value tc, value id){
CAMLparam2(tc,id);
const char* txt = (const char*) ((TextCollection*) tc)->GetText((DocID) Int_val(id));
CAMLreturn (caml_copy_string(txt));
}
+extern "C" CAMLprim value caml_text_collection_empty_text(value tc,value id){
+ CAMLparam2(tc,id);
+ CAMLreturn ( Val_int(((TextCollection*) tc)->EmptyText((DocID) Int_val(id))));
+}
extern "C" CAMLprim value caml_xml_tree_root(value tree){
CAMLparam1(tree);
}
extern "C" CAMLprim value caml_xml_tree_text_collection(value tree){
CAMLparam1(tree);
- CAMLreturn((value) XMLTREE(tree)->GetTextCollection());
+ CAMLreturn((value) XMLTREE(tree)->getTextCollection());
}
extern "C" CAMLprim value caml_xml_tree_first_child(value tree, value id){
reader_ = new TextReader(data,size,"");
setProperties();
storageIfc_ = new SXSIStorageInterface();
+ buffer = "";
}
XMLDocShredder::XMLDocShredder(const string inFileName)
reader_ = new TextReader(inFileName);
setProperties();
storageIfc_ = new SXSIStorageInterface();
-
+ buffer = "";
}
XMLDocShredder::~XMLDocShredder()
ustring name = reader_->get_name();
bool empty = false;
- if (!last_text)
- storageIfc_->newText(""); //prevText
- last_text = false;
-
+ storageIfc_->newText(buffer); //prevText
+ buffer.erase();
+
storageIfc_->newChild(name);
/* We must be really carefull here. calling process attributes moves
};
-
-
-
}
void XMLDocShredder::processEndElement()
{
// tell the storage interface that the current node has been completely processed
- if (!last_text)
- storageIfc_->newText(""); //nextText of previous node
- last_text = false;
+ storageIfc_->newText(buffer); //prevText
+ buffer.erase();
storageIfc_->nodeFinished(reader_->get_name());
}
// send the content of this PCDATA node to the storage interface as a text node
if (reader_->has_value())
- {
- storageIfc_->newText(reader_->get_value());
- last_text = true;
- }
- else
- storageIfc_->newText("");
+ {
+ buffer += reader_->get_value();
+
+ };
+
}
void XMLDocShredder::processAttributes()
{
reader_->move_to_first_attribute();
- string nspaceStr = "xmlns";
+ string nspaceStr = "xmlns";
+ storageIfc_->newText(""); //prevText
storageIfc_->newChild("<@>");
do
{
void XMLDocShredder::processSignificantWhitespace()
{
- ustring value = reader_->get_value();
- // each significant whitespace sequence constructs a text node
- storageIfc_->newText(value);
+ // each significant whitespace sequence constructs a text node
+ buffer += reader_->get_value();
}
type 'a node = int
type node_kind = [`Text | `Tree ]
- let compare : 'a node -> 'a node -> int = fun x y -> x - y
- let equal : 'a node -> 'a node -> bool = fun x y -> x == y
+ let compare : 'a node -> 'a node -> int = (-)
+ let equal : 'a node -> 'a node -> bool = (==)
(* abstract type, values are pointers to a XMLTree C++ object *)
-
+ external int_of_node : 'a node -> int = "%identity"
+
external parse_xml_uri : string -> t = "caml_call_shredder_uri"
let parse_xml_uri uri = parse_xml_uri uri
(* Todo *)
external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt"
let nil = nullt ()
- external get_text1 : t -> [`Text] node -> string = "caml_text_collection_get_text"
+ external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text"
- let get_text t n = Printf.printf "@@@@@@%i\n%!" (Obj.magic n);
+ let get_text t n =
if equal nil n then ""
- else get_text1 t n
-
- let is_empty t (n : [`Text] node) = (get_text t n) = ""
-
+ else get_text t n
+
+ external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text"
end
external parent : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child"
external parent_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc"
external first_child : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child"
+
+
+
external next_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling"
external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf"
-
+
external tag : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag"
external text_collection : t -> Text.t = "caml_xml_tree_text_collection"
let is_last t n = equal nil (next_sibling t n)
- external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text"
- let prev_text t id = Printf.eprintf "Calling PrevText for node %i with result" (Obj.magic id);
- let did = if is_nil id then Text.nil else prev_text t id
- in Printf.eprintf " %i!!!\n%!" (Obj.magic did); did
-
+ external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text"
external my_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text"
external next_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text"
- let next_text t id = Printf.eprintf "Calling NextText for node %i with result" (Obj.magic id);
- let did = if is_nil id then Text.nil else next_text t id
- in Printf.eprintf " %i!!!\n%!" (Obj.magic did); did
-
external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id"
external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id"
let print_skel t =
+ let textcol = text_collection t in
let rec aux id =
if (is_nil id)
then Printf.eprintf "#"
else
begin
- Printf.eprintf "%s(" (Tag.to_string (tag t id));
+ Printf.eprintf "Node %i has tag '%s', DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!"
+ (int_of_node id)
+ (Tag.to_string (tag t id))
+ (int_of_node (prev_text t id))
+ (Text.get_text textcol (prev_text t id))
+ (int_of_node (my_text t id))
+ (Text.get_text textcol (my_text t id))
+ (int_of_node (next_text t id))
+ (Text.get_text textcol (next_text t id));
aux(first_child t id);
- Printf.eprintf ",\n";
aux(next_sibling t id);
- Printf.eprintf ")\n";
end
in
aux (root t)
+
+ let traversal t =
+ let textcol = text_collection t in
+ let rec aux id =
+ if not (is_nil id)
+ then
+ begin
+ ignore (tag t id);
+ ignore (Text.get_text textcol (prev_text t id));
+ if (is_leaf t id)
+ then ignore (Text.get_text textcol (my_text t id));
+ if (is_last t id)
+ then ignore (Text.get_text textcol (next_text t id));
+ aux (first_child t id);
+ aux (next_sibling t id);
+ end
+ in
+ aux (root t)
end
module Binary = struct
type node_content =
- [ `Node of [`Tree ] node
- | `String of [`Text ] node * [`Tree ] node ]
+ NC of [`Tree ] node
+ | SC of [`Text ] node * [`Tree ] node
type string_content = [ `Text ] node
type descr =
| Nil
type doc = t
type t = { doc : doc;
+ text : Text.t;
node : descr }
let dump { doc=t } = Tree.print_skel t
open Tree
- let node_of_t t = { doc= t; node= Node(`Node (root t)) }
+ let node_of_t t = { doc= t;
+ text = text_collection t;
+ node = Node(NC (root t)) }
let parse_xml_uri str = node_of_t (parse_xml_uri str)
let parse_xml_string str = node_of_t (parse_xml_string str)
let compare a b = match a.node,b.node with
- | Node(`Node i),Node(`Node j) -> compare i j
- | _, Node(`Node( _ )) -> 1
- | Node(`String (i,_)),Node(`String (j,_)) -> compare i j
- | Node(`Node( _ )),Node(`String (_,_)) -> -1
- | _, Node(`String (_,_)) -> 1
+ | Node(NC i),Node(NC j) -> compare i j
+ | _, Node(NC( _ )) -> 1
+ | Node(SC (i,_)),Node(SC (j,_)) -> compare i j
+ | Node(NC( _ )),Node(SC (_,_)) -> -1
+ | _, Node(SC (_,_)) -> 1
| String i, String j -> compare i j
| Node _ , String _ -> -1
| _ , String _ -> 1
let equal a b = (compare a b) == 0
let string t = match t.node with
- | String i -> Text.get_text (text_collection t.doc) i
+ | String i -> Text.get_text t.text i
| _ -> assert false
- let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (`Node n)
+ let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n)
let descr t = t.node
+ let nts = function
+ Nil -> "Nil"
+ | String i -> Printf.sprintf "String %i" i
+ | Node (NC t) -> Printf.sprintf "Node (NC %i)" (int_of_node t)
+ | Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i)
+
let first_child n =
- Printf.eprintf "first_child!\n%!";
let node' =
match n.node with
- | Nil | String _ -> failwith "first_child"
- | Node (`Node t) ->
+ | Node (NC t) when is_leaf n.doc t ->
+ let txt = my_text n.doc t in
+ if Text.is_empty n.text txt
+ then Nil
+ else Node(SC (txt,Tree.nil))
+ | Node (NC t) ->
let fs = first_child n.doc t in
- let txt = prev_text n.doc t in
- if Text.is_empty (text_collection n.doc) txt
+ let txt = prev_text n.doc fs in
+ if Text.is_empty n.text txt
then norm fs
- else Node (`String (txt, fs))
-
- | Node(`String (i,_)) -> String i
+ else Node (SC (txt, fs))
+ | Node(SC (i,_)) -> String i
+ | Nil | String _ -> failwith "first_child"
in
{ n with node = node'}
+
let next_sibling n =
- Printf.eprintf "next_sibling!\n%!";
let node' =
match n.node with
- | Nil | String _ -> failwith "next_sibling"
- | Node (`String (_,ns)) -> norm ns
- | Node(`Node t) ->
+ | Node (SC (_,ns)) -> norm ns
+ | Node(NC t) ->
let ns = next_sibling n.doc t in
let txt = next_text n.doc t in
- if Text.is_empty (text_collection n.doc) txt
+ if Text.is_empty n.text txt
then norm ns
- else Node (`String (txt, ns))
+ else Node (SC (txt, ns))
+ | Nil | String _ -> failwith "next_sibling"
in
{ n with node = node'}
- let left = first_child
+ let left = first_child
let right = next_sibling
+
let id =
- function { doc=d; node=Node(`Node n)} -> text_xml_id d n
- | { doc=d; node=Node(`String (i,_) )} -> node_xml_id d i
+ function { doc=d; node=Node(NC n)} -> text_xml_id d n
+ | { doc=d; node=Node(SC (i,_) )} -> node_xml_id d i
| _ -> failwith "id"
let tag =
- function { node=Node(`String _) } -> Tag.pcdata
- | { doc=d; node=Node(`Node n)} -> tag d n
+ function { node=Node(SC _) } -> Tag.pcdata
+ | { doc=d; node=Node(NC n)} -> tag d n
| _ -> failwith "Tag"
-
+
let print_xml_fast outc t =
output_string outc tg;
output_char outc '>'
);if print_right then loop r
- and loop_attributes a = match a.node with
- | Node(_) -> let value = string (left(left a)) in
- output_char outc ' ';
- output_string outc (Tag.to_string (tag a));
- output_string outc "=\"";
- output_string outc value;
- output_char outc '"';
- loop_attributes (right a)
+ and loop_attributes a =
+
+ match a.node with
+ | Node(_) ->
+ let value =
+ match (left a).node with
+ | Nil -> ""
+ | _ -> string (left(left a))
+ in
+ output_char outc ' ';
+ output_string outc (Tag.to_string (tag a));
+ output_string outc "=\"";
+ output_string outc value;
+ output_char outc '"';
+ loop_attributes (right a)
| _ -> ()
in
loop ~print_right:false t
-
+ let print_xml_fast outc t =
+ if Tag.to_string (tag t) = "" then
+ print_xml_fast outc (first_child t)
+ else print_xml_fast outc t
+
+ let traversal t = Tree.traversal t.doc
+ let full_traversal t =
+ let rec aux n =
+ match n.node with
+ | Nil -> ()
+ | String i -> ignore(Text.get_text t.text i)
+ | Node(_) ->
+ ignore (tag n);
+ aux (first_child n);
+ aux (next_sibling n)
+ in aux t
end
end
let dump = XML.Binary.dump
+let traversal = XML.Binary.traversal
+let full_traversal = XML.Binary.full_traversal
+external cpp_traversal : XML.t -> unit = "caml_cpp_traversal"
+let cpp_traversal t = cpp_traversal t.XML.Binary.doc
+
include XML