From 496df5f7d3e6f8271763314f2067719cc2904c71 Mon Sep 17 00:00:00 2001 From: kim Date: Fri, 12 Dec 2008 11:16:51 +0000 Subject: [PATCH] . git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/xpathcomp@25 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- OCamlDriver.cpp | 43 ++++++++- SXSIStorageInterface.cpp | 17 ++-- XMLDocShredder.cpp | 37 ++++---- XMLDocShredder.h | 3 + main.ml | 16 ++-- tests/test.xml | 8 +- tree.ml | 182 ++++++++++++++++++++++++++------------- tree.mli | 3 + 8 files changed, 207 insertions(+), 102 deletions(-) diff --git a/OCamlDriver.cpp b/OCamlDriver.cpp index 1898d5a..490e007 100644 --- a/OCamlDriver.cpp +++ b/OCamlDriver.cpp @@ -17,9 +17,9 @@ extern "C" { #include } //extern C +#include "TextCollection/TextCollection.h" #include "XMLDocShredder.h" #include "XMLTree.h" -#include "TextCollection/TextCollection.h" #include "Utils.h" #define CAMLRAISECPP(e) (caml_failwith( ((e).what()))) @@ -67,12 +67,51 @@ extern "C" CAMLprim value caml_call_shredder_string(value data){ }; } +void traversal_rec(XMLTree* tree, treeNode id){ + if (id == NULLT) + return; + char * tag = (char*)tree->GetTagName(tree->Tag(id)); + if (id) { + DocID tid = tree->PrevText(id); + char * data = (char *) (tree->getTextCollection())->GetText(tid); + if (tree->IsLeaf(id)){ + tid = tree->MyText(id); + data = (char*) (tree->getTextCollection())->GetText(tid); + }; + + if (tree->NextSibling(id) == NULLT){ + tid = tree->NextText(id); + data = (char*) (tree->getTextCollection())->GetText(tid); + }; + } + traversal_rec(tree,tree->FirstChild(id)); + traversal_rec(tree,tree->NextSibling(id)); + return; +} + +void traversal (XMLTree* tree){ + traversal_rec(tree, tree->Root()); + return; +} + + +extern "C" CAMLprim value caml_cpp_traversal(value tree){ + CAMLparam1(tree); + traversal(XMLTREE(tree)); + CAMLreturn(Val_unit); +} + + extern "C" CAMLprim value caml_text_collection_get_text(value tc, value id){ CAMLparam2(tc,id); const char* txt = (const char*) ((TextCollection*) tc)->GetText((DocID) Int_val(id)); CAMLreturn (caml_copy_string(txt)); } +extern "C" CAMLprim value caml_text_collection_empty_text(value tc,value id){ + CAMLparam2(tc,id); + CAMLreturn ( Val_int(((TextCollection*) tc)->EmptyText((DocID) Int_val(id)))); +} extern "C" CAMLprim value caml_xml_tree_root(value tree){ CAMLparam1(tree); @@ -80,7 +119,7 @@ extern "C" CAMLprim value caml_xml_tree_root(value tree){ } extern "C" CAMLprim value caml_xml_tree_text_collection(value tree){ CAMLparam1(tree); - CAMLreturn((value) XMLTREE(tree)->GetTextCollection()); + CAMLreturn((value) XMLTREE(tree)->getTextCollection()); } extern "C" CAMLprim value caml_xml_tree_first_child(value tree, value id){ diff --git a/SXSIStorageInterface.cpp b/SXSIStorageInterface.cpp index 14c1d70..5df826b 100644 --- a/SXSIStorageInterface.cpp +++ b/SXSIStorageInterface.cpp @@ -15,7 +15,7 @@ SXSIStorageInterface::SXSIStorageInterface() { tree = new XMLTree(); - tree->OpenDocument(true,1); + tree->OpenDocument(true,64); } SXSIStorageInterface::~SXSIStorageInterface() @@ -24,30 +24,23 @@ SXSIStorageInterface::~SXSIStorageInterface() void SXSIStorageInterface::newChild(string name) { - cerr << "New child " << name << "\n"; tree->NewOpenTag((unsigned char*) name.c_str()); } void SXSIStorageInterface::newText(string text) { - if (text.empty()){ - cerr << "Calling newEmptyText()\n"; + if (text.empty()) tree->NewEmptyText(); - } - else { - cerr << "Calling newText(" << text <<")\n"; - tree->NewText((unsigned char*) text.c_str()); - - }; + else + tree->NewText((unsigned char*) text.c_str()); } void SXSIStorageInterface::nodeFinished(string name) { - cerr << "Node Finished child " << name << "\n"; - tree->NewClosingTag((unsigned char*) name.c_str()); + tree->NewClosingTag((unsigned char*) name.c_str()); } void SXSIStorageInterface::parsingFinished() diff --git a/XMLDocShredder.cpp b/XMLDocShredder.cpp index 3daaf45..1c25d61 100644 --- a/XMLDocShredder.cpp +++ b/XMLDocShredder.cpp @@ -59,6 +59,7 @@ XMLDocShredder::XMLDocShredder(const unsigned char * data, reader_ = new TextReader(data,size,""); setProperties(); storageIfc_ = new SXSIStorageInterface(); + buffer = ""; } XMLDocShredder::XMLDocShredder(const string inFileName) @@ -67,7 +68,7 @@ XMLDocShredder::XMLDocShredder(const string inFileName) reader_ = new TextReader(inFileName); setProperties(); storageIfc_ = new SXSIStorageInterface(); - + buffer = ""; } XMLDocShredder::~XMLDocShredder() @@ -84,10 +85,9 @@ void XMLDocShredder::processStartElement() ustring name = reader_->get_name(); bool empty = false; - if (!last_text) - storageIfc_->newText(""); //prevText - last_text = false; - + storageIfc_->newText(buffer); //prevText + buffer.erase(); + storageIfc_->newChild(name); /* We must be really carefull here. calling process attributes moves @@ -110,17 +110,13 @@ void XMLDocShredder::processStartElement() }; - - - } void XMLDocShredder::processEndElement() { // tell the storage interface that the current node has been completely processed - if (!last_text) - storageIfc_->newText(""); //nextText of previous node - last_text = false; + storageIfc_->newText(buffer); //prevText + buffer.erase(); storageIfc_->nodeFinished(reader_->get_name()); } @@ -129,19 +125,19 @@ void XMLDocShredder::processPCDATA() // send the content of this PCDATA node to the storage interface as a text node if (reader_->has_value()) - { - storageIfc_->newText(reader_->get_value()); - last_text = true; - } - else - storageIfc_->newText(""); + { + buffer += reader_->get_value(); + + }; + } void XMLDocShredder::processAttributes() { reader_->move_to_first_attribute(); - string nspaceStr = "xmlns"; + string nspaceStr = "xmlns"; + storageIfc_->newText(""); //prevText storageIfc_->newChild("<@>"); do { @@ -178,9 +174,8 @@ void XMLDocShredder::processAttributes() void XMLDocShredder::processSignificantWhitespace() { - ustring value = reader_->get_value(); - // each significant whitespace sequence constructs a text node - storageIfc_->newText(value); + // each significant whitespace sequence constructs a text node + buffer += reader_->get_value(); } diff --git a/XMLDocShredder.h b/XMLDocShredder.h index 66a96b5..00f50d6 100644 --- a/XMLDocShredder.h +++ b/XMLDocShredder.h @@ -53,6 +53,9 @@ private: TextReader *reader_; void setProperties(); bool last_text; + string buffer; + // used to coalece successive text events + // which can occur if we discard pi and comment nodes. }; #endif /*XMLDOCSHREDDER_H_*/ diff --git a/main.ml b/main.ml index 88363e0..e7bdd35 100644 --- a/main.ml +++ b/main.ml @@ -24,8 +24,15 @@ let total_time () = List.fold_left (+.) 0. !l;; let test filename query output = Printf.eprintf "Parsing document : %!"; let v = time Tree.Binary.parse_xml_uri filename in - Tree.dump v; - Tree.Binary.print_xml_fast stderr v +(* Tree.dump v; + Printf.eprintf "\n\n\n\n\n%!"; + Tree.Binary.print_xml_fast stderr v *) + Printf.eprintf "Full Traversal\n%!"; + time (Tree.full_traversal) v; + Printf.eprintf "Traversal\n%!"; + time (Tree.traversal) v; + Printf.eprintf "CPP Traversal\n%!"; + time (Tree.cpp_traversal) v let main filename query output = @@ -34,7 +41,6 @@ let main filename query output = MM(v,__LOCATION__); a := Some (v); a := None; - Tree.Binary.print_xml_fast stderr v; Printf.eprintf "Parsing query : "; let query = try time @@ -49,9 +55,9 @@ let main filename query output = Printf.eprintf "TopDown (No BackTrack) : \n"; time (fun v -> ignore (TopDown.accept auto v)) v; - P(Printf.eprintf "Max states in TopDownNOBT : %i\n%!" auto.max_states); + Printf.eprintf "Number of nodes in the result set : %i\n" (BST.cardinal auto.result); begin - match output with + match output with | None -> () | Some f -> diff --git a/tests/test.xml b/tests/test.xml index 87340f4..1358477 100644 --- a/tests/test.xml +++ b/tests/test.xml @@ -1 +1,7 @@ -YYYYYYYYYYYYYAAAAAAAAAXXXXXXXXXXXX + + + + YTYYYYYYYYYY + + + diff --git a/tree.ml b/tree.ml index 219ee4b..5e2a44b 100644 --- a/tree.ml +++ b/tree.ml @@ -221,12 +221,13 @@ struct type 'a node = int type node_kind = [`Text | `Tree ] - let compare : 'a node -> 'a node -> int = fun x y -> x - y - let equal : 'a node -> 'a node -> bool = fun x y -> x == y + let compare : 'a node -> 'a node -> int = (-) + let equal : 'a node -> 'a node -> bool = (==) (* abstract type, values are pointers to a XMLTree C++ object *) - + external int_of_node : 'a node -> int = "%identity" + external parse_xml_uri : string -> t = "caml_call_shredder_uri" let parse_xml_uri uri = parse_xml_uri uri @@ -240,14 +241,13 @@ struct (* Todo *) external nullt : unit -> [`Text ] node = "caml_xml_tree_nullt" let nil = nullt () - external get_text1 : t -> [`Text] node -> string = "caml_text_collection_get_text" + external get_text : t -> [`Text] node -> string = "caml_text_collection_get_text" - let get_text t n = Printf.printf "@@@@@@%i\n%!" (Obj.magic n); + let get_text t n = if equal nil n then "" - else get_text1 t n - - let is_empty t (n : [`Text] node) = (get_text t n) = "" - + else get_text t n + + external is_empty : t -> [`Text ] node -> bool = "caml_text_collection_empty_text" end @@ -267,56 +267,76 @@ struct external parent : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" external parent_doc : t -> [`Text ] node -> [`Tree ] node = "caml_xml_tree_parent_doc" external first_child : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_first_child" + + + external next_sibling : t -> [`Tree] node -> [`Tree] node = "caml_xml_tree_next_sibling" external is_leaf : t -> [`Tree] node -> bool = "caml_xml_tree_is_leaf" - + external tag : t -> [`Tree ] node -> Tag.t = "caml_xml_tree_tag" external text_collection : t -> Text.t = "caml_xml_tree_text_collection" let is_last t n = equal nil (next_sibling t n) - external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" - let prev_text t id = Printf.eprintf "Calling PrevText for node %i with result" (Obj.magic id); - let did = if is_nil id then Text.nil else prev_text t id - in Printf.eprintf " %i!!!\n%!" (Obj.magic did); did - + external prev_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_prev_text" external my_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_my_text" external next_text : t -> [`Tree] node -> [`Text ] node = "caml_xml_tree_next_text" - let next_text t id = Printf.eprintf "Calling NextText for node %i with result" (Obj.magic id); - let did = if is_nil id then Text.nil else next_text t id - in Printf.eprintf " %i!!!\n%!" (Obj.magic did); did - external text_xml_id : t -> [`Text ] node -> int = "caml_xml_tree_text_xml_id" external node_xml_id : t -> [`Tree ] node -> int = "caml_xml_tree_node_xml_id" let print_skel t = + let textcol = text_collection t in let rec aux id = if (is_nil id) then Printf.eprintf "#" else begin - Printf.eprintf "%s(" (Tag.to_string (tag t id)); + Printf.eprintf "Node %i has tag '%s', DocID of PrevText,MyText,NextText : (%i = %s,%i = %s,%i = %s)\n%!" + (int_of_node id) + (Tag.to_string (tag t id)) + (int_of_node (prev_text t id)) + (Text.get_text textcol (prev_text t id)) + (int_of_node (my_text t id)) + (Text.get_text textcol (my_text t id)) + (int_of_node (next_text t id)) + (Text.get_text textcol (next_text t id)); aux(first_child t id); - Printf.eprintf ",\n"; aux(next_sibling t id); - Printf.eprintf ")\n"; end in aux (root t) + + let traversal t = + let textcol = text_collection t in + let rec aux id = + if not (is_nil id) + then + begin + ignore (tag t id); + ignore (Text.get_text textcol (prev_text t id)); + if (is_leaf t id) + then ignore (Text.get_text textcol (my_text t id)); + if (is_last t id) + then ignore (Text.get_text textcol (next_text t id)); + aux (first_child t id); + aux (next_sibling t id); + end + in + aux (root t) end module Binary = struct type node_content = - [ `Node of [`Tree ] node - | `String of [`Text ] node * [`Tree ] node ] + NC of [`Tree ] node + | SC of [`Text ] node * [`Tree ] node type string_content = [ `Text ] node type descr = | Nil @@ -326,22 +346,25 @@ struct type doc = t type t = { doc : doc; + text : Text.t; node : descr } let dump { doc=t } = Tree.print_skel t open Tree - let node_of_t t = { doc= t; node= Node(`Node (root t)) } + let node_of_t t = { doc= t; + text = text_collection t; + node = Node(NC (root t)) } let parse_xml_uri str = node_of_t (parse_xml_uri str) let parse_xml_string str = node_of_t (parse_xml_string str) let compare a b = match a.node,b.node with - | Node(`Node i),Node(`Node j) -> compare i j - | _, Node(`Node( _ )) -> 1 - | Node(`String (i,_)),Node(`String (j,_)) -> compare i j - | Node(`Node( _ )),Node(`String (_,_)) -> -1 - | _, Node(`String (_,_)) -> 1 + | Node(NC i),Node(NC j) -> compare i j + | _, Node(NC( _ )) -> 1 + | Node(SC (i,_)),Node(SC (j,_)) -> compare i j + | Node(NC( _ )),Node(SC (_,_)) -> -1 + | _, Node(SC (_,_)) -> 1 | String i, String j -> compare i j | Node _ , String _ -> -1 | _ , String _ -> 1 @@ -351,57 +374,67 @@ struct let equal a b = (compare a b) == 0 let string t = match t.node with - | String i -> Text.get_text (text_collection t.doc) i + | String i -> Text.get_text t.text i | _ -> assert false - let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (`Node n) + let norm (n : [`Tree ] node ) = if is_nil n then Nil else Node (NC n) let descr t = t.node + let nts = function + Nil -> "Nil" + | String i -> Printf.sprintf "String %i" i + | Node (NC t) -> Printf.sprintf "Node (NC %i)" (int_of_node t) + | Node (SC (t,i)) -> Printf.sprintf "Node (SC (%i,%i))" (int_of_node t) (int_of_node i) + let first_child n = - Printf.eprintf "first_child!\n%!"; let node' = match n.node with - | Nil | String _ -> failwith "first_child" - | Node (`Node t) -> + | Node (NC t) when is_leaf n.doc t -> + let txt = my_text n.doc t in + if Text.is_empty n.text txt + then Nil + else Node(SC (txt,Tree.nil)) + | Node (NC t) -> let fs = first_child n.doc t in - let txt = prev_text n.doc t in - if Text.is_empty (text_collection n.doc) txt + let txt = prev_text n.doc fs in + if Text.is_empty n.text txt then norm fs - else Node (`String (txt, fs)) - - | Node(`String (i,_)) -> String i + else Node (SC (txt, fs)) + | Node(SC (i,_)) -> String i + | Nil | String _ -> failwith "first_child" in { n with node = node'} + let next_sibling n = - Printf.eprintf "next_sibling!\n%!"; let node' = match n.node with - | Nil | String _ -> failwith "next_sibling" - | Node (`String (_,ns)) -> norm ns - | Node(`Node t) -> + | Node (SC (_,ns)) -> norm ns + | Node(NC t) -> let ns = next_sibling n.doc t in let txt = next_text n.doc t in - if Text.is_empty (text_collection n.doc) txt + if Text.is_empty n.text txt then norm ns - else Node (`String (txt, ns)) + else Node (SC (txt, ns)) + | Nil | String _ -> failwith "next_sibling" in { n with node = node'} - let left = first_child + let left = first_child let right = next_sibling + let id = - function { doc=d; node=Node(`Node n)} -> text_xml_id d n - | { doc=d; node=Node(`String (i,_) )} -> node_xml_id d i + function { doc=d; node=Node(NC n)} -> text_xml_id d n + | { doc=d; node=Node(SC (i,_) )} -> node_xml_id d i | _ -> failwith "id" let tag = - function { node=Node(`String _) } -> Tag.pcdata - | { doc=d; node=Node(`Node n)} -> tag d n + function { node=Node(SC _) } -> Tag.pcdata + | { doc=d; node=Node(NC n)} -> tag d n | _ -> failwith "Tag" - + let print_xml_fast outc t = @@ -437,24 +470,51 @@ struct output_string outc tg; output_char outc '>' );if print_right then loop r - and loop_attributes a = match a.node with - | Node(_) -> let value = string (left(left a)) in - output_char outc ' '; - output_string outc (Tag.to_string (tag a)); - output_string outc "=\""; - output_string outc value; - output_char outc '"'; - loop_attributes (right a) + and loop_attributes a = + + match a.node with + | Node(_) -> + let value = + match (left a).node with + | Nil -> "" + | _ -> string (left(left a)) + in + output_char outc ' '; + output_string outc (Tag.to_string (tag a)); + output_string outc "=\""; + output_string outc value; + output_char outc '"'; + loop_attributes (right a) | _ -> () in loop ~print_right:false t - + let print_xml_fast outc t = + if Tag.to_string (tag t) = "" then + print_xml_fast outc (first_child t) + else print_xml_fast outc t + + let traversal t = Tree.traversal t.doc + let full_traversal t = + let rec aux n = + match n.node with + | Nil -> () + | String i -> ignore(Text.get_text t.text i) + | Node(_) -> + ignore (tag n); + aux (first_child n); + aux (next_sibling n) + in aux t end end let dump = XML.Binary.dump +let traversal = XML.Binary.traversal +let full_traversal = XML.Binary.full_traversal +external cpp_traversal : XML.t -> unit = "caml_cpp_traversal" +let cpp_traversal t = cpp_traversal t.XML.Binary.doc + include XML diff --git a/tree.mli b/tree.mli index 060e086..329d497 100644 --- a/tree.mli +++ b/tree.mli @@ -26,3 +26,6 @@ end module Binary : BINARY val dump : Binary.t -> unit +val traversal : Binary.t -> unit +val full_traversal : Binary.t -> unit +val cpp_traversal : Binary.t -> unit -- 2.17.1