CAMLreturn (Val_unit);
}
-extern "C" CAMLprim value caml_xml_tree_load(value fd){
- CAMLparam1(fd);
+extern "C" CAMLprim value caml_xml_tree_load(value fd, value load_tc,value sf){
+ CAMLparam3(fd,load_tc,sf);
CAMLlocal1(doc);
XMLTree * tree;
try {
- tree = XMLTree::Load(Int_val(fd));
- doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2);
- memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*));
- CAMLreturn(doc);
+ tree = XMLTree::Load(Int_val(fd),Bool_val(load_tc),Int_val(sf));
+ doc = caml_alloc_custom(&ops,sizeof(XMLTree*),1,2);
+ memcpy(Data_custom_val(doc),&tree,sizeof(XMLTree*));
+ CAMLreturn(doc);
}
catch (const xmlpp::internal_error& e){ CAMLRAISEMSG(e.what()); }
catch (const std::exception& e){ CAMLRAISEMSG(e.what()); }
}
bool docId_comp(DocID x, DocID y) { return x < y; };
+
extern "C" CAMLprim value caml_text_collection_contains(value tree,value str){
CAMLparam2(tree,str);
CAMLlocal1(resarray);
uchar * cstr = (uchar *) String_val(str);
std::vector<DocID> results;
results = XMLTREE(tree)->Contains(cstr);
- //free(cstr);
std::sort(results.begin(), results.end(), docId_comp);
size_t s = results.size();
resarray = caml_alloc_tuple(s);
CAMLreturn (resarray);
}
+extern "C" CAMLprim value caml_text_collection_equals(value tree,value str){
+ CAMLparam2(tree,str);
+ CAMLlocal1(resarray);
+ uchar * cstr = (uchar *) String_val(str);
+ std::vector<DocID> results;
+ results = XMLTREE(tree)->Equal(cstr);
+ std::sort(results.begin(), results.end(), docId_comp);
+ size_t s = results.size();
+ resarray = caml_alloc_tuple(s);
+
+ for (size_t i = 0; i < s ;i++){
+ caml_initialize(&Field(resarray,i),Val_int(results[i]));
+ };
+ CAMLreturn (resarray);
+}
+extern "C" CAMLprim value caml_text_collection_startswith(value tree,value str){
+ CAMLparam2(tree,str);
+ CAMLlocal1(resarray);
+ uchar * cstr = (uchar *) String_val(str);
+ std::vector<DocID> results;
+ results = XMLTREE(tree)->Prefix(cstr);
+ std::sort(results.begin(), results.end(), docId_comp);
+ size_t s = results.size();
+ resarray = caml_alloc_tuple(s);
+
+ for (size_t i = 0; i < s ;i++){
+ caml_initialize(&Field(resarray,i),Val_int(results[i]));
+ };
+ CAMLreturn (resarray);
+}
+extern "C" CAMLprim value caml_text_collection_endswith(value tree,value str){
+ CAMLparam2(tree,str);
+ CAMLlocal1(resarray);
+ uchar * cstr = (uchar *) String_val(str);
+ std::vector<DocID> results;
+ results = XMLTREE(tree)->Suffix(cstr);
+ std::sort(results.begin(), results.end(), docId_comp);
+ size_t s = results.size();
+ resarray = caml_alloc_tuple(s);
+
+ for (size_t i = 0; i < s ;i++){
+ caml_initialize(&Field(resarray,i),Val_int(results[i]));
+ };
+ CAMLreturn (resarray);
+}
+
+
+
extern "C" CAMLprim value caml_text_collection_unsorted_contains(value tree,value str){
CAMLparam2(tree,str);
+ CAMLlocal1(resarray);
uchar * cstr = (uchar *) String_val(str);
std::vector<DocID> results;
results = XMLTREE(tree)->Contains(cstr);
- CAMLreturn (Val_unit);
+ resarray = caml_alloc_tuple(results.size());
+ for (size_t i = 0; i < results.size() ;i++){
+ caml_initialize(&Field(resarray,i),Val_int(results[i]));
+ };
+ CAMLreturn (resarray);
}
CAMLreturn (Val_int(nextResult(r, Int_val(p))));
}
+extern "C" CAMLprim value caml_result_set_count(value result){
+ CAMLparam0();
+ results r;
+ r = *( (results *) result);
+ CAMLreturn (Val_int(countResult(r)));
+}
+
extern "C" CAMLprim value caml_xml_tree_print(value tree,value node,value fd){
CAMLparam3(tree,node,fd);
XMLTREE(tree)->Print(Int_val(fd),TREENODEVAL(node));
external create_empty : int -> bits = "caml_result_set_create"
external set : bits -> int -> unit = "caml_result_set_set"
external next : bits -> int -> int = "caml_result_set_next"
+ external count : bits -> int = "caml_result_set_count"
external clear : bits -> elt -> elt -> unit = "caml_result_set_clear"
+
external set_tag_bits : bits -> Tag.t -> Tree.t -> elt -> elt = "caml_set_tag_bits"
type t =
{ segments : elt list;
in loop (next t.bits 0) acc
let map _ _ = failwith "noop"
- let length t = let cpt = ref 0 in
- iter (fun _ -> incr cpt) t; !cpt
+ (*let length t = let cpt = ref 0 in
+ iter (fun _ -> incr cpt) t; !cpt *)
+ let length t = count t.bits
let clear_bits t =
let rec loop l = match l with
let cont =
match f_kind,n_kind with
| `NIL,`NIL ->
- Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__);
(fun t _ -> eval_fold2_slist fl_list t (Tree.tag tree t) empty_res empty_res)
| _,`NIL -> (
match f_kind with
if (Algebra.is_rec a s fst) && (Algebra.is_rec a s snd)
&& (Algebra.is_final_marking a s)
then
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
RS.mk_quick_tag_loop default llist 1 tree tag'
- else
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
- default
- else
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
- default
+ else default
+ else default
| _ ->
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t _ -> eval_fold2_slist fl_list t (Tree.tag tree t) empty_res
(loop (first t) llist t ))
)
if t == Tree.nil then empty_res else
let res2 = loop (next t ctx) ctx in
eval_fold2_slist fl_list t tag res2 empty_res
- in Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__);loop
+ in loop
else
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t ctx -> eval_fold2_slist fl_list t (Tree.tag tree t)
(loop_tag tag' (next t ctx) rlist ctx ) empty_res)
| _ ->
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t ctx -> eval_fold2_slist fl_list t (Tree.tag tree t)
(loop (next t ctx) rlist ctx ) empty_res)
)
| `TAG(tag1),`TAG(tag2) ->
- let _ = Printf.eprintf "Using %i %s %s\n" (Loc.start_line __LOCATION__)
- (Tag.to_string tag1)
- (Tag.to_string tag2)
- in
(fun t ctx ->
eval_fold2_slist fl_list t (Tree.tag tree t)
(loop_tag tag2 (next t ctx) rlist ctx )
(loop_tag tag1 (first t) llist t ))
| `TAG(tag'),`ANY ->
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t ctx ->
eval_fold2_slist fl_list t (Tree.tag tree t)
(loop (next t ctx) rlist ctx )
(loop_tag tag' (first t) llist t ))
| `ANY,`TAG(tag') ->
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t ctx ->
eval_fold2_slist fl_list t (Tree.tag tree t)
(loop_tag tag' (next t ctx) rlist ctx )
(loop (first t) llist t ))
| `ANY,`ANY ->
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
if SList.equal slist rlist && SList.equal slist llist
then
let rec loop t ctx =
and r2 = loop (next t ctx) ctx
in
eval_fold2_slist fl_list t (Tree.tag tree t) r2 r1
- in
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
- loop
+ in loop
else
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t ctx ->
eval_fold2_slist fl_list t (Tree.tag tree t)
(loop (next t ctx) rlist ctx )
(loop (first t) llist t ))
| _,_ ->
- let _ = Printf.eprintf "Using %i\n" (Loc.start_line __LOCATION__) in
(fun t ctx ->
eval_fold2_slist fl_list t (Tree.tag tree t)
(loop (next t ctx) rlist ctx )
let top_down_count a t = let module RI = Run(Integer) in Integer.length (RI.run_top_down a t)
let top_down a t = let module RI = Run(IdSet) in (RI.run_top_down a t)
let bottom_up_count a t k = let module RI = Run(Integer) in Integer.length (RI.run_bottom_up a t k)
-
+ let bottom_up a t k = let module RI = Run(IdSet) in (RI.run_bottom_up a t k)
module Test (Doc : sig val doc : Tree.t end) =
struct
val top_down : 'a t -> Tree.t -> IdSet.t
val bottom_up_count :
'a t -> Tree.t -> [> `CONTAINS of 'b | `TAG of Tag.t ] -> int
+val bottom_up :
+ 'a t -> Tree.t -> [> `CONTAINS of 'b | `TAG of Tag.t ] -> IdSet.t
module Test (Doc : sig val doc : Tree.t end ) :
sig
let jump_to =
match contains with
None -> (max_int,`NOTHING)
- | Some s ->
+ | Some ((op,s)) ->
let r = Tree.count v s
in
Printf.eprintf "%i documents in the TextCollection\n" (Tree.text_size v);
Printf.eprintf "Global count is %i, using " r;
if r < !Options.tc_threshold then begin
Printf.eprintf "TextCollection contains\nCalling global contains : ";
- time (Tree.init_contains v) s;
+ time (Tree.init_textfun op v) s;
end
else begin
Printf.eprintf "Naive contains\nCalling global contains : ";
let _ = Printf.eprintf "%!" in
(* let _ = Gc.set (disabled_gc) in *)
if !Options.backward && ((snd test_list) != `NOTHING )then
-
- let r = time (bottom_up_count auto v )(snd test_list) in
+ if !Options.count_only then
+ let r = time_mem (bottom_up_count auto v )(snd test_list) in
let _ = Printf.eprintf "Number of nodes in the result set : %i\n%!" r
in ()
+ else begin
+ let r = time_mem (bottom_up auto v )(snd test_list) in
+ let _ = Printf.eprintf "Number of nodes in the result set : %i\n%!" (IdSet.length r)
+ in
+ match output with
+
+ | None -> ()
+ | Some f ->
+ Printf.eprintf "Serializing results : ";
+ time( fun () ->
+ (*let oc = open_out f in *)
+ let oc = Unix.openfile f [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
+ (*output_string oc "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";*)
+ IdSet.iter (fun t ->
+ Tree.print_xml_fast3 v t oc;
+ (*output_char oc '\n'; *)
+ ) r) ();
+ end
+
else
let _ =
if !Options.backward then Printf.eprintf "WARNING: couldn't find a jumping point, running top-down\n"
in
if !Options.count_only then
- let r = time ( top_down_count auto ) v in
+ let r = time_mem ( top_down_count auto ) v in
let _ = Printf.eprintf "Number of nodes in the result set : %i\n%!" r
in ()
else
let module GR = Ata.Test(struct let doc = v end) in
- let result = time (GR.top_down auto) v in
+ let result = time_mem (GR.top_down auto) v in
let _ = Printf.eprintf "Counting results " in
let rcount = time (GR.Results.length) result in
Printf.eprintf "Number of nodes in the result set : %i\n" rcount;
(*let oc = open_out f in *)
let oc = Unix.openfile f [ Unix.O_WRONLY;Unix.O_TRUNC;Unix.O_CREAT] 0o644 in
(*output_string oc "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";*)
+ let t1 = ref (Unix.gettimeofday()) in
+ let count = ref 1 in
+ let old_count = ref 1 in
GR.Results.iter (fun t ->
- Tree.print_xml_fast3 v t oc;
+ incr count;
+ begin
+ if (!count mod 15) == 0
+ then
+ let t2 = Unix.gettimeofday() in
+ let _ = Printf.eprintf "Printing %i elements in %f ms\n"
+ (!count - !old_count) (1000. *.(t2 -. !t1))
+ in
+ ( old_count := !count; t1 := Unix.gettimeofday())
+ end;
+ Tree.print_xml_fast3 v t oc;
(*output_char oc '\n'; *)
- ) result) ();
+ ) result) ();
end;
end;
let _ = Gc.set enabled_gc in
then
begin
Printf.eprintf "Loading from file : ";
- time (Tree.load ~sample:!Options.sample_factor )
+ time (Tree.load ~sample:!Options.sample_factor ~load_text:(not !Options.count_only))
!Options.input_file;
end
else
{ fprintf(stderr,"Error, redefine logW as %i and recompile\n",lg(W)-1);\r
exit(1);\r
}\r
+\r
R.n = 2*n-1;\r
R.lgn = lg(n);\r
+ fprintf(stderr,"Size of the result set : %i elements, %li kB\n", n,\r
+ (((R.n+W-1)/W)*sizeof(int)/1024));\r
R.tree = (int*) malloc (((R.n+W-1)/W)*sizeof(int));\r
clearBit(R.tree,0); // clear all\r
return R;\r
return unconv(answ,R.n,R.lgn);\r
}\r
\r
+// Naively implemented by kim\r
+\r
+unsigned int countResult(results R) {\r
+ unsigned int result = 0;\r
+ int i = 0;\r
+ while ( i != -1 && i < R.n) {\r
+ result ++; \r
+ i = unconv(nextLarger(R.tree,R.n,conv(i+1,R.n,R.lgn),0,R.lgn),R.n,R.lgn);\r
+ };\r
+ return result;\r
+ \r
+}\r
+\r
+\r
static void prnspace (int k)\r
\r
{ while (k--) putchar(' ');\r
// returns 0/1 telling whether result p is not/is present in R
int readResult (results R, int p);
+unsigned int countResult (results R);
+
// inserts result p into R
void setResult (results R, int p);
QUERY[0]="/descendant::MedlineCitation/descendant::*/contains('brain')"
QUERY[1]="/descendant::MedlineCitation/descendant::Country/contains('AUSTRALIA')"
QUERY[2]="/descendant::Country/contains('AUSTRALIA')"
-QUERY[3]="/descendant::*/contains('AUSTRALIA')"
-QUERY[4]="/descendant::*/contains('?')"
-QUERY[5]="/descendant::MedlineCitation/descendant::*/contains('?')"
+QUERY[3]="/descendant::*/contains('1930')"
+QUERY[4]="/descendant::MedlineCitation/descendant::*/contains('1930')"
+QUERY[5]="/descendant::MedlineCitation/Article/AuthorList/Author/LastName/startswith('Bar')"
+QUERY[6]="/descendant::MedlineCitation[MedlineJournalInfo/Country/endswith('LAND')]"
-for ((i=0;i<=5;i++))
+for ((i=0;i<=6;i++))
do
echo Running query "$i" : "${QUERY[$i]}"
- ../main -b medline.srx "${QUERY[$i]}" > q_"$i".time 2>&1 &
+ ../main -f 0 -b medline_05.srx "${QUERY[$i]}"
- while pidof main >/dev/null 2>&1
- do
- cat /proc/`pidof main`/status | grep "VmRSS" >> q_"$i".mem
- sleep 4
- done
-done
\ No newline at end of file
+done
external parse_xml_string : string -> int -> bool -> bool -> tree = "caml_call_shredder_string"
external tree_print_xml_fast3 : tree -> [`Tree ] node -> Unix.file_descr ->unit = "caml_xml_tree_print"
external tree_save : tree -> Unix.file_descr -> unit = "caml_xml_tree_save"
-external tree_load : Unix.file_descr -> tree = "caml_xml_tree_load"
+external tree_load : Unix.file_descr -> bool -> int -> tree = "caml_xml_tree_load"
external nullt : unit -> 'a node = "caml_xml_tree_nullt"
external text_count_contains : tree -> string -> int = "caml_text_collection_count_contains"
external text_count : tree -> string -> int = "caml_text_collection_count"
external text_contains : tree -> string -> [`Text ] node array = "caml_text_collection_contains"
-external text_unsorted_contains : tree -> string -> unit = "caml_text_collection_unsorted_contains"
+external text_startswith : tree -> string -> [`Text ] node array = "caml_text_collection_startswith"
+external text_endswith : tree -> string -> [`Text ] node array = "caml_text_collection_endswith"
+external text_equals : tree -> string -> [`Text ] node array = "caml_text_collection_equals"
+external text_unsorted_contains : tree -> string -> [`Text ] node array = "caml_text_collection_unsorted_contains"
external text_get_cached_text : tree -> [`Text] node -> string = "caml_text_collection_get_cached_text"
external tree_root : tree -> [`Tree] node = "caml_xml_tree_root"
with
Not_found -> false
-let init_contains t s =
- let a = text_contains t.doc s
+let init_textfun f t s =
+ let a = match f with
+ | `CONTAINS -> text_contains t.doc s
+ | `STARTSWITH -> text_startswith t.doc s
+ | `ENDSWITH -> text_endswith t.doc s
+ | `EQUALS -> text_equals t.doc s
in
- Array.fast_sort (compare) a;
+ (*Array.fast_sort (compare) a; *)
contains_array := a;
Array.iter (fun x -> Hashtbl.add contains_index x true) !contains_array
close_out out_c
;;
-let load ?(sample=64) str =
+let load ?(sample=64) ?(load_text=true) str =
let fd = Unix.openfile str [ Unix.O_RDONLY ] 0o644 in
let in_c = Unix.in_channel_of_descr fd in
let _ = set_binary_mode_in in_c true in
let _ = Printf.eprintf "\nLoading tag table : " in
let ntable = time (load_table) () in
ignore(Unix.lseek fd (pos_in in_c) Unix.SEEK_SET);
- let tree = { doc = tree_load fd;
+ let tree = { doc = tree_load fd load_text sample;
ttable = ntable;}
in close_in in_c;
tree
type t
-val init_contains : t -> string -> unit
+val init_textfun : [ `CONTAINS | `STARTSWITH | `ENDSWITH | `EQUALS ] -> t -> string -> unit
val init_naive_contains : t -> string -> unit
val parse_xml_uri : string -> t
val parse_xml_string : string -> t
val save : t -> string -> unit
-val load : ?sample:int -> string -> t
+val load : ?sample:int -> ?load_text:bool -> string -> t
val tag_pool : t -> Tag.pool
val binary_parent : t -> [`Tree] node -> [`Tree] node
val count_contains : t -> string -> int
-val unsorted_contains : t -> string -> unit
+(* val unsorted_contains : t -> string -> unit *)
val text_size : t -> int
val doc_ids : t -> [`Tree] node -> [`Text] node * [`Text] node
val subtree_tags : t -> Tag.t -> [`Tree] node -> int
let rec token = lexer
| [' ' '\t'] -> token lexbuf
| "text()" | "node()" | "and" | "not" | "or"
- | "contains" | "contains_full"
+ | "contains" | "contains_full" | "endswith" | "startswith" | "equals"
| "self" | "descendant" | "child" | "descendant-or-self"
| "attribute" | "following-sibling" | "preceding-sibling"
| "parent" | "ancestor" | "ancestor-or-self" | "preceding" | "following"
l:= t::!l;
Printf.eprintf " %fms\n%!" t ;
Printf.eprintf "Mem use before: %s\n%!" s1;
- Printf.eprintf "Mem use after: %s\n\n\n%!" s2;
+ Printf.eprintf "Final Mem: %s\n\n\n%!" s2;
r
;;
let time f x =
| [ "." ; p = top_pred -> [(Self,TagSet.node,p)] ]
| [ ".." ; p = top_pred -> [(Parent,TagSet.star,p)] ]
| [ "contains"; "(" ; s = STRING ; ")";p=top_pred -> [
- let _ = contains := Some(s) in (Child,TagSet.singleton Tag.pcdata, p)]
+ let _ = contains := Some((`CONTAINS,s)) in (Child,TagSet.singleton Tag.pcdata, p)]
]
-| [ "contains_full"; "(" ; s = STRING ; ")";p=top_pred -> [
- let _ = contains := Some(s) in (Descendant,TagSet.singleton Tag.pcdata, p)]
+| [ "equals"; "(" ; s = STRING ; ")";p=top_pred -> [
+ let _ = contains := Some((`EQUALS,s)) in (Child,TagSet.singleton Tag.pcdata, p)]
+ ]
+| [ "startswith"; "(" ; s = STRING ; ")";p=top_pred -> [
+ let _ = contains := Some((`STARTSWITH,s)) in (Child,TagSet.singleton Tag.pcdata, p)]
+ ]
+| [ "endswith"; "(" ; s = STRING ; ")";p=top_pred -> [
+ let _ = contains := Some((`ENDSWITH,s)) in (Child,TagSet.singleton Tag.pcdata, p)]
]
| [ test = test; p = top_pred -> [(Child,test, p)] ]
| [ att = ATT ; p = top_pred ->
end
module Compile :
sig
-val compile : ?querystring:string -> Ast.path -> 'a Ata.t * (Tag.t*Ata.StateSet.t) list * string option
+val compile : ?querystring:string -> Ast.path -> 'a Ata.t * (Tag.t*Ata.StateSet.t) list * ([ `CONTAINS | `STARTSWITH | `ENDSWITH | `EQUALS ]*string) option
end