From: Kim Nguyễn Date: Wed, 2 May 2012 12:32:34 +0000 (+0200) Subject: Various improvements: X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2Fxpathcomp.git;a=commitdiff_plain;h=e4e34e687798285e65df74561915f7721b066631 Various improvements: - Store the set of attributes in a TLIST instead of a Ptset.Int.t - Add a C trim function for string (to remove whitespaces left and right) - Rewrite the text_query wrappers. --- diff --git a/src/tree.ml b/src/tree.ml index 2b8c1aa..ae256dd 100644 --- a/src/tree.ml +++ b/src/tree.ml @@ -5,7 +5,8 @@ (* Distributed under the terms of the LGPL (see LICENCE) *) (******************************************************************************) INCLUDE "debug.ml" - INCLUDE "utils.ml" +INCLUDE "log.ml" +INCLUDE "utils.ml" external init_lib : unit -> unit = "sxsi_cpp_init" @@ -56,6 +57,7 @@ struct external open_tag : t -> string -> unit = "caml_xml_tree_builder_open_tag" external close_tag : t -> string -> unit = "caml_xml_tree_builder_close_tag" external text : t -> string -> unit = "caml_xml_tree_builder_text" + external trim : string -> string = "caml_trim" let is_whitespace s = let rec loop len i = @@ -184,11 +186,16 @@ let bit_vector_create n = let len = if n <= 0 then 0 else (n - 1) / 8 + 1 in String.make len '\000' +type tag_list + +external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc" +external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc" + type t = { doc : tree; elements: Ptset.Int.t; attributes: Ptset.Int.t; - attribute_array : Tag.t array; + attribute_array : tag_list; children : Ptset.Int.t array; siblings : Ptset.Int.t array; descendants: Ptset.Int.t array; @@ -214,10 +221,6 @@ external nullt : unit -> 'a Node.t = "caml_xml_tree_nullt" let nil : [`Tree ] Node.t = Node.nil let root : [`Tree ] Node.t = Node.null -type tag_list - -external tag_list_alloc : int -> tag_list = "caml_tag_list_alloc" -external tag_list_set : tag_list -> int -> Tag.t -> unit = "caml_tag_list_set" "noalloc" module HPtset = Hashtbl.Make(Ptset.Int) @@ -339,16 +342,25 @@ let subtree_tags t n tag = tree_subtree_tags t.doc n tag external tree_subtree_size : tree -> [`Tree] Node.t -> int = "caml_xml_tree_subtree_size" "noalloc" let subtree_size t n = tree_subtree_size t.doc n +let rec iter_array_tag i a len tree node acc = + if i == len then acc + else + iter_array_tag (i+1) a len tree node + (acc - (tree_subtree_tags tree node a.(i))) + +external tree_subtree_elements : tree -> [`Tree] Node.t -> tag_list -> int = "caml_xml_tree_subtree_elements" "noalloc" + +let subtree_elements t node = + tree_subtree_elements t.doc node t.attribute_array +(* let subtree_elements t node = let size = tree_subtree_size t.doc node - 1 in - if size == 0 then 0 + if size <= 0 then 0 else let size = size - (tree_subtree_tags t.doc node Tag.pcdata) in - if size < 2 then size else - let acc = ref size in - for i = 0 to Array.length t.attribute_array - 1 do - acc := !acc - tree_subtree_tags t.doc node t.attribute_array.(i) - done; - !acc + if size < 3 then size else + let a = t.attribute_array in + iter_array_tag 0 a (Array.length a) t.doc node size +*) external tree_closing : tree -> [`Tree] Node.t -> [`Tree] Node.t = "caml_xml_tree_closing" "noalloc" let closing t n = tree_closing t.doc n @@ -557,7 +569,7 @@ let node_of_t t = in { doc= t; attributes = attributes; - attribute_array = Array.of_list (Ptset.Int.elements attributes); + attribute_array = tag_list_of_set attributes; elements = elements; children = c; siblings = n; @@ -652,7 +664,7 @@ let load ?(sample=64) ?(load_text=true) str = in let tree = { doc = xml_tree; attributes = attributes; - attribute_array = Array.of_list (Ptset.Int.elements attributes); + attribute_array = tag_list_of_set attributes; elements = elements; children = c; siblings = s; @@ -721,6 +733,7 @@ let query_fun = function let _pred_cache = Hashtbl.create 17 ;; let mk_pred query s = + LOG ( __ "bottom-up" 3 "Calling mk_pred for '%s'\n" s); let f = query_fun query in let memo = ref (fun _ _ -> failwith "Undefined") in memo := begin fun tree node -> @@ -733,9 +746,13 @@ let mk_pred query s = in let bv = results.bv in memo := begin fun _ n -> - bit_vector_unsafe_get bv (Node.to_int n) + let r = bit_vector_unsafe_get bv (Node.to_int n) in + LOG( __ "bottom-up" 3 "Running predicate on node %i = %b@\n" (Node.to_int n) r); + r end; - bit_vector_unsafe_get bv (Node.to_int node) + let r = bit_vector_unsafe_get bv (Node.to_int node) in + LOG( __ "bottom-up" 3 "Running predicate on node %i = %b@\n" (Node.to_int node) r); + r end; Predicate.make memo diff --git a/src/utils_stub.cpp b/src/utils_stub.cpp index 8ee3e0b..2ea3feb 100644 --- a/src/utils_stub.cpp +++ b/src/utils_stub.cpp @@ -1,5 +1,18 @@ #include "utils_stub.hpp" +extern "C" { +#define CAML_NAME_SPACE + +#include +#include +#include +#include +#include +#include +#include +} +#include + extern "C" value caml_clz(value i) { return Val_long( ((sizeof(unsigned long)*8) - __builtin_clzl(Long_val(i))) - 1); @@ -10,11 +23,50 @@ extern "C" value caml_leading_bit(value i) return Val_long( ( 1 << (sizeof(unsigned long)*8 - __builtin_clzl(Long_val(i)) - 1))); } + +static char * rtrim(char *str) +{ + char *ptr; + int len; + + len = strlen(str); + for (ptr = str + len - 1; ptr >= str && isspace((int)*ptr ); --ptr); + ptr[1] = '\0'; + + return str; +} + +static char * ltrim(char *str) +{ + char *ptr; + int len; + + for (ptr = str; *ptr && isspace((int)*ptr); ++ptr); + + len = strlen(ptr); + memmove(str, ptr, len + 1); + + return str; +} +extern "C" value caml_trim(value s) +{ + CAMLparam1(s); + CAMLlocal1(res); + char * ptr; + char * str = String_val(s); + ptr = rtrim(str); + str = ltrim(ptr); + res = caml_copy_string(str); + CAMLreturn(res); +} + xml_tree::tag_t*& TAGLIST(value x) { return Obj_val(x); } + + static void finalize_tag_list(value x) { xml_tree::tag_t * t = TAGLIST(x); diff --git a/src/xml-tree_stub.cpp b/src/xml-tree_stub.cpp index 47b6c63..9744694 100644 --- a/src/xml-tree_stub.cpp +++ b/src/xml-tree_stub.cpp @@ -77,9 +77,11 @@ caml_xml_tree_subtree_tags(value tree, value node, value tag) TAG(tag)))); } -NoAlloc extern "C" value caml_xml_tree_subtree_elements(value tree, value node) +NoAlloc extern "C" value caml_xml_tree_subtree_elements(value tree, value node, + value atts) { - return (Val_int(XMLTREE(tree)->subtree_elements(TREENODE(node)))); + return (Val_int(XMLTREE(tree)->subtree_elements(TREENODE(node), + TAGLIST(atts)))); } NoAlloc extern "C" value caml_xml_tree_is_leaf(value tree, value node){ @@ -305,7 +307,13 @@ extern "C" value caml_xml_tree_register_tag(value tree, value str) /** Full reporting into a bit vector */ -static std::vector sort_results(std::vector v) +static void pr_vector(std::vector v) +{ + for(auto i = v.begin(); i!= v.end(); ++i) + fprintf(stderr, "%i ", *i); + fprintf(stderr, "\n"); +} +static std::vector sort_results(std::vector v, xml_tree *t) { std::vector res; std::sort(v.begin(), v.end()); @@ -316,19 +324,19 @@ static std::vector sort_results(std::vector v) if (i == v.end()) return res; }; prev = *i; - res.push_back(prev); + res.push_back(t->parent_node(prev)); }; return res; } #define BV_QUERY(pref, Pref) \ extern "C" value caml_text_collection_## pref ##_bv(value tree, value str, value dobvv){ \ - CAMLparam3(tree, str, dobvv); \ + CAMLparam3(tree, str, dobvv); \ CAMLlocal3(res, res_bv, res_array); \ int j; \ uchar * cstr = (uchar *) strdup(String_val(str)); \ std::vector uresults = XMLTREE(tree)->Pref(cstr); \ - std::vector results = sort_results(uresults); \ + std::vector results = sort_results(uresults, XMLTREE(tree)); \ bool dobv = Bool_val(dobvv); \ res_bv = caml_alloc_string(dobv ? ((XMLTREE(tree)->size() / 4) + 2) : 0); \ unsigned long slen = caml_string_length(res_bv); \ @@ -336,12 +344,13 @@ static std::vector sort_results(std::vector v) memset(&(Byte(res_bv,0)), 0, slen); \ res_array = caml_alloc_shr(results.size(), 0); \ for (unsigned int i = 0; i < results.size(); ++i) { \ - j = XMLTREE(tree)->parent_node(results[i]); \ + j = results[i]; \ if (dobv) { \ Byte(res_bv, j >> 3) |= (1 << (j & 7)); \ }; \ caml_initialize(&Field(res_array, i), Val_int(j)); \ }; \ + fprintf(stderr, "Raw results: %i, Sorted reulsts %i\n", uresults.size(), results.size()); \ free(cstr); \ res = caml_alloc(2, 0); \ Store_field(res, 0, res_bv); \