From fc0cefb22e6a5e449e95a13713e873a9a6545b88 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Sat, 6 Oct 2012 20:19:01 +0200 Subject: [PATCH 1/1] Fixes text_id_range to include the node itself, if it corresponds to the first text-element. --- xml-tree-inc.hpp | 6 +-- xml-tree.cpp | 116 +++++++++++++++++++++++++++++++++++++++++++---- xml-tree.hpp | 3 ++ 3 files changed, 113 insertions(+), 12 deletions(-) diff --git a/xml-tree-inc.hpp b/xml-tree-inc.hpp index 30a321e..90717de 100644 --- a/xml-tree-inc.hpp +++ b/xml-tree-inc.hpp @@ -28,12 +28,12 @@ xml_tree::subtree_tags(xml_tree::node_t x, xml_tree::tag_t label) const xml_tree::node_t y = bp_find_close(this->par, x); if (y - x < 10) { uint32_t count = 0; - for(xml_tree::node_t i = x; i < y; i++) + for (xml_tree::node i = x; i <= y; ++i) count += (tag(i) == label); return count; } else { - return tags[label]->rank(y) - tags[label]->rank(x); - }; + return tags[label]->rank(y) - tags[label]->rank(x); + } } inline uint32_t xml_tree::subtree_elements(xml_tree::node_t x, diff --git a/xml-tree.cpp b/xml-tree.cpp index 575ba8f..158ec1b 100644 --- a/xml-tree.cpp +++ b/xml-tree.cpp @@ -101,9 +101,8 @@ xml_tree::xml_tree(std::vector *tags_, (*this->tag_names)[val.second] = val.first; uint32_t max_tag = tag_names->size() - 1; - bit_vector *tmp_bitmap = new bit_vector(npar, 1, 0); - uint32_t * buff = tmp_bitmap->get_vector_ptr(); + uint32_t * buff; for(int32_t i = 0; i <= max_tag; i++){ uint32_t ones = 0; for (size_t k = 0; k < npar; k++){ @@ -111,12 +110,13 @@ xml_tree::xml_tree(std::vector *tags_, tmp_bitmap->set(k, t); ones += t; }; - + buff = tmp_bitmap->get_vector_ptr(); this->tags.push_back(new static_bitsequence_sdarray(buff, npar, ones)); } delete tmp_bitmap; delete [] buff; + //static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); //alphabet_mapper *am = new alphabet_mapper_none(); @@ -366,12 +366,18 @@ xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const std::pair xml_tree::text_id_range(xml_tree::node_t x) const { int32_t i, j; - i = text_positions->rank1(x) - 1; - j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2); + xml_tree::node_t y = bp_find_close(par, x); + if (x == root()) + i = 0; + else + i = text_positions->rank1(x-1); + + j = text_positions->rank1(y); +// fprintf(stderr, "Rank of node %i is %i, rank of closing %i is %i\n", x, i, y, j); if (i == j) return std::make_pair(xml_tree::NIL, xml_tree::NIL); else - return std::make_pair(i + 1, j); + return std::make_pair(i, j); } int32_t xml_tree::text_id(xml_tree::node_t x) const @@ -496,9 +502,16 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) auto r = text_id_range(x); if (r.first == xml_tree::NIL) current_text = 0; - else + else { current_text = text_collection->GetText(r.first, r.second); - current_text += (current_text[0] == 1); + current_text += (current_text[0] == 1); + }; + + //fprintf(stderr, "Printing node %i, tag: %s\n", x, get_tag_name_by_ref(tag(x))); + // fprintf(stderr, "Beggining of text is (%i):%s\n", r.first, current_text); + //fflush(stderr); + + orig_text = current_text; @@ -620,7 +633,7 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) // if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { // n++; // if (no_text) uputs("><@@>", fd); - + // while (bp_inspect(par, n)) // if (no_text) { // uputc('<', fd); @@ -668,3 +681,88 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) // uputc('\n', fd); // } + + + +static inline uchar * next_char(uchar *s, size_t &numtexts) +{ + uchar c; + s++; + c = *s; + while (c <= 1){ + if (c == 0) { + numtexts--; + if (numtexts == 0) return 0; + }; + s++; + c = *s; + }; + return s; +} + +static inline bool naive_char_contains(uchar const *s, uchar c, size_t numtexts) +{ + uchar sc; + while(numtexts != 0){ + sc = *s; + if (c == sc) return true; + if (sc == 0) numtexts--; + s++; + }; + return false; +} + +bool xml_tree::naive_contains(xml_tree::node_t n, uchar const *w) const +{ + if (w[0] == 0) + return true; + // fprintf(stderr, "Calling naive_contains on node: %i, with tag: %s\n", + // n, + // get_tag_name_by_ref(tag(n))); + // fflush(stderr); + std::pair range = text_id_range(n); + + //pattern is at least one char + if (range.first == xml_tree::NIL) + return false; + + uchar * text = this->text_collection->GetText(range.first, range.second); + size_t num_texts = + subtree_tags(n, ATTRIBUTE_DATA_OPEN_TAG_ID) + + subtree_tags(n, PCDATA_OPEN_TAG_ID); + + if (w[1] == 0) + return naive_char_contains(text, w[0], num_texts); + + //KMP is overkill + uchar * s = text; + uchar * ss; + uchar const *ww; + size_t nnum_texts; + while (true) { + // fprintf(stderr, "Current char in s: %c, num_texts is: %lu\n", *s, num_texts); + // fflush(stderr); + ss = s; + ww = w; + nnum_texts = num_texts; + while (true){ + // fprintf(stderr, " Current char in w: %c, num_texts is: %lu\n", *ww, num_texts); + // fprintf(stderr, " Current char in s: %c\n", *ss); + // fflush(stderr); + + if (*ww == 0) return true; + if (*ww != *ss) break; + ww++; + ss = next_char(ss, nnum_texts); + if (ss == 0) return (*ww == 0); + }; + // fprintf(stderr, "Not found, returning\n"); + // fflush(stderr); + // fprintf(stderr, "Current string s is: %s\n", s); + s = next_char(s, num_texts); + if (s == 0) return false; +// fprintf(stderr, "After next_char, string s is: %s\n", s); +// fflush(stderr); + }; + +} diff --git a/xml-tree.hpp b/xml-tree.hpp index 4f40dd1..3ac5813 100644 --- a/xml-tree.hpp +++ b/xml-tree.hpp @@ -108,6 +108,9 @@ public: SXSI::TextCollection::document_result contains(uchar const *s) const; SXSI::TextCollection::document_result less_than(uchar const *s) const; + + bool naive_contains(node_t, uchar const *s) const; + //I/O functions void save(int, char*); static xml_tree* load(int, char*, bool, int); -- 2.17.1