X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=xml-tree.cpp;h=158ec1b272f313bddb78cfae01739ccdcd8f6478;hb=fc0cefb22e6a5e449e95a13713e873a9a6545b88;hp=575ba8f76697adca17a6ab6f275a0c1d8d9b398e;hpb=23fb7cad5ba8bca3cf899f3c58f879b416ba05b1;p=SXSI%2FXMLTree.git diff --git a/xml-tree.cpp b/xml-tree.cpp index 575ba8f..158ec1b 100644 --- a/xml-tree.cpp +++ b/xml-tree.cpp @@ -101,9 +101,8 @@ xml_tree::xml_tree(std::vector *tags_, (*this->tag_names)[val.second] = val.first; uint32_t max_tag = tag_names->size() - 1; - bit_vector *tmp_bitmap = new bit_vector(npar, 1, 0); - uint32_t * buff = tmp_bitmap->get_vector_ptr(); + uint32_t * buff; for(int32_t i = 0; i <= max_tag; i++){ uint32_t ones = 0; for (size_t k = 0; k < npar; k++){ @@ -111,12 +110,13 @@ xml_tree::xml_tree(std::vector *tags_, tmp_bitmap->set(k, t); ones += t; }; - + buff = tmp_bitmap->get_vector_ptr(); this->tags.push_back(new static_bitsequence_sdarray(buff, npar, ones)); } delete tmp_bitmap; delete [] buff; + //static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); //alphabet_mapper *am = new alphabet_mapper_none(); @@ -366,12 +366,18 @@ xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const std::pair xml_tree::text_id_range(xml_tree::node_t x) const { int32_t i, j; - i = text_positions->rank1(x) - 1; - j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2); + xml_tree::node_t y = bp_find_close(par, x); + if (x == root()) + i = 0; + else + i = text_positions->rank1(x-1); + + j = text_positions->rank1(y); +// fprintf(stderr, "Rank of node %i is %i, rank of closing %i is %i\n", x, i, y, j); if (i == j) return std::make_pair(xml_tree::NIL, xml_tree::NIL); else - return std::make_pair(i + 1, j); + return std::make_pair(i, j); } int32_t xml_tree::text_id(xml_tree::node_t x) const @@ -496,9 +502,16 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) auto r = text_id_range(x); if (r.first == xml_tree::NIL) current_text = 0; - else + else { current_text = text_collection->GetText(r.first, r.second); - current_text += (current_text[0] == 1); + current_text += (current_text[0] == 1); + }; + + //fprintf(stderr, "Printing node %i, tag: %s\n", x, get_tag_name_by_ref(tag(x))); + // fprintf(stderr, "Beggining of text is (%i):%s\n", r.first, current_text); + //fflush(stderr); + + orig_text = current_text; @@ -620,7 +633,7 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) // if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { // n++; // if (no_text) uputs("><@@>", fd); - + // while (bp_inspect(par, n)) // if (no_text) { // uputc('<', fd); @@ -668,3 +681,88 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) // uputc('\n', fd); // } + + + +static inline uchar * next_char(uchar *s, size_t &numtexts) +{ + uchar c; + s++; + c = *s; + while (c <= 1){ + if (c == 0) { + numtexts--; + if (numtexts == 0) return 0; + }; + s++; + c = *s; + }; + return s; +} + +static inline bool naive_char_contains(uchar const *s, uchar c, size_t numtexts) +{ + uchar sc; + while(numtexts != 0){ + sc = *s; + if (c == sc) return true; + if (sc == 0) numtexts--; + s++; + }; + return false; +} + +bool xml_tree::naive_contains(xml_tree::node_t n, uchar const *w) const +{ + if (w[0] == 0) + return true; + // fprintf(stderr, "Calling naive_contains on node: %i, with tag: %s\n", + // n, + // get_tag_name_by_ref(tag(n))); + // fflush(stderr); + std::pair range = text_id_range(n); + + //pattern is at least one char + if (range.first == xml_tree::NIL) + return false; + + uchar * text = this->text_collection->GetText(range.first, range.second); + size_t num_texts = + subtree_tags(n, ATTRIBUTE_DATA_OPEN_TAG_ID) + + subtree_tags(n, PCDATA_OPEN_TAG_ID); + + if (w[1] == 0) + return naive_char_contains(text, w[0], num_texts); + + //KMP is overkill + uchar * s = text; + uchar * ss; + uchar const *ww; + size_t nnum_texts; + while (true) { + // fprintf(stderr, "Current char in s: %c, num_texts is: %lu\n", *s, num_texts); + // fflush(stderr); + ss = s; + ww = w; + nnum_texts = num_texts; + while (true){ + // fprintf(stderr, " Current char in w: %c, num_texts is: %lu\n", *ww, num_texts); + // fprintf(stderr, " Current char in s: %c\n", *ss); + // fflush(stderr); + + if (*ww == 0) return true; + if (*ww != *ss) break; + ww++; + ss = next_char(ss, nnum_texts); + if (ss == 0) return (*ww == 0); + }; + // fprintf(stderr, "Not found, returning\n"); + // fflush(stderr); + // fprintf(stderr, "Current string s is: %s\n", s); + s = next_char(s, num_texts); + if (s == 0) return false; +// fprintf(stderr, "After next_char, string s is: %s\n", s); +// fflush(stderr); + }; + +}