From: Kim Nguyễn Date: Tue, 29 May 2012 06:12:22 +0000 (+0200) Subject: Merge branch remove-virtual X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FXMLTree.git;a=commitdiff_plain;h=2b67e586d2ea45d87648ac3977306139110c4e78 Merge branch remove-virtual Updating a6ceb9a..f423189 Fast-forward Squash commit -- not updating HEAD xml-tree-builder.cpp | 2 + xml-tree-inc.hpp | 54 ++++++- xml-tree.cpp | 406 +++++++++++++++++++++++++------------------------- xml-tree.hpp | 7 +- 4 files changed, 258 insertions(+), 211 deletions(-) commit f423189f0f02a6649090256d1faa0748c29cb2e7 Author: Kim Nguyễn Date: Tue May 29 08:09:47 2012 +0200 More inlining commit d87421ea4d344a29eabbd6ecd25c38b828782a45 Author: Kim Nguyễn Date: Tue May 29 08:08:33 2012 +0200 Replace abstract static_sequence by an array of static_sequence_sdarray (avoids calling virtual methods). commit 6ccf015bb473e810c3f7f91f1f85f21044ed2c0b Author: Kim Nguyễn Date: Tue May 29 08:07:51 2012 +0200 Revert to old printing function --- diff --git a/xml-tree-builder.cpp b/xml-tree-builder.cpp index 19ce5dc..a55767c 100644 --- a/xml-tree-builder.cpp +++ b/xml-tree-builder.cpp @@ -1,5 +1,6 @@ #include "xml-tree-builder.hpp" #include +#include #include using namespace SXSI; @@ -83,6 +84,7 @@ xml_tree_builder::open_document(bool disable_text_index, this->disable_text_index = disable_text_index; if (!disable_text_index){ + fprintf(stderr, "Sample rate is %u\n", sample_rate); tc_builder = TextCollectionBuilder::create(sample_rate, idx_type); text_positions = new bit_vector(); text_index_type = idx_type; diff --git a/xml-tree-inc.hpp b/xml-tree-inc.hpp index bb70513..30a321e 100644 --- a/xml-tree-inc.hpp +++ b/xml-tree-inc.hpp @@ -19,7 +19,7 @@ inline uint32_t xml_tree::num_tags() const inline uint32_t xml_tree::subtree_size(xml_tree::node_t x) const { - return bp_subtree_size(this->par, x); + return x == root() ? this->par->n/2 : bp_subtree_size(this->par, x); } inline uint32_t @@ -32,7 +32,7 @@ xml_tree::subtree_tags(xml_tree::node_t x, xml_tree::tag_t label) const count += (tag(i) == label); return count; } else { - return tags->rank(label, y) - tags->rank(label, x); + return tags[label]->rank(y) - tags[label]->rank(x); }; } @@ -158,9 +158,22 @@ inline xml_tree::node_t xml_tree::next_element(xml_tree::node_t x) const return x; } -inline xml_tree::node_t xml_tree::tagged_next(node_t x, tag_t tag) const + +inline xml_tree::node_t xml_tree::tagged_next_close(xml_tree::node_t x, + xml_tree::tag_t label) const +{ + xml_tree::node_t i=x; + for(xml_tree::node_t i = x+1; i < std::min(x+100, this->par->n); i++) + if (tag(i) == label) + return i; + + return tags[label]->select_next(i); +} + +inline xml_tree::node_t xml_tree::tagged_next(xml_tree::node_t x, + xml_tree::tag_t label) const { - return this->tags->select_next(tag, x); + return tags[label]->select_next(x); } inline xml_tree::node_t @@ -174,7 +187,7 @@ xml_tree::tagged_descendant(xml_tree::node_t x, inline xml_tree::node_t xml_tree::tagged_following_before(xml_tree::node_t x, xml_tree::tag_t tag, - xml_tree::node_t limit) const + xml_tree::node_t limit) const { xml_tree::node_t close = bp_find_close(this->par, x); xml_tree::node_t s = tagged_next(close, tag); @@ -207,6 +220,37 @@ inline xml_tree::node_t xml_tree::tagged_sibling(xml_tree::node_t x, return sibling; } +inline xml_tree::node_t +xml_tree::select_descendant(xml_tree::node_t x, xml_tree::tag_t *tags) const +{ + if (is_leaf(x)) return xml_tree::NIL; + auto min = xml_tree::NIL; + auto aux = xml_tree::NIL; + for(; *tags != xml_tree::NIL_TAG_ID; ++tags){ + aux = tagged_next(x, *tags); + if ((unsigned int) aux < (unsigned int) min) min = aux; + }; + return (min == xml_tree::NIL || is_ancestor(x, min)) ? min : xml_tree::NIL; +} + + +inline xml_tree::node_t +xml_tree::select_following_before(xml_tree::node_t x, xml_tree::tag_t *tags, + xml_tree::node_t limit) const +{ + auto min = xml_tree::NIL; + auto aux = xml_tree::NIL; + auto close = bp_find_close(this->par, x); + + for(; *tags != xml_tree::NIL_TAG_ID; ++tags){ + aux = tagged_next(close, *tags); + if ((unsigned int) aux < (unsigned int) min) min = aux; + } + + return (min == xml_tree::NIL || min < limit) ? min : xml_tree::NIL; +} + + xml_tree::node_t xml_tree::closing(xml_tree::node_t x) const { return bp_find_close(this->par, x); diff --git a/xml-tree.cpp b/xml-tree.cpp index 3c71f6c..575ba8f 100644 --- a/xml-tree.cpp +++ b/xml-tree.cpp @@ -76,7 +76,7 @@ xml_tree::xml_tree() print_buffer = 0; } -xml_tree::xml_tree(std::vector *tags, +xml_tree::xml_tree(std::vector *tags_, std::unordered_map *tag_ids, bit_vector *parbitmap, bool disable_tc, @@ -89,7 +89,6 @@ xml_tree::xml_tree(std::vector *tags, size_t npar = parbitmap->size(); parbitmap->pack(); - par = bp_construct(npar, parbitmap->get_vector_ptr(), OPT_DEGREE); @@ -102,17 +101,34 @@ xml_tree::xml_tree(std::vector *tags, (*this->tag_names)[val.second] = val.first; uint32_t max_tag = tag_names->size() - 1; - static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); - alphabet_mapper *am = new alphabet_mapper_none(); - this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb); + bit_vector *tmp_bitmap = new bit_vector(npar, 1, 0); + uint32_t * buff = tmp_bitmap->get_vector_ptr(); + for(int32_t i = 0; i <= max_tag; i++){ + uint32_t ones = 0; + for (size_t k = 0; k < npar; k++){ + bool t = (i == (*tags_)[k]); + tmp_bitmap->set(k, t); + ones += t; + }; + + this->tags.push_back(new static_bitsequence_sdarray(buff, npar, ones)); + } + delete tmp_bitmap; + delete [] buff; + + //static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); + //alphabet_mapper *am = new alphabet_mapper_none(); + +// this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb); + bits_per_tag = bits8(max_tag); tag_seq_len = npar; tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)]; for(size_t i = 0; i < (size_t) tag_seq_len; i++) - set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]); + set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags_)[i]); - delete tags; + delete tags_; if (disable_tc) { text_positions = 0; @@ -131,17 +147,21 @@ xml_tree::xml_tree(std::vector *tags, delete tc_builder; }; - } xml_tree::~xml_tree() { bp_delete(par); - delete tags; + + for(int i = 0; i < tag_names->size(); i++){ + delete tags[i]; + tags[i] = 0; + }; + + delete [] tag_seq; delete tag_names; delete tag_ids; - if (text_collection) delete text_collection; if (text_positions) delete text_positions; } @@ -176,18 +196,6 @@ xml_tree::select_child(xml_tree::node_t x, xml_tree::tag_t *tags) const return select_sibling(child, tags); } -xml_tree::node_t -xml_tree::select_descendant(xml_tree::node_t x, xml_tree::tag_t *tags) const -{ - if (is_leaf(x)) return xml_tree::NIL; - auto min = xml_tree::NIL; - auto aux = xml_tree::NIL; - for(; *tags != xml_tree::NIL_TAG_ID; ++tags){ - aux = tagged_descendant(x, *tags); - if ((unsigned int) aux < (unsigned int) min) min = aux; - }; - return min; -} xml_tree::node_t xml_tree::select_sibling(xml_tree::node_t x, xml_tree::tag_t *tags) const @@ -202,18 +210,6 @@ xml_tree::select_sibling(xml_tree::node_t x, xml_tree::tag_t *tags) const return sibling; } -xml_tree::node_t -xml_tree::select_following_before(xml_tree::node_t x, xml_tree::tag_t *tags, - xml_tree::node_t limit) const -{ - auto min = xml_tree::NIL; - auto aux = xml_tree::NIL; - for(; *tags != xml_tree::NIL_TAG_ID; ++tags){ - aux = tagged_following_before(x, *tags, limit); - if ((unsigned int) aux < (unsigned int) min) min = aux; - } - return min; -} void xml_tree::save(int fd, char* s) { @@ -231,7 +227,9 @@ void xml_tree::save(int fd, char* s) for (int i = 0; i < ntags;i++) fprintf(fp, "%s\n", tag_names->at(i).c_str()); - tags->save(fp); + //tags->save(fp); + for(int i = 0; i < ntags; i++) + tags[i]->save(fp); ufwrite(&bits_per_tag, sizeof(uint),1,fp); ufwrite(&tag_seq_len, sizeof(uint),1,fp); @@ -298,7 +296,11 @@ xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) }; - tree->tags = static_sequence::load(fp); + for(int i = 0; i < ntags; i++){ + tree->tags.push_back(static_bitsequence_sdarray::load(fp)); + }; + + //tree->tags = static_sequence_bs::load(fp); ufread(&tree->bits_per_tag, sizeof(uint), 1, fp); fprintf(stderr, "\nBits per tag: %u\n", tree->bits_per_tag); ufread(&tree->tag_seq_len, sizeof(uint), 1, fp); @@ -475,112 +477,9 @@ size_t xml_tree::uprintf(const char*s, int fd) return i; } -// void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) -// { - -// if (print_buffer == 0) { -// print_buffer = new std::string(BUFFER_SIZE, 0); -// print_buffer->clear(); -// print_stack = new std::vector(); -// print_stack->reserve(256); -// }; - -// xml_tree::node_t fin = bp_find_close(par, x); -// xml_tree::node_t n = x; -// xml_tree::tag_t label = tag(n); -// unsigned char * orig_text; -// unsigned char * current_text; - -// auto r = text_id_range(x); -// if (r.first == xml_tree::NIL) -// current_text = 0; -// else -// current_text = text_collection->GetText(r.first, r.second); -// current_text += (current_text[0] == 1); - -// orig_text = current_text; - -// size_t read = 0; - -// while (n <= fin) { - -// if (bp_inspect(par, n)) { -// if (label == xml_tree::PCDATA_OPEN_TAG_ID){ -// if (no_text) { -// uputs("<$/>", fd); -// } else { -// read = uprintf( (const char*) current_text, fd); -// current_text += read + 1; -// }; -// n += 2; // skip closin $ -// label = tag(n); -// } else { - -// uputc('<', fd); -// uput_str((*tag_names)[label], fd); -// n++; -// if (bp_inspect(par, n)) { -// print_stack->push_back((*tag_names)[label]); -// label = tag(n); -// if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { -// n++; -// if (no_text) uputs("><@@>", fd); - -// while (bp_inspect(par, n)) -// if (no_text) { -// uputc('<', fd); -// uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); -// uputc('>', fd); -// uputs("<$@/>', fd); -// n += 4; -// } else { -// uputc(' ', fd); -// uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); -// n++; -// uputs("=\"", fd); -// read = uprintf((const char*) current_text, fd); -// current_text += read + 1; -// uputc('"', fd); -// n += 3; -// }; - -// if (no_text) -// uputs("", fd); -// else uputc('>', fd); -// n++; -// label = tag(n); -// } else -// uputc('>', fd); -// } else { -// uputs("/>", fd); -// n++; -// label = tag(n); -// }; -// }; -// } else do { -// uputs("back(), fd); -// uputc('>', fd); -// print_stack->pop_back(); -// n++; -// } while (!bp_inspect(par, n) && !print_stack->empty()); -// label = tag(n); -// }; -// uputc('\n', fd); -// if (orig_text && text_index_type != TextCollectionBuilder::index_type_default) -// if (*orig_text == '\0') -// text_collection->DeleteText(orig_text - 1); -// else -// text_collection->DeleteText(orig_text); - -// } void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) { - for (int i = 0; i < 50; i++) - fprintf(stderr, "Printing text number %i: %s\n", i, get_text(i)); - + if (print_buffer == 0) { print_buffer = new std::string(BUFFER_SIZE, 0); print_buffer->clear(); @@ -591,8 +490,19 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) xml_tree::node_t fin = bp_find_close(par, x); xml_tree::node_t n = x; xml_tree::tag_t label = tag(n); + unsigned char * orig_text; unsigned char * current_text; + auto r = text_id_range(x); + if (r.first == xml_tree::NIL) + current_text = 0; + else + current_text = text_collection->GetText(r.first, r.second); + current_text += (current_text[0] == 1); + + orig_text = current_text; + + size_t read = 0; while (n <= fin) { @@ -601,70 +511,160 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) if (no_text) { uputs("<$/>", fd); } else { - current_text = get_text(text_id(n)); - uprintf( (const char*) (current_text + (current_text[0] == 1)), fd); - - if (current_text && text_index_type != TextCollectionBuilder::index_type_default) - text_collection->DeleteText(current_text); - - n += 2; // skip closin $ - label = tag(n); - }; + read = uprintf( (const char*) current_text, fd); + current_text += read + 1; + }; + n += 2; // skip closin $ + label = tag(n); } else { - uputc('<', fd); - uput_str((*tag_names)[label], fd); - n++; - if (bp_inspect(par, n)) { - print_stack->push_back((*tag_names)[label]); - label = tag(n); - if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { - n++; - if (no_text) uputs("><@@>", fd); - - while (bp_inspect(par, n)) - if (no_text) { - uputc('<', fd); - uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); - uputc('>', fd); - uputs("<$@/>', fd); - n += 4; - } else { - uputc(' ', fd); - uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); - n+= 2; - uputs("=\"", fd); - current_text = get_text(text_id(n)); - uprintf((const char*) (current_text + (current_text[0] == 1)), fd); - if (current_text && text_index_type != TextCollectionBuilder::index_type_default) - text_collection->DeleteText(current_text); - uputc('"', fd); - n += 2; - }; - - if (no_text) - uputs("", fd); - else uputc('>', fd); - n++; - label = tag(n); - } else - uputc('>', fd); - } else { - uputs("/>", fd); - n++; - label = tag(n); - }; - }; - } else do { - uputs("back(), fd); - uputc('>', fd); - print_stack->pop_back(); - n++; - } while (!bp_inspect(par, n) && !print_stack->empty()); - label = tag(n); - }; + + uputc('<', fd); + uput_str((*tag_names)[label], fd); + n++; + if (bp_inspect(par, n)) { + print_stack->push_back((*tag_names)[label]); + label = tag(n); + if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { + n++; + if (no_text) uputs("><@@>", fd); + + while (bp_inspect(par, n)) + if (no_text) { + uputc('<', fd); + uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); + uputc('>', fd); + uputs("<$@/>', fd); + n += 4; + } else { + uputc(' ', fd); + uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); + n++; + uputs("=\"", fd); + read = uprintf((const char*) current_text, fd); + current_text += read + 1; + uputc('"', fd); + n += 3; + }; + + if (no_text) + uputs("", fd); + else uputc('>', fd); + n++; + label = tag(n); + } else + uputc('>', fd); + } else { + uputs("/>", fd); + n++; + label = tag(n); + }; + }; + } else do { + uputs("back(), fd); + uputc('>', fd); + print_stack->pop_back(); + n++; + } while (!bp_inspect(par, n) && !print_stack->empty()); + label = tag(n); + }; uputc('\n', fd); + if (orig_text && text_index_type != TextCollectionBuilder::index_type_default) + if (*orig_text == '\0') + text_collection->DeleteText(orig_text - 1); + else + text_collection->DeleteText(orig_text); } + +// void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) +// { +// if (print_buffer == 0) { +// print_buffer = new std::string(BUFFER_SIZE, 0); +// print_buffer->clear(); +// print_stack = new std::vector(); +// print_stack->reserve(256); +// }; + +// xml_tree::node_t fin = bp_find_close(par, x); +// xml_tree::node_t n = x; +// xml_tree::tag_t label = tag(n); +// unsigned char * current_text; + + +// while (n <= fin) { + +// if (bp_inspect(par, n)) { +// if (label == xml_tree::PCDATA_OPEN_TAG_ID){ +// if (no_text) { +// uputs("<$/>", fd); +// } else { +// current_text = get_text(text_id(n)); +// uprintf( (const char*) (current_text + (current_text[0] == 1)), fd); + +// if (current_text && text_index_type != TextCollectionBuilder::index_type_default) +// text_collection->DeleteText(current_text); + +// n += 2; // skip closin $ +// label = tag(n); +// }; +// } else { +// uputc('<', fd); +// uput_str((*tag_names)[label], fd); +// n++; +// if (bp_inspect(par, n)) { +// print_stack->push_back((*tag_names)[label]); +// label = tag(n); +// if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { +// n++; +// if (no_text) uputs("><@@>", fd); + +// while (bp_inspect(par, n)) +// if (no_text) { +// uputc('<', fd); +// uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); +// uputc('>', fd); +// uputs("<$@/>', fd); +// n += 4; +// } else { +// uputc(' ', fd); +// uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); +// n+= 2; +// uputs("=\"", fd); +// current_text = get_text(text_id(n)); +// uprintf((const char*) (current_text + (current_text[0] == 1)), fd); +// if (current_text && text_index_type != TextCollectionBuilder::index_type_default) +// text_collection->DeleteText(current_text); +// uputc('"', fd); +// n += 2; +// }; + +// if (no_text) +// uputs("", fd); +// else uputc('>', fd); +// n++; +// label = tag(n); +// } else +// uputc('>', fd); +// } else { +// uputs("/>", fd); +// n++; +// label = tag(n); +// }; +// }; +// } else do { +// uputs("back(), fd); +// uputc('>', fd); +// print_stack->pop_back(); +// n++; +// } while (!bp_inspect(par, n) && !print_stack->empty()); +// label = tag(n); +// }; +// uputc('\n', fd); + +// } diff --git a/xml-tree.hpp b/xml-tree.hpp index 209ee56..4f40dd1 100644 --- a/xml-tree.hpp +++ b/xml-tree.hpp @@ -83,15 +83,16 @@ public: inline node_t prev_sibling(node_t) const; inline node_t first_element(node_t) const; inline node_t next_element(node_t) const; + inline node_t tagged_next_close(node_t, tag_t) const; inline node_t tagged_next(node_t, tag_t) const; inline node_t tagged_descendant(node_t, tag_t) const; inline node_t tagged_following_before(node_t, tag_t, node_t) const; inline node_t tagged_child(node_t, tag_t) const; inline node_t tagged_sibling(node_t, tag_t) const; node_t select_child(node_t, tag_t*) const; - node_t select_descendant(node_t, tag_t*) const; + inline node_t select_descendant(node_t, tag_t*) const; node_t select_sibling(node_t, tag_t*) const; - node_t select_following_before (node_t, tag_t*, node_t) const; + inline node_t select_following_before (node_t, tag_t*, node_t) const; inline node_t closing(node_t) const; //Text functions @@ -127,7 +128,7 @@ private: //Parenthesis sequence bp *par; //tag sequence - static_sequence *tags; + std::vector tags; uint32_t *tag_seq; uint32_t tag_seq_len; uint32_t bits_per_tag;