From: Kim Nguyễn Date: Wed, 9 May 2012 12:25:45 +0000 (+0200) Subject: Fix a libcds bug: X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FXMLTree.git;a=commitdiff_plain;h=a6ceb9addec9be126c967821d278153b43e948e4 Fix a libcds bug: Sadakane's code uses big endian ordered words (for set/get bit) Francisco's code uses little endian ordered words. Add a set_le method to bit-vector --- diff --git a/Makefile b/Makefile index 1e054aa..feb8ae0 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,10 @@ INC_FLAGS=-I.. -I../libcds/includes/ -I. +ifeq ($(POPCOUNT), 1) + POPCOUNT_FLAG=-DHAS_NATIVE_POPCOUNT -mpopcnt +else + POPCOUNT_FLAG= +endif ifeq ($(VERBOSE), true) HIDE= @@ -10,7 +15,7 @@ endif ifeq ($(DEBUG), true) OPT_FLAGS=-O0 -g $(POPCOUNT_FLAG) -static else - OPT_FLAGS=-O3 $(POPCOUNT_FLAG) -static -flto + OPT_FLAGS=-O3 $(POPCOUNT_FLAG) -static endif ifeq ($(PROFILE), true) diff --git a/bit-vector.cpp b/bit-vector.cpp index 5450989..986e098 100644 --- a/bit-vector.cpp +++ b/bit-vector.cpp @@ -146,17 +146,29 @@ void bit_vector::pack() //sets the idxth bit to b void bit_vector::set(size_t idx, bool b) { - - size_t i = idx / W; - size_t j = idx % W; - grow(allocforsize(idx+1)); - if (idx >= _size) - _size = idx+1; - uint32_t mask = 1 << (W - 1 - j); - if (b) - _vector[i] |= mask; - else - _vector[i] &= ~mask; + size_t i = idx / W; + size_t j = idx % W; + grow(allocforsize(idx+1)); + if (idx >= _size) + _size = idx+1; + uint32_t mask = 1 << (W - 1 - j); + if (b) + _vector[i] |= mask; + else + _vector[i] &= ~mask; +} +void bit_vector::set_le(size_t idx, bool b) +{ + size_t i = idx / W; + size_t j = idx % W; + grow(allocforsize(idx+1)); + if (idx >= _size) + _size = idx+1; + uint32_t mask = 1 << j; + if (b) + _vector[i] |= mask; + else + _vector[i] &= ~mask; } void bit_vector::push_back(bool b) @@ -222,3 +234,12 @@ uint32_t * bit_vector::get_vector() const array_copy(_vector,vector,_alloc); return vector; } + +void bit_vector::debug(void) const +{ + for(size_t i = 0; i < _size; i++) + fprintf(stderr, "%i ", get(i)); + fprintf(stderr, "\n"); + + +} diff --git a/bit-vector.hpp b/bit-vector.hpp index 697581d..76ed870 100644 --- a/bit-vector.hpp +++ b/bit-vector.hpp @@ -56,7 +56,7 @@ class bit_vector { /** Sets the idxth bit to b. The bit_vector is resized if necessary */ void set(size_t idx, bool b); - + void set_le(size_t idx, bool b); /** Adds bit b at the end of the bit_vector. The bit_vector is resized if necessary */ void push_back(bool b); @@ -94,7 +94,7 @@ class bit_vector { /** Efficient unsafe get/get_field */ bool unsafe_get(size_t idx) const; uint32_t unsafe_get_field(size_t idx, size_t len) const; - + void debug(void) const; private: // true if get_vector_ptr() was called diff --git a/xml-tree-builder.cpp b/xml-tree-builder.cpp index 0b7effc..19ce5dc 100644 --- a/xml-tree-builder.cpp +++ b/xml-tree-builder.cpp @@ -94,7 +94,7 @@ void xml_tree_builder::open_tag(std::string tag) int32_t id = register_tag(tag); tags->push_back(id); par->push_back(true); - if (!disable_text_index) text_positions->push_back(false); + if (!disable_text_index) text_positions->set_le(text_positions->size(), false); } void xml_tree_builder::close_tag(std::string) @@ -102,7 +102,7 @@ void xml_tree_builder::close_tag(std::string) xml_tree::tag_t t = xml_tree::CLOSE_TAG_ID; tags->push_back(t); par->push_back(false); - if (!disable_text_index) text_positions->push_back(false); + if (!disable_text_index) text_positions->set_le(text_positions->size(), false); } void xml_tree_builder::text(std::string s) @@ -110,7 +110,7 @@ void xml_tree_builder::text(std::string s) if (!disable_text_index){ if (s.empty()) s = "\001"; tc_builder->InsertText((const unsigned char *) s.c_str()); - text_positions->set(text_positions->size() - 1, true); + text_positions->set_le(text_positions->size() - 1, true); } } diff --git a/xml-tree-inc.hpp b/xml-tree-inc.hpp index d5a921f..bb70513 100644 --- a/xml-tree-inc.hpp +++ b/xml-tree-inc.hpp @@ -36,6 +36,20 @@ xml_tree::subtree_tags(xml_tree::node_t x, xml_tree::tag_t label) const }; } +inline uint32_t xml_tree::subtree_elements(xml_tree::node_t x, + xml_tree::tag_t *atts) const +{ + + int32_t size = bp_subtree_size(par, x) - 1; + if (size <= 0) return 0; + size -= subtree_tags(x, xml_tree::PCDATA_OPEN_TAG_ID); + if (size < 3) return (uint32_t) size; + for(; *atts != xml_tree::NIL_TAG_ID; atts++) + size -= subtree_tags(x, *atts); + return (uint32_t) size; + +} + inline bool xml_tree::is_leaf(xml_tree::node_t x) const { return !bp_inspect(this->par, x+1); @@ -206,7 +220,8 @@ inline SXSI::TextCollection *xml_tree::get_text_collection() const inline xml_tree::node_t xml_tree::parent_node(int32_t d) const { - return (xml_tree::node_t) text_positions->select1(d + 1); + xml_tree::node_t res = text_positions->select1(d+1); + return (xml_tree::node_t) res; } inline SXSI::TextCollection::document_result diff --git a/xml-tree.cpp b/xml-tree.cpp index 8c0386e..3c71f6c 100644 --- a/xml-tree.cpp +++ b/xml-tree.cpp @@ -124,7 +124,7 @@ xml_tree::xml_tree(std::vector *tags, npar, 32); //delete [] textbm; - delete textbitmap; + //delete textbitmap; this->text_index_type = idx_type; text_collection = tc_builder->InitTextCollection(); @@ -300,6 +300,7 @@ xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) tree->tags = static_sequence::load(fp); ufread(&tree->bits_per_tag, sizeof(uint), 1, fp); + fprintf(stderr, "\nBits per tag: %u\n", tree->bits_per_tag); ufread(&tree->tag_seq_len, sizeof(uint), 1, fp); size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len); tree->tag_seq = new uint[size]; @@ -342,29 +343,7 @@ xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) return tree; } -uint32_t xml_tree::subtree_elements(xml_tree::node_t x) const -{ - - uint32_t size = bp_subtree_size(par, x); - if (x == root()){ - x = bp_first_child(par,x); - size = size - 1; - }; - - int s = x + 2*size - 1; - int ntext = - tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, s) - - tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, x-1); - size = size - ntext; - xml_tree::node_t fin = bp_find_close(par, x); - xml_tree::node_t y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, x); - while (y != xml_tree::NIL && y < fin){ - size -= subtree_size(y); - y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, y); - }; - return size; - } uint32_t xml_tree::num_children(xml_tree::node_t x) const { @@ -385,7 +364,7 @@ xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const std::pair xml_tree::text_id_range(xml_tree::node_t x) const { int32_t i, j; - i = text_positions->rank1(x - 1); + i = text_positions->rank1(x) - 1; j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2); if (i == j) return std::make_pair(xml_tree::NIL, xml_tree::NIL); @@ -419,7 +398,7 @@ void xml_tree::flush(int fd) void xml_tree::uflush_r(int fd, size_t s) { - if (s == 0) return; + if (s == 0 || print_buffer == 0 || fd <= 0) return; size_t written; while (1) { written = write(fd, print_buffer->data(), s); @@ -496,9 +475,112 @@ size_t xml_tree::uprintf(const char*s, int fd) return i; } +// void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) +// { + +// if (print_buffer == 0) { +// print_buffer = new std::string(BUFFER_SIZE, 0); +// print_buffer->clear(); +// print_stack = new std::vector(); +// print_stack->reserve(256); +// }; + +// xml_tree::node_t fin = bp_find_close(par, x); +// xml_tree::node_t n = x; +// xml_tree::tag_t label = tag(n); +// unsigned char * orig_text; +// unsigned char * current_text; + +// auto r = text_id_range(x); +// if (r.first == xml_tree::NIL) +// current_text = 0; +// else +// current_text = text_collection->GetText(r.first, r.second); +// current_text += (current_text[0] == 1); + +// orig_text = current_text; + +// size_t read = 0; + +// while (n <= fin) { + +// if (bp_inspect(par, n)) { +// if (label == xml_tree::PCDATA_OPEN_TAG_ID){ +// if (no_text) { +// uputs("<$/>", fd); +// } else { +// read = uprintf( (const char*) current_text, fd); +// current_text += read + 1; +// }; +// n += 2; // skip closin $ +// label = tag(n); +// } else { + +// uputc('<', fd); +// uput_str((*tag_names)[label], fd); +// n++; +// if (bp_inspect(par, n)) { +// print_stack->push_back((*tag_names)[label]); +// label = tag(n); +// if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { +// n++; +// if (no_text) uputs("><@@>", fd); + +// while (bp_inspect(par, n)) +// if (no_text) { +// uputc('<', fd); +// uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); +// uputc('>', fd); +// uputs("<$@/>', fd); +// n += 4; +// } else { +// uputc(' ', fd); +// uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); +// n++; +// uputs("=\"", fd); +// read = uprintf((const char*) current_text, fd); +// current_text += read + 1; +// uputc('"', fd); +// n += 3; +// }; + +// if (no_text) +// uputs("", fd); +// else uputc('>', fd); +// n++; +// label = tag(n); +// } else +// uputc('>', fd); +// } else { +// uputs("/>", fd); +// n++; +// label = tag(n); +// }; +// }; +// } else do { +// uputs("back(), fd); +// uputc('>', fd); +// print_stack->pop_back(); +// n++; +// } while (!bp_inspect(par, n) && !print_stack->empty()); +// label = tag(n); +// }; +// uputc('\n', fd); +// if (orig_text && text_index_type != TextCollectionBuilder::index_type_default) +// if (*orig_text == '\0') +// text_collection->DeleteText(orig_text - 1); +// else +// text_collection->DeleteText(orig_text); + +// } void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) { - + for (int i = 0; i < 50; i++) + fprintf(stderr, "Printing text number %i: %s\n", i, get_text(i)); + if (print_buffer == 0) { print_buffer = new std::string(BUFFER_SIZE, 0); print_buffer->clear(); @@ -509,17 +591,8 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) xml_tree::node_t fin = bp_find_close(par, x); xml_tree::node_t n = x; xml_tree::tag_t label = tag(n); - unsigned char * orig_text; unsigned char * current_text; - auto r = text_id_range(x); - if (r.first == xml_tree::NIL) - current_text = 0; - else - current_text = get_text(r.first); - - orig_text = current_text; - size_t read = 0; while (n <= fin) { @@ -528,70 +601,70 @@ void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) if (no_text) { uputs("<$/>", fd); } else { - read = uprintf( (const char*) current_text, fd); - current_text += read + 1; - }; - n += 2; // skip closin $ - label = tag(n); - } else { + current_text = get_text(text_id(n)); + uprintf( (const char*) (current_text + (current_text[0] == 1)), fd); - uputc('<', fd); - uput_str((*tag_names)[label], fd); - n++; - if (bp_inspect(par, n)) { - print_stack->push_back((*tag_names)[label]); - label = tag(n); - if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { - n++; - if (no_text) uputs("><@@>", fd); - - while (bp_inspect(par, n)) - if (no_text) { - uputc('<', fd); - uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); - uputc('>', fd); - uputs("<$@/>', fd); - n += 4; - } else { - uputc(' ', fd); - uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); - n++; - uputs("=\"", fd); - read = uprintf((const char*) current_text, fd); - current_text += read + 1; - uputc('"', fd); - n += 3; - }; - - if (no_text) - uputs("", fd); - else uputc('>', fd); - n++; - label = tag(n); - } else - uputc('>', fd); - } else { - uputs("/>", fd); - n++; - label = tag(n); - }; - }; - } else do { - uputs("back(), fd); - uputc('>', fd); - print_stack->pop_back(); - n++; - } while (!bp_inspect(par, n) && !print_stack->empty()); - label = tag(n); - }; + if (current_text && text_index_type != TextCollectionBuilder::index_type_default) + text_collection->DeleteText(current_text); + + n += 2; // skip closin $ + label = tag(n); + }; + } else { + uputc('<', fd); + uput_str((*tag_names)[label], fd); + n++; + if (bp_inspect(par, n)) { + print_stack->push_back((*tag_names)[label]); + label = tag(n); + if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) { + n++; + if (no_text) uputs("><@@>", fd); + + while (bp_inspect(par, n)) + if (no_text) { + uputc('<', fd); + uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); + uputc('>', fd); + uputs("<$@/>', fd); + n += 4; + } else { + uputc(' ', fd); + uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); + n+= 2; + uputs("=\"", fd); + current_text = get_text(text_id(n)); + uprintf((const char*) (current_text + (current_text[0] == 1)), fd); + if (current_text && text_index_type != TextCollectionBuilder::index_type_default) + text_collection->DeleteText(current_text); + uputc('"', fd); + n += 2; + }; + + if (no_text) + uputs("", fd); + else uputc('>', fd); + n++; + label = tag(n); + } else + uputc('>', fd); + } else { + uputs("/>", fd); + n++; + label = tag(n); + }; + }; + } else do { + uputs("back(), fd); + uputc('>', fd); + print_stack->pop_back(); + n++; + } while (!bp_inspect(par, n) && !print_stack->empty()); + label = tag(n); + }; uputc('\n', fd); - if (orig_text && text_index_type != TextCollectionBuilder::index_type_default) - if (*orig_text == '\0') - text_collection->DeleteText(orig_text - 1); - else - text_collection->DeleteText(orig_text); } diff --git a/xml-tree.hpp b/xml-tree.hpp index ff98f21..209ee56 100644 --- a/xml-tree.hpp +++ b/xml-tree.hpp @@ -50,7 +50,7 @@ public: inline uint32_t num_tags() const; inline uint32_t subtree_size(node_t) const; inline uint32_t subtree_tags(node_t, tag_t) const; - uint32_t subtree_elements(node_t) const; + inline uint32_t subtree_elements(node_t, tag_t*) const; uint32_t num_children(node_t) const; uint32_t child_pos(node_t) const;