X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=xml-tree.cpp;h=82f00b0af86742bfd66720272e51c0fd9e650c26;hb=HEAD;hp=758c1b4d9b5ffce3bbdb2022ca57a9a642b58ca5;hpb=429f734c9f9241dfb9c587e8b333777f3540625f;p=SXSI%2FXMLTree.git diff --git a/xml-tree.cpp b/xml-tree.cpp index 758c1b4..82f00b0 100644 --- a/xml-tree.cpp +++ b/xml-tree.cpp @@ -42,7 +42,13 @@ static int bits8 (int t ) { } - +static bool array_mem(xml_tree::tag_t *a, xml_tree::tag_t t) +{ + for(; *a != xml_tree::NIL_TAG_ID; a++){ + if (*a >= t) return (*a == t); + }; + return false; +} static void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream) { @@ -70,7 +76,7 @@ xml_tree::xml_tree() print_buffer = 0; } -xml_tree::xml_tree(std::vector *tags, +xml_tree::xml_tree(std::vector *tags_, std::unordered_map *tag_ids, bit_vector *parbitmap, bool disable_tc, @@ -83,30 +89,56 @@ xml_tree::xml_tree(std::vector *tags, size_t npar = parbitmap->size(); parbitmap->pack(); - par = bp_construct(npar, parbitmap->get_vector_ptr(), OPT_DEGREE); delete parbitmap; this->tag_ids = tag_ids; + tag_names = new std::vector(); tag_names->resize(tag_ids->size()); - for(auto val : *(this->tag_ids)) - (*this->tag_names)[val.second] = val.first; + this->attribute_ids = new std::unordered_set(); + std::unordered_map::iterator val; + for(val = this->tag_ids->begin(); val != this->tag_ids->end(); ++val){ + (*tag_names)[val->second] = val->first; + if (val->first.size() >= 3 && + val->first[0] == '<' && + val->first[1] == '@' && + val->first[2] == '>'){ + this->attribute_ids->insert(val->second); + }; + } uint32_t max_tag = tag_names->size() - 1; - static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); - alphabet_mapper *am = new alphabet_mapper_none(); + bit_vector *tmp_bitmap = new bit_vector(npar, 1, 0); + uint32_t * buff; + for(int32_t i = 0; i <= max_tag; i++){ + uint32_t ones = 0; + for (size_t k = 0; k < npar; k++){ + bool t = (i == (*tags_)[k]); + tmp_bitmap->set(k, t); + ones += t; + }; + buff = tmp_bitmap->get_vector_ptr(); + this->tags.push_back(new static_bitsequence_sdarray(buff, npar, ones)); + } + delete tmp_bitmap; + delete [] buff; + + + //static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); + //alphabet_mapper *am = new alphabet_mapper_none(); + +// this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb); - this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb); bits_per_tag = bits8(max_tag); tag_seq_len = npar; tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)]; for(size_t i = 0; i < (size_t) tag_seq_len; i++) - set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]); + set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags_)[i]); - delete tags; + delete tags_; if (disable_tc) { text_positions = 0; @@ -118,28 +150,29 @@ xml_tree::xml_tree(std::vector *tags, npar, 32); //delete [] textbm; - delete textbitmap; + //delete textbitmap; this->text_index_type = idx_type; - fprintf(stderr, "Before!\n"); - fflush(stderr); text_collection = tc_builder->InitTextCollection(); - fprintf(stderr, "After!\n"); - fflush(stderr); delete tc_builder; }; - } xml_tree::~xml_tree() { bp_delete(par); - delete tags; + + for(int i = 0; i < tag_names->size(); i++){ + delete tags[i]; + tags[i] = 0; + }; + + delete [] tag_seq; delete tag_names; delete tag_ids; - + delete attribute_ids; if (text_collection) delete text_collection; if (text_positions) delete text_positions; } @@ -166,56 +199,28 @@ uint32_t xml_tree::postorder(xml_tree::node_t x) const } xml_tree::node_t -xml_tree::select_child(xml_tree::node_t x, - std::unordered_set *tags) const +xml_tree::select_child(xml_tree::node_t x, xml_tree::tag_t *tags) const { if (is_nil(x) || is_leaf(x)) return xml_tree::NIL; xml_tree::node_t child = first_child(x); - if (tags->find(tag(child)) != tags->end()) return child; + if (array_mem(tags, tag(child))) return child; return select_sibling(child, tags); } -xml_tree::node_t -xml_tree::select_descendant(xml_tree::node_t x, - std::unordered_set *tags) const -{ - if (is_leaf(x)) return xml_tree::NIL; - auto min = xml_tree::NIL; - auto aux = xml_tree::NIL; - for(auto tag = tags->begin(); tag != tags->end(); ++ tag){ - aux = tagged_descendant(x, *tag); - if ((unsigned int) aux < (unsigned int) min) min = aux; - }; - return min; -} xml_tree::node_t -xml_tree::select_sibling(xml_tree::node_t x, - std::unordered_set *tags) const +xml_tree::select_sibling(xml_tree::node_t x, xml_tree::tag_t *tags) const { xml_tree::node_t sibling = next_sibling(x); xml_tree::tag_t t; while(!is_nil(sibling)) { t = tag(sibling); - if (tags->find(t) != tags->end()) return sibling; + if (array_mem(tags, t)) return sibling; sibling = next_sibling(sibling); }; return sibling; } -xml_tree::node_t -xml_tree::select_following_before(xml_tree::node_t x, - std::unordered_set *tags, - xml_tree::node_t limit) const -{ - auto min = xml_tree::NIL; - auto aux = xml_tree::NIL; - for(auto tag = tags->begin(); tag != tags->end(); ++tag){ - aux = tagged_following_before(x, *tag, limit); - if ((unsigned int) aux < (unsigned int) min) min = aux; - } - return min; -} void xml_tree::save(int fd, char* s) { @@ -233,7 +238,9 @@ void xml_tree::save(int fd, char* s) for (int i = 0; i < ntags;i++) fprintf(fp, "%s\n", tag_names->at(i).c_str()); - tags->save(fp); + //tags->save(fp); + for(int i = 0; i < ntags; i++) + tags[i]->save(fp); ufwrite(&bits_per_tag, sizeof(uint),1,fp); ufwrite(&tag_seq_len, sizeof(uint),1,fp); @@ -242,8 +249,7 @@ void xml_tree::save(int fd, char* s) bool disable_tc = text_collection == 0 || text_positions == 0; ufwrite(&disable_tc, sizeof(bool),1,fp); - fprintf(stderr, "whoot\n"); - fflush(stderr); + if (!disable_tc) { text_positions->save(fp); ufwrite(&text_index_type, @@ -284,6 +290,7 @@ xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) tree->par = loadTree(fp); //TODO use new api tree->tag_names = new std::vector(); tree->tag_ids = new std::unordered_map(); + tree->attribute_ids = new std::unordered_set(); std::string s; int ntags; @@ -298,11 +305,19 @@ xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) tree->tag_names->push_back(s); tree->tag_ids->insert(std::make_pair(s, static_cast(i))); + if (s.size() >= 3 && s[0] == '<' && s[1] == '@' && s[2] == '>'){ + tree->attribute_ids->insert(static_cast(i)); + }; + + }; + for(int i = 0; i < ntags; i++){ + tree->tags.push_back(static_bitsequence_sdarray::load(fp)); }; - tree->tags = static_sequence::load(fp); + //tree->tags = static_sequence_bs::load(fp); ufread(&tree->bits_per_tag, sizeof(uint), 1, fp); + //fprintf(stderr, "\nBits per tag: %u\n", tree->bits_per_tag); ufread(&tree->tag_seq_len, sizeof(uint), 1, fp); size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len); tree->tag_seq = new uint[size]; @@ -345,29 +360,7 @@ xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf) return tree; } -uint32_t xml_tree::subtree_elements(xml_tree::node_t x) const -{ - - uint32_t size = bp_subtree_size(par, x); - if (x == root()){ - x = bp_first_child(par,x); - size = size - 1; - }; - - int s = x + 2*size - 1; - int ntext = - tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, s) - - tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, x-1); - size = size - ntext; - xml_tree::node_t fin = bp_find_close(par, x); - xml_tree::node_t y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, x); - while (y != xml_tree::NIL && y < fin){ - size -= subtree_size(y); - y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, y); - }; - return size; - } uint32_t xml_tree::num_children(xml_tree::node_t x) const { @@ -388,12 +381,17 @@ xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const std::pair xml_tree::text_id_range(xml_tree::node_t x) const { int32_t i, j; - i = text_positions->rank1(x - 1); - j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2); + xml_tree::node_t y = bp_find_close(par, x); + if (x == root()) + i = 0; + else + i = text_positions->rank1(x-1); + j = text_positions->rank1(y); + if (i == j) return std::make_pair(xml_tree::NIL, xml_tree::NIL); else - return std::make_pair(i + 1, j); + return std::make_pair(i, j); } int32_t xml_tree::text_id(xml_tree::node_t x) const @@ -401,9 +399,9 @@ int32_t xml_tree::text_id(xml_tree::node_t x) const return (int32_t) text_positions->rank1(x) - 1; } -unsigned char* xml_tree::get_text(int32_t id) const +const char* xml_tree::get_text(int32_t id) const { - unsigned char * s = text_collection->GetText(id); + const char * s = reinterpret_cast(text_collection->GetText(id)); return s + (s[0] == 1); } @@ -413,6 +411,7 @@ void xml_tree::uflush(int fd) if (size < BUFFER_SIZE) return; uflush_r(fd, size); } + void xml_tree::flush(int fd) { uflush_r(fd, print_buffer->size()); @@ -421,16 +420,18 @@ void xml_tree::flush(int fd) void xml_tree::uflush_r(int fd, size_t s) { - if (s == 0) return; + if (s == 0 || print_buffer == 0 || fd <= 0) return; size_t written; while (1) { written = write(fd, print_buffer->data(), s); - if ((written < 0) && (errno == EAGAIN || errno == EINTR)) + if ((written < 0) && (errno == EAGAIN || errno == EINTR)){ continue; + }; break; }; print_buffer->clear(); } + void xml_tree::uput_str(std::string s, int fd) { print_buffer->append(s); @@ -449,17 +450,16 @@ void xml_tree::uputc(const char c, int fd) const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const { - - unsigned char *s; if (tagid < 0 || tagid >= tag_names->size()) return ""; - return (const char *) (*tag_names)[tagid].c_str(); + return (*tag_names)[tagid].c_str(); } xml_tree::tag_t xml_tree::register_tag(char *s) { - auto found = tag_ids->find(std::string(s)); + std::unordered_map::iterator found; + found = tag_ids->find(std::string(s)); if (found == tag_ids->end()) return xml_tree::NIL_TAG_ID; else @@ -496,27 +496,38 @@ size_t xml_tree::uprintf(const char*s, int fd) return i; } -void xml_tree::print(int fd, xml_tree::node_t x, bool no_text) +void xml_tree::print(xml_tree::node_t x, int fd, bool no_text) { + if (print_buffer == 0) { print_buffer = new std::string(BUFFER_SIZE, 0); print_buffer->clear(); print_stack = new std::vector(); print_stack->reserve(256); }; + xml_tree::node_t fin = bp_find_close(par, x); xml_tree::node_t n = x; xml_tree::tag_t label = tag(n); unsigned char * orig_text; unsigned char * current_text; - auto r = text_id_range(x); + std::pair r = text_id_range(x); if (r.first == xml_tree::NIL) current_text = 0; - else - current_text = get_text(r.first); + else { + current_text = text_collection->GetText(r.first, r.second); + current_text += (current_text[0] == 1); + }; + + //fprintf(stderr, "Printing node %i, tag: %s\n", x, get_tag_name_by_ref(tag(x))); + // fprintf(stderr, "Beggining of text is (%i):%s\n", r.first, current_text); + //fflush(stderr); + + orig_text = current_text; + size_t read = 0; while (n <= fin) { @@ -557,6 +568,7 @@ void xml_tree::print(int fd, xml_tree::node_t x, bool no_text) uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd); n++; uputs("=\"", fd); + current_text += (current_text[0] == 1); read = uprintf((const char*) current_text, fd); current_text += read + 1; uputc('"', fd); @@ -570,11 +582,11 @@ void xml_tree::print(int fd, xml_tree::node_t x, bool no_text) label = tag(n); } else uputc('>', fd); - } else { + } else { uputs("/>", fd); n++; label = tag(n); - }; + }; }; } else do { uputs("', fd); print_stack->pop_back(); n++; - } while (!bp_inspect(par, n) || print_stack->empty()); + } while (!bp_inspect(par, n) && !print_stack->empty()); label = tag(n); }; uputc('\n', fd); @@ -593,3 +605,87 @@ void xml_tree::print(int fd, xml_tree::node_t x, bool no_text) text_collection->DeleteText(orig_text); } + + +static inline uchar * next_char(uchar *s, size_t &numtexts) +{ + uchar c; + s++; + c = *s; + while (c <= 1){ + if (c == 0) { + numtexts--; + if (numtexts == 0) return 0; + }; + s++; + c = *s; + }; + return s; +} + +static inline bool naive_char_contains(uchar const *s, uchar c, size_t numtexts) +{ + uchar sc; + while(numtexts != 0){ + sc = *s; + if (c == sc) return true; + if (sc == 0) numtexts--; + s++; + }; + return false; +} + +bool xml_tree::naive_contains(xml_tree::node_t n, uchar const *w) const +{ + if (w[0] == 0) + return true; + // fprintf(stderr, "Calling naive_contains on node: %i, with tag: %s\n", + // n, + // get_tag_name_by_ref(tag(n))); + // fflush(stderr); + std::pair range = text_id_range(n); + + //pattern is at least one char + if (range.first == xml_tree::NIL) + return false; + + uchar * text = this->text_collection->GetText(range.first, range.second); + size_t num_texts = + subtree_tags(n, ATTRIBUTE_DATA_OPEN_TAG_ID) + + subtree_tags(n, PCDATA_OPEN_TAG_ID); + + if (w[1] == 0) + return naive_char_contains(text, w[0], num_texts); + + //KMP is overkill + uchar * s = text; + uchar * ss; + uchar const *ww; + size_t nnum_texts; + while (true) { + // fprintf(stderr, "Current char in s: %c, num_texts is: %lu\n", *s, num_texts); + // fflush(stderr); + ss = s; + ww = w; + nnum_texts = num_texts; + while (true){ + // fprintf(stderr, " Current char in w: %c, num_texts is: %lu\n", *ww, num_texts); + // fprintf(stderr, " Current char in s: %c\n", *ss); + // fflush(stderr); + + if (*ww == 0) return true; + if (*ww != *ss) break; + ww++; + ss = next_char(ss, nnum_texts); + if (ss == 0) return (*ww == 0); + }; + // fprintf(stderr, "Not found, returning\n"); + // fflush(stderr); + // fprintf(stderr, "Current string s is: %s\n", s); + s = next_char(s, num_texts); + if (s == 0) return false; +// fprintf(stderr, "After next_char, string s is: %s\n", s); +// fflush(stderr); + }; + +}