12 #include "xml-tree.hpp"
16 const xml_tree::node_t xml_tree::NIL;
17 const xml_tree::node_t xml_tree::ROOT;
20 const xml_tree::tag_t xml_tree::NIL_TAG_ID;
21 const char* xml_tree::NIL_TAG = "<!NIL!>";
22 const xml_tree::tag_t xml_tree::DOCUMENT_OPEN_TAG_ID;
23 const char* xml_tree::DOCUMENT_OPEN_TAG = "";
24 const xml_tree::tag_t xml_tree::ATTRIBUTE_OPEN_TAG_ID;
25 const char* xml_tree::ATTRIBUTE_OPEN_TAG = "<@>";
26 const xml_tree::tag_t xml_tree::PCDATA_OPEN_TAG_ID;
27 const char* xml_tree::PCDATA_OPEN_TAG = "<$>";
28 const xml_tree::tag_t xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID;
29 const char* xml_tree::ATTRIBUTE_DATA_OPEN_TAG = "<@$>";
30 const xml_tree::tag_t xml_tree::CLOSE_TAG_ID;
31 const char* xml_tree::CLOSE_TAG = "</>";
34 static int bits8 (int t ) {
45 static bool array_mem(xml_tree::tag_t *a, xml_tree::tag_t t)
47 for(; *a != xml_tree::NIL_TAG_ID; a++){
48 if (*a >= t) return (*a == t);
53 static void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream)
56 res = fread(ptr,size,nmemb,stream);
58 throw std::runtime_error("ufread I/O error" );
62 static void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
65 res = fwrite(ptr,size,nmemb,stream);
67 throw std::runtime_error("ufwrite I/O error");
79 xml_tree::xml_tree(std::vector<int32_t> *tags_,
80 std::unordered_map<std::string, int32_t> *tag_ids,
81 bit_vector *parbitmap,
83 SXSI::TextCollectionBuilder *tc_builder,
84 SXSI::TextCollectionBuilder::index_type_t idx_type,
85 bit_vector *textbitmap)
90 size_t npar = parbitmap->size();
92 par = bp_construct(npar,
93 parbitmap->get_vector_ptr(),
97 this->tag_ids = tag_ids;
98 tag_names = new std::vector<std::string>();
99 tag_names->resize(tag_ids->size());
100 std::unordered_map<std::string, tag_t>::iterator val;
101 //for(auto val : *(this->tag_ids))
102 //(*this->tag_names)[val.second] = val.first;
103 for(val = this->tag_ids->begin(); val != this->tag_ids->end(); ++val)
104 (*this->tag_names)[val->second] = val->first;
107 uint32_t max_tag = tag_names->size() - 1;
108 bit_vector *tmp_bitmap = new bit_vector(npar, 1, 0);
110 for(int32_t i = 0; i <= max_tag; i++){
112 for (size_t k = 0; k < npar; k++){
113 bool t = (i == (*tags_)[k]);
114 tmp_bitmap->set(k, t);
117 buff = tmp_bitmap->get_vector_ptr();
118 this->tags.push_back(new static_bitsequence_sdarray(buff, npar, ones));
124 //static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();
125 //alphabet_mapper *am = new alphabet_mapper_none();
127 // this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb);
129 bits_per_tag = bits8(max_tag);
131 tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)];
132 for(size_t i = 0; i < (size_t) tag_seq_len; i++)
133 set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags_)[i]);
142 uint32_t * textbm = textbitmap->get_vector_ptr();
143 text_positions = new static_bitsequence_rrr02(textbm,
149 this->text_index_type = idx_type;
150 text_collection = tc_builder->InitTextCollection();
156 xml_tree::~xml_tree()
160 for(int i = 0; i < tag_names->size(); i++){
169 if (text_collection) delete text_collection;
170 if (text_positions) delete text_positions;
173 bool xml_tree::is_child(xml_tree::node_t x, xml_tree::node_t y) const
175 return !is_leaf(x) && is_ancestor(y, x) && depth(x)+1 == depth(y);
178 uint32_t xml_tree::depth(xml_tree::node_t x) const
180 return bp_depth(this->par, x);
184 uint32_t xml_tree::preorder(xml_tree::node_t x) const
186 return bp_preorder_rank(this->par, x);
189 uint32_t xml_tree::postorder(xml_tree::node_t x) const
191 return bp_postorder_rank(this->par, x);
195 xml_tree::select_child(xml_tree::node_t x, xml_tree::tag_t *tags) const
197 if (is_nil(x) || is_leaf(x)) return xml_tree::NIL;
198 xml_tree::node_t child = first_child(x);
199 if (array_mem(tags, tag(child))) return child;
200 return select_sibling(child, tags);
205 xml_tree::select_sibling(xml_tree::node_t x, xml_tree::tag_t *tags) const
207 xml_tree::node_t sibling = next_sibling(x);
209 while(!is_nil(sibling)) {
211 if (array_mem(tags, t)) return sibling;
212 sibling = next_sibling(sibling);
218 void xml_tree::save(int fd, char* s)
223 fp = fdopen(fd2, "w");
225 if (fd == 0) throw(std::runtime_error(strerror(errno)));
226 saveTree(par, fp); //TODO use new api
228 int ntags = tag_names->size();
230 ufwrite(&ntags, sizeof(int), 1, fp);
231 for (int i = 0; i < ntags;i++)
232 fprintf(fp, "%s\n", tag_names->at(i).c_str());
235 for(int i = 0; i < ntags; i++)
238 ufwrite(&bits_per_tag, sizeof(uint),1,fp);
239 ufwrite(&tag_seq_len, sizeof(uint),1,fp);
240 ufwrite(tag_seq, sizeof(uint), uint_len(bits_per_tag, tag_seq_len), fp);
242 bool disable_tc = text_collection == 0 || text_positions == 0;
244 ufwrite(&disable_tc, sizeof(bool),1,fp);
247 text_positions->save(fp);
248 ufwrite(&text_index_type,
249 sizeof(TextCollectionBuilder::index_type_t),
253 switch (text_index_type){
254 case TextCollectionBuilder::index_type_default:
255 file.append("_default");
257 case TextCollectionBuilder::index_type_swcsa:
258 file.append("_swcsa");
260 case TextCollectionBuilder::index_type_rlcsa:
261 file.append("_rlcsa");
265 text_collection->Save(fp, file.c_str());
273 xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf)
280 fp = fdopen(fd, "r");
281 xml_tree *tree = new xml_tree();
283 tree->par = loadTree(fp); //TODO use new api
284 tree->tag_names = new std::vector<std::string>();
285 tree->tag_ids = new std::unordered_map<std::string, xml_tree::tag_t>();
289 ufread(&ntags, sizeof(int), 1, fp);
291 for (i = 0; i < ntags; i++) {
292 if (fgets(buffer, 1022, fp) != buffer)
293 throw std::runtime_error("xml_tree::load: cannot read tag list");
295 // remove the trailing \n
297 tree->tag_names->push_back(s);
298 tree->tag_ids->insert(std::make_pair(s,
299 static_cast<xml_tree::tag_t>(i)));
303 for(int i = 0; i < ntags; i++){
304 tree->tags.push_back(static_bitsequence_sdarray::load(fp));
307 //tree->tags = static_sequence_bs::load(fp);
308 ufread(&tree->bits_per_tag, sizeof(uint), 1, fp);
309 fprintf(stderr, "\nBits per tag: %u\n", tree->bits_per_tag);
310 ufread(&tree->tag_seq_len, sizeof(uint), 1, fp);
311 size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len);
312 tree->tag_seq = new uint[size];
313 ufread(tree->tag_seq, sizeof(uint), size, fp);
316 ufread(&disable_tc, sizeof(bool), 1, fp);
318 if (disable_tc || !load_tc) {
319 tree->text_positions = 0;
320 tree->text_collection = 0;
323 tree->text_positions = static_bitsequence_rrr02::load(fp);
325 ufread(&tree->text_index_type,
326 sizeof(TextCollectionBuilder::index_type_t), 1, fp);
328 std::string file(name);
329 switch (tree->text_index_type){
330 case TextCollectionBuilder::index_type_default:
331 file.append("_default");
333 case TextCollectionBuilder::index_type_swcsa:
334 file.append("_swcsa");
336 case TextCollectionBuilder::index_type_rlcsa:
337 file.append("_rlcsa");
342 tree->text_collection =
343 TextCollection::Load(fp,
345 TextCollection::index_mode_default,
354 uint32_t xml_tree::num_children(xml_tree::node_t x) const
356 return bp_degree(par, x);
359 uint32_t xml_tree::child_pos(xml_tree::node_t x) const
361 return bp_child_rank(par, x);
364 xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const
366 if (i < 10) return bp_naive_child(par, x, i);
367 else bp_child(par, x, i);
370 std::pair<int32_t, int32_t> xml_tree::text_id_range(xml_tree::node_t x) const
373 xml_tree::node_t y = bp_find_close(par, x);
377 i = text_positions->rank1(x-1);
378 j = text_positions->rank1(y);
379 // fprintf(stderr, "Rank of node %i is %i, rank of closing %i is %i\n", x, i, y, j);
381 return std::make_pair(xml_tree::NIL, xml_tree::NIL);
383 return std::make_pair(i, j);
386 int32_t xml_tree::text_id(xml_tree::node_t x) const
388 return (int32_t) text_positions->rank1(x) - 1;
391 unsigned char* xml_tree::get_text(int32_t id) const
393 unsigned char * s = text_collection->GetText(id);
394 return s + (s[0] == 1);
397 void xml_tree::uflush(int fd)
399 size_t size = print_buffer->size();
400 if (size < BUFFER_SIZE) return;
404 void xml_tree::flush(int fd)
406 uflush_r(fd, print_buffer->size());
410 void xml_tree::uflush_r(int fd, size_t s)
412 if (s == 0 || print_buffer == 0 || fd <= 0) return;
415 written = write(fd, print_buffer->data(), s);
416 if ((written < 0) && (errno == EAGAIN || errno == EINTR)){
421 print_buffer->clear();
424 void xml_tree::uput_str(std::string s, int fd)
426 print_buffer->append(s);
429 void xml_tree::uputs(const char* s, int fd)
431 print_buffer->append(s);
434 void xml_tree::uputc(const char c, int fd)
436 print_buffer->push_back(c);
440 const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const
444 if (tagid < 0 || tagid >= tag_names->size())
445 return "<INVALID TAG>";
447 return (const char *) (*tag_names)[tagid].c_str();
450 xml_tree::tag_t xml_tree::register_tag(char *s)
452 std::unordered_map<std::string, tag_t>::iterator found;
453 found = tag_ids->find(std::string(s));
454 if (found == tag_ids->end())
455 return xml_tree::NIL_TAG_ID;
457 return (*found).second;
460 size_t xml_tree::uprintf(const char*s, int fd)
462 if (s == 0) return 0;
465 for (i = 0; (c = s[i]); i++) {
490 void xml_tree::print(xml_tree::node_t x, int fd, bool no_text)
493 if (print_buffer == 0) {
494 print_buffer = new std::string(BUFFER_SIZE, 0);
495 print_buffer->clear();
496 print_stack = new std::vector<std::string>();
497 print_stack->reserve(256);
500 xml_tree::node_t fin = bp_find_close(par, x);
501 xml_tree::node_t n = x;
502 xml_tree::tag_t label = tag(n);
503 unsigned char * orig_text;
504 unsigned char * current_text;
506 std::pair<int32_t, int32_t> r = text_id_range(x);
507 if (r.first == xml_tree::NIL)
510 current_text = text_collection->GetText(r.first, r.second);
511 current_text += (current_text[0] == 1);
514 //fprintf(stderr, "Printing node %i, tag: %s\n", x, get_tag_name_by_ref(tag(x)));
515 // fprintf(stderr, "Beggining of text is (%i):%s\n", r.first, current_text);
520 orig_text = current_text;
526 if (bp_inspect(par, n)) {
527 if (label == xml_tree::PCDATA_OPEN_TAG_ID){
531 read = uprintf( (const char*) current_text, fd);
532 current_text += read + 1;
534 n += 2; // skip closin $
539 uput_str((*tag_names)[label], fd);
541 if (bp_inspect(par, n)) {
542 print_stack->push_back((*tag_names)[label]);
544 if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) {
546 if (no_text) uputs("><@@>", fd);
548 while (bp_inspect(par, n))
551 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
553 uputs("<$@/></", fd);
554 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
559 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
562 read = uprintf((const char*) current_text, fd);
563 current_text += read + 1;
583 uput_str(print_stack->back(), fd);
585 print_stack->pop_back();
587 } while (!bp_inspect(par, n) && !print_stack->empty());
591 if (orig_text && text_index_type != TextCollectionBuilder::index_type_default)
592 if (*orig_text == '\0')
593 text_collection->DeleteText(orig_text - 1);
595 text_collection->DeleteText(orig_text);
599 // void xml_tree::print(xml_tree::node_t x, int fd, bool no_text)
601 // if (print_buffer == 0) {
602 // print_buffer = new std::string(BUFFER_SIZE, 0);
603 // print_buffer->clear();
604 // print_stack = new std::vector<std::string>();
605 // print_stack->reserve(256);
608 // xml_tree::node_t fin = bp_find_close(par, x);
609 // xml_tree::node_t n = x;
610 // xml_tree::tag_t label = tag(n);
611 // unsigned char * current_text;
614 // while (n <= fin) {
616 // if (bp_inspect(par, n)) {
617 // if (label == xml_tree::PCDATA_OPEN_TAG_ID){
619 // uputs("<$/>", fd);
621 // current_text = get_text(text_id(n));
622 // uprintf( (const char*) (current_text + (current_text[0] == 1)), fd);
624 // if (current_text && text_index_type != TextCollectionBuilder::index_type_default)
625 // text_collection->DeleteText(current_text);
627 // n += 2; // skip closin $
632 // uput_str((*tag_names)[label], fd);
634 // if (bp_inspect(par, n)) {
635 // print_stack->push_back((*tag_names)[label]);
637 // if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) {
639 // if (no_text) uputs("><@@>", fd);
641 // while (bp_inspect(par, n))
644 // uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
646 // uputs("<$@/></", fd);
647 // uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
652 // uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
655 // current_text = get_text(text_id(n));
656 // uprintf((const char*) (current_text + (current_text[0] == 1)), fd);
657 // if (current_text && text_index_type != TextCollectionBuilder::index_type_default)
658 // text_collection->DeleteText(current_text);
664 // uputs("</@@>", fd);
665 // else uputc('>', fd);
678 // uput_str(print_stack->back(), fd);
680 // print_stack->pop_back();
682 // } while (!bp_inspect(par, n) && !print_stack->empty());
691 static inline uchar * next_char(uchar *s, size_t &numtexts)
699 if (numtexts == 0) return 0;
707 static inline bool naive_char_contains(uchar const *s, uchar c, size_t numtexts)
710 while(numtexts != 0){
712 if (c == sc) return true;
713 if (sc == 0) numtexts--;
719 bool xml_tree::naive_contains(xml_tree::node_t n, uchar const *w) const
723 // fprintf(stderr, "Calling naive_contains on node: %i, with tag: %s\n",
725 // get_tag_name_by_ref(tag(n)));
727 std::pair<int32_t, int32_t> range = text_id_range(n);
729 //pattern is at least one char
730 if (range.first == xml_tree::NIL)
733 uchar * text = this->text_collection->GetText(range.first, range.second);
735 subtree_tags(n, ATTRIBUTE_DATA_OPEN_TAG_ID) +
736 subtree_tags(n, PCDATA_OPEN_TAG_ID);
739 return naive_char_contains(text, w[0], num_texts);
747 // fprintf(stderr, "Current char in s: %c, num_texts is: %lu\n", *s, num_texts);
751 nnum_texts = num_texts;
753 // fprintf(stderr, " Current char in w: %c, num_texts is: %lu\n", *ww, num_texts);
754 // fprintf(stderr, " Current char in s: %c\n", *ss);
757 if (*ww == 0) return true;
758 if (*ww != *ss) break;
760 ss = next_char(ss, nnum_texts);
761 if (ss == 0) return (*ww == 0);
763 // fprintf(stderr, "Not found, returning\n");
765 // fprintf(stderr, "Current string s is: %s\n", s);
766 s = next_char(s, num_texts);
767 if (s == 0) return false;
768 // fprintf(stderr, "After next_char, string s is: %s\n", s);