12 #include "xml-tree.hpp"
16 const xml_tree::node_t xml_tree::NIL;
17 const xml_tree::node_t xml_tree::ROOT;
20 const xml_tree::tag_t xml_tree::NIL_TAG_ID;
21 const char* xml_tree::NIL_TAG = "<!NIL!>";
22 const xml_tree::tag_t xml_tree::DOCUMENT_OPEN_TAG_ID;
23 const char* xml_tree::DOCUMENT_OPEN_TAG = "";
24 const xml_tree::tag_t xml_tree::ATTRIBUTE_OPEN_TAG_ID;
25 const char* xml_tree::ATTRIBUTE_OPEN_TAG = "<@>";
26 const xml_tree::tag_t xml_tree::PCDATA_OPEN_TAG_ID;
27 const char* xml_tree::PCDATA_OPEN_TAG = "<$>";
28 const xml_tree::tag_t xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID;
29 const char* xml_tree::ATTRIBUTE_DATA_OPEN_TAG = "<@$>";
30 const xml_tree::tag_t xml_tree::CLOSE_TAG_ID;
31 const char* xml_tree::CLOSE_TAG = "</>";
34 static int bits8 (int t ) {
45 static bool array_mem(xml_tree::tag_t *a, xml_tree::tag_t t)
47 for(; *a != xml_tree::NIL_TAG_ID; a++){
48 if (*a >= t) return (*a == t);
53 static void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream)
56 res = fread(ptr,size,nmemb,stream);
58 throw std::runtime_error("ufread I/O error" );
62 static void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
65 res = fwrite(ptr,size,nmemb,stream);
67 throw std::runtime_error("ufwrite I/O error");
79 xml_tree::xml_tree(std::vector<int32_t> *tags,
80 std::unordered_map<std::string, int32_t> *tag_ids,
81 bit_vector *parbitmap,
83 SXSI::TextCollectionBuilder *tc_builder,
84 SXSI::TextCollectionBuilder::index_type_t idx_type,
85 bit_vector *textbitmap)
90 size_t npar = parbitmap->size();
93 par = bp_construct(npar,
94 parbitmap->get_vector_ptr(),
98 this->tag_ids = tag_ids;
99 tag_names = new std::vector<std::string>();
100 tag_names->resize(tag_ids->size());
101 for(auto val : *(this->tag_ids))
102 (*this->tag_names)[val.second] = val.first;
104 uint32_t max_tag = tag_names->size() - 1;
105 static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();
106 alphabet_mapper *am = new alphabet_mapper_none();
108 this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb);
109 bits_per_tag = bits8(max_tag);
111 tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)];
112 for(size_t i = 0; i < (size_t) tag_seq_len; i++)
113 set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]);
122 uint32_t * textbm = textbitmap->get_vector_ptr();
123 text_positions = new static_bitsequence_rrr02(textbm,
129 this->text_index_type = idx_type;
130 text_collection = tc_builder->InitTextCollection();
137 xml_tree::~xml_tree()
145 if (text_collection) delete text_collection;
146 if (text_positions) delete text_positions;
149 bool xml_tree::is_child(xml_tree::node_t x, xml_tree::node_t y) const
151 return !is_leaf(x) && is_ancestor(y, x) && depth(x)+1 == depth(y);
154 uint32_t xml_tree::depth(xml_tree::node_t x) const
156 return bp_depth(this->par, x);
160 uint32_t xml_tree::preorder(xml_tree::node_t x) const
162 return bp_preorder_rank(this->par, x);
165 uint32_t xml_tree::postorder(xml_tree::node_t x) const
167 return bp_postorder_rank(this->par, x);
171 xml_tree::select_child(xml_tree::node_t x, xml_tree::tag_t *tags) const
173 if (is_nil(x) || is_leaf(x)) return xml_tree::NIL;
174 xml_tree::node_t child = first_child(x);
175 if (array_mem(tags, tag(child))) return child;
176 return select_sibling(child, tags);
180 xml_tree::select_descendant(xml_tree::node_t x, xml_tree::tag_t *tags) const
182 if (is_leaf(x)) return xml_tree::NIL;
183 auto min = xml_tree::NIL;
184 auto aux = xml_tree::NIL;
185 for(; *tags != xml_tree::NIL_TAG_ID; ++tags){
186 aux = tagged_descendant(x, *tags);
187 if ((unsigned int) aux < (unsigned int) min) min = aux;
193 xml_tree::select_sibling(xml_tree::node_t x, xml_tree::tag_t *tags) const
195 xml_tree::node_t sibling = next_sibling(x);
197 while(!is_nil(sibling)) {
199 if (array_mem(tags, t)) return sibling;
200 sibling = next_sibling(sibling);
206 xml_tree::select_following_before(xml_tree::node_t x, xml_tree::tag_t *tags,
207 xml_tree::node_t limit) const
209 auto min = xml_tree::NIL;
210 auto aux = xml_tree::NIL;
211 for(; *tags != xml_tree::NIL_TAG_ID; ++tags){
212 aux = tagged_following_before(x, *tags, limit);
213 if ((unsigned int) aux < (unsigned int) min) min = aux;
218 void xml_tree::save(int fd, char* s)
223 fp = fdopen(fd2, "w");
225 if (fd == 0) throw(std::runtime_error(strerror(errno)));
226 saveTree(par, fp); //TODO use new api
228 int ntags = tag_names->size();
230 ufwrite(&ntags, sizeof(int), 1, fp);
231 for (int i = 0; i < ntags;i++)
232 fprintf(fp, "%s\n", tag_names->at(i).c_str());
236 ufwrite(&bits_per_tag, sizeof(uint),1,fp);
237 ufwrite(&tag_seq_len, sizeof(uint),1,fp);
238 ufwrite(tag_seq, sizeof(uint), uint_len(bits_per_tag, tag_seq_len), fp);
240 bool disable_tc = text_collection == 0 || text_positions == 0;
242 ufwrite(&disable_tc, sizeof(bool),1,fp);
245 text_positions->save(fp);
246 ufwrite(&text_index_type,
247 sizeof(TextCollectionBuilder::index_type_t),
251 switch (text_index_type){
252 case TextCollectionBuilder::index_type_default:
253 file.append("_default");
255 case TextCollectionBuilder::index_type_swcsa:
256 file.append("_swcsa");
258 case TextCollectionBuilder::index_type_rlcsa:
259 file.append("_rlcsa");
263 text_collection->Save(fp, file.c_str());
271 xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf)
278 fp = fdopen(fd, "r");
279 xml_tree *tree = new xml_tree();
281 tree->par = loadTree(fp); //TODO use new api
282 tree->tag_names = new std::vector<std::string>();
283 tree->tag_ids = new std::unordered_map<std::string, xml_tree::tag_t>();
287 ufread(&ntags, sizeof(int), 1, fp);
289 for (i = 0; i < ntags; i++) {
290 if (fgets(buffer, 1022, fp) != buffer)
291 throw std::runtime_error("xml_tree::load: cannot read tag list");
293 // remove the trailing \n
295 tree->tag_names->push_back(s);
296 tree->tag_ids->insert(std::make_pair(s,
297 static_cast<xml_tree::tag_t>(i)));
301 tree->tags = static_sequence::load(fp);
302 ufread(&tree->bits_per_tag, sizeof(uint), 1, fp);
303 fprintf(stderr, "\nBits per tag: %u\n", tree->bits_per_tag);
304 ufread(&tree->tag_seq_len, sizeof(uint), 1, fp);
305 size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len);
306 tree->tag_seq = new uint[size];
307 ufread(tree->tag_seq, sizeof(uint), size, fp);
310 ufread(&disable_tc, sizeof(bool), 1, fp);
312 if (disable_tc || !load_tc) {
313 tree->text_positions = 0;
314 tree->text_collection = 0;
317 tree->text_positions = static_bitsequence_rrr02::load(fp);
319 ufread(&tree->text_index_type,
320 sizeof(TextCollectionBuilder::index_type_t), 1, fp);
322 std::string file(name);
323 switch (tree->text_index_type){
324 case TextCollectionBuilder::index_type_default:
325 file.append("_default");
327 case TextCollectionBuilder::index_type_swcsa:
328 file.append("_swcsa");
330 case TextCollectionBuilder::index_type_rlcsa:
331 file.append("_rlcsa");
336 tree->text_collection =
337 TextCollection::Load(fp,
339 TextCollection::index_mode_default,
348 uint32_t xml_tree::num_children(xml_tree::node_t x) const
350 return bp_degree(par, x);
353 uint32_t xml_tree::child_pos(xml_tree::node_t x) const
355 return bp_child_rank(par, x);
358 xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const
360 if (i < 10) return bp_naive_child(par, x, i);
361 else bp_child(par, x, i);
364 std::pair<int32_t, int32_t> xml_tree::text_id_range(xml_tree::node_t x) const
367 i = text_positions->rank1(x) - 1;
368 j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2);
370 return std::make_pair(xml_tree::NIL, xml_tree::NIL);
372 return std::make_pair(i + 1, j);
375 int32_t xml_tree::text_id(xml_tree::node_t x) const
377 return (int32_t) text_positions->rank1(x) - 1;
380 unsigned char* xml_tree::get_text(int32_t id) const
382 unsigned char * s = text_collection->GetText(id);
383 return s + (s[0] == 1);
386 void xml_tree::uflush(int fd)
388 size_t size = print_buffer->size();
389 if (size < BUFFER_SIZE) return;
393 void xml_tree::flush(int fd)
395 uflush_r(fd, print_buffer->size());
399 void xml_tree::uflush_r(int fd, size_t s)
401 if (s == 0 || print_buffer == 0 || fd <= 0) return;
404 written = write(fd, print_buffer->data(), s);
405 if ((written < 0) && (errno == EAGAIN || errno == EINTR)){
410 print_buffer->clear();
413 void xml_tree::uput_str(std::string s, int fd)
415 print_buffer->append(s);
418 void xml_tree::uputs(const char* s, int fd)
420 print_buffer->append(s);
423 void xml_tree::uputc(const char c, int fd)
425 print_buffer->push_back(c);
429 const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const
433 if (tagid < 0 || tagid >= tag_names->size())
434 return "<INVALID TAG>";
436 return (const char *) (*tag_names)[tagid].c_str();
439 xml_tree::tag_t xml_tree::register_tag(char *s)
441 auto found = tag_ids->find(std::string(s));
442 if (found == tag_ids->end())
443 return xml_tree::NIL_TAG_ID;
445 return (*found).second;
448 size_t xml_tree::uprintf(const char*s, int fd)
450 if (s == 0) return 0;
453 for (i = 0; (c = s[i]); i++) {
478 // void xml_tree::print(xml_tree::node_t x, int fd, bool no_text)
481 // if (print_buffer == 0) {
482 // print_buffer = new std::string(BUFFER_SIZE, 0);
483 // print_buffer->clear();
484 // print_stack = new std::vector<std::string>();
485 // print_stack->reserve(256);
488 // xml_tree::node_t fin = bp_find_close(par, x);
489 // xml_tree::node_t n = x;
490 // xml_tree::tag_t label = tag(n);
491 // unsigned char * orig_text;
492 // unsigned char * current_text;
494 // auto r = text_id_range(x);
495 // if (r.first == xml_tree::NIL)
498 // current_text = text_collection->GetText(r.first, r.second);
499 // current_text += (current_text[0] == 1);
501 // orig_text = current_text;
505 // while (n <= fin) {
507 // if (bp_inspect(par, n)) {
508 // if (label == xml_tree::PCDATA_OPEN_TAG_ID){
510 // uputs("<$/>", fd);
512 // read = uprintf( (const char*) current_text, fd);
513 // current_text += read + 1;
515 // n += 2; // skip closin $
520 // uput_str((*tag_names)[label], fd);
522 // if (bp_inspect(par, n)) {
523 // print_stack->push_back((*tag_names)[label]);
525 // if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) {
527 // if (no_text) uputs("><@@>", fd);
529 // while (bp_inspect(par, n))
532 // uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
534 // uputs("<$@/></", fd);
535 // uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
540 // uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
543 // read = uprintf((const char*) current_text, fd);
544 // current_text += read + 1;
550 // uputs("</@@>", fd);
551 // else uputc('>', fd);
564 // uput_str(print_stack->back(), fd);
566 // print_stack->pop_back();
568 // } while (!bp_inspect(par, n) && !print_stack->empty());
572 // if (orig_text && text_index_type != TextCollectionBuilder::index_type_default)
573 // if (*orig_text == '\0')
574 // text_collection->DeleteText(orig_text - 1);
576 // text_collection->DeleteText(orig_text);
579 void xml_tree::print(xml_tree::node_t x, int fd, bool no_text)
581 for (int i = 0; i < 50; i++)
582 fprintf(stderr, "Printing text number %i: %s\n", i, get_text(i));
584 if (print_buffer == 0) {
585 print_buffer = new std::string(BUFFER_SIZE, 0);
586 print_buffer->clear();
587 print_stack = new std::vector<std::string>();
588 print_stack->reserve(256);
591 xml_tree::node_t fin = bp_find_close(par, x);
592 xml_tree::node_t n = x;
593 xml_tree::tag_t label = tag(n);
594 unsigned char * current_text;
599 if (bp_inspect(par, n)) {
600 if (label == xml_tree::PCDATA_OPEN_TAG_ID){
604 current_text = get_text(text_id(n));
605 uprintf( (const char*) (current_text + (current_text[0] == 1)), fd);
607 if (current_text && text_index_type != TextCollectionBuilder::index_type_default)
608 text_collection->DeleteText(current_text);
610 n += 2; // skip closin $
615 uput_str((*tag_names)[label], fd);
617 if (bp_inspect(par, n)) {
618 print_stack->push_back((*tag_names)[label]);
620 if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) {
622 if (no_text) uputs("><@@>", fd);
624 while (bp_inspect(par, n))
627 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
629 uputs("<$@/></", fd);
630 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
635 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
638 current_text = get_text(text_id(n));
639 uprintf((const char*) (current_text + (current_text[0] == 1)), fd);
640 if (current_text && text_index_type != TextCollectionBuilder::index_type_default)
641 text_collection->DeleteText(current_text);
661 uput_str(print_stack->back(), fd);
663 print_stack->pop_back();
665 } while (!bp_inspect(par, n) && !print_stack->empty());