12 #include "xml-tree.hpp"
16 const xml_tree::tag_t xml_tree::NIL_TAG_ID;
17 const char* xml_tree::NIL_TAG = "<!NIL!>";
18 const xml_tree::tag_t xml_tree::DOCUMENT_OPEN_TAG_ID;
19 const char* xml_tree::DOCUMENT_OPEN_TAG = "";
20 const xml_tree::tag_t xml_tree::ATTRIBUTE_OPEN_TAG_ID;
21 const char* xml_tree::ATTRIBUTE_OPEN_TAG = "<@>";
22 const xml_tree::tag_t xml_tree::PCDATA_OPEN_TAG_ID;
23 const char* xml_tree::PCDATA_OPEN_TAG = "<$>";
24 const xml_tree::tag_t xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID;
25 const char* xml_tree::ATTRIBUTE_DATA_OPEN_TAG = "<@$>";
26 const xml_tree::tag_t xml_tree::CLOSE_TAG_ID;
27 const char* xml_tree::CLOSE_TAG = "</>";
29 static int bits8 (int t ) {
42 static void ufread(void *ptr, size_t size, size_t nmemb, FILE *stream)
45 res = fread(ptr,size,nmemb,stream);
47 throw std::runtime_error("ufread I/O error" );
51 static void ufwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
54 res = fwrite(ptr,size,nmemb,stream);
56 throw std::runtime_error("ufwrite I/O error");
68 xml_tree::xml_tree(std::vector<int32_t> *tags,
69 std::unordered_map<std::string, int32_t> *tag_ids,
70 bit_vector *parbitmap,
72 SXSI::TextCollectionBuilder *tc_builder,
73 SXSI::TextCollectionBuilder::index_type_t idx_type,
74 bit_vector *textbitmap)
79 size_t npar = parbitmap->size();
81 par = bp_construct(npar,
82 parbitmap->get_vector_ptr(),
86 this->tag_ids = tag_ids;
87 tag_names = new std::vector<std::string>();
88 tag_names->resize(tag_ids->size());
89 for(auto val : *(this->tag_ids))
90 (*this->tag_names)[val.second] = val.first;
92 uint32_t max_tag = tag_names->size() - 1;
93 static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();
94 alphabet_mapper *am = new alphabet_mapper_none();
95 this->tags = new static_sequence_bs((uint32_t *) &tags[0], npar, am, bmb);
96 bits_per_tag = bits8(max_tag);
98 tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)];
99 for(size_t i = 0; i < (size_t) tag_seq_len; i++)
100 set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]);
109 uint32_t * textbm = textbitmap->get_vector_ptr();
110 text_positions = new static_bitsequence_rrr02(textbm,
115 this->text_index_type = idx_type;
116 text_collection = tc_builder->InitTextCollection();
123 xml_tree::~xml_tree()
131 if (text_collection) delete text_collection;
132 if (text_positions) delete text_positions;
135 bool xml_tree::is_child(xml_tree::node_t x, xml_tree::node_t y) const
137 return !is_leaf(x) && is_ancestor(y, x) && depth(x)+1 == depth(y);
140 uint32_t xml_tree::depth(xml_tree::node_t x) const
142 return bp_depth(this->par, x);
146 uint32_t xml_tree::preorder(xml_tree::node_t x) const
148 return bp_preorder_rank(this->par, x);
151 uint32_t xml_tree::postorder(xml_tree::node_t x) const
153 return bp_postorder_rank(this->par, x);
157 xml_tree::select_child(xml_tree::node_t x,
158 std::unordered_set<tag_t> *tags) const
160 if (is_nil(x) || is_leaf(x)) return xml_tree::NIL;
161 xml_tree::node_t child = first_child(x);
162 if (tags->find(tag(child)) != tags->end()) return child;
163 return select_sibling(child, tags);
167 xml_tree::select_descendant(xml_tree::node_t x,
168 std::unordered_set<tag_t> *tags) const
170 if (is_leaf(x)) return xml_tree::NIL;
171 auto min = xml_tree::NIL;
172 auto aux = xml_tree::NIL;
173 for(auto tag = tags->begin(); tag != tags->end(); ++ tag){
174 aux = tagged_descendant(x, *tag);
175 if ((unsigned int) aux < (unsigned int) min) min = aux;
182 xml_tree::select_sibling(xml_tree::node_t x,
183 std::unordered_set<tag_t> *tags) const
185 xml_tree::node_t sibling = next_sibling(x);
186 while(!is_nil(sibling) && tags->find(tag(sibling)) == tags->end())
187 sibling = next_sibling(sibling);
192 xml_tree::select_following_before(xml_tree::node_t x,
193 std::unordered_set<tag_t> *tags,
194 xml_tree::node_t limit) const
196 auto min = xml_tree::NIL;
197 auto aux = xml_tree::NIL;
198 for(auto tag = tags->begin(); tag != tags->end(); ++tag){
199 aux = tagged_following_before(x, *tag, limit);
200 if ((unsigned int) aux < (unsigned int) min) min = aux;
205 void xml_tree::save(int fd, char* s)
210 fp = fdopen(fd2, "w");
212 if (fd == 0) throw(std::runtime_error(strerror(errno)));
213 saveTree(par, fp); //TODO use new api
215 int ntags = tag_names->size();
217 ufwrite(&ntags, sizeof(int), 1, fp);
218 for (int i = 0; i < ntags;i++)
219 fprintf(fp, "%s\n", tag_names->at(i).c_str());
223 ufwrite(&bits_per_tag, sizeof(uint),1,fp);
224 ufwrite(&tag_seq_len, sizeof(uint),1,fp);
225 ufwrite(tag_seq, sizeof(uint), uint_len(bits_per_tag, tag_seq_len), fp);
227 bool disable_tc = text_collection == 0 || text_positions == 0;
228 ufwrite(&disable_tc, sizeof(bool),1,fp);
230 text_positions->save(fp);
232 ufwrite(&text_index_type,
233 sizeof(TextCollectionBuilder::index_type_t),
237 switch (text_index_type){
238 case TextCollectionBuilder::index_type_default:
239 file.append("_default");
241 case TextCollectionBuilder::index_type_swcsa:
242 file.append("_swcsa");
244 case TextCollectionBuilder::index_type_rlcsa:
245 file.append("_rlcsa");
249 text_collection->Save(fp, file.c_str());
256 //static xml_tree* load(char*, bool, int);
257 // void print(int, node_t, bool no_text=false);
258 xml_tree* xml_tree::load(int fd, char* name, bool load_tc, int sf)
265 fp = fdopen(fd, "r");
267 xml_tree *tree = new xml_tree();
269 tree->par = loadTree(fp); //TODO use new api
270 tree->tag_names = new std::vector<std::string>();
271 tree->tag_ids = new std::unordered_map<std::string, xml_tree::tag_t>();
275 ufread(&ntags, sizeof(int), 1, fp);
277 for (i = 0; i < ntags; i++) {
278 if (fgets(buffer, 1022, fp) != buffer)
279 throw std::runtime_error("xml_tree::load: cannot read tag list");
281 // remove the trailing \n
283 tree->tag_names->push_back(s);
284 tree->tag_ids->insert(std::make_pair(s,
285 static_cast<xml_tree::tag_t>(i)));
289 tree->tags = static_sequence::load(fp);
290 ufread(&tree->bits_per_tag, sizeof(uint), 1, fp);
291 ufread(&tree->tag_seq_len, sizeof(uint), 1, fp);
292 size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len);
293 tree->tag_seq = new uint[size];
294 ufread(tree->tag_seq, sizeof(uint), size, fp);
297 ufread(&disable_tc, sizeof(bool), 1, fp);
299 if (disable_tc || !load_tc) {
300 tree->text_positions = 0;
301 tree->text_collection = 0;
304 tree->text_positions = static_bitsequence_rrr02::load(fp);
306 ufread(&tree->text_index_type,
307 sizeof(TextCollectionBuilder::index_type_t), 1, fp);
309 std::string file(name);
310 switch (tree->text_index_type){
311 case TextCollectionBuilder::index_type_default:
312 file.append("_default");
314 case TextCollectionBuilder::index_type_swcsa:
315 file.append("_swcsa");
317 case TextCollectionBuilder::index_type_rlcsa:
318 file.append("_rlcsa");
323 tree->text_collection =
324 TextCollection::Load(fp,
326 TextCollection::index_mode_default,
333 uint32_t xml_tree::subtree_elements(xml_tree::node_t x) const
336 uint32_t size = bp_subtree_size(par, x);
338 x = bp_first_child(par,x);
342 int s = x + 2*size - 1;
344 tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, s) -
345 tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, x-1);
348 xml_tree::node_t fin = bp_find_close(par, x);
349 xml_tree::node_t y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, x);
350 while (y != xml_tree::NIL && y < fin){
351 size -= subtree_size(y);
352 y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, y);
357 uint32_t xml_tree::num_children(xml_tree::node_t x) const
359 return bp_degree(par, x);
362 uint32_t xml_tree::child_pos(xml_tree::node_t x) const
364 return bp_child_rank(par, x);
367 xml_tree::node_t xml_tree::child(xml_tree::node_t x, uint32_t i) const
369 if (i < 10) return bp_naive_child(par, x, i);
370 else bp_child(par, x, i);
373 std::pair<int32_t, int32_t> xml_tree::text_id_range(xml_tree::node_t x) const
376 i = text_positions->rank1(x - 1);
377 j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2);
379 return std::make_pair(xml_tree::NIL, xml_tree::NIL);
381 return std::make_pair(i + 1, j);
384 int32_t xml_tree::text_id(xml_tree::node_t x) const
386 return (int32_t) text_positions->rank1(x) - 1;
389 unsigned char* xml_tree::get_text(int32_t id) const
391 unsigned char * s = text_collection->GetText(id);
392 return s + (s[0] == 1);
395 void xml_tree::uflush(int fd)
397 size_t size = print_buffer->size();
398 if (size < BUFFER_SIZE) return;
401 void xml_tree::flush(int fd)
403 uflush_r(fd, print_buffer->size());
407 void xml_tree::uflush_r(int fd, size_t s)
412 written = write(fd, print_buffer->data(), s);
413 if ((written < 0) && (errno == EAGAIN || errno == EINTR))
417 print_buffer->clear();
419 void xml_tree::uput_str(std::string s, int fd)
421 print_buffer->append(s);
424 void xml_tree::uputs(const char* s, int fd)
426 print_buffer->append(s);
429 void xml_tree::uputc(const char c, int fd)
431 print_buffer->push_back(c);
435 const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const
439 if (tagid < 0 || tagid >= tag_names->size())
440 return "<INVALID TAG>";
442 return (const char *) (*tag_names)[tagid].c_str();
445 xml_tree::tag_t xml_tree::register_tag(char *s)
447 auto found = tag_ids->find(std::string(s));
448 if (found == tag_ids->end())
449 return xml_tree::NIL_TAG_ID;
451 return (*found).second;
454 size_t xml_tree::uprintf(const char*s, int fd)
456 if (s == 0) return 0;
459 for (i = 0; (c = s[i]); i++) {
484 void xml_tree::print(int fd, xml_tree::node_t x, bool no_text)
486 if (print_buffer == 0) {
487 print_buffer = new std::string(BUFFER_SIZE, 0);
488 print_buffer->clear();
489 print_stack = new std::vector<std::string>();
490 print_stack->reserve(256);
492 xml_tree::node_t fin = bp_find_close(par, x);
493 xml_tree::node_t n = x;
494 xml_tree::tag_t label = tag(n);
495 unsigned char * orig_text;
496 unsigned char * current_text;
498 auto r = text_id_range(x);
499 if (r.first == xml_tree::NIL)
502 current_text = get_text(r.first);
504 orig_text = current_text;
509 if (bp_inspect(par, n)) {
510 if (label == xml_tree::PCDATA_OPEN_TAG_ID){
514 read = uprintf( (const char*) current_text, fd);
515 current_text += read + 1;
517 n += 2; // skip closin $
522 uput_str((*tag_names)[label], fd);
524 if (bp_inspect(par, n)) {
525 print_stack->push_back((*tag_names)[label]);
527 if (label == xml_tree::ATTRIBUTE_OPEN_TAG_ID) {
529 if (no_text) uputs("><@@>", fd);
531 while (bp_inspect(par, n))
534 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
536 uputs("<$@/></", fd);
537 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
542 uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
545 read = uprintf((const char*) current_text, fd);
546 current_text += read + 1;
566 uput_str(print_stack->back(), fd);
568 print_stack->pop_back();
570 } while (!bp_inspect(par, n) || print_stack->empty());
574 if (orig_text && text_index_type != TextCollectionBuilder::index_type_default)
575 if (*orig_text == '\0')
576 text_collection->DeleteText(orig_text - 1);
578 text_collection->DeleteText(orig_text);