print_buffer = 0;
}
-xml_tree::xml_tree(std::vector<int32_t> *tags,
+xml_tree::xml_tree(std::vector<int32_t> *tags_,
std::unordered_map<std::string, int32_t> *tag_ids,
bit_vector *parbitmap,
bool disable_tc,
size_t npar = parbitmap->size();
parbitmap->pack();
-
par = bp_construct(npar,
parbitmap->get_vector_ptr(),
OPT_DEGREE);
delete parbitmap;
this->tag_ids = tag_ids;
+
tag_names = new std::vector<std::string>();
tag_names->resize(tag_ids->size());
- for(auto val : *(this->tag_ids))
- (*this->tag_names)[val.second] = val.first;
+ this->attribute_ids = new std::unordered_set<xml_tree::tag_t>();
+ std::unordered_map<std::string, tag_t>::iterator val;
+ for(val = this->tag_ids->begin(); val != this->tag_ids->end(); ++val){
+ (*tag_names)[val->second] = val->first;
+ if (val->first.size() >= 3 &&
+ val->first[0] == '<' &&
+ val->first[1] == '@' &&
+ val->first[2] == '>'){
+ this->attribute_ids->insert(val->second);
+ };
+ }
uint32_t max_tag = tag_names->size() - 1;
- static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();
- alphabet_mapper *am = new alphabet_mapper_none();
+ bit_vector *tmp_bitmap = new bit_vector(npar, 1, 0);
+ uint32_t * buff;
+ for(int32_t i = 0; i <= max_tag; i++){
+ uint32_t ones = 0;
+ for (size_t k = 0; k < npar; k++){
+ bool t = (i == (*tags_)[k]);
+ tmp_bitmap->set(k, t);
+ ones += t;
+ };
+ buff = tmp_bitmap->get_vector_ptr();
+ this->tags.push_back(new static_bitsequence_sdarray(buff, npar, ones));
+ }
+ delete tmp_bitmap;
+ delete [] buff;
+
+
+ //static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();
+ //alphabet_mapper *am = new alphabet_mapper_none();
+
+// this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb);
- this->tags = new static_sequence_bs((uint32_t*)&((*tags)[0]), npar, am, bmb);
bits_per_tag = bits8(max_tag);
tag_seq_len = npar;
tag_seq = new uint32_t[uint_len(bits_per_tag, tag_seq_len)];
for(size_t i = 0; i < (size_t) tag_seq_len; i++)
- set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags)[i]);
+ set_field(tag_seq, bits_per_tag, i, (uint32_t) (*tags_)[i]);
- delete tags;
+ delete tags_;
if (disable_tc) {
text_positions = 0;
npar,
32);
//delete [] textbm;
- delete textbitmap;
+ //delete textbitmap;
this->text_index_type = idx_type;
text_collection = tc_builder->InitTextCollection();
delete tc_builder;
};
-
}
xml_tree::~xml_tree()
{
bp_delete(par);
- delete tags;
+
+ for(int i = 0; i < tag_names->size(); i++){
+ delete tags[i];
+ tags[i] = 0;
+ };
+
+
delete [] tag_seq;
delete tag_names;
delete tag_ids;
-
+ delete attribute_ids;
if (text_collection) delete text_collection;
if (text_positions) delete text_positions;
}
return select_sibling(child, tags);
}
-xml_tree::node_t
-xml_tree::select_descendant(xml_tree::node_t x, xml_tree::tag_t *tags) const
-{
- if (is_leaf(x)) return xml_tree::NIL;
- auto min = xml_tree::NIL;
- auto aux = xml_tree::NIL;
- for(; *tags != xml_tree::NIL_TAG_ID; ++tags){
- aux = tagged_descendant(x, *tags);
- if ((unsigned int) aux < (unsigned int) min) min = aux;
- };
- return min;
-}
xml_tree::node_t
xml_tree::select_sibling(xml_tree::node_t x, xml_tree::tag_t *tags) const
return sibling;
}
-xml_tree::node_t
-xml_tree::select_following_before(xml_tree::node_t x, xml_tree::tag_t *tags,
- xml_tree::node_t limit) const
-{
- auto min = xml_tree::NIL;
- auto aux = xml_tree::NIL;
- for(; *tags != xml_tree::NIL_TAG_ID; ++tags){
- aux = tagged_following_before(x, *tags, limit);
- if ((unsigned int) aux < (unsigned int) min) min = aux;
- }
- return min;
-}
void xml_tree::save(int fd, char* s)
{
for (int i = 0; i < ntags;i++)
fprintf(fp, "%s\n", tag_names->at(i).c_str());
- tags->save(fp);
+ //tags->save(fp);
+ for(int i = 0; i < ntags; i++)
+ tags[i]->save(fp);
ufwrite(&bits_per_tag, sizeof(uint),1,fp);
ufwrite(&tag_seq_len, sizeof(uint),1,fp);
tree->par = loadTree(fp); //TODO use new api
tree->tag_names = new std::vector<std::string>();
tree->tag_ids = new std::unordered_map<std::string, xml_tree::tag_t>();
+ tree->attribute_ids = new std::unordered_set<xml_tree::tag_t>();
std::string s;
int ntags;
tree->tag_names->push_back(s);
tree->tag_ids->insert(std::make_pair(s,
static_cast<xml_tree::tag_t>(i)));
+ if (s.size() >= 3 && s[0] == '<' && s[1] == '@' && s[2] == '>'){
+ tree->attribute_ids->insert(static_cast<xml_tree::tag_t>(i));
+ };
+
+ };
+ for(int i = 0; i < ntags; i++){
+ tree->tags.push_back(static_bitsequence_sdarray::load(fp));
};
- tree->tags = static_sequence::load(fp);
+ //tree->tags = static_sequence_bs::load(fp);
ufread(&tree->bits_per_tag, sizeof(uint), 1, fp);
+ fprintf(stderr, "\nBits per tag: %u\n", tree->bits_per_tag);
ufread(&tree->tag_seq_len, sizeof(uint), 1, fp);
size_t size = uint_len(tree->bits_per_tag, tree->tag_seq_len);
tree->tag_seq = new uint[size];
return tree;
}
-uint32_t xml_tree::subtree_elements(xml_tree::node_t x) const
-{
-
- uint32_t size = bp_subtree_size(par, x);
- if (x == root()){
- x = bp_first_child(par,x);
- size = size - 1;
- };
-
- int s = x + 2*size - 1;
- int ntext =
- tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, s) -
- tags->rank(xml_tree::PCDATA_OPEN_TAG_ID, x-1);
- size = size - ntext;
- xml_tree::node_t fin = bp_find_close(par, x);
- xml_tree::node_t y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, x);
- while (y != xml_tree::NIL && y < fin){
- size -= subtree_size(y);
- y = tags->select_next(xml_tree::ATTRIBUTE_OPEN_TAG_ID, y);
- };
- return size;
- }
uint32_t xml_tree::num_children(xml_tree::node_t x) const
{
std::pair<int32_t, int32_t> xml_tree::text_id_range(xml_tree::node_t x) const
{
int32_t i, j;
- i = text_positions->rank1(x - 1);
- j = text_positions->rank1(x + 2 * bp_subtree_size(par, x) - 2);
+ xml_tree::node_t y = bp_find_close(par, x);
+ if (x == root())
+ i = 0;
+ else
+ i = text_positions->rank1(x-1);
+ j = text_positions->rank1(y);
+// fprintf(stderr, "Rank of node %i is %i, rank of closing %i is %i\n", x, i, y, j);
if (i == j)
return std::make_pair(xml_tree::NIL, xml_tree::NIL);
else
- return std::make_pair(i + 1, j);
+ return std::make_pair(i, j);
}
int32_t xml_tree::text_id(xml_tree::node_t x) const
return (int32_t) text_positions->rank1(x) - 1;
}
-unsigned char* xml_tree::get_text(int32_t id) const
+const char* xml_tree::get_text(int32_t id) const
{
- unsigned char * s = text_collection->GetText(id);
+ const char * s = reinterpret_cast<const char*>(text_collection->GetText(id));
return s + (s[0] == 1);
}
void xml_tree::uflush_r(int fd, size_t s)
{
- if (s == 0) return;
+ if (s == 0 || print_buffer == 0 || fd <= 0) return;
size_t written;
while (1) {
written = write(fd, print_buffer->data(), s);
const char * xml_tree::get_tag_name_by_ref(xml_tree::tag_t tagid) const
{
-
- unsigned char *s;
if (tagid < 0 || tagid >= tag_names->size())
return "<INVALID TAG>";
- return (const char *) (*tag_names)[tagid].c_str();
+ return (*tag_names)[tagid].c_str();
}
xml_tree::tag_t xml_tree::register_tag(char *s)
{
- auto found = tag_ids->find(std::string(s));
+ std::unordered_map<std::string, tag_t>::iterator found;
+ found = tag_ids->find(std::string(s));
if (found == tag_ids->end())
return xml_tree::NIL_TAG_ID;
else
unsigned char * orig_text;
unsigned char * current_text;
- auto r = text_id_range(x);
+ std::pair<int32_t, int32_t> r = text_id_range(x);
if (r.first == xml_tree::NIL)
current_text = 0;
- else
- current_text = get_text(r.first);
+ else {
+ current_text = text_collection->GetText(r.first, r.second);
+ current_text += (current_text[0] == 1);
+ };
+
+ //fprintf(stderr, "Printing node %i, tag: %s\n", x, get_tag_name_by_ref(tag(x)));
+ // fprintf(stderr, "Beggining of text is (%i):%s\n", r.first, current_text);
+ //fflush(stderr);
+
+
orig_text = current_text;
+
size_t read = 0;
while (n <= fin) {
uputs((const char*) &(get_tag_name_by_ref(tag(n))[3]), fd);
n++;
uputs("=\"", fd);
+ current_text += (current_text[0] == 1);
read = uprintf((const char*) current_text, fd);
current_text += read + 1;
uputc('"', fd);
text_collection->DeleteText(orig_text);
}
+
+
+static inline uchar * next_char(uchar *s, size_t &numtexts)
+{
+ uchar c;
+ s++;
+ c = *s;
+ while (c <= 1){
+ if (c == 0) {
+ numtexts--;
+ if (numtexts == 0) return 0;
+ };
+ s++;
+ c = *s;
+ };
+ return s;
+}
+
+static inline bool naive_char_contains(uchar const *s, uchar c, size_t numtexts)
+{
+ uchar sc;
+ while(numtexts != 0){
+ sc = *s;
+ if (c == sc) return true;
+ if (sc == 0) numtexts--;
+ s++;
+ };
+ return false;
+}
+
+bool xml_tree::naive_contains(xml_tree::node_t n, uchar const *w) const
+{
+ if (w[0] == 0)
+ return true;
+ // fprintf(stderr, "Calling naive_contains on node: %i, with tag: %s\n",
+ // n,
+ // get_tag_name_by_ref(tag(n)));
+ // fflush(stderr);
+ std::pair<int32_t, int32_t> range = text_id_range(n);
+
+ //pattern is at least one char
+ if (range.first == xml_tree::NIL)
+ return false;
+
+ uchar * text = this->text_collection->GetText(range.first, range.second);
+ size_t num_texts =
+ subtree_tags(n, ATTRIBUTE_DATA_OPEN_TAG_ID) +
+ subtree_tags(n, PCDATA_OPEN_TAG_ID);
+
+ if (w[1] == 0)
+ return naive_char_contains(text, w[0], num_texts);
+
+ //KMP is overkill
+ uchar * s = text;
+ uchar * ss;
+ uchar const *ww;
+ size_t nnum_texts;
+ while (true) {
+ // fprintf(stderr, "Current char in s: %c, num_texts is: %lu\n", *s, num_texts);
+ // fflush(stderr);
+ ss = s;
+ ww = w;
+ nnum_texts = num_texts;
+ while (true){
+ // fprintf(stderr, " Current char in w: %c, num_texts is: %lu\n", *ww, num_texts);
+ // fprintf(stderr, " Current char in s: %c\n", *ss);
+ // fflush(stderr);
+
+ if (*ww == 0) return true;
+ if (*ww != *ss) break;
+ ww++;
+ ss = next_char(ss, nnum_texts);
+ if (ss == 0) return (*ww == 0);
+ };
+ // fprintf(stderr, "Not found, returning\n");
+ // fflush(stderr);
+ // fprintf(stderr, "Current string s is: %s\n", s);
+ s = next_char(s, num_texts);
+ if (s == 0) return false;
+// fprintf(stderr, "After next_char, string s is: %s\n", s);
+// fflush(stderr);
+ };
+
+}