\r
XMLTree::XMLTree( pb * const par, uint npar, vector<string> * const TN, TagIdMap * const tim,\r
uint *empty_texts_bmp, TagType *tags,\r
- TextCollection * const TC, bool dis_tc,\r
+ TextCollectionBuilder * const TCB, bool dis_tc,\r
TextCollectionBuilder::index_type_t _index_type )\r
{\r
buffer = 0;\r
print_stack = 0;\r
// creates the data structure for the tree topology\r
STARTTIMER();\r
- Par = bp_construct(npar, (pb*) par, OPT_FAST_PREORDER_SELECT | OPT_DEGREE|0);\r
+ Par = bp_construct(npar, (pb*) par, OPT_DEGREE|0);\r
STOPTIMER(Building);\r
PRINTTIME("Building parenthesis struct", Building);\r
STARTTIMER();\r
\r
uint max_tag = TN->size() - 1;\r
\r
-\r
static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray();\r
alphabet_mapper *am = new alphabet_mapper_none();\r
Tags = new static_sequence_bs((uint*)tags,npar,am,bmb);\r
STOPTIMER(Building);\r
PRINTTIME("Building Tag Structure", Building);\r
\r
- Text = (TextCollection*) TC;\r
-\r
-\r
EBVector = new static_bitsequence_rrr02(empty_texts_bmp,npar,32);\r
- //EBVector = new static_bitsequence_sdarray(empty_texts_bmp,npar);\r
free(empty_texts_bmp);\r
empty_texts_bmp = NULL;\r
\r
\r
disable_tc = dis_tc;\r
text_index_type = _index_type;\r
+ if (!disable_tc) {\r
+ assert(TCB != 0);\r
+ STARTTIMER();\r
+ Text = TCB->InitTextCollection();\r
+ delete TCB;\r
+ STOPTIMER(Building);\r
+ PRINTTIME("Building TextCollection", Building);\r
+\r
+ } else {\r
+ Text = NULL;\r
+ }\r
+\r
std::cerr << "Number of distinct tags " << TagName->size() << "\n";\r
//std::cerr.flush();\r
}\r
{\r
FILE *fp;\r
int i;\r
-\r
- fp = fdopen(fd, "wa");\r
+ off_t pos = lseek(fd, 0, SEEK_CUR);\r
+ int fd2 = dup(fd);\r
+ fp = fdopen(fd2, "w");\r
+ fseek(fp, pos, SEEK_SET);\r
// first stores the tree topology\r
saveTree(Par, fp);\r
\r
\r
//text positions\r
EBVector->save(fp);\r
-\r
+ std::cerr << "TC Index position: " << ftell(fp) << std::endl;\r
// stores the texts\r
if (!disable_tc) {\r
-\r
+ std::cerr << "Writing " << sizeof(TextCollectionBuilder::index_type_t) << " bytes\n" << std::endl;\r
ufwrite(&text_index_type, sizeof(TextCollectionBuilder::index_type_t), 1, fp);\r
\r
\r
string file(name);\r
switch (text_index_type){\r
case TextCollectionBuilder::index_type_default:\r
- file.append(".default");\r
+ file.append("_default");\r
break;\r
case TextCollectionBuilder::index_type_swcsa:\r
- file.append(".swcsa");\r
+ file.append("_swcsa");\r
break;\r
case TextCollectionBuilder::index_type_rlcsa:\r
- file.append(".rlcsa");\r
+ file.append("_rlcsa");\r
break;\r
};\r
\r
\r
\r
}\r
+ fflush(fp);\r
+ fclose(fp);\r
}\r
\r
// Load: loads XML tree data structure from file. Returns\r
STOPTIMER(Loading);\r
PRINTTIME("Loading text bitvector struct", Loading);\r
STARTTIMER();\r
-\r
+ std::cerr << "TC Load Index position: " << ftell(fp) << std::endl;\r
// Not used\r
// loads the texts\r
if (!XML_Tree->disable_tc){\r
string file(name);\r
switch (XML_Tree->text_index_type){\r
case TextCollectionBuilder::index_type_default:\r
- file.append(".default");\r
+ file.append("_default");\r
break;\r
case TextCollectionBuilder::index_type_swcsa:\r
- file.append(".swcsa");\r
+ file.append("_swcsa");\r
break;\r
case TextCollectionBuilder::index_type_rlcsa:\r
- file.append(".rlcsa");\r
+ file.append("_rlcsa");\r
break;\r
};\r
+\r
+\r
XML_Tree->Text = TextCollection::Load(fp, file.c_str(), TextCollection::index_mode_default, sample_factor);\r
\r
}\r
STOPTIMER(Parsing);\r
PRINTTIME("Parsing XML Document", Parsing);\r
\r
- if (!disable_tc) {\r
- assert(Text == 0);\r
- assert(TextBuilder != 0);\r
- STARTTIMER();\r
- Text = TextBuilder->InitTextCollection();\r
- delete TextBuilder;\r
- TextBuilder = 0;\r
- STOPTIMER(Building);\r
- PRINTTIME("Building TextCollection", Building);\r
-\r
- }\r
-\r
XMLTree *T = new XMLTree(par_aux,\r
npar,\r
TagName,\r
tIdMap,\r
empty_texts_aux, // freed by the constructor\r
tags_aux, // freed by the constructor\r
- Text,\r
+ TextBuilder, // freed by the constructor\r
disable_tc,\r
text_index_type);\r
tags_aux = 0;\r
int XMLTreeBuilder::NewOpenTag(string tagname)\r
{\r
int i;\r
-\r
// inserts a new opening parentheses in the bit sequence\r
if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis\r
- par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize);\r
- parArraySize *= 2;\r
- }\r
+\r
+ // If array is already 1GB, be gentler when resizing:\r
+ if (sizeof(pb)*parArraySize >= 1024*1024*1024)\r
+ parArraySize += (128*1024*1024);\r
+ else\r
+ parArraySize *= 2;\r
+ par_aux = (pb *) urealloc(par_aux, sizeof(pb)*parArraySize);\r
+ };\r
\r
bp_setbit(par_aux,npar,OP); // marks a new opening parenthesis\r
\r
tagname.compare(ATTRIBUTE_DATA_OPEN_TAG) == 0){\r
};\r
\r
+\r
tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
\r
tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags\r
int XMLTreeBuilder::NewClosingTag(string tagname)\r
{\r
int i;\r
-\r
// inserts a new closing parentheses in the bit sequence\r
if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis\r
- par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize);\r
- parArraySize *= 2;\r
- }\r
+ // If array is already 1GB, be gentler when resizing:\r
+ if (sizeof(pb)*parArraySize >= 1024*1024*1024)\r
+ parArraySize += (128*1024*1024);\r
+ else\r
+ parArraySize *= 2;\r
+ par_aux = (pb *)urealloc(par_aux, sizeof(pb)*parArraySize);\r
+ };\r
\r
bp_setbit(par_aux,npar,CP); // marks a new closing parenthesis\r
\r
// else\r
// i = tag_id->second;\r
\r
- tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
+ tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
\r
tags_aux[npar] = CLOSING_TAG_ID; // inserts the new tag id within the preorder sequence of tags\r
\r