From: Kim Nguyễn Date: Thu, 1 Mar 2012 13:19:46 +0000 (+0100) Subject: Don't create the text collection during parsing but afterwards. X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FXMLTree.git;a=commitdiff_plain;h=d79d6498e2d585560d915592ef59f3ad6a57b3c7 Don't create the text collection during parsing but afterwards. --- diff --git a/XMLTree.cpp b/XMLTree.cpp index 65a577b..20e6801 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -66,14 +66,14 @@ static uint fast_get_field(uint* A,int len, int idx) XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, - TextCollection * const TC, bool dis_tc, + TextCollectionBuilder * const TCB, bool dis_tc, TextCollectionBuilder::index_type_t _index_type ) { buffer = 0; print_stack = 0; // creates the data structure for the tree topology STARTTIMER(); - Par = bp_construct(npar, (pb*) par, OPT_FAST_PREORDER_SELECT | OPT_DEGREE|0); + Par = bp_construct(npar, (pb*) par, OPT_DEGREE|0); STOPTIMER(Building); PRINTTIME("Building parenthesis struct", Building); STARTTIMER(); @@ -86,7 +86,6 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM uint max_tag = TN->size() - 1; - static_bitsequence_builder *bmb = new static_bitsequence_builder_sdarray(); alphabet_mapper *am = new alphabet_mapper_none(); Tags = new static_sequence_bs((uint*)tags,npar,am,bmb); @@ -108,17 +107,25 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM STOPTIMER(Building); PRINTTIME("Building Tag Structure", Building); - Text = (TextCollection*) TC; - - EBVector = new static_bitsequence_rrr02(empty_texts_bmp,npar,32); - //EBVector = new static_bitsequence_sdarray(empty_texts_bmp,npar); free(empty_texts_bmp); empty_texts_bmp = NULL; disable_tc = dis_tc; text_index_type = _index_type; + if (!disable_tc) { + assert(TCB != 0); + STARTTIMER(); + Text = TCB->InitTextCollection(); + delete TCB; + STOPTIMER(Building); + PRINTTIME("Building TextCollection", Building); + + } else { + Text = NULL; + } + std::cerr << "Number of distinct tags " << TagName->size() << "\n"; //std::cerr.flush(); } @@ -166,8 +173,10 @@ void XMLTree::Save(int fd, char * name) { FILE *fp; int i; - - fp = fdopen(fd, "wa"); + off_t pos = lseek(fd, 0, SEEK_CUR); + int fd2 = dup(fd); + fp = fdopen(fd2, "w"); + fseek(fp, pos, SEEK_SET); // first stores the tree topology saveTree(Par, fp); @@ -190,23 +199,23 @@ void XMLTree::Save(int fd, char * name) //text positions EBVector->save(fp); - + std::cerr << "TC Index position: " << ftell(fp) << std::endl; // stores the texts if (!disable_tc) { - + std::cerr << "Writing " << sizeof(TextCollectionBuilder::index_type_t) << " bytes\n" << std::endl; ufwrite(&text_index_type, sizeof(TextCollectionBuilder::index_type_t), 1, fp); string file(name); switch (text_index_type){ case TextCollectionBuilder::index_type_default: - file.append(".default"); + file.append("_default"); break; case TextCollectionBuilder::index_type_swcsa: - file.append(".swcsa"); + file.append("_swcsa"); break; case TextCollectionBuilder::index_type_rlcsa: - file.append(".rlcsa"); + file.append("_rlcsa"); break; }; @@ -214,6 +223,8 @@ void XMLTree::Save(int fd, char * name) } + fflush(fp); + fclose(fp); } // Load: loads XML tree data structure from file. Returns @@ -292,7 +303,7 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor, char * name) STOPTIMER(Loading); PRINTTIME("Loading text bitvector struct", Loading); STARTTIMER(); - + std::cerr << "TC Load Index position: " << ftell(fp) << std::endl; // Not used // loads the texts if (!XML_Tree->disable_tc){ @@ -301,15 +312,17 @@ XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor, char * name) string file(name); switch (XML_Tree->text_index_type){ case TextCollectionBuilder::index_type_default: - file.append(".default"); + file.append("_default"); break; case TextCollectionBuilder::index_type_swcsa: - file.append(".swcsa"); + file.append("_swcsa"); break; case TextCollectionBuilder::index_type_rlcsa: - file.append(".rlcsa"); + file.append("_rlcsa"); break; }; + + XML_Tree->Text = TextCollection::Load(fp, file.c_str(), TextCollection::index_mode_default, sample_factor); } diff --git a/XMLTree.h b/XMLTree.h index d809bfc..351db20 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -32,6 +32,7 @@ #undef Wminusone #include +#include #include #include #include @@ -219,7 +220,7 @@ class XMLTree { std::vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, - TextCollection * const TC, bool dis_tc, + TextCollectionBuilder * const TCB, bool dis_tc, TextCollectionBuilder::index_type_t _index_type ); public: diff --git a/XMLTreeBuilder.cpp b/XMLTreeBuilder.cpp index 7ff1eff..d9b0832 100644 --- a/XMLTreeBuilder.cpp +++ b/XMLTreeBuilder.cpp @@ -68,25 +68,13 @@ XMLTree *XMLTreeBuilder::CloseDocument() STOPTIMER(Parsing); PRINTTIME("Parsing XML Document", Parsing); - if (!disable_tc) { - assert(Text == 0); - assert(TextBuilder != 0); - STARTTIMER(); - Text = TextBuilder->InitTextCollection(); - delete TextBuilder; - TextBuilder = 0; - STOPTIMER(Building); - PRINTTIME("Building TextCollection", Building); - - } - XMLTree *T = new XMLTree(par_aux, npar, TagName, tIdMap, empty_texts_aux, // freed by the constructor tags_aux, // freed by the constructor - Text, + TextBuilder, // freed by the constructor disable_tc, text_index_type); tags_aux = 0; @@ -101,12 +89,16 @@ XMLTree *XMLTreeBuilder::CloseDocument() int XMLTreeBuilder::NewOpenTag(string tagname) { int i; - // inserts a new opening parentheses in the bit sequence if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); - parArraySize *= 2; - } + + // If array is already 1GB, be gentler when resizing: + if (sizeof(pb)*parArraySize >= 1024*1024*1024) + parArraySize += (128*1024*1024); + else + parArraySize *= 2; + par_aux = (pb *) urealloc(par_aux, sizeof(pb)*parArraySize); + }; bp_setbit(par_aux,npar,OP); // marks a new opening parenthesis @@ -123,6 +115,7 @@ int XMLTreeBuilder::NewOpenTag(string tagname) tagname.compare(ATTRIBUTE_DATA_OPEN_TAG) == 0){ }; + tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1)); tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags @@ -139,12 +132,15 @@ int XMLTreeBuilder::NewOpenTag(string tagname) int XMLTreeBuilder::NewClosingTag(string tagname) { int i; - // inserts a new closing parentheses in the bit sequence if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis - par_aux = (pb *)urealloc(par_aux, sizeof(pb)*2*parArraySize); - parArraySize *= 2; - } + // If array is already 1GB, be gentler when resizing: + if (sizeof(pb)*parArraySize >= 1024*1024*1024) + parArraySize += (128*1024*1024); + else + parArraySize *= 2; + par_aux = (pb *)urealloc(par_aux, sizeof(pb)*parArraySize); + }; bp_setbit(par_aux,npar,CP); // marks a new closing parenthesis @@ -159,7 +155,7 @@ int XMLTreeBuilder::NewClosingTag(string tagname) // else // i = tag_id->second; - tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1)); + tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1)); tags_aux[npar] = CLOSING_TAG_ID; // inserts the new tag id within the preorder sequence of tags