From 8b92ac7e539c796ee3160078b5ca30537f26ea51 Mon Sep 17 00:00:00 2001 From: kim Date: Tue, 19 Apr 2011 08:16:22 +0000 Subject: [PATCH] Adds external flush function, sanitize builder. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/XMLTree@1053 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- Makefile | 2 +- XMLTree.cpp | 141 ++++++++++++++++++++++++++------------------- XMLTree.h | 98 ++++++++++++++++++++----------- XMLTreeBuilder.cpp | 13 +++-- XMLTreeBuilder.h | 5 +- 5 files changed, 159 insertions(+), 100 deletions(-) diff --git a/Makefile b/Makefile index 05922b4..e972a3c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -OPT_FLAGS=-O6 -DPOPCOUNT_TABLE -DHAS_NATIVE_POPCOUNT +OPT_FLAGS=-O6 -DPOPCOUNT_TABLE -DHAS_NATIVE_POPCOUNT -fno-PIC INC_FLAGS=-I./libcds/includes/ -I. CFLAGS= $(INC_FLAGS) $(OPT_FLAGS) CXXFLAGS= -std=c++0x $(INC_FLAGS) $(OPT_FLAGS) diff --git a/XMLTree.cpp b/XMLTree.cpp index 5033877..40d8055 100644 --- a/XMLTree.cpp +++ b/XMLTree.cpp @@ -66,9 +66,11 @@ static uint fast_get_field(uint* A,int len, int idx) XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, - TextCollection * const TC, bool dis_tc) + TextCollection * const TC, bool dis_tc, + TextCollectionBuilder::index_type_t _index_type ) { buffer = 0; + print_stack = 0; // creates the data structure for the tree topology Par = (bp *)umalloc(sizeof(bp)); STARTTIMER(); @@ -117,8 +119,7 @@ XMLTree::XMLTree( pb * const par, uint npar, vector * const TN, TagIdM disable_tc = dis_tc; - stream = NULL; - stream_fd = 0; + text_index_type = _index_type; std::cerr << "Number of distinct tags " << TagName->size() << "\n"; //std::cerr.flush(); } @@ -146,11 +147,6 @@ XMLTree::~XMLTree() delete EBVector; EBVector = NULL; - if (stream != NULL){ - fclose(stream); - stream = NULL; - stream_fd = 0; - }; } @@ -167,7 +163,7 @@ void XMLTree::print_stats() } // Save: saves XML tree data structure to file. -void XMLTree::Save(int fd, char *filename) +void XMLTree::Save(int fd) { FILE *fp; char filenameaux[1024]; @@ -199,14 +195,31 @@ void XMLTree::Save(int fd, char *filename) // stores the texts if (!disable_tc) { - Text->Save(fp, filename); - }; - } + ufwrite(&text_index_type, sizeof(TextCollectionBuilder::index_type_t), 1, fp); + + const char * pref; + switch (text_index_type){ + case TextCollectionBuilder::index_type_default: + pref = "default_"; + break; + case TextCollectionBuilder::index_type_swcsa: + pref = "swcsa_"; + break; + case TextCollectionBuilder::index_type_rlcsa: + pref = "rlcsa_"; + break; + }; + + Text->Save(fp, pref); + + + } + } // Load: loads XML tree data structure from file. Returns // a pointer to the loaded data structure -XMLTree *XMLTree::Load(int fd, char *filename, bool load_tc,int sample_factor) +XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) { FILE *fp; @@ -277,32 +290,44 @@ XMLTree *XMLTree::Load(int fd, char *filename, bool load_tc,int sample_factor) ufread(&(XML_Tree->disable_tc), sizeof(bool), 1, fp); if (load_tc) { - XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); - //XML_Tree->EBVector = static_bitsequence_sdarray::load(fp); + XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); - STOPTIMER(Loading); - PRINTTIME("Loading text bitvector struct", Loading); - STARTTIMER(); - - // Not used - // loads the texts - if (!XML_Tree->disable_tc){ - XML_Tree->Text = TextCollection::Load(fp, filename, TextCollection::index_mode_default, sample_factor); - } - else XML_Tree->Text = NULL; - STOPTIMER(Loading); - PRINTTIME("Loading TextCollection", Loading); - STARTTIMER(); + STOPTIMER(Loading); + PRINTTIME("Loading text bitvector struct", Loading); + STARTTIMER(); + + // Not used + // loads the texts + if (!XML_Tree->disable_tc){ + ufread(&(XML_Tree->text_index_type), + sizeof(TextCollectionBuilder::index_type_t), 1, fp); + const char * pref; + switch (!XML_Tree->text_index_type){ + case TextCollectionBuilder::index_type_default: + pref = "default_"; + break; + case TextCollectionBuilder::index_type_swcsa: + pref = "swcsa_"; + break; + case TextCollectionBuilder::index_type_rlcsa: + pref = "rlcsa_"; + break; + }; + XML_Tree->Text = TextCollection::Load(fp, pref, TextCollection::index_mode_default, sample_factor); + + } + else XML_Tree->Text = NULL; + STOPTIMER(Loading); + PRINTTIME("Loading TextCollection", Loading); + STARTTIMER(); } else { XML_Tree->EBVector = NULL; XML_Tree->Text = NULL; XML_Tree->disable_tc = true; }; - - XML_Tree->stream = NULL; - XML_Tree->stream_fd = 0; + return XML_Tree; } @@ -500,12 +525,12 @@ treeNode XMLTree::NextElement(treeNode x) }*/ // LastChild(x): returns the last child of node x. -treeNode XMLTree::LastChild(treeNode x) + /*treeNode XMLTree::LastChild(treeNode x) { NULLT_IF(x == NULLT || fast_isleaf(Par,x)); return find_open(Par, fast_find_close(Par, x)-1); } - + */ // NextSibling(x): returns the next sibling of node x, assuming it exists. /*treeNode XMLTree::NextSibling(treeNode x) { @@ -516,12 +541,12 @@ treeNode XMLTree::LastChild(treeNode x) */ // PrevSibling(x): returns the previous sibling of node x, assuming it exists. -treeNode XMLTree::PrevSibling(treeNode x) +/*treeNode XMLTree::PrevSibling(treeNode x) { NULLT_IF(x==NULLT); return prev_sibling(Par, x); } - +*/ // TaggedChild(x,tag): returns the first child of node x tagged tag, or NULLT if there is none. // Because of the balanced-parentheses representation of the tree, this operation is not supported // efficiently, just iterating among the children of node x until finding the desired child. @@ -801,11 +826,10 @@ DocID XMLTree::MyText(treeNode x) // seems faster than testing EBVector->access(x); if (tag == PCDATA_TAG_ID || tag == ATTRIBUTE_DATA_TAG_ID) - //if (EBVector->access(x)) - return (DocID) (EBVector->rank1(x)-1); //-1 because document ids start from 0 - else + return (DocID) (EBVector->rank1(x)-1); + else return (DocID) NULLT; - + } // MyText(x): returns the document identifier of the text below node x, // or NULLT if x is not a leaf node or the text is empty. Assumes Doc @@ -813,6 +837,7 @@ DocID XMLTree::MyText(treeNode x) DocID XMLTree::MyTextUnsafe(treeNode x) { return (DocID) (EBVector->rank1(x)-1); //-1 because document ids start from 0 + } // TextXMLId(d): returns the preorder of document with identifier d in the tree consisting of // all tree nodes and all text nodes. Assumes that the tree root has preorder 1. @@ -901,19 +926,21 @@ bool XMLTree::IsOpen(treeNode x) { return fast_inspect(Par,x); } void XMLTree::Print(int fd,treeNode x, bool no_text){ - if (buffer == 0) - buffer = new string(); - + if (buffer == 0) { + buffer = new string(BUFFER_ALLOC, 0); + print_stack = new std::vector(); + print_stack->reserve(256); + }; treeNode fin = fast_find_close(Par,x); treeNode n = x; TagType tag = Tag(n); - uchar * tagstr; + range r = DocIds(x); treeNode first_idx; treeNode first_text = (tag == PCDATA_TAG_ID ? x : ParentNode(r.min-1)); treeNode first_att = NULLT; - if (first_att == NULLT) + if (first_att == NULLT) first_idx = first_text; else if (first_text == NULLT) first_idx = first_att; @@ -923,10 +950,9 @@ void XMLTree::Print(int fd,treeNode x, bool no_text){ uchar * current_text=NULL; if (first_idx != NULLT) - current_text = GetText(MyText(first_idx)); + current_text = GetText(MyTextUnsafe(first_idx)); size_t read = 0; - std::vector st; while (n <= fin){ if (fast_inspect(Par,n)){ if (tag == PCDATA_TAG_ID) { @@ -940,14 +966,13 @@ void XMLTree::Print(int fd,treeNode x, bool no_text){ n+=2; // skip closing $ tag = Tag(n); - } - else { - _dputc('<',fd); - tagstr = (uchar*) GetTagNameByRef(tag); - _dputs((const char*) tagstr, fd); + } else { + + _dputc('<',fd); + _dput_str((*TagName)[tag], fd); n++; if (fast_inspect(Par,n)) { - st.push_back(tagstr); + print_stack->push_back(&((*TagName)[tag])); tag = Tag(n); if (tag == ATTRIBUTE_TAG_ID){ n++; @@ -987,17 +1012,15 @@ void XMLTree::Print(int fd,treeNode x, bool no_text){ tag=Tag(n); }; }; - } - else - do { + } else do { _dputs("back()), fd); _dputc('>', fd); - st.pop_back(); + print_stack->pop_back(); n++; - } while (!(fast_inspect(Par,n) || st.empty())); + } while (!(fast_inspect(Par,n) || print_stack->empty())); tag = Tag(n); }; _dputc('\n', fd); - _flush(fd); + //_flush(fd); } diff --git a/XMLTree.h b/XMLTree.h index 5c010a5..4673ccc 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -96,6 +96,8 @@ typedef TagIdMap::const_iterator TagIdMapIT; // Direct calls to sarray library +#define BUFFER_ALLOC (8192 * 2) +#define BUFFER_SIZE (BUFFER_ALLOC / 2) static inline int fast_find_close(bp *b,int s) { return fwd_excess(b,s,-1); @@ -158,7 +160,7 @@ class XMLTree { /** Bit vector indicating with a 1 the positions of the non-empty texts. */ static_bitsequence *EBVector; - + /** Tag sequence represented with a data structure for rank and select */ static_sequence *Tags; uint * tags_fix; @@ -169,59 +171,65 @@ class XMLTree { // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; + SXSI::TextCollectionBuilder::index_type_t text_index_type; - FILE* stream; - int stream_fd; - std::string * buffer; + std::string *buffer; + std::vector *print_stack; void _flush(int fd){ size_t size = buffer->size(); - size_t written = write(fd, buffer->data(), size); - if (written != size) - throw "Cannot flush buffer"; + if (size < BUFFER_SIZE) return; + size_t written; + while (1) { + written = write(fd, buffer->data(), size); + if ((written < 0) && (errno == EAGAIN || errno == EINTR)) + continue; + break; + }; buffer->clear(); } + void _dput_str(std::string s, int fd){ + buffer->append(s); + _flush(fd); + } + void _dputs(const char* s, int fd){ buffer->append(s); - if (buffer->size() >= 131072) _flush(fd); - + _flush(fd); } void _dputc(const char c, int fd){ - buffer->append(1,c); - if (buffer->size() >= 131072) _flush(fd); + buffer->push_back(c); } size_t _dprintf(const char* s, int fd){ if (s == NULL) return 0; - size_t i = 0; - while (1) { - switch (s[i]) { + size_t i; + char c; + for (i = 0; (c = s[i]); i++) { + switch (c) { + case '"': + _dputs(""", fd); + break; case '&': - buffer->append("&"); + _dputs("&", fd); break; case '\'': - buffer->append("'"); - break; - case '"': - buffer->append("""); + _dputs("'", fd); break; case '<': - buffer->append("<"); + _dputs("<", fd); break; case '>': - buffer->append(">"); + _dputs(">", fd); break; - case 0: - return i; default: - buffer->append(1, s[i]); + _dputc(c, fd); }; - if (buffer->size() >= 131072) _flush(fd); - ++i; }; + return i; } void PrintNode(treeNode n, int fd); @@ -229,8 +237,13 @@ class XMLTree { XMLTree(){ buffer = 0;}; // non const pointer are freed by this method. - XMLTree( pb * const par, uint npar, std::vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, - TextCollection * const TC, bool dis_tc); + XMLTree( pb * const par, + uint npar, + std::vector * const TN, + TagIdMap * const tim, uint *empty_texts_bmp, + TagType *tags, + TextCollection * const TC, bool dis_tc, + TextCollectionBuilder::index_type_t _index_type ); public: /** Data structure destructor */ @@ -329,6 +342,16 @@ public: else return parent(Par, x); }; + + treeNode BinaryParent(treeNode x){ + if (x <= Root()) + return NULLT; + else { + treeNode prev = x - 1; + return (fast_inspect(Par, prev) == OP) ? prev : find_open(Par, prev); + }; + }; + /* Assumes x is neither 0 nor -1 */ /** Child(x,i): returns the i-th child of node x, assuming it exists. */ @@ -337,14 +360,20 @@ public: /** LastChild(x): returns the last child of node x. */ - treeNode LastChild(treeNode x); + treeNode LastChild(treeNode x) { + NULLT_IF(x == NULLT || fast_isleaf(Par,x)); + return find_open(Par, fast_find_close(Par, x)-1); + } - - /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ - treeNode PrevSibling(treeNode x); + treeNode PrevSibling(treeNode x) + { + NULLT_IF(x==NULLT); + return prev_sibling(Par, x); + } + /** TaggedChild(x,tag): returns the first child of node x tagged tag, or * NULLT if there is none. Because of the balanced-parentheses representation @@ -577,11 +606,11 @@ public: } /** Save: saves XML tree data structure to file. */ - void Save(int fd, char *filename); + void Save(int fd ); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd, char *filename, bool load_tc, int sample_factor); + static XMLTree *Load(int fd, bool load_tc, int sample_factor); void insertTag(TagType tag, uint position); @@ -597,6 +626,7 @@ public: /** Print procedure */ void Print(int fd,treeNode x, bool no_text); void Print(int fd,treeNode x) { Print(fd,x,false); } + void Flush(int fd){ _flush(fd); } // The following are inlined here for speed /** Tag(x): returns the tag identifier of node x. */ diff --git a/XMLTreeBuilder.cpp b/XMLTreeBuilder.cpp index 3115e4d..9c7e3f6 100644 --- a/XMLTreeBuilder.cpp +++ b/XMLTreeBuilder.cpp @@ -10,12 +10,15 @@ XMLTreeBuilder::~XMLTreeBuilder(){ // OpenDocument(empty_texts): it starts the construction of the data structure for // the XML document. Parameter empty_texts indicates whether we index empty texts // in document or not. Returns a non-zero value upon success, NULLT in case of error. -int XMLTreeBuilder::OpenDocument(bool empty_texts, int sample_rate_text, bool dtc) +int XMLTreeBuilder::OpenDocument(bool empty_texts, + int sample_rate_text, + bool dtc, + TextCollectionBuilder::index_type_t index_type) { npar = 0; parArraySize = 1; disable_tc = dtc; - + text_index_type = index_type; STARTTIMER(); par_aux = (pb *)umalloc(sizeof(pb)*parArraySize); @@ -39,7 +42,8 @@ int XMLTreeBuilder::OpenDocument(bool empty_texts, int sample_rate_text, bool dt if (disable_tc) TextBuilder = 0; else - TextBuilder = TextCollectionBuilder::create((unsigned)sample_rate_text, TextCollectionBuilder::index_type_default); + TextBuilder = TextCollectionBuilder::create((unsigned)sample_rate_text, index_type); + Text = 0; empty_texts_aux = (unsigned int *)ucalloc(sizeof(unsigned int),1); eta_size = sizeof(unsigned int); @@ -80,7 +84,8 @@ XMLTree *XMLTreeBuilder::CloseDocument() empty_texts_aux, // freed by the constructor tags_aux, //freed by the constructor Text, - disable_tc); + disable_tc, + text_index_type); return T; } diff --git a/XMLTreeBuilder.h b/XMLTreeBuilder.h index 055eeda..c7a9036 100644 --- a/XMLTreeBuilder.h +++ b/XMLTreeBuilder.h @@ -67,7 +67,7 @@ class XMLTreeBuilder { int eta_size; // Allows to disable the TextCollection for benchmarkin purposes bool disable_tc; - + TextCollectionBuilder::index_type_t text_index_type; public: XMLTreeBuilder() {;}; @@ -82,7 +82,8 @@ public: * (i.e. everything is considered an empty text *) * Returns a non-zero value upon success, NULLT in case of * error. */ - int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc); + int OpenDocument(bool empty_texts, int sample_rate_text, bool dtc, + TextCollectionBuilder::index_type_t index_type); /** CloseDocument(): finishes the construction of the data structure for * the XML document. Tree and tags are represented in the final form, -- 2.17.1