From: nvalimak Date: Fri, 16 Oct 2009 15:19:58 +0000 (+0000) Subject: Uncompressed WT with ArrayDoc X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=6048d243439020acd3b5acf8654641cb9cef8e27;hp=83c839dd1e7417fbd51cd4a3483f145e47863efb;p=SXSI%2FTextCollection.git Uncompressed WT with ArrayDoc git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@575 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/ArrayDoc.h b/ArrayDoc.h new file mode 100644 index 0000000..87cff98 --- /dev/null +++ b/ArrayDoc.h @@ -0,0 +1,77 @@ + +#ifndef _ARRAYDOC_H_ +#define _ARRAYDOC_H_ + +// TODO add includes + +namespace SXSI +{ + +class ArrayDoc { +public: + ArrayDoc(BlockArray *input) + : data(input) + { + + } + ArrayDoc(FILE *fp) + : data(0) + { + data = new BlockArray(fp); + } + + ~ArrayDoc() + { + delete data; + } + + void save(FILE *fp) + { + data->Save(fp); + } + + inline uint access(uint i) + { + return (*data)[i]; + } + inline vector accessAll(uint i, uint j) + { + vector res; + res.reserve(j-i+1); + + for (; i <= j; ++i) + res.push_back((*data)[i]); + + return res; + } + + vector access(uint i, uint j, uint min, uint max) + { + vector res; + res.reserve(j-i+1); + + for (; i <= j; ++i) + if ((*data)[i] >= min && (*data)[i] <= max) + res.push_back((*data)[i]); + + return res; + } + + + uint count(uint i, uint j, uint min, uint max) + { + uint c = 0; + for (; i <= j; ++i) + if ((*data)[i] >= min && (*data)[i] <= max) + ++c; + return c; + } + + +private: + BlockArray *data; +}; + +}; + +#endif diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 32f1898..09f6d24 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -41,7 +41,7 @@ namespace SXSI { // Save file version info -const uchar TCImplementation::versionFlag = 7; +const uchar TCImplementation::versionFlag = 8; /** * Constructor inits an empty dynamic FM-index. @@ -554,6 +554,11 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern return result; } + +/** + *** +* * FIXME Lessthan or equal + */ TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -763,6 +768,9 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0) { +// Tools::StartTimer(); +// std::cout << std::endl << "Loading..."<< std::endl; + uchar verFlag = 0; if (std::fread(&verFlag, 1, 1, file) != 1) throw std::runtime_error("TCImplementation::Load(): file read error (version flag)."); @@ -784,7 +792,9 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position)."); +// std::cout << "Loading alphabet rank (" << Tools::GetTime() << " s)." << std::endl; alphabetrank = static_sequence::load(file); +// std::cout << "Loading samples (" << Tools::GetTime() << " s)." << std::endl; sampled = static_bitsequence::load(file); suffixes = new BlockArray(file); suffixDocId = new BlockArray(file); @@ -794,9 +804,13 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); - Doc = static_sequence::load(file); +// std::cout << "Loading Doc (" << Tools::GetTime() << " s)." << std::endl; + Doc = new ArrayDoc(file); //static_sequence::load(file); +// std::cout << "Loading text storage (" << Tools::GetTime() << " s)." << std::endl; textStorage = TextStorage::Load(file); +// std::cout << "Loading done(" << Tools::GetTime() << " s)." << std::endl; + // FIXME Construct data structures with new samplerate //maketables(); } @@ -1010,8 +1024,8 @@ void TCImplementation::makewavelet(uchar *bwt) #endif alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate? - wt_coder * wtc = new wt_coder_binary(bwt,n,am); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate? + wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am); delete bmb; bwt = 0; // already deleted @@ -1049,7 +1063,8 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) // Mapping from end-markers to doc ID's: unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts); - uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1]; +// uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1]; + BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts); BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n)); uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; @@ -1088,7 +1103,8 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) // Record the order of end-markers in BWT: ulong endmarkerRank = alphabetrank_i_tmp - 1; - set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); + //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); + (*endmarkerDocId)[endmarkerRank] = (textId + 1) % numberOfTexts; // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) @@ -1201,12 +1217,14 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) HeapProfiler::ResetMaxHeapConsumption(); #endif - alphabet_mapper * am = new alphabet_mapper_none(); + /*alphabet_mapper * am = new alphabet_mapper_none(); static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate? Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true); - delete bmb; + delete bmb;*/ // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! + Doc = new ArrayDoc(endmarkerDocId); + #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; #endif diff --git a/TCImplementation.h b/TCImplementation.h index 75f1d40..f0a21cc 100644 --- a/TCImplementation.h +++ b/TCImplementation.h @@ -42,6 +42,7 @@ #undef bitget #include "TextStorage.h" +#include "ArrayDoc.h" #include namespace SXSI @@ -157,7 +158,8 @@ private: ulong maxTextLength; // Array of document id's in the order of end-markers in BWT - static_sequence *Doc; +// static_sequence *Doc; + ArrayDoc *Doc; // Text storage for fast extraction TextStorage * textStorage; diff --git a/makefile b/makefile index 6d071a6..046043b 100644 --- a/makefile +++ b/makefile @@ -1,6 +1,6 @@ CC = g++ LIBCDSPATH = ../libcds -CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O2 -DNDEBUG +CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG LIBCDSA = $(LIBCDSPATH)/lib/libcds.a LIBRLCSA = incbwt/rlcsa.a LIBLZTRIE = lzindex/lztrie.a