X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.cpp;h=1a29f2524eccf74a394c02aed5b62bf4c54bb252;hb=102e33b134075765e6d4e0c38bc1307568ce5602;hp=c3b430a16e7ac762420c0c8c514d9501693608b8;hpb=3c8e8e6146312f6f55046c451897234fed56608d;p=SXSI%2FTextCollection.git diff --git a/TCImplementation.cpp b/TCImplementation.cpp index c3b430a..1a29f25 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -36,26 +36,28 @@ using std::vector; using std::pair; using std::make_pair; using std::map; - +using std::string; namespace SXSI { // Save file version info -const uchar TCImplementation::versionFlag = 6; +const uchar TCImplementation::versionFlag = 8; /** * Constructor inits an empty dynamic FM-index. * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ -TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_) +TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, + unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, + CSA::DeltaVector & notIndexed, const string & niText, char tsType) : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) { makewavelet(bwt); // Deletes bwt! bwt = 0; - + // Make sampling tables - maketables(); + maketables(numberOfSamples_, tsType, notIndexed, niText); } bool TCImplementation::EmptyText(DocId k) const @@ -92,8 +94,9 @@ uchar * TCImplementation::GetText(DocId k) const res[i-j-1] = result[j]; return res;*/ } + /* - * Not supported + * Substring queries are supported via the pointer returned by TextStorage::GetText uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const { assert(k < (DocId)numberOfTexts); @@ -108,6 +111,8 @@ uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const return Substring(i + start, j-i+1); }*/ + + /****************************************************************** * Existential queries */ @@ -553,6 +558,11 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern return result; } + +/** + *** +* * FIXME Lessthan or equal + */ TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -580,7 +590,7 @@ TextCollection::document_result TCImplementation::LessThan(uchar const * pattern } -TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const +TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -599,7 +609,7 @@ TextCollection::document_result TCImplementation::Kmismaches(uchar const * patte return result; } -TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const +TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -660,7 +670,7 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern return result; } -TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const +TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -677,7 +687,7 @@ TextCollection::full_result TCImplementation::FullKmismatches(uchar const * patt return result; } -TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const +TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -757,8 +767,13 @@ void TCImplementation::Save(FILE *file) const * * Throws a std::runtime_error exception on i/o error. * For more info, see TCImplementation::Save(). + * + * index_mode_t is defined in TextCollection.h and + * defaults to both the index and "naive" text. + * + * Note: Samplerate can not be changed during load. */ -TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) +TCImplementation::TCImplementation(FILE *file, index_mode_t im, unsigned samplerate_) : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0) { @@ -784,17 +799,24 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position)."); alphabetrank = static_sequence::load(file); + if (im == index_mode_text_only) { delete alphabetrank; alphabetrank = 0; } + sampled = static_bitsequence::load(file); + if (im == index_mode_text_only) { delete sampled; sampled = 0; } suffixes = new BlockArray(file); + if (im == index_mode_text_only) { delete suffixes; suffixes = 0; } suffixDocId = new BlockArray(file); + if (im == index_mode_text_only) { delete suffixDocId; suffixDocId = 0; } if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts)."); if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); - Doc = static_sequence::load(file); - textStorage = new TextStorage(file); + Doc = new ArrayDoc(file); //static_sequence::load(file); + if (im == index_mode_text_only) { delete Doc; Doc = 0; } + + textStorage = TextStorage::Load(file); // FIXME Construct data structures with new samplerate //maketables(); @@ -858,25 +880,20 @@ ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pa //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const { - cout << "j = " << j << ", k = " << k << ", d:"; - for (unsigned i = 0; i < m+1; ++i) - cout << " " << d[i]; - cout << endl; - + ulong sum=0; if (d[m]<=k) // range of suffixes with at most k-errors found { if (sp<=ep) result.push_back(std::make_pair(sp, ep)); - return (sp<=ep)?ep-sp+1:0ul; + sum += (sp<=ep)?ep-sp+1:0ul; } if (sp>ep || j==0) - return 0; + return sum; ulong *dnew = new ulong[m+1]; int c; ulong spnew; ulong p,lowerbound; ulong epnew; - ulong sum=0; vector chars = alphabetrank->accessAll(sp, ep); for (vector::iterator it = chars.begin(); it != chars.end(); ++it) { @@ -1010,12 +1027,12 @@ void TCImplementation::makewavelet(uchar *bwt) #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove + HeapProfiler::ResetMaxHeapConsumption(); #endif alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate? - wt_coder * wtc = new wt_coder_binary(bwt,n,am); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate? + wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am); delete bmb; bwt = 0; // already deleted @@ -1026,7 +1043,7 @@ void TCImplementation::makewavelet(uchar *bwt) #endif } -void TCImplementation::maketables() +void TCImplementation::maketables(ulong sampleLength, char tsType, CSA::DeltaVector & notIndexed, const string & niText) { // Calculate BWT end-marker position (of last inserted text) { @@ -1042,13 +1059,21 @@ void TCImplementation::maketables() this->bwtEndPos = i; } +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + // Build up array for text starting positions - BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); - (*textStartPos)[0] = 0; +// BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); +// (*textStartPos)[0] = 0; // Mapping from end-markers to doc ID's: - uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs. - + unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts); +// uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1]; + BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts); + + BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n)); uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++) sampledpositions[i] = 0; @@ -1056,105 +1081,182 @@ void TCImplementation::maketables() ulong x,p=bwtEndPos; ulong sampleCount = 0; // Keeping track of text position of prev. end-marker seen - ulong posOfSuccEndmarker = n; + ulong posOfSuccEndmarker = n-1; DocId textId = numberOfTexts; ulong ulongmax = 0; ulongmax--; uint alphabetrank_i_tmp =0; - /** - * First pass: populate tables textStartPos and sampledpositions. - */ + // Text length = n + number of bytes not indexed. + TextStorageBuilder tsbuilder(n + niText.length()); + ulong tsb_i = n + niText.length(); // Iterator from text length to 0. + string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts + for (ulong i=n-1;iGetPos(i) x=(i==n-1)?0:i+1; - if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); + + tsbuilder[--tsb_i] = c; // Build TextStorage + + if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0') + { set_field(sampledpositions,1,p,1); + (*positions)[sampleCount] = p; sampleCount ++; } - uchar c = alphabetrank->access(p, alphabetrank_i_tmp); if (c == '\0') { - --textId; + unsigned prevTextId = textId; // Cache textId value. + --textId; + /** + * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus, + * we have to search for the first text that is actually *indexed* + * to get correct prevTextId. + */ + if (prevTextId == numberOfTexts) + { + prevTextId = 0; + while (notIndexed.isSet(prevTextId)) + ++ prevTextId; + // Now prevTextId points to the first indexed Doc ID. + } + + /** + * Insert non-indexed texts + */ + while (notIndexed.isSet(textId)) + { + do { + tsbuilder[tsb_i] = *nit_i; + -- tsb_i; + ++ nit_i; + } while (nit_i != niText.rend() && *nit_i != '\0'); + + tsbuilder[tsb_i] = '\0'; + + if (textId == 0) + break; + --textId; + } // Record the order of end-markers in BWT: ulong endmarkerRank = alphabetrank_i_tmp - 1; - endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts; + //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); + (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts; // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) { - (*textStartPos)[textId + 1] = x; // x is the position of end-marker. - posOfSuccEndmarker = x; +// (*textStartPos)[textId + 1] = x; // x-1 is text position of end-marker. + + posOfSuccEndmarker = i; } - // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). - p = textId; // Correct LF-mapping to the last char of the previous text. + // LF-mapping from '\0' does not work with this (pseudo) BWT. + // Correct LF-mapping to the last char of the previous text: + p = textId - notIndexed.rank(textId); } else // Now c != '\0', do LF-mapping: p = C[c]+alphabetrank_i_tmp-1; } + while (textId > 0 && notIndexed.isSet(textId-1)) + { + do { + -- tsb_i; + tsbuilder[tsb_i] = *nit_i; + ++ nit_i; + } while (nit_i != niText.rend() && *nit_i != '\0'); + --textId; + } assert(textId == 0); - + assert(tsb_i == 0); + assert(nit_i == niText.rend()); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + textStorage = tsbuilder.InitTextStorage(tsType); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); delete [] sampledpositions; - ulong sampleLength = sampled->rank1(n-1); assert(sampleCount == sampleLength); + assert(sampled->rank1(n-1) == sampleLength); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif // Suffixes store an offset from the text start position suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - p=bwtEndPos; - textId = numberOfTexts; - - TextStorageBuilder tsbuilder(n); - - /** - * Second pass: populate tables suffixes and suffixDocId. - */ - for (ulong i=n-1;iaccess(p)) { - ulong j = sampled->rank1(p)-1; - - (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x); - - // calculate offset from text start: - (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]]; + x = n + niText.length() - 2; + textId = numberOfTexts - 1; + posOfSuccEndmarker = x + 1; + for(ulong i = 0; i < sampleLength; i ++) { + // Find next sampled text position + while ((posOfSuccEndmarker - x) % samplerate != 0 + || notIndexed.isSet(textId)) // Loop over non-indexed + { + --x; + assert(x != ~0lu); + if (textStorage->IsEndmarker(x)) + { + posOfSuccEndmarker = x--; + -- textId; + } } + assert((*positions)[i] < n); + ulong j = sampled->rank1((*positions)[i]); - uchar c = alphabetrank->access(p, alphabetrank_i_tmp); - tsbuilder[i] = c; - - if (c == '\0') + assert(j != 0); // if (j==0) j=sampleLength; + + TextPosition textPos = (x==n-1)?0:x+1; + (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos); + assert(textStorage->DocIdAtTextPos(textPos) == textId); + + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); + --x; + if (x != ~0lu && textStorage->IsEndmarker(x)) { - --textId; - // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). - p = textId; // Correct LF-mapping to the last char of the previous text. + posOfSuccEndmarker = x--; + -- textId; } - else // Now c != '\0', do LF-mapping: - p = C[c]+alphabetrank_i_tmp-1; } - assert(textId == 0); - delete textStartPos; - textStorage = tsbuilder.InitTextStorage(); + delete positions; + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; HeapProfiler::ResetMaxHeapConsumption(); #endif - alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true); - delete bmb; + /*alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate? + Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true); + delete bmb;*/ // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! + Doc = new ArrayDoc(endmarkerDocId); + #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; #endif