X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.cpp;h=32f1898f44b25c372a3c2f9bbd3b1400f2903750;hb=d2a43b1c9d921272cad2b7d2e0cffa2de0975129;hp=2aec680c310b78138bac05cec0ece9b72c31ddab;hpb=e59bd57d53c2d9e5db4c22f2501a9789f62f46c1;p=SXSI%2FTextCollection.git diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 2aec680..32f1898 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -41,22 +41,22 @@ namespace SXSI { // Save file version info -const uchar TCImplementation::versionFlag = 6; +const uchar TCImplementation::versionFlag = 7; /** * Constructor inits an empty dynamic FM-index. * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, - unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_) + unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, char tsType) : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) { makewavelet(bwt); // Deletes bwt! bwt = 0; - + // Make sampling tables - maketables(numberOfSamples_); + maketables(numberOfSamples_, tsType); } bool TCImplementation::EmptyText(DocId k) const @@ -581,7 +581,7 @@ TextCollection::document_result TCImplementation::LessThan(uchar const * pattern } -TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const +TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -600,7 +600,7 @@ TextCollection::document_result TCImplementation::Kmismaches(uchar const * patte return result; } -TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const +TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -661,7 +661,7 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern return result; } -TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const +TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -678,7 +678,7 @@ TextCollection::full_result TCImplementation::FullKmismatches(uchar const * patt return result; } -TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const +TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -795,7 +795,7 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); Doc = static_sequence::load(file); - textStorage = new TextStorage(file); + textStorage = TextStorage::Load(file); // FIXME Construct data structures with new samplerate //maketables(); @@ -859,20 +859,20 @@ ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pa //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const { + ulong sum=0; if (d[m]<=k) // range of suffixes with at most k-errors found { if (sp<=ep) result.push_back(std::make_pair(sp, ep)); - return (sp<=ep)?ep-sp+1:0ul; + sum += (sp<=ep)?ep-sp+1:0ul; } if (sp>ep || j==0) - return 0; + return sum; ulong *dnew = new ulong[m+1]; int c; ulong spnew; ulong p,lowerbound; ulong epnew; - ulong sum=0; vector chars = alphabetrank->accessAll(sp, ep); for (vector::iterator it = chars.begin(); it != chars.end(); ++it) { @@ -1022,7 +1022,7 @@ void TCImplementation::makewavelet(uchar *bwt) #endif } -void TCImplementation::maketables(ulong sampleLength) +void TCImplementation::maketables(ulong sampleLength, char tsType) { // Calculate BWT end-marker position (of last inserted text) { @@ -1110,7 +1110,7 @@ void TCImplementation::maketables(ulong sampleLength) HeapProfiler::ResetMaxHeapConsumption(); #endif - textStorage = tsbuilder.InitTextStorage(); + textStorage = tsbuilder.InitTextStorage(tsType); #ifdef DEBUG_MEMUSAGE std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; @@ -1131,11 +1131,6 @@ void TCImplementation::maketables(ulong sampleLength) suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - x = n - 2; posOfSuccEndmarker = n-1; for(ulong i=0; iIsEndmarker(x)) posOfSuccEndmarker = x--; } assert((*positions)[i] < n); @@ -1158,12 +1154,17 @@ void TCImplementation::maketables(ulong sampleLength) // calculate offset from text start: (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); --x; - if (tsbuilder[x] == '\0') + if (x != ~0lu && textStorage->IsEndmarker(x)) posOfSuccEndmarker = x--; } delete positions; +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + /** * Second pass: check tables */