From 7cdaf25b1e5f1890e359b3ad37ab7ec2c9e30d5a Mon Sep 17 00:00:00 2001 From: nvalimak Date: Fri, 29 May 2009 14:37:37 +0000 Subject: [PATCH] LZ index support git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@418 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- TCImplementation.cpp | 41 +++++++++++++++++++++-------------------- TCImplementation.h | 32 ++++++++++++++++---------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 2aec680..c512306 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -48,15 +48,15 @@ const uchar TCImplementation::versionFlag = 6; * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, - unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_) + unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, char tsType) : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) { makewavelet(bwt); // Deletes bwt! bwt = 0; - + // Make sampling tables - maketables(numberOfSamples_); + maketables(numberOfSamples_, tsType); } bool TCImplementation::EmptyText(DocId k) const @@ -581,7 +581,7 @@ TextCollection::document_result TCImplementation::LessThan(uchar const * pattern } -TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const +TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -600,7 +600,7 @@ TextCollection::document_result TCImplementation::Kmismaches(uchar const * patte return result; } -TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const +TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -661,7 +661,7 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern return result; } -TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const +TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -678,7 +678,7 @@ TextCollection::full_result TCImplementation::FullKmismatches(uchar const * patt return result; } -TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const +TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -795,7 +795,7 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); Doc = static_sequence::load(file); - textStorage = new TextStorage(file); + textStorage = TextStorage::Load(file); // FIXME Construct data structures with new samplerate //maketables(); @@ -859,20 +859,20 @@ ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pa //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const { + ulong sum=0; if (d[m]<=k) // range of suffixes with at most k-errors found { if (sp<=ep) result.push_back(std::make_pair(sp, ep)); - return (sp<=ep)?ep-sp+1:0ul; + sum += (sp<=ep)?ep-sp+1:0ul; } if (sp>ep || j==0) - return 0; + return sum; ulong *dnew = new ulong[m+1]; int c; ulong spnew; ulong p,lowerbound; ulong epnew; - ulong sum=0; vector chars = alphabetrank->accessAll(sp, ep); for (vector::iterator it = chars.begin(); it != chars.end(); ++it) { @@ -1022,7 +1022,7 @@ void TCImplementation::makewavelet(uchar *bwt) #endif } -void TCImplementation::maketables(ulong sampleLength) +void TCImplementation::maketables(ulong sampleLength, char tsType) { // Calculate BWT end-marker position (of last inserted text) { @@ -1110,7 +1110,7 @@ void TCImplementation::maketables(ulong sampleLength) HeapProfiler::ResetMaxHeapConsumption(); #endif - textStorage = tsbuilder.InitTextStorage(); + textStorage = tsbuilder.InitTextStorage(tsType); #ifdef DEBUG_MEMUSAGE std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; @@ -1131,11 +1131,6 @@ void TCImplementation::maketables(ulong sampleLength) suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - x = n - 2; posOfSuccEndmarker = n-1; for(ulong i=0; iIsEndmarker(x)) posOfSuccEndmarker = x--; } assert((*positions)[i] < n); @@ -1158,12 +1154,17 @@ void TCImplementation::maketables(ulong sampleLength) // calculate offset from text start: (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); --x; - if (tsbuilder[x] == '\0') + if (x != ~0lu && textStorage->IsEndmarker(x)) posOfSuccEndmarker = x--; } delete positions; +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + /** * Second pass: check tables */ diff --git a/TCImplementation.h b/TCImplementation.h index 65bf22c..75f1d40 100644 --- a/TCImplementation.h +++ b/TCImplementation.h @@ -39,6 +39,7 @@ # define W 32 #endif #undef bitset +#undef bitget #include "TextStorage.h" #include @@ -53,31 +54,30 @@ namespace SXSI */ class TCImplementation : public SXSI::TextCollection { public: - TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong); + TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong, char); ~TCImplementation(); bool EmptyText(DocId) const; /** - * Returns a pointer to the original text. + * Extracting one text. * - * Do *not* try to free the array. - * (However, this implementation is suspect to change.) - * - * See TextStorage.h for details. + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. */ uchar * GetText(DocId) const; + void DeleteText(uchar *text) const + { textStorage->DeleteText(text); } /** * Returns a pointer to the beginning of texts i, i+1, ..., j. * Texts are separated by a '\0' byte. * - * Do *not* try to free the array. - * (However, this implementation is suspect to change.) - * - * See TextStorage.h for details. + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. */ - uchar * GetText(DocId i, DocId j) const; + uchar * GetText(DocId i, DocId j) const + { return textStorage->GetText(i, j); } /** * Returns a substring of given text ID. @@ -117,8 +117,8 @@ public: document_result Equal(uchar const *) const; document_result Contains(uchar const *) const; document_result LessThan(uchar const *) const; - document_result Kmismaches(uchar const *, unsigned) const; - document_result Kerrors(uchar const *, unsigned) const; + document_result KMismaches(uchar const *, unsigned) const; + document_result KErrors(uchar const *, unsigned) const; document_result Prefix(uchar const *, DocId, DocId) const; document_result Suffix(uchar const *, DocId, DocId) const; @@ -129,8 +129,8 @@ public: // Definition of full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; full_result FullContains(uchar const *, DocId, DocId) const; - full_result FullKmismatches(uchar const *, unsigned) const; - full_result FullKerrors(uchar const *, unsigned) const; + full_result FullKMismatches(uchar const *, unsigned) const; + full_result FullKErrors(uchar const *, unsigned) const; // Index from/to disk TCImplementation(FILE *, unsigned); @@ -165,7 +165,7 @@ private: // Following methods are not part of the public API uchar * BWT(uchar *); void makewavelet(uchar *); - void maketables(ulong); + void maketables(ulong, char); DocId DocIdAtTextPos(BlockArray*, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; -- 2.17.1