From: nvalimak Date: Thu, 14 May 2009 11:56:18 +0000 (+0000) Subject: Kmismatches Kerrors X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=3c8e8e6146312f6f55046c451897234fed56608d;p=SXSI%2FTextCollection.git Kmismatches Kerrors git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@390 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 5959e6a..c3b430a 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -527,30 +527,7 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern // We want unique document indentifiers, using std::set to collect them std::set resultSet; - - // Check each occurrence - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->access(i); - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - resultSet.insert(Doc->access(endmarkerRank)); - } - else - { - DocId di = (*suffixDocId)[sampled->rank1(i)-1]; - assert((unsigned)di < numberOfTexts); - resultSet.insert(di); - } - } + EnumerateDocuments(resultSet, sp, ep); // Convert std::set to std::vector TextCollection::document_result result(resultSet.begin(), resultSet.end()); @@ -569,33 +546,7 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern // We want unique document indentifiers, using std::set to collect them std::set resultSet; - - // Check each occurrence - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->access(i); - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - if (docId >= begin && docId <= end) - resultSet.insert(docId); - } - else - { - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - assert((unsigned)docId < numberOfTexts); - if (docId >= begin && docId <= end) - resultSet.insert(docId); - } - } + EnumerateDocuments(resultSet, sp, ep, begin, end); // Convert std::set to std::vector TextCollection::document_result result(resultSet.begin(), resultSet.end()); @@ -628,6 +579,50 @@ TextCollection::document_result TCImplementation::LessThan(uchar const * pattern return EnumerateEndmarkers(sp, ep, begin, end); } + +TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + suffix_range_vector ranges; + kmismatches(ranges, pattern, 0, n-1, m, k); + std::set resultSet; + + for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) + // Iterate through docs in [sp,ep]: + EnumerateDocuments(resultSet, (*it).first, (*it).second); + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + return result; +} + +TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + suffix_range_vector ranges; + ulong *dd = new ulong[m+1]; + for (ulong i=0;i resultSet; + for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) + // Iterate through docs in [sp,ep]: + EnumerateDocuments(resultSet, (*it).first, (*it).second); + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + return result; +} + + /** * Full result set queries */ @@ -643,34 +638,7 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern full_result result; result.reserve(ep-sp+1); // Try to avoid reallocation. - - // Report each occurrence - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - TextPosition dist = 0; - uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i); - ++ dist; - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - result.push_back(make_pair(docId, dist)); - } - else - { - TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - - result.push_back(make_pair(docId, textPos)); - } - } + EnumeratePositions(result, sp, ep); return result; } @@ -687,39 +655,46 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern full_result result; result.reserve(ep-sp+1); // Try to avoid reallocation. + EnumeratePositions(result, sp, ep, begin, end); + + return result; +} - // Report each occurrence - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - TextPosition dist = 0; - uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i); - ++ dist; - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - - // End-marker that we found belongs to the "preceeding" doc in collection: - DocId docId = Doc->access(endmarkerRank); - if (docId >= begin && docId <= end) - result.push_back(make_pair(docId, dist)); - } - else - { - TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; +TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::full_result(); // empty result set - if (docId >= begin && docId <= end) - result.push_back(make_pair(docId, textPos)); - } - } - + suffix_range_vector ranges; + ulong count = kmismatches(ranges, pattern, 0, n-1, m, k); + + TextCollection::full_result result; + result.reserve(count); // avoid reallocation. + for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) + // Iterate through docs in [sp,ep]: + EnumeratePositions(result, (*it).first, (*it).second); + return result; +} + +TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::full_result(); // empty result set + + suffix_range_vector ranges; + ulong *dd = new ulong[m+1]; + for (unsigned i=0;i=1) + { + c = (int)pattern[--i]; + *sp = C[c]+alphabetrank->rank(c,*sp-1); + *ep = C[c]+alphabetrank->rank(c,*ep)-1; + } + if (*sp<=*ep) + return *ep - *sp + 1; + else + return 0; +} +ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const +{ + if (sp>ep) return 0; + if (j == 0) + { + result.push_back(std::make_pair(sp,ep)); + return ep-sp+1; + } + int c; + ulong spnew; + ulong epnew; + int knew; + ulong sum=0; + if (k==0) + { + sum = searchPrefix(pattern, j, &sp, &ep); + if (sp<=ep) + result.push_back(std::make_pair(sp, ep)); + return sum; + } + vector chars = alphabetrank->accessAll(sp, ep); + for (vector::iterator it = chars.begin(); it != chars.end(); ++it) + { + if (*it == 0) + continue; // skip '\0' + c = *it; + spnew = C[c]+alphabetrank->rank(c,sp-1); + epnew = C[c]+alphabetrank->rank(c,ep)-1; + if (c!=pattern[j-1]) knew = (int)k-1; else knew = k; + if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew); + } + return sum; +} + +//first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i +ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const +{ + cout << "j = " << j << ", k = " << k << ", d:"; + for (unsigned i = 0; i < m+1; ++i) + cout << " " << d[i]; + cout << endl; + + if (d[m]<=k) // range of suffixes with at most k-errors found + { + if (sp<=ep) + result.push_back(std::make_pair(sp, ep)); + return (sp<=ep)?ep-sp+1:0ul; + } + if (sp>ep || j==0) + return 0; + ulong *dnew = new ulong[m+1]; + int c; + ulong spnew; + ulong p,lowerbound; + ulong epnew; + ulong sum=0; + vector chars = alphabetrank->accessAll(sp, ep); + for (vector::iterator it = chars.begin(); it != chars.end(); ++it) + { + if (*it == 0) + continue; // skip '\0' + c = *it; + spnew = C[c]+alphabetrank->rank(c,sp-1); + epnew = C[c]+alphabetrank->rank(c,ep)-1; + if (spnew>epnew) continue; + dnew[0]=m+k-j+1; + lowerbound=k+1; + for (p=1; p<=m; p++) { + dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1)); + if (dnew[p] namespace SXSI { @@ -117,6 +117,8 @@ public: document_result Equal(uchar const *) const; document_result Contains(uchar const *) const; document_result LessThan(uchar const *) const; + document_result Kmismaches(uchar const *, unsigned) const; + document_result Kerrors(uchar const *, unsigned) const; document_result Prefix(uchar const *, DocId, DocId) const; document_result Suffix(uchar const *, DocId, DocId) const; @@ -127,12 +129,16 @@ public: // Definition of full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; full_result FullContains(uchar const *, DocId, DocId) const; + full_result FullKmismatches(uchar const *, unsigned) const; + full_result FullKerrors(uchar const *, unsigned) const; // Index from/to disk TCImplementation(FILE *, unsigned); void Save(FILE *) const; private: + typedef std::vector > suffix_range_vector; + static const uchar versionFlag; TextPosition n; unsigned samplerate; @@ -164,7 +170,9 @@ private: ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - + ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const; + ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const; + ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; /** * Count end-markers in given interval */ @@ -223,6 +231,146 @@ private: return Doc->access(sp, ep-1, min, max); } + + /** + * Enumerate documents in given interval [sp, ep] + */ + inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep) const + { + // We want unique document indentifiers, using std::set to collect them + // FIXME use unordered_set? + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + resultSet.insert(Doc->access(endmarkerRank)); + } + else + { + DocId di = (*suffixDocId)[sampled->rank1(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); + } + } + } + + /** + * Enumerate documents in given interval [sp, ep] + * and within [begin, end] + */ + inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const + { + // We want unique document indentifiers, using std::set to collect them + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + else + { + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + assert((unsigned)docId < numberOfTexts); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + } + } + + /** + * Enumerate document+position pairs (full_result) of + * each suffix in given interval. + */ + inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const + { + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + + result.push_back(make_pair(docId, textPos)); + } + } + } + + /** + * Enumerate document+position pairs (full_result) of + * each suffix in given interval and within [begin, end]. + */ + inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const + { + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, textPos)); + } + } + } + }; // class TCImplementation } // namespace SXSI diff --git a/TextCollection.h b/TextCollection.h index 1e34ab0..a1d065d 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -153,7 +153,9 @@ namespace SXSI virtual document_result Equal(uchar const *) const = 0; virtual document_result Contains(uchar const *) const = 0; virtual document_result LessThan(uchar const *) const = 0; - + virtual document_result Kmismaches(uchar const *, unsigned) const = 0; + virtual document_result Kerrors(uchar const *, unsigned) const = 0; + /** * Document reporting queries for given DocId interval. */ @@ -174,6 +176,9 @@ namespace SXSI // Full reporting query for given DocId interval virtual full_result FullContains(uchar const *, DocId, DocId) const = 0; + virtual full_result FullKmismatches(uchar const *, unsigned) const = 0; + virtual full_result FullKerrors(uchar const *, unsigned) const = 0; + protected: // Protected constructor; use TextCollectionBuilder TextCollection() { };