// We want unique document indentifiers, using std::set to collect them
std::set<DocId> resultSet;
-
- // Check each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->access(i);
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
- resultSet.insert(Doc->access(endmarkerRank));
- }
- else
- {
- DocId di = (*suffixDocId)[sampled->rank1(i)-1];
- assert((unsigned)di < numberOfTexts);
- resultSet.insert(di);
- }
- }
+ EnumerateDocuments(resultSet, sp, ep);
// Convert std::set to std::vector
TextCollection::document_result result(resultSet.begin(), resultSet.end());
// We want unique document indentifiers, using std::set to collect them
std::set<DocId> resultSet;
-
- // Check each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->access(i);
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
- DocId docId = Doc->access(endmarkerRank);
- if (docId >= begin && docId <= end)
- resultSet.insert(docId);
- }
- else
- {
- DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
- assert((unsigned)docId < numberOfTexts);
- if (docId >= begin && docId <= end)
- resultSet.insert(docId);
- }
- }
+ EnumerateDocuments(resultSet, sp, ep, begin, end);
// Convert std::set to std::vector
TextCollection::document_result result(resultSet.begin(), resultSet.end());
return EnumerateEndmarkers(sp, ep, begin, end);
}
+
+TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ suffix_range_vector ranges;
+ kmismatches(ranges, pattern, 0, n-1, m, k);
+ std::set<DocId> resultSet;
+
+ for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
+ // Iterate through docs in [sp,ep]:
+ EnumerateDocuments(resultSet, (*it).first, (*it).second);
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ return result;
+}
+
+TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ suffix_range_vector ranges;
+ ulong *dd = new ulong[m+1];
+ for (ulong i=0;i<m+1;i++)
+ dd[i]=i;
+ kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
+ delete [] dd;
+
+ std::set<DocId> resultSet;
+ for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
+ // Iterate through docs in [sp,ep]:
+ EnumerateDocuments(resultSet, (*it).first, (*it).second);
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ return result;
+}
+
+
/**
* Full result set queries
*/
full_result result;
result.reserve(ep-sp+1); // Try to avoid reallocation.
-
- // Report each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- TextPosition dist = 0;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i);
- ++ dist;
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
- DocId docId = Doc->access(endmarkerRank);
- result.push_back(make_pair(docId, dist));
- }
- else
- {
- TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
- DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
-
- result.push_back(make_pair(docId, textPos));
- }
- }
+ EnumeratePositions(result, sp, ep);
return result;
}
full_result result;
result.reserve(ep-sp+1); // Try to avoid reallocation.
+ EnumeratePositions(result, sp, ep, begin, end);
+
+ return result;
+}
- // Report each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- TextPosition dist = 0;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i);
- ++ dist;
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = Doc->access(endmarkerRank);
- if (docId >= begin && docId <= end)
- result.push_back(make_pair(docId, dist));
- }
- else
- {
- TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
- DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::full_result(); // empty result set
- if (docId >= begin && docId <= end)
- result.push_back(make_pair(docId, textPos));
- }
- }
-
+ suffix_range_vector ranges;
+ ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
+
+ TextCollection::full_result result;
+ result.reserve(count); // avoid reallocation.
+ for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
+ // Iterate through docs in [sp,ep]:
+ EnumeratePositions(result, (*it).first, (*it).second);
+ return result;
+}
+
+TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::full_result(); // empty result set
+
+ suffix_range_vector ranges;
+ ulong *dd = new ulong[m+1];
+ for (unsigned i=0;i<m+1;i++)
+ dd[i]=i;
+ ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
+ delete [] dd;
+
+ TextCollection::full_result result;
+ result.reserve(count); // avoid reallocation.
+ for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
+ // Iterate through docs in [sp,ep]:
+ EnumeratePositions(result, (*it).first, (*it).second);
return result;
}
/**
* Rest of the functions follow...
*/
+ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
+{
+ int c;
+ while (*sp<=*ep && i>=1)
+ {
+ c = (int)pattern[--i];
+ *sp = C[c]+alphabetrank->rank(c,*sp-1);
+ *ep = C[c]+alphabetrank->rank(c,*ep)-1;
+ }
+ if (*sp<=*ep)
+ return *ep - *sp + 1;
+ else
+ return 0;
+}
+ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
+{
+ if (sp>ep) return 0;
+ if (j == 0)
+ {
+ result.push_back(std::make_pair(sp,ep));
+ return ep-sp+1;
+ }
+ int c;
+ ulong spnew;
+ ulong epnew;
+ int knew;
+ ulong sum=0;
+ if (k==0)
+ {
+ sum = searchPrefix(pattern, j, &sp, &ep);
+ if (sp<=ep)
+ result.push_back(std::make_pair(sp, ep));
+ return sum;
+ }
+ vector<int> chars = alphabetrank->accessAll(sp, ep);
+ for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
+ {
+ if (*it == 0)
+ continue; // skip '\0'
+ c = *it;
+ spnew = C[c]+alphabetrank->rank(c,sp-1);
+ epnew = C[c]+alphabetrank->rank(c,ep)-1;
+ if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
+ if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
+ }
+ return sum;
+}
+
+//first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
+ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
+{
+ cout << "j = " << j << ", k = " << k << ", d:";
+ for (unsigned i = 0; i < m+1; ++i)
+ cout << " " << d[i];
+ cout << endl;
+
+ if (d[m]<=k) // range of suffixes with at most k-errors found
+ {
+ if (sp<=ep)
+ result.push_back(std::make_pair(sp, ep));
+ return (sp<=ep)?ep-sp+1:0ul;
+ }
+ if (sp>ep || j==0)
+ return 0;
+ ulong *dnew = new ulong[m+1];
+ int c;
+ ulong spnew;
+ ulong p,lowerbound;
+ ulong epnew;
+ ulong sum=0;
+ vector<int> chars = alphabetrank->accessAll(sp, ep);
+ for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
+ {
+ if (*it == 0)
+ continue; // skip '\0'
+ c = *it;
+ spnew = C[c]+alphabetrank->rank(c,sp-1);
+ epnew = C[c]+alphabetrank->rank(c,ep)-1;
+ if (spnew>epnew) continue;
+ dnew[0]=m+k-j+1;
+ lowerbound=k+1;
+ for (p=1; p<=m; p++) {
+ dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
+ if (dnew[p]<lowerbound)
+ lowerbound = dnew[p];
+ }
+ if (lowerbound<=k)
+ sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
+ }
+ delete [] dnew;
+ return sum;
+}
+
ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
{
#undef bitset
#include "TextStorage.h"
-
+#include <set>
namespace SXSI
{
document_result Equal(uchar const *) const;
document_result Contains(uchar const *) const;
document_result LessThan(uchar const *) const;
+ document_result Kmismaches(uchar const *, unsigned) const;
+ document_result Kerrors(uchar const *, unsigned) const;
document_result Prefix(uchar const *, DocId, DocId) const;
document_result Suffix(uchar const *, DocId, DocId) const;
// Definition of full_result is inherited from SXSI::TextCollection.
full_result FullContains(uchar const *) const;
full_result FullContains(uchar const *, DocId, DocId) const;
+ full_result FullKmismatches(uchar const *, unsigned) const;
+ full_result FullKerrors(uchar const *, unsigned) const;
// Index from/to disk
TCImplementation(FILE *, unsigned);
void Save(FILE *) const;
private:
+ typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
+
static const uchar versionFlag;
TextPosition n;
unsigned samplerate;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-
+ ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
+ ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
+ ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
/**
* Count end-markers in given interval
*/
return Doc->access(sp, ep-1, min, max);
}
+
+ /**
+ * Enumerate documents in given interval [sp, ep]
+ */
+ inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
+ {
+ // We want unique document indentifiers, using std::set to collect them
+ // FIXME use unordered_set?
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ resultSet.insert(Doc->access(endmarkerRank));
+ }
+ else
+ {
+ DocId di = (*suffixDocId)[sampled->rank1(i)-1];
+ assert((unsigned)di < numberOfTexts);
+ resultSet.insert(di);
+ }
+ }
+ }
+
+ /**
+ * Enumerate documents in given interval [sp, ep]
+ * and within [begin, end]
+ */
+ inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+ {
+ // We want unique document indentifiers, using std::set to collect them
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+ assert((unsigned)docId < numberOfTexts);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ }
+ }
+
+ /**
+ * Enumerate document+position pairs (full_result) of
+ * each suffix in given interval.
+ */
+ inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
+ {
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+ }
+
+ /**
+ * Enumerate document+position pairs (full_result) of
+ * each suffix in given interval and within [begin, end].
+ */
+ inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+ {
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+ }
+
}; // class TCImplementation
} // namespace SXSI