--- /dev/null
+/**
+ * k-errors queries
+ *
+ * Aligment using suffix filters.
+ */
+
+#ifndef _IndelQuery_H_
+#define _IndelQuery_H_
+
+#include "ResultSet.h"
+#include "EditDistance.h"
+#include "Query.h"
+
+#include <string>
+#include <cstdlib>
+#include <cstring>
+
+namespace SXSI
+{
+
+class IndelQuery : public Query
+{
+public:
+ IndelQuery(TextCollection const *tc_)
+ : Query(tc_), checked(0), splitPos(0), ed(0)
+ {
+ checked = new ResultSet(tc->getLength());
+ }
+
+ virtual ~IndelQuery()
+ {
+ delete checked; checked = 0;
+ delete [] splitPos; splitPos = 0;
+ delete ed; ed = 0;
+ }
+
+protected:
+ void firstStep(void)
+ {
+ checked->clear();
+ constructPartition();
+
+ uchar *reversepat = new uchar[patlen+1];
+ for (unsigned i = klimit+1; i > 1; --i)
+ {
+ if (debug) std::cout << "new try: " << splitPos[i] << ", split = " << i << std::endl;
+
+ for (unsigned j = 0; j < splitPos[i-1]; j++)
+ reversepat[j] = pat[splitPos[i-1]-j-1];
+ reversepat[splitPos[i-1]] = 0;
+ ed = new MyersEditDistanceIncremental(reversepat, splitPos[i-1], i-1);
+
+ exactMatch(i);
+
+ delete ed;
+ ed = 0;
+ }
+ delete [] reversepat;
+ reversepat = 0;
+
+ if (debug) std::cout << "new try: " << splitPos[1] << ", split = 1" << std::endl;
+ ed = 0; // Not used, exact search only for the first block
+ exactMatch(1);
+ }
+
+private:
+
+ void exactMatch(const unsigned split)
+ {
+ unsigned i = splitPos[split];
+ const unsigned border = splitPos[split-1];
+ if (debug) std::cerr << "Exact match: i = " << i << ", border = " << border << std::endl;
+
+ while (i != border && pushChar(pat[i-1]))
+ --i;
+
+ if (debug) std::cerr << "matched upto: i = " << i << std::endl;
+ if (i == 0)
+ checkMatch();
+ else
+ if (i == border)
+ nextStep(split);
+
+ // Rewind stack
+ match.clear();
+ while (!smin.empty()) smin.pop();
+ while (!smax.empty()) smax.pop();
+ smin.push(0);
+ smax.push(textlen-1);
+ }
+
+ void nextStep(const unsigned split)
+ {
+ // Maximum edit distance for the current filter is split-1
+ if (ed->dist() <= split-1) checkMatch();
+
+ if (ed->end()) return;
+
+ // ALPHABET has been defined in Query
+ for (const char *cp = ALPHABET; cp < ALPHABET + ALPHABET_SIZE; ++cp) {
+ char c = *cp;
+ if (!pushChar(c)) continue;
+ const unsigned colmin = ed->pushChar(c);
+
+ // Edit distance is for reversed pattern, compute prefix lenght:
+ const unsigned prefixl = ed->length - ed->getSuffixLength();
+ // How many errors are we allowing on a prefix of given length?
+ unsigned i = 1;
+ while (splitPos[i] < prefixl)
+ ++i;
+
+ assert(i < klimit+1);
+ assert(split-1 > i-1);
+ const unsigned maxk = (split-1) - (i-1);
+
+ if (debug)
+ std::cerr << "c = " << c << ", split = " << split << ", maxk = " << maxk << ", colmin = " << colmin
+ << ", prefixl = " << prefixl << ", dist = " << ed->dist() << std::endl;
+
+ if (colmin <= maxk) nextStep(split);
+
+ //if (debug) std::cerr << "pop c = " << c << std::endl;
+ ed->popChar();
+ popChar();
+ }
+ }
+
+ inline void checkMatch(void)
+ {
+// if (debug) std::cerr << "min,max: " << smin.top() << ", " << smax.top() << std::endl;
+ ulong min = smin.top();
+ ulong max = smax.top();
+ MyersEditDistance med(patlen, klimit);
+ for (ulong i = min; i <= max; i++) {
+ if (!checked->get(i))
+ {
+ checked->set(i);
+ med.init((uchar const *)pat);
+ validateCandidate(med, i);
+ }
+ }
+ }
+
+ void validateCandidate(MyersEditDistance &med, ulong sapos)
+ {
+ uchar *candidate = tc->getSuffix(sapos, patlen+klimit);
+ unsigned l = strlen((char*)candidate);
+ if (l < patlen)
+ {
+ delete [] candidate;
+ return;
+ }
+
+ std::pair<unsigned, unsigned> dist(0, patlen);
+ if (klimit == 0)
+ { // MyersEditDistance class does not support klimit == 0
+ unsigned i = 0;
+ while (i < patlen && pat[i] == candidate[i])
+ ++i;
+ if (i < patlen)
+ {
+ delete [] candidate;
+ return; // There is at least one mismatch (and klimit == 0)
+ }
+ }
+ else // Using prefixDist() when klimit > 0
+ dist = med.prefixDist(candidate, l);
+
+ if (dist.first <= klimit)
+ newMatch(sapos);
+ delete [] candidate;
+ }
+
+
+ inline void newMatch(ulong sapos)
+ {
+ TextCollection::DocId di = tc->getDoc(sapos);
+ result.insert(di);
+ }
+
+
+ void constructPartition(void)
+ {
+ delete [] splitPos;
+ splitPos = new unsigned[klimit+2];
+
+ if (klimit == 0)
+ {
+ splitPos[0] = 0;
+ splitPos[1]=patlen;
+ return;
+ }
+
+ // First piece size
+ unsigned psize = patlen/(klimit+2);
+ if (psize == 0)
+ {
+ std::cerr << "error: First psize == 0, for patlen = " << patlen << ", klimit = " << klimit << std::endl;
+ std::exit(1);
+ }
+ splitPos[0] = 0;
+ splitPos[1] = 2*psize;
+
+ // Last klimit are of size:
+ psize = (patlen-splitPos[1])/klimit;
+ if (psize == 0)
+ {
+ std::cerr << "error: psize == 0, for patlen = " << patlen << ", klimit = " << klimit << std::endl;
+ std::exit(1);
+ }
+
+ for (unsigned i = 2; i <= klimit; i++)
+ splitPos[i]=splitPos[i-1]+psize;
+ splitPos[klimit+1]=patlen;
+
+ if (splitPos[klimit] >= patlen)
+ {
+ std::cerr << "error: splitPos[" << klimit << "] == " << splitPos[klimit] << " >= patlen, for patlen = " << patlen << ", klimit = " << klimit << std::endl;
+ std::exit(1);
+ }
+
+ if (debug)
+ {
+ std::cerr << "splitpos: ";
+ for (unsigned i= 0;i<=klimit+1; i++)
+ std::cerr << splitPos[i] << ", ";
+ std::cerr << std::endl;
+ }
+ }
+
+
+ ResultSet *checked;
+ unsigned *splitPos;
+ MyersEditDistanceIncremental *ed;
+
+ IndelQuery();
+ // No copy constructor or assignment
+ IndelQuery(IndelQuery const&);
+ IndelQuery& operator = (IndelQuery const&);
+};
+
+} // namespace
+
+#endif // _IndelQuery_H_