Added approximate pattern matching with Suffix Filters
authornvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Mon, 1 Nov 2010 13:58:38 +0000 (13:58 +0000)
committernvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Mon, 1 Nov 2010 13:58:38 +0000 (13:58 +0000)
git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@929 3cdefd35-fc62-479d-8e8d-bae585ffb9ca

12 files changed:
EditDistance.cpp [new file with mode: 0644]
EditDistance.h [new file with mode: 0644]
IndelQuery.h [new file with mode: 0644]
Query.cpp [new file with mode: 0644]
Query.h [new file with mode: 0644]
RLCSAWrapper.h
ResultSet.cpp [new file with mode: 0644]
ResultSet.h [new file with mode: 0644]
TextCollection.h
dependencies.mk
incbwt/dependencies.mk
makefile

diff --git a/EditDistance.cpp b/EditDistance.cpp
new file mode 100644 (file)
index 0000000..23b7de6
--- /dev/null
@@ -0,0 +1,10 @@
+#include "EditDistance.h"
+
+namespace SXSI
+{
+
+signed char MyersEditDistanceIncremental::myers_total[256 * 256];
+signed char MyersEditDistanceIncremental::myers_optimum[256 * 256];
+uchar       MyersEditDistanceIncremental::myers_best_pos[256 * 256];
+
+}
diff --git a/EditDistance.h b/EditDistance.h
new file mode 100644 (file)
index 0000000..ad0e868
--- /dev/null
@@ -0,0 +1,545 @@
+#ifndef _SXSI_EditDistance_h_
+#define _SXSI_EditDistance_h_
+
+/**
+ * TODO
+ * [ ] clean up code
+ */
+
+#include <utility>
+#include <cstring>
+#include "Tools.h"
+
+namespace SXSI
+{
+
+class EditDistance 
+{
+public:
+    uchar const *pat;
+    unsigned  patlen;
+    unsigned maxlen;
+    unsigned **dp;
+    unsigned p;
+
+    bool end()
+    {
+        if (p < maxlen)
+            return false;
+        return true;
+    }
+
+    EditDistance(uchar const *pattern, unsigned len, unsigned maxk) {
+        pat = pattern;
+        patlen = len;
+        maxlen = len + maxk;
+        p = 0;
+        dp = new unsigned*[patlen + 1];
+        for (unsigned i = 0; i <= patlen; i++) {
+            dp[i] = new unsigned[maxlen+1];
+            dp[i][0] = i;
+        }
+    }
+
+    ~EditDistance() {
+        for (unsigned i = 0; i <= patlen; i++)
+            delete [] dp[i];
+        delete [] dp;
+    }
+
+    std::pair<unsigned, unsigned> prefixDist(uchar const *cand, unsigned candlen) {
+        unsigned mindist = patlen+candlen;
+        unsigned minlen = patlen+candlen;
+        unsigned *col = new unsigned[patlen+1];
+        unsigned *ncol = new unsigned[patlen+1];
+        for (unsigned i = 0; i <= patlen; i++)
+            col[i] = i;
+        for (unsigned j = 1; j <= candlen; j++) {
+            ncol[0] = j;
+            for (unsigned i = 1; i <= patlen; i++) {
+                unsigned match = (cand[j-1] == pat[i-1])? col[i - 1] : (col[i - 1] + 1);
+                ncol[i] = std::min(col[i] + 1, ncol[i - 1] + 1);
+                ncol[i] = std::min(ncol[i],match);
+            }
+            if (ncol[patlen]<mindist)
+            {
+                mindist = ncol[patlen];
+                minlen = j;
+            }
+            for (unsigned i = 0; i <= patlen; i++)
+                col[i] = ncol[i];
+        }
+        delete [] col;
+        delete [] ncol;
+        return std::make_pair(mindist,minlen);
+    }
+
+
+    unsigned pushChar(uchar c) {
+        unsigned mindist = patlen;
+        p++;
+        if (p>maxlen)
+            std::cout << "out of bounds in DP computation, p = " << p << ", max = " << maxlen << std::endl;
+        dp[0][p] = p;
+        for (unsigned i = 1; i <= patlen; i++) {
+            unsigned match = (c == pat[i-1])? dp[i - 1][p-1] : (dp[i - 1][p-1] + 1);
+            dp[i][p] = std::min(dp[i][p-1] + 1, dp[i - 1][p] + 1);
+            dp[i][p] = std::min(dp[i][p],match);
+            if (dp[i][p]<mindist)
+                mindist = dp[i][p];
+        }
+        return mindist;
+    }
+
+    void popChar() {
+        --p;
+        return;
+    }
+
+    unsigned dist() {
+        return dp[patlen][p];
+    }
+
+    unsigned getSuffixLength()
+    {
+        unsigned mindist = dp[patlen][p];
+        unsigned minlen = patlen;
+        for (unsigned i = 0; i < patlen; ++i)
+            if (dp[i][p] < mindist)
+            {
+                mindist = dp[i][p];
+                minlen = i;
+            }
+        return minlen;
+    }
+
+};
+
+
+class MyersEditDistanceIncremental
+{
+  public:
+
+    MyersEditDistanceIncremental(const uchar *pattern, unsigned _length, unsigned _max_errors) :
+      length(_length), max_errors(_max_errors), min_row(~0u)
+    {
+      this->max_length = this->length + this->max_errors;
+      this->blocks = (this->length + W - 1) / W;
+      this->last_block_length = this->length % W;
+      if(this->last_block_length == 0) { this->last_block_length = W; }
+      this->last_block_mask = ~0lu >> (this->blocks * W - this->length);
+
+      this->initializeCharVectors(pattern);
+      this->initializePlusMinusVectors();
+    }
+
+    ~MyersEditDistanceIncremental()
+    {
+      for(unsigned c = 0; c < 256; c++)
+      {
+        delete[] this->char_vectors[c]; this->char_vectors[c] = 0;
+      }
+      delete[] this->char_vectors; this->char_vectors = 0;
+
+      for(unsigned i = 0; i <= this->max_length; i++)
+      {
+        delete[] this->plus_vectors[i]; this->plus_vectors[i] = 0;
+        delete[] this->minus_vectors[i]; this->minus_vectors[i] = 0;
+      }
+      delete[] this->plus_vectors; this->plus_vectors = 0;
+      delete[] this->minus_vectors; this->minus_vectors = 0;
+
+      delete[] this->score; this->score = 0;
+    }
+
+    unsigned pushChar(unsigned c)
+    {
+      if(c >= 256)
+      {
+        std::cerr << "MyersEditDistance: Invalid character value " << c << "!" << std::endl;
+        return ~0u;
+      }
+      if(this->pos >= this->max_length)
+      {
+        std::cerr << "MyersEditDistance: Cannot pushChar(c) past limit!" << std::endl;
+        std::cerr << "MyersEditDistance: Current text position " << this-> pos << ", maximum " << this->max_length << std::endl;
+        return ~0u;
+      }
+
+      this->pos++;
+      ulong c_vec, diagonal, h_plus = 0, h_minus = 0, plus, minus, temp;
+      ulong d_overflow = 0, hp_overflow = 1, hm_overflow = 0;
+
+      /*
+        Update rules from http://www.cs.uta.fi/~helmu/pubs/phd.pdf (Figure 25).
+      */
+      for(unsigned i = 0; i < this->blocks; i++)
+      {
+        c_vec = this->char_vectors[c][i];
+        plus = this->plus_vectors[this->pos - 1][i];
+        minus = this->minus_vectors[this->pos - 1][i];
+
+        diagonal = c_vec & plus;
+        temp = diagonal + plus + d_overflow;
+        d_overflow = ((temp < diagonal) || (temp == diagonal && d_overflow == 1) ? 1 : 0);
+        diagonal = (temp ^ plus) | c_vec | minus;
+
+        h_plus = minus | ~(diagonal | plus);
+        h_minus = diagonal & plus;
+
+        this->plus_vectors[this->pos][i] =
+          (h_minus << 1) | hm_overflow | ~(diagonal | (h_plus << 1) | hp_overflow);
+        this->minus_vectors[this->pos][i] = diagonal & ((h_plus << 1) | hp_overflow);
+
+        hp_overflow = (h_plus & HIGH_BIT ? 1 : 0);
+        hm_overflow = (h_minus & HIGH_BIT ? 1 : 0);
+      }
+      this->plus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
+      this->minus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
+
+      this->score[this->pos] = this->score[this->pos - 1];
+      if(h_plus & (1lu << (this->length % W - 1)))
+      {
+        this->score[this->pos]++;
+      }
+      else if(h_minus & (1lu << (this->length % W - 1)))
+      {
+        this->score[this->pos]--;
+      }
+
+      // Find the minimal value in the current column.
+      int minimum = this->score[this->pos], cur = minimum;
+      min_row = this->length;
+      for(unsigned i = this->blocks; i > 0;)
+      {
+        i--;
+        plus = this->plus_vectors[this->pos][i];
+        minus = this->minus_vectors[this->pos][i];
+
+        for(unsigned j = W; j > 0;)
+        {
+          j -= CHAR_BIT;
+          unsigned a = (plus >> j) & 0xFF, b = (minus >> j) & 0xFF;
+          if (minimum > cur + myers_optimum[256 * a + b])
+          {
+              minimum = cur + myers_optimum[256 * a + b];
+              min_row = i*W + j + myers_best_pos[256 * a + b];
+          }
+          cur += myers_total[256 * a + b];
+        }
+      }
+
+      return minimum;
+    }
+
+    unsigned dist() {
+        return this->score[this->pos];
+    }
+    
+    void popChar()
+    {
+      if(this->pos == 0)
+      {
+        std::cerr << "MyersEditDistance: Cannot popChar() from text position 0!" << std::endl;
+        return;
+      }
+
+      this->pos--;
+    }
+
+    inline unsigned getSuffixLength()
+    {
+        return min_row;
+    }
+
+/*
+    // FIXME implement
+    std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
+*/
+
+    bool end()
+    {
+        if (pos < max_length)
+            return false;
+        return true;
+    }
+
+static void initMyersFourRussians()
+{
+//  myers_total = new signed char[256 * 256];
+//  myers_optimum = new signed char[256 * 256];
+//   myers_best_pos = new uchar[256 * 256];
+
+  unsigned bp = 8;
+  for(unsigned i = 0; i < 256; i++)
+  {
+    for(unsigned j = 0; j < 256; j++)
+    {
+      int cur = 0, best = 0;
+      for(unsigned k = CHAR_BIT; k > 0;)
+      {
+        k--;
+        if(i & (1 << k)) { cur--; }
+        if(j & (1 << k)) { cur++; }
+        if(cur < best)
+        {
+          best = cur;
+          bp = k;
+        }
+      }
+
+      myers_total[256 * i + j] = cur;
+      myers_optimum[256 * i + j] = best;
+      myers_best_pos[256 * i + j] = bp;
+    }
+  }
+}
+
+    unsigned length;
+private:
+    static signed char myers_total[];
+    static signed char myers_optimum[];
+    static uchar       myers_best_pos[];
+
+
+    const static ulong HIGH_BIT = 1lu << (W - 1);
+
+    ulong **char_vectors;
+    unsigned max_errors;
+    unsigned max_length;
+    unsigned min_row;
+
+    unsigned blocks;            // Number of blocks required for the pattern
+    unsigned last_block_length; // Number of bits in the final block
+    ulong last_block_mask;
+
+    ulong **plus_vectors;
+    ulong **minus_vectors;
+    unsigned* score;
+
+    unsigned pos;               // We have matched pos characters of text
+
+
+    void initializeCharVectors(uchar const *pattern)
+    {
+      this->char_vectors = new ulong*[256];
+      for(unsigned c = 0; c < 256; c++)
+      {
+        this->char_vectors[c] = new ulong[this->blocks];
+        std::memset(this->char_vectors[c], 0, this->blocks * sizeof(ulong));
+      }
+
+      for(unsigned i = 0; i < this->length; i++)
+      {
+        this->char_vectors[pattern[i]][i / W] |= (1lu << (i % W));
+      }
+    }
+
+    void initializePlusMinusVectors()
+    {
+      this->plus_vectors = new ulong*[this->max_length + 1];
+      this->minus_vectors = new ulong*[this->max_length + 1];
+      this->score = new unsigned[this->max_length + 1];
+
+      for(unsigned i = 0; i <= this->max_length; i++)
+      {
+        this->plus_vectors[i] = new ulong[this->blocks];
+        this->minus_vectors[i] = new ulong[this->blocks];
+      }
+
+      this->initialStep();
+    }
+
+    void initialStep()
+    {
+      for(unsigned i = 0; i < this->blocks; i++)
+      {
+        this->plus_vectors[0][i] = ~0lu;
+        this->minus_vectors[0][i] = 0;
+      }
+      this->plus_vectors[0][this->blocks - 1] = this->last_block_mask;
+
+      this->score[0] = this->length;
+      this->pos = 0;
+    }
+
+
+
+};
+
+
+
+
+    class MyersEditDistance
+    {
+    public:
+        ulong **Peq;
+        bool S[256];
+        unsigned k;
+        unsigned m;
+        unsigned blockCount;
+        unsigned block;
+        unsigned lastBlockLength;
+        ulong *Pv;
+        ulong *Mv;
+        ulong *Score;
+        ulong twopowmpos;
+        ulong twopowwpos;
+        void init(uchar const *P)
+        {
+            ulong i;
+            ulong j;
+            ulong l;
+            for(i = 0;i< 256;i++) {
+                S[i] = false;
+                for (l=0;l <= ((m-1) / W); l++)
+                    Peq[l][i] = 0;
+            }
+            for (l=0;l <= ((m-1) / W); l++)
+                for (j = 1; j <= W; j++) {
+                    if (W*l+j > m)
+                        break;
+                    Peq[l][P[W*l+j-1]] = Peq[l][P[W*l+j-1]] | (1lu << (j-1));
+                    if (j+W*l <= k+1)
+                        S[(unsigned)P[W*l+j-1]] = true;
+                }
+            for (l=0; l < blockCount; l++) {
+                Mv[l] = 0lu;
+                Score[l] = (l+1lu)*W;
+                Pv[l] = ~0lu;
+            }
+            Mv[blockCount] = 0;
+            Score[blockCount] = m;
+            Pv[blockCount] = (~0lu) >> ((blockCount+1lu)*W - m);
+        }
+
+        MyersEditDistance(unsigned _m, unsigned maxk) 
+            : Peq(0), k(maxk), m(_m)
+        {
+            ulong l;
+            Peq = new ulong *[(m-1) / W + 1]; // (unsigned **)malloc((((m-1) / w)+1)*sizeof(unsigned*));
+            for (l=0;l <= ((m-1) / W); l++)
+                Peq[l] = new ulong[256]; //(unsigned *)malloc(256*sizeof(unsigned));
+
+            blockCount= ((m-1) / W);
+            block = (k-1) / W;
+            lastBlockLength = m % W;
+            if (lastBlockLength == 0)
+                lastBlockLength = W;
+            Pv = new ulong[blockCount+1];  // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
+            Mv = new ulong[blockCount+1];  // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
+            Score = new ulong[blockCount+1];//(unsigned *)malloc((blockCount+1)*sizeof(unsigned));
+            twopowmpos = 1lu << (lastBlockLength-1);
+            twopowwpos = 1lu << (W-1);
+        }
+
+        ~MyersEditDistance()
+        {
+            delete [] Pv; Pv = 0;
+            delete [] Mv; Mv = 0;
+            delete [] Score; Score = 0;
+            unsigned l;
+            for (l=0;l <= ((m-1) / W); l++)
+            {
+                delete [] Peq[l];
+                Peq[l] = 0;
+            }
+            delete [] Peq; Peq = 0;
+        }
+
+        std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
+        {
+            unsigned mindist = m+n;
+            unsigned minlen = m+n;
+            //            int count=0;
+            //int blockAverage=0;
+            ulong Eq, Xv, Xh, Ph, Mh;
+            ulong Phendbit;
+            ulong Mhendbit;
+            ulong temp;
+            ulong j;
+            ulong l;
+            ulong overk=1;
+            block = (k-1) / W;
+            for(j = 1; j <= n; j++) {
+                Phendbit = 1lu; //0;
+                Mhendbit = 0lu;
+                for (l=0; l <= blockCount; l++) {
+                    Eq = Peq[l][(unsigned char)text[j-1]];
+                    Xv = Eq | Mv[l];
+                    Xh = ((((Eq | Mhendbit)& Pv[l]) + Pv[l]) ^ Pv[l]) | (Eq | Mhendbit);
+                    Ph = Mv[l] | ~ (Xh | Pv[l]);
+                    Mh = Pv[l] & Xh;
+                    temp = l < blockCount ? twopowwpos : twopowmpos;
+                    if (Ph & temp)
+                        Score[l] += 1;
+                    else if (Mh & temp)
+                        Score[l] -= 1;
+                    temp = (Ph >> (W-1));
+                    Ph <<= 1;
+                    Ph |= Phendbit;
+                    Phendbit = temp;
+                    temp = (Mh >> (W-1));
+                    Mh <<= 1;
+                    Mh |= Mhendbit;
+                    Mhendbit = temp;
+                    Pv[l] = (Mh) | ~ (Xv | Ph);
+                    Mv[l] = Ph & Xv;
+                    if (block == l+1 &&
+                        ((block == blockCount &&
+                          (Score[l] > k+lastBlockLength ||
+                           (Score[l] > k && overk != 0 && j - overk >= lastBlockLength))) ||
+                         (block != blockCount &&
+                          (Score[l] > k+W ||
+                           (Score[l] > k && overk != 0 && j - overk >= W)))))  {
+                        // lohkon Score kasvanut niin isoksi, ettei seuraavaa kannata laskea
+                        overk = 0;
+                        block = l;
+                        break;
+                    }
+                    else if (block == l+1 && overk == 0 && Score[l] > k)
+                        // Talletetaan ensimmäinen diagonaali jolla Score > k. Näin tiedetään
+                        // jatkossa koska seuraavan lohkon laskeminen on turhaa.
+                        overk = j;
+                    else if (block == l+1 && Score[l] <= k)
+                        // Diagonaalilla Score <= k ==> Nollataan muuttuja overk.
+                        overk = 0;
+                    else if (block == l && block != blockCount && Score[l] <= k) {
+                        // Score <= k ==> jatketaan seuraavaan lohkoon. Koska seuraavaa lohkoa
+                        // ei ole laskettu edellisellä sarakkeella, pitää sen arvot päivittää.
+                        Score[l+1] = k+1lu;
+                        Pv[l+1] = 0lu;
+                        Mv[l+1] = 0lu;
+                        overk = 0lu;
+                        block = l+1;
+                    }
+                    else if (block == l)
+                        // Seuraavaan lohkoon ei kannata siirtyä, kun Score > k.
+                        break;
+                }
+//                blockAverage += block+1;
+                // if (block == blockCount && Score[blockCount] <= k) {
+                if (block == blockCount && Score[blockCount] <= mindist)
+                {
+                    mindist = Score[blockCount];
+                    minlen = j;
+                
+                    //printf("Min %u at len %u\n", mindist, minlen);
+                //    count++;
+                }
+            }
+//            return count;
+            return std::make_pair(mindist,minlen);
+            /*ShowMessage("Esiintymiä löytyi " + IntToStr(laskuri) + "kpl. " +
+              "Aikaa meni " + IntToStr(msec) + "msec");
+              ShowMessage("Keskimääräinen lohkojen määrä " +
+              FormatFloat("0.00",(float)blockAverage / n));*/
+        }
+
+};
+
+
+} // Namespace
+
+#endif
diff --git a/IndelQuery.h b/IndelQuery.h
new file mode 100644 (file)
index 0000000..169ab81
--- /dev/null
@@ -0,0 +1,244 @@
+/**
+ * k-errors queries
+ * 
+ * Aligment using suffix filters.
+ */
+
+#ifndef _IndelQuery_H_
+#define _IndelQuery_H_
+
+#include "ResultSet.h"
+#include "EditDistance.h"
+#include "Query.h"
+
+#include <string>
+#include <cstdlib> 
+#include <cstring>  
+
+namespace SXSI
+{
+
+class IndelQuery : public Query
+{
+public:
+    IndelQuery(TextCollection const *tc_)
+        : Query(tc_), checked(0), splitPos(0), ed(0)
+    { 
+        checked = new ResultSet(tc->getLength());
+    }
+
+    virtual ~IndelQuery() 
+    {
+        delete checked; checked = 0;
+       delete [] splitPos; splitPos = 0;
+        delete ed; ed = 0;
+    }
+
+protected:
+    void firstStep(void) 
+    {
+        checked->clear();
+        constructPartition();
+        
+       uchar *reversepat = new uchar[patlen+1];
+        for (unsigned i = klimit+1; i > 1; --i) 
+        {
+            if (debug) std::cout << "new try: " << splitPos[i] << ", split = " << i << std::endl;
+
+            for (unsigned j = 0; j < splitPos[i-1]; j++)
+                reversepat[j] = pat[splitPos[i-1]-j-1];
+            reversepat[splitPos[i-1]] = 0;
+            ed = new MyersEditDistanceIncremental(reversepat, splitPos[i-1], i-1);
+
+            exactMatch(i);
+
+            delete ed;
+            ed = 0;
+        }
+       delete [] reversepat;
+       reversepat = 0;
+
+        if (debug) std::cout << "new try: " << splitPos[1] << ", split = 1" << std::endl;
+        ed = 0; // Not used, exact search only for the first block
+        exactMatch(1);
+    }
+
+private:
+
+    void exactMatch(const unsigned split)
+    {
+        unsigned i = splitPos[split];
+        const unsigned border = splitPos[split-1];
+        if (debug) std::cerr << "Exact match: i = " << i << ", border = " << border << std::endl;
+
+        while (i != border  && pushChar(pat[i-1])) 
+            --i;
+        
+        if (debug) std::cerr << "matched upto: i = " << i << std::endl;
+        if (i == 0)
+            checkMatch();
+        else
+            if (i == border)
+                nextStep(split);
+        
+        // Rewind stack
+        match.clear();
+        while (!smin.empty()) smin.pop();
+        while (!smax.empty()) smax.pop();
+        smin.push(0);
+        smax.push(textlen-1);
+    }
+
+    void nextStep(const unsigned split) 
+    {
+        // Maximum edit distance for the current filter is split-1
+        if (ed->dist() <= split-1) checkMatch();
+
+        if (ed->end()) return;
+
+        // ALPHABET has been defined in Query
+        for (const char *cp = ALPHABET; cp < ALPHABET + ALPHABET_SIZE; ++cp) {
+            char c = *cp;
+            if (!pushChar(c)) continue;
+            const unsigned colmin = ed->pushChar(c);
+
+            // Edit distance is for reversed pattern, compute prefix lenght:
+            const unsigned prefixl = ed->length - ed->getSuffixLength();
+            // How many errors are we allowing on a prefix of given length?
+            unsigned i = 1;
+            while (splitPos[i] < prefixl)
+                ++i;
+            
+            assert(i < klimit+1);
+            assert(split-1 > i-1);
+            const unsigned maxk = (split-1) - (i-1);
+
+            if (debug)
+                std::cerr << "c = " << c << ", split = " << split << ", maxk = " << maxk << ", colmin = " << colmin 
+                          << ", prefixl = " << prefixl << ", dist = " << ed->dist() << std::endl;
+           
+            if (colmin <= maxk) nextStep(split);
+
+            //if (debug) std::cerr << "pop c = " << c << std::endl;
+            ed->popChar();
+            popChar();
+        }
+    }
+
+    inline void checkMatch(void) 
+    {
+//        if (debug) std::cerr << "min,max: " << smin.top() << ", " << smax.top() << std::endl;
+        ulong min = smin.top();
+        ulong max = smax.top();
+        MyersEditDistance med(patlen, klimit);
+        for (ulong i = min; i <= max; i++) {
+            if (!checked->get(i))
+            {
+                checked->set(i);
+                med.init((uchar const *)pat);
+                validateCandidate(med, i);
+            }
+        }
+    }
+
+    void validateCandidate(MyersEditDistance &med, ulong sapos) 
+    {
+        uchar *candidate = tc->getSuffix(sapos, patlen+klimit);
+        unsigned l = strlen((char*)candidate);
+        if (l < patlen)
+        {
+            delete [] candidate;
+            return;
+        }
+
+        std::pair<unsigned, unsigned> dist(0, patlen);
+        if (klimit == 0)
+        {  // MyersEditDistance class does not support klimit == 0
+            unsigned i = 0;
+            while (i < patlen && pat[i] == candidate[i])
+                ++i;
+            if (i < patlen)
+            {
+                delete [] candidate;
+                return; // There is at least one mismatch (and klimit == 0)
+            }
+        }
+        else // Using prefixDist() when klimit > 0
+            dist = med.prefixDist(candidate, l);
+
+        if (dist.first <= klimit)
+            newMatch(sapos);
+        delete [] candidate;
+    }
+
+
+    inline void newMatch(ulong sapos) 
+    {
+        TextCollection::DocId di = tc->getDoc(sapos);
+        result.insert(di);
+    }
+
+
+    void constructPartition(void) 
+    {
+       delete [] splitPos;
+        splitPos = new unsigned[klimit+2];
+
+       if (klimit == 0)
+        {
+            splitPos[0] = 0; 
+            splitPos[1]=patlen; 
+            return;
+        }
+        
+       // First piece size
+        unsigned psize = patlen/(klimit+2);
+       if (psize == 0)
+        {
+           std::cerr << "error: First psize == 0, for patlen = " << patlen << ", klimit = " << klimit << std::endl;
+           std::exit(1);
+        }
+        splitPos[0] = 0;
+       splitPos[1] = 2*psize;
+
+       // Last klimit are of size:
+       psize = (patlen-splitPos[1])/klimit;
+       if (psize == 0)
+        {
+           std::cerr << "error: psize == 0, for patlen = " << patlen << ", klimit = " << klimit << std::endl;
+           std::exit(1);
+        }
+
+        for (unsigned i = 2; i <= klimit; i++)
+            splitPos[i]=splitPos[i-1]+psize;
+       splitPos[klimit+1]=patlen; 
+
+       if (splitPos[klimit] >= patlen)
+        {
+           std::cerr << "error: splitPos[" << klimit << "] == " << splitPos[klimit] << " >= patlen, for patlen = " << patlen << ", klimit = " << klimit << std::endl;
+           std::exit(1);
+        }
+
+        if (debug)
+        { 
+            std::cerr << "splitpos: ";
+            for (unsigned i= 0;i<=klimit+1; i++)
+                std::cerr << splitPos[i] << ", ";
+            std::cerr << std::endl;
+        }
+    }
+
+    
+    ResultSet *checked;
+    unsigned *splitPos;
+    MyersEditDistanceIncremental *ed;
+
+    IndelQuery();
+    // No copy constructor or assignment
+    IndelQuery(IndelQuery const&);
+    IndelQuery& operator = (IndelQuery const&);
+};
+
+} // namespace
+
+#endif // _IndelQuery_H_
diff --git a/Query.cpp b/Query.cpp
new file mode 100644 (file)
index 0000000..29b0bb1
--- /dev/null
+++ b/Query.cpp
@@ -0,0 +1,83 @@
+#include "Query.h"
+#include <cstring>
+
+namespace SXSI
+{
+
+const char Query::ALPHABET_DNA[] = {'A', 'C', 'G', 'T', 'N'};
+
+
+Query::Query(TextCollection const *tc_)
+    : pat(0), patlen(0), tc(tc_), debug(false), klimit(0), 
+      textlen(tc->getLength()), ALPHABET(ALPHABET_DNA)
+{ }
+
+
+/*void reverse()
+{
+    uchar c;
+    for (int i = 0; i < patlen / 2; ++i) {
+        c = pat[i];
+        pat[i] = pat[patlen - i - 1];
+        pat[patlen - i - 1] = c;
+    }
+}
+
+void revcmp()
+{
+    reverse();
+    for (int i = 0; i < patlen; ++i)
+        switch (pat[i]) 
+        {
+        case('A'): pat[i] = 'T'; break;
+        case('C'): pat[i] = 'G'; break;
+        case('G'): pat[i] = 'C'; break;
+        case('T'): pat[i] = 'A'; break;                
+        case('a'): pat[i] = 't'; break;                
+        case('c'): pat[i] = 'g'; break;
+        case('g'): pat[i] = 'c'; break;                
+        case('a'): pat[i] = 't'; break;                
+        case('N'): pat[i] = 'N'; break;                
+        case('n'): pat[i] = 'n'; break;                
+        default:
+            std::cerr << "Query::align(): invalid alphabet ("<< pat[i] << ") given!" << std::endl;
+            std::exit(1);
+            break;
+        }
+        }*/
+
+TextCollection::document_result Query::align(uchar const *p, unsigned k)
+{
+    result.clear();
+    this->pat = p;
+    this->patlen = std::strlen((char const *)p);
+    this->klimit = k;
+
+    while (!smin.empty()) smin.pop();
+    while (!smax.empty()) smax.pop();
+    smin.push(0);
+    smax.push(textlen-1);
+    match.clear();
+        
+    firstStep();
+
+    // Search reverse complemented?
+/*    {
+        revcmp();
+        this->pat = p->c_str();
+        while (!smin.empty()) smin.pop();
+        while (!smax.empty()) smax.pop();
+        smin.push(0);
+        smax.push(textlen-1);
+        match.clear();
+        
+        firstStep();
+
+        revcmp(); // Restore orig string
+        }*/
+
+    TextCollection::document_result dr(result.begin(), result.end());
+    return dr;
+}
+
+} // namespace
diff --git a/Query.h b/Query.h
new file mode 100644 (file)
index 0000000..bf77802
--- /dev/null
+++ b/Query.h
@@ -0,0 +1,86 @@
+/**
+ * Interface for alignment queries
+ */
+
+#ifndef _Query_H_
+#define _Query_H_
+
+#include "TextCollection.h"
+
+#include <stack>
+#include <vector>
+#include <set>
+
+namespace SXSI
+{
+
+/**
+ * Base class for queries
+ */
+class Query
+{
+public:
+    TextCollection::document_result align(uchar const *p, unsigned k);
+
+    void setDebug(bool b)
+    { this->debug = b; }
+
+    virtual ~Query() 
+    {  }
+
+protected:
+    virtual void firstStep() = 0;
+
+    inline bool pushChar(char c) {
+        ulong nmin = tc->LF(c, smin.top()-1);
+        ulong nmax = tc->LF(c, smax.top())-1;
+        if (nmin > nmax) return false;
+        smin.push(nmin);
+        smax.push(nmax);
+        match.push_back(c);
+        return true;
+    }
+
+    inline void popChar() {
+        smin.pop();
+        smax.pop();
+        match.pop_back();
+    }
+
+    inline int min(int a, int b) {
+        if (a < b) return a; else return b;
+    }
+
+    inline int min(int a, int b, int c) {
+        if (a < b) if (a < c) return a; else return c;
+        else if (b < c) return b; else return c;
+    }
+
+    Query(TextCollection const *tc_);
+
+    uchar const *pat;
+    unsigned patlen;
+    TextCollection const *tc;
+    bool debug;
+    unsigned klimit;
+    ulong textlen;
+    
+    const char *ALPHABET;
+    static const unsigned ALPHABET_SIZE = 5;
+    static const char ALPHABET_DNA[];
+
+    std::stack<ulong> smin;
+    std::stack<ulong> smax;
+    std::vector<uchar> match;
+    std::set<TextCollection::DocId> result;
+private:
+    Query();
+    // No copy constructor or assignment
+    Query(Query const&);
+    Query& operator = (Query const&);
+};
+
+} // Namespace
+
+#endif // _Query_H_
index 4e22fe8..8fd1d68 100644 (file)
@@ -23,6 +23,7 @@
 #define _RLCSAWrapper_H_
 
 #include "TextCollection.h"
+#include "IndelQuery.h"
 
 #include "incbwt/rlcsa.h"
 
@@ -34,6 +35,7 @@
 #   define W 32
 #endif
 
+#include <stdexcept>
 #include <set>
 #include <string>
 
@@ -51,7 +53,10 @@ class RLCSAWrapper : public SXSI::TextCollection
 public:
     RLCSAWrapper(const CSA::RLCSA* index)
         : rlcsa(index)
-    { /* NOP */ }
+    {
+        // Init the edit distance look-up tables
+        MyersEditDistanceIncremental::initMyersFourRussians();
+    }
 
     ~RLCSAWrapper()
     {
@@ -131,9 +136,43 @@ public:
     document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
     document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
     document_result Equal(uchar const *) const { unsupported(); return document_result(); };
+
     document_result Contains(uchar const *pattern) const
-    {
-        //std::pair<TextPosition,TextPosition> p = backwards(pattern);
+    {        
+        /*************************************************************************
+         * Approximate pattern matching
+         * 
+         * Using suffix filters. Has to enumerate *all* approx. occurrences (sloooow...)
+         * instead of just returning the best occurrence (which is usually much faster).
+         *
+         * Query format: contains("APM 3 GATTACA")
+         * where
+         *          "APM" is the keyword for approximate queries.
+         *            "3" is the maximum edit distance allowed.
+         *      "GATTACA" is the query word to be aligned.
+         */
+        if (strncmp((char const *)pattern, "APM ", 4) == 0)
+        {
+            // Edit distance allowed.
+            int k = std::atoi((char const *)pattern + 4);
+            if (k < 0 || k == INT_MAX || k == INT_MIN)
+                goto exact_pattern_matching; // Invalid format
+
+            // Find the start of the pattern (i.e. the second ' ')
+            uchar const * tmp = pattern + 4;
+            while (*tmp != ' ' && *tmp != 0) ++tmp;
+            if (*tmp != ' ' || tmp == pattern + 4)
+                goto exact_pattern_matching; // Invalid format
+
+            IndelQuery iq(this);
+            //std::cerr << "Pattern: " << tmp+1 << ", k = " << k << std::endl;
+            return iq.align(tmp+1, k);
+        }
+
+        /*************************************************************************
+         * Exact pattern matching
+         */
+    exact_pattern_matching:
         CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
         if (p.first > p.second)
             return document_result();
@@ -175,6 +214,45 @@ public:
         this->rlcsa->writeTo(std::string(filename));
     }
 
+    TextCollection::TextPosition getLength() const
+    {
+        return this->rlcsa->getSize() + this->rlcsa->getNumberOfSequences();
+    }
+    
+    inline TextCollection::TextPosition LF(uchar c, TextPosition i) const
+    {
+        ++i;
+        if(i < this->rlcsa->getNumberOfSequences()) 
+            return rlcsa->C(c);
+        return rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c);
+    }
+
+    uchar* getSuffix(TextPosition pos, unsigned l) const
+    {
+        ++pos;
+        uchar* text = new uchar[l + 1];
+        
+        if(l == 0 || pos < this->rlcsa->getNumberOfSequences())
+        {
+            text[0] = 0;
+            return text;
+        }
+        pos -= this->rlcsa->getNumberOfSequences();
+        
+        unsigned n = rlcsa->displayFromPosition(pos, l, text);
+        text[n] = 0;
+        return text;
+    }
+    
+    DocId getDoc(TextPosition i) const
+    {
+        if(i < this->rlcsa->getNumberOfSequences())
+            return i;
+        
+        CSA::pair_type pt = rlcsa->getRelativePosition(this->rlcsa->locate(i - this->rlcsa->getNumberOfSequences()));
+        return pt.first;
+    }
+
 private:
     const CSA::RLCSA* rlcsa;
 
@@ -185,32 +263,20 @@ private:
 
         std::cerr << "\nFirst symb = " << (char)c << std::endl;
 
-        TextPosition sp = rlcsa->C(c);
-        TextPosition ep = rlcsa->C(c+1)-1;
+        TextPosition sp = LF(c, -1);
+        TextPosition ep = LF(c, getLength()-1) - 1;
         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
         while (sp<=ep && i>=1) 
         {
             c = (int)pattern[--i];
-            sp = LF(c, sp);
-            ep = LF(c, ep+1)-1;
+            sp = LF(c, sp-1);
+            ep = LF(c, ep)-1;
             printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep);
         }
 
-        uchar* found = rlcsa->display(sp, std::strlen((char const *)pattern), 5, i);
-        std::cerr << "found: " << found << " (" << i << std::endl;
-
         return std::make_pair(sp, ep);
         }*/
-
-    inline TextCollection::TextPosition
-        LF(uchar c, TextPosition i) const
-    {
-        if(i == (TextPosition)-1 || i < this->rlcsa->getNumberOfSequences()) 
-        { return this->rlcsa->C(c); }
-        return this->rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c);
-    }
-
-    
+   
     void unsupported() const
     { 
         std::cerr << std::endl << "-------------------------------------------------------------\n"
diff --git a/ResultSet.cpp b/ResultSet.cpp
new file mode 100644 (file)
index 0000000..208a9d5
--- /dev/null
@@ -0,0 +1,227 @@
+\r
+#include <iostream> \r
+#include "ResultSet.h"\r
+\r
+namespace SXSI\r
+{\r
+\r
+#define W (8*sizeof(ulong))\r
+\r
+#if __WORDSIZE == 64\r
+#define logW 6lu // 64bit system, code checks = lg(W)-1\r
+#else\r
+#define logW 5lu // 32bit system, code checks = lg(W)-1\r
+#endif\r
+\r
+#define divW(p) ((p)>>logW)\r
+#define modW(p) ((p)&(W-1))\r
+\r
+#define setBit(arr,pos) ((arr)[divW(pos)] |= 1lu<<modW(pos))\r
+#define clearBit(arr,pos) ((arr)[divW(pos)] &= ~(1lu<<modW(pos)))\r
+#define getBit(arr,pos) ((arr)[divW(pos)] & (1lu<<modW(pos))) // 0 or !=0\r
+\r
+\r
+ulong ResultSet::lg (ulong n)\r
+{ \r
+    ulong answ=0;\r
+    while (n) { n>>=1lu; answ++; }\r
+    return answ;\r
+}\r
+\r
+ResultSet::ResultSet(ulong n)\r
+{ \r
+    if (logW != lg(W)-1)\r
+    { \r
+        std::cerr << "Error, redefine logW as " << lg(W)-1 << " and recompile\n";\r
+//        exit(1);\r
+    }\r
+    this->n = 2*n-1;\r
+    this->lgn = lg(n);\r
+    this->tree = new ulong[(this->n+W-1)/W];\r
+    if (!this->tree)\r
+    {\r
+        std::cerr << "Malloc failed at ResultSet class." << std::endl;\r
+        //      exit(1);\r
+    }\r
+    clearBit(this->tree,0); // clear all\r
+}\r
+\r
+ResultSet::~ResultSet()\r
+{ \r
+    delete [] tree;\r
+}\r
+\r
+// to map 1..n to the bottom of the tree when n is not a power of 2\r
+ulong ResultSet::conv (ulong p, ulong n, ulong lgn)\r
+\r
+  { ulong t = n+1-(1lu<<lgn);\r
+    if (p < t) return p;\r
+    return (p<<1lu)-t;\r
+  }\r
+\r
+ulong ResultSet::unconv (ulong p, ulong n, ulong lgn)\r
+\r
+  { ulong t = n+1-(1lu<<lgn);\r
+    if (p < t) return p;\r
+    return (p+t)>>1lu;\r
+  }\r
+\r
+bool ResultSet::get (ulong p) // returns 0 or 1\r
+\r
+  { \r
+    ulong pos = 0;\r
+    ulong pot = this->lgn;\r
+    p = conv(p,n,pot);\r
+    do { if (!getBit(this->tree,pos)) return false;\r
+         pot--;\r
+         pos = (pos<<1lu)+1+(p>>pot);\r
+         p &= ~(1lu<<pot);\r
+       }\r
+    while (pos < n);\r
+    return true;\r
+  }\r
+\r
+void ResultSet::set (ulong p)\r
+\r
+  { \r
+    ulong pos = 0;\r
+    ulong npos;\r
+    ulong pot = this->lgn;\r
+    p = conv(p,n,pot);\r
+    do { npos = (pos<<1lu)+1;\r
+        if (!getBit(this->tree,pos))\r
+           { setBit(this->tree,pos);\r
+             if (npos < n) \r
+                { clearBit(this->tree,npos);\r
+                  clearBit(this->tree,npos+1);\r
+                }\r
+           }\r
+         pot--;\r
+         pos = npos+(p>>pot);\r
+         p &= ~(1lu<<pot);\r
+       }\r
+    while (pos < n);\r
+  }\r
+\r
+       // returns final value of bit at pos\r
+       \r
+ulong ResultSet::clearRangeLeft (ulong p1, ulong n, ulong pos, ulong pot)\r
+ { ulong npos;\r
+    ulong bit;\r
+    if (!getBit(tree,pos)) return 0; // range already zeroed\r
+    p1 &= ~(1lu<<pot);\r
+    if (p1 == 0) // full range to clear\r
+       { clearBit(tree,pos); return 0; }\r
+       // p1 != 0, there must be children\r
+    pot--;\r
+    npos = (pos<<1lu)+1;\r
+    if ((p1>>pot) == 0) // go left, clear right\r
+       { clearBit(tree,npos+1);\r
+        bit = clearRangeLeft(p1,n,npos,pot);\r
+       } \r
+    else // go right\r
+       { bit = clearRangeLeft(p1,n,npos+1,pot);\r
+        if (!bit) bit = getBit(tree,npos);\r
+       }\r
+    if (!bit) clearBit(tree,pos);\r
+    return bit;\r
+  }\r
+\r
+ulong ResultSet::clearRangeRight (ulong p2, ulong n, ulong pos, ulong pot)\r
+\r
+  { ulong npos;\r
+    ulong bit;\r
+    if (!getBit(tree,pos)) return 0; // range already zeroed\r
+    p2 &= ~(1lu<<pot);\r
+    if (p2 == 0) return 1; // empty range to clear, and bit is 1 for sure\r
+       // p2 != 0, there must be children\r
+    pot--;\r
+    npos = (pos<<1lu)+1;\r
+    if ((p2>>pot) == 1) // go right, clear left\r
+       { clearBit(tree,npos);\r
+        bit = clearRangeRight(p2,n,npos+1,pot);\r
+       } \r
+    else // go left\r
+       { bit = clearRangeRight(p2,n,npos,pot);\r
+        if (!bit) bit = getBit(tree,npos+1);\r
+       }\r
+    if (!bit) clearBit(tree,pos);\r
+    return bit;\r
+  }\r
+\r
+ulong ResultSet::clearBoth (ulong n, ulong p1, ulong p2, ulong pos, ulong pot)\r
+\r
+  { ulong npos,npos1,npos2;\r
+    ulong bit;\r
+    if (!getBit(tree,pos)) return 0; // range is already zeroed\r
+    npos = (pos<<1lu)+1;\r
+       // children must exist while the path is unique, as p1<p2\r
+    pot--;\r
+    npos1 = npos+(p1>>pot);\r
+    npos2 = npos+(p2>>pot);\r
+    if (npos1 == npos2) // we're inside npos1=npos2\r
+       { bit = clearBoth (n,p1&~(1lu<<pot),p2&~(1lu<<pot),npos1,pot);\r
+        bit |= getBit(tree,npos+1-(p1>>pot)); // the other\r
+       }\r
+     else  // p1 and p2 take different paths here\r
+       { bit  = clearRangeLeft(p1,n,npos1,pot); \r
+          bit |= clearRangeRight(p2,n,npos2,pot);\r
+       }\r
+     if (!bit) clearBit(tree,pos);\r
+     return bit;\r
+  }\r
+\r
+void ResultSet::clearRange (ulong p1, ulong p2)\r
+\r
+  { if ((p2+1)<<1lu > this->n) \r
+         clearRangeLeft(conv(p1,this->n,this->lgn),this->n,0,this->lgn);\r
+    else clearBoth(n,conv(p1,n,this->lgn),conv(p2+1,n,lgn),0,lgn);\r
+  }\r
+\r
+ulong ResultSet::nextSmallest (ulong n, ulong pos, ulong pot)\r
+\r
+  { ulong p = 0;\r
+    while (1)\r
+       { pot--;\r
+        pos = (pos<<1lu)+1;\r
+        if (pos >= n) return p;\r
+        if (!getBit(tree,pos)) { pos++; p |= (1lu<<pot); }\r
+       }\r
+  }\r
+\r
+ulong ResultSet::nextLarger (ulong n, ulong p, ulong pos, ulong pot)\r
+\r
+  { ulong answ;\r
+    if (!getBit(tree,pos)) return ~0lu; // no answer\r
+    pot--;\r
+    pos = (pos<<1lu)+1;\r
+    if (pos >= n) return 0; // when n is not a power of 2, missing leaves\r
+    if ((p>>pot) == 0) // p goes left\r
+       { answ = nextLarger(n,p&~(1lu<<pot),pos,pot);\r
+        if (answ != ~0lu) return answ;\r
+         if (!getBit(tree,pos+1)) return ~0lu; // no answer\r
+        return (1lu<<pot) | nextSmallest(n,pos+1,pot);\r
+       }\r
+    else \r
+       { answ = nextLarger(n,p&~(1lu<<pot),pos+1,pot);\r
+         if (answ != ~0lu) return (1lu<<pot) | answ;\r
+         return ~0lu;\r
+       }\r
+  }\r
+\r
+\r
+void  ResultSet::clear()\r
+{\r
+    clearBit(this->tree,0);\r
+}\r
+\r
+ulong ResultSet::nextResult (ulong p) // returns pos of next(p) or -1 if none\r
+\r
+  { ulong answ;\r
+    if (((p+1)<<1lu) > this->n) return ~0lu; // next(last), p+1 out of bounds\r
+    answ = nextLarger(this->n,conv(p+1,this->n,this->lgn),0,this->lgn);\r
+    if (answ == ~0lu) return ~0lu;\r
+    return unconv(answ,this->n,this->lgn);\r
+  }\r
+\r
+} // Namespace\r
diff --git a/ResultSet.h b/ResultSet.h
new file mode 100644 (file)
index 0000000..ba357af
--- /dev/null
@@ -0,0 +1,53 @@
+// Dynamic result set class
+// Originally by Gonzalo Navarro
+
+#ifndef _RESULTSET_H_
+#define _RESULTSET_H_
+
+namespace SXSI
+{
+
+#ifndef ulong
+typedef unsigned long ulong;
+#endif
+
+class ResultSet {
+private:
+    ulong n, lgn;
+    ulong *tree;
+    ulong lg(ulong);
+
+    ulong conv (ulong p, ulong n, ulong lgn);
+    ulong unconv (ulong p, ulong n, ulong lgn);
+    ulong clearRangeLeft (ulong p1, ulong n, ulong pos, ulong pot);
+    ulong clearRangeRight (ulong p2, ulong n, ulong pos, ulong pot);
+    ulong clearBoth (ulong n, ulong p1, ulong p2, ulong pos, ulong pot);
+
+    ulong nextSmallest (ulong n, ulong pos, ulong pot);
+    ulong nextLarger (ulong n, ulong p, ulong pos, ulong pot);
+
+    void prnspace (ulong k);
+public:
+    // creates empty results data structure for n nodes numbered 0..n-1
+    ResultSet(ulong n);
+    ~ResultSet();
+
+    // returns 0/1 telling whether result p is not/is present in R
+    bool get(ulong p);
+
+    // inserts result p into R
+    void set(ulong p);
+
+    // clears all results p1..p2 in R
+    void clearRange (ulong p1, ulong p2);
+    // clears all results
+    void clear();
+
+    // returns pos of next(p) in R, or -1 if none
+    ulong nextResult (ulong p);
+
+};
+
+}
+
+#endif // _RESULTSET_H_
index 9c6591f..98272c1 100644 (file)
@@ -201,6 +201,36 @@ namespace SXSI
         virtual full_result FullKMismatches(uchar const *, unsigned) const = 0;
         virtual full_result FullKErrors(uchar const *, unsigned) const = 0;
 
+
+        virtual TextPosition getLength() const
+        {
+            std::cerr << "TextCollection::getLength() is unsupported! Use RLCSA instead." << std::endl;
+            std::exit(2);
+            return 0;
+        }
+    
+        virtual TextPosition LF(uchar c, TextPosition i) const
+        {
+            std::cerr << "TextCollection::LF() is unsupported! Use RLCSA instead." << std::endl;
+            std::exit(2);
+            return 0;
+        }
+
+        virtual uchar* getSuffix(TextPosition pos, unsigned l) const
+        {
+            std::cerr << "TextCollection::getSuffix() is unsupported! Use RLCSA instead." << std::endl;
+            std::exit(2);
+            return 0;
+        }
+    
+        virtual DocId getDoc(TextPosition i) const
+        {
+            std::cerr << "TextCollection::getDoc() is unsupported! Use RLCSA instead." << std::endl;
+            std::exit(2);
+            return 0;
+        }
+        
+
     protected:
         // Protected constructor; use TextCollectionBuilder
         TextCollection() { };
index e8f8dbc..494b444 100644 (file)
@@ -1,37 +1,40 @@
 BitRank.o: BitRank.cpp BitRank.h BlockArray.h Tools.h
+EditDistance.o: EditDistance.cpp EditDistance.h Tools.h
 FMIndex.o: FMIndex.cpp FMIndex.h incbwt/bits/deltavector.h \
  incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \
  incbwt/bits/bitbuffer.h BitRank.h BlockArray.h Tools.h TextCollection.h \
  TextStorage.h ArrayDoc.h
 FMIndexBuilder.o: FMIndexBuilder.cpp incbwt/rlcsa_builder.h \
- incbwt/rlcsa.h incbwt/bits/vectors.h incbwt/bits/deltavector.h \
- incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \
- incbwt/bits/bitbuffer.h incbwt/bits/rlevector.h incbwt/sasamples.h \
- incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \
- incbwt/bits/deltavector.h incbwt/lcpsamples.h incbwt/bits/array.h \
- incbwt/misc/utils.h incbwt/misc/definitions.h incbwt/misc/parameters.h \
- incbwt/bits/deltavector.h FMIndexBuilder.h TextCollectionBuilder.h \
- TextCollection.h Tools.h TextStorage.h FMIndex.h BitRank.h BlockArray.h \
- ArrayDoc.h
+ incbwt/rlcsa.h incbwt/bits/deltavector.h incbwt/bits/bitvector.h \
+ incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \
+ incbwt/bits/rlevector.h incbwt/bits/nibblevector.h incbwt/sasamples.h \
+ incbwt/misc/definitions.h incbwt/bits/bitbuffer.h incbwt/lcpsamples.h \
+ incbwt/bits/array.h incbwt/misc/utils.h incbwt/misc/definitions.h \
+ incbwt/misc/parameters.h incbwt/bits/deltavector.h FMIndexBuilder.h \
+ TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h FMIndex.h \
+ BitRank.h BlockArray.h ArrayDoc.h
 HeapProfiler.o: HeapProfiler.cpp HeapProfiler.h
+Query.o: Query.cpp Query.h TextCollection.h Tools.h
 RLCSABuilder.o: RLCSABuilder.cpp incbwt/rlcsa_builder.h incbwt/rlcsa.h \
- incbwt/bits/vectors.h incbwt/bits/deltavector.h incbwt/bits/bitvector.h \
+ incbwt/bits/deltavector.h incbwt/bits/bitvector.h \
  incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \
- incbwt/bits/rlevector.h incbwt/sasamples.h incbwt/misc/definitions.h \
- incbwt/bits/bitbuffer.h incbwt/bits/deltavector.h incbwt/lcpsamples.h \
+ incbwt/bits/rlevector.h incbwt/bits/nibblevector.h incbwt/sasamples.h \
+ incbwt/misc/definitions.h incbwt/bits/bitbuffer.h incbwt/lcpsamples.h \
  incbwt/bits/array.h incbwt/misc/utils.h incbwt/misc/definitions.h \
  incbwt/misc/parameters.h RLCSABuilder.h TextCollectionBuilder.h \
  TextCollection.h Tools.h TextStorage.h incbwt/bits/deltavector.h \
- RLCSAWrapper.h incbwt/rlcsa.h
+ RLCSAWrapper.h IndelQuery.h ResultSet.h EditDistance.h Query.h \
+ incbwt/rlcsa.h
+ResultSet.o: ResultSet.cpp ResultSet.h
 TextCollection.o: TextCollection.cpp TextCollection.h Tools.h FMIndex.h \
  incbwt/bits/deltavector.h incbwt/bits/bitvector.h \
  incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h BitRank.h \
  BlockArray.h TextStorage.h ArrayDoc.h SWCSAWrapper.h \
  swcsa/utils/defValues.h swcsa/utils/valstring.h swcsa/interface.h \
- RLCSAWrapper.h incbwt/rlcsa.h incbwt/bits/vectors.h \
- incbwt/bits/deltavector.h incbwt/bits/rlevector.h incbwt/sasamples.h \
- incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \
- incbwt/bits/deltavector.h incbwt/lcpsamples.h incbwt/bits/array.h \
+ RLCSAWrapper.h IndelQuery.h ResultSet.h EditDistance.h Query.h \
+ incbwt/rlcsa.h incbwt/bits/deltavector.h incbwt/bits/rlevector.h \
+ incbwt/bits/nibblevector.h incbwt/sasamples.h incbwt/misc/definitions.h \
+ incbwt/bits/bitbuffer.h incbwt/lcpsamples.h incbwt/bits/array.h \
  incbwt/misc/utils.h incbwt/misc/definitions.h incbwt/misc/parameters.h
 TextCollectionBuilder.o: TextCollectionBuilder.cpp \
  TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h \
index 1c203df..a23a883 100644 (file)
@@ -1,74 +1,81 @@
 build_plcp.o: build_plcp.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
  misc/parameters.h
 build_rlcsa.o: build_rlcsa.cpp rlcsa_builder.h rlcsa.h bits/deltavector.h \
  bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
  bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \
- bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \
+ bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \
  misc/definitions.h misc/parameters.h
 display_test.o: display_test.cpp rlcsa.h bits/deltavector.h \
  bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
  bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \
- bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \
+ bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \
  misc/definitions.h misc/parameters.h
 extract_sequence.o: extract_sequence.cpp rlcsa.h bits/deltavector.h \
  bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
  bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \
- bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \
+ bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \
  misc/definitions.h misc/parameters.h
 lcp_test.o: lcp_test.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
  misc/parameters.h
-lcpsamples.o: lcpsamples.cpp lcpsamples.h bits/array.h \
- bits/../misc/definitions.h bits/bitbuffer.h bits/vectors.h misc/utils.h \
+lcpsamples.o: lcpsamples.cpp lcpsamples.h bits/deltavector.h \
+ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
+ bits/rlevector.h bits/nibblevector.h bits/array.h misc/utils.h \
  misc/definitions.h
 locate_test.o: locate_test.cpp rlcsa.h bits/deltavector.h \
  bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
  bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \
- bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \
+ bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \
  misc/definitions.h misc/parameters.h
 parallel_build.o: parallel_build.cpp rlcsa_builder.h rlcsa.h \
  bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \
  bits/bitbuffer.h bits/rlevector.h bits/nibblevector.h sasamples.h \
  misc/definitions.h bits/bitbuffer.h lcpsamples.h bits/array.h \
bits/vectors.h misc/utils.h misc/definitions.h misc/parameters.h
+ misc/utils.h misc/definitions.h misc/parameters.h
 read_bwt.o: read_bwt.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
  misc/parameters.h
 rlcsa.o: rlcsa.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
- misc/parameters.h qsufsort/qsufsort.h qsufsort/../misc/definitions.h
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
+ misc/parameters.h qsufsort/qsufsort.h qsufsort/../misc/definitions.h \
+ bits/vectors.h
 rlcsa_builder.o: rlcsa_builder.cpp rlcsa_builder.h rlcsa.h \
  bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \
  bits/bitbuffer.h bits/rlevector.h bits/nibblevector.h sasamples.h \
  misc/definitions.h bits/bitbuffer.h lcpsamples.h bits/array.h \
bits/vectors.h misc/utils.h misc/definitions.h misc/parameters.h
+ misc/utils.h misc/definitions.h misc/parameters.h
 rlcsa_grep.o: rlcsa_grep.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
  misc/parameters.h
 rlcsa_test.o: rlcsa_test.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
  misc/parameters.h
 sample_lcp.o: sample_lcp.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \
  bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \
  bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
- lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \
+ lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \
  misc/parameters.h
 sasamples.o: sasamples.cpp sasamples.h misc/definitions.h \
  bits/bitbuffer.h bits/../misc/definitions.h bits/deltavector.h \
  bits/bitvector.h bits/bitbuffer.h misc/utils.h misc/definitions.h
+text_generator.o: text_generator.cpp rlcsa.h bits/deltavector.h \
+ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
+ bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \
+ bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \
+ misc/definitions.h misc/parameters.h
 array.o: bits/array.cpp bits/array.h bits/../misc/definitions.h \
  bits/bitbuffer.h
 bitvector.o: bits/bitvector.cpp bits/bitvector.h \
index 60aa8ba..8c88467 100644 (file)
--- a/makefile
+++ b/makefile
@@ -10,7 +10,7 @@ LIBSWCSA = swcsa/swcsa.a
 dcover_obs = dcover/difference_cover.o
 
 TextCollection_obs = TextCollection.o TextCollectionBuilder.o FMIndexBuilder.o RLCSABuilder.o FMIndex.o Tools.o \
-                     TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA}
+                     TextStorage.o Query.o EditDistance.o ResultSet.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA}
 TCDebug_obs =  bittree.o rbtree.o dynFMI.o 
 
 all: testTextCollection