Added approximate pattern matching with Suffix Filters
[SXSI/TextCollection.git] / EditDistance.h
diff --git a/EditDistance.h b/EditDistance.h
new file mode 100644 (file)
index 0000000..ad0e868
--- /dev/null
@@ -0,0 +1,545 @@
+#ifndef _SXSI_EditDistance_h_
+#define _SXSI_EditDistance_h_
+
+/**
+ * TODO
+ * [ ] clean up code
+ */
+
+#include <utility>
+#include <cstring>
+#include "Tools.h"
+
+namespace SXSI
+{
+
+class EditDistance 
+{
+public:
+    uchar const *pat;
+    unsigned  patlen;
+    unsigned maxlen;
+    unsigned **dp;
+    unsigned p;
+
+    bool end()
+    {
+        if (p < maxlen)
+            return false;
+        return true;
+    }
+
+    EditDistance(uchar const *pattern, unsigned len, unsigned maxk) {
+        pat = pattern;
+        patlen = len;
+        maxlen = len + maxk;
+        p = 0;
+        dp = new unsigned*[patlen + 1];
+        for (unsigned i = 0; i <= patlen; i++) {
+            dp[i] = new unsigned[maxlen+1];
+            dp[i][0] = i;
+        }
+    }
+
+    ~EditDistance() {
+        for (unsigned i = 0; i <= patlen; i++)
+            delete [] dp[i];
+        delete [] dp;
+    }
+
+    std::pair<unsigned, unsigned> prefixDist(uchar const *cand, unsigned candlen) {
+        unsigned mindist = patlen+candlen;
+        unsigned minlen = patlen+candlen;
+        unsigned *col = new unsigned[patlen+1];
+        unsigned *ncol = new unsigned[patlen+1];
+        for (unsigned i = 0; i <= patlen; i++)
+            col[i] = i;
+        for (unsigned j = 1; j <= candlen; j++) {
+            ncol[0] = j;
+            for (unsigned i = 1; i <= patlen; i++) {
+                unsigned match = (cand[j-1] == pat[i-1])? col[i - 1] : (col[i - 1] + 1);
+                ncol[i] = std::min(col[i] + 1, ncol[i - 1] + 1);
+                ncol[i] = std::min(ncol[i],match);
+            }
+            if (ncol[patlen]<mindist)
+            {
+                mindist = ncol[patlen];
+                minlen = j;
+            }
+            for (unsigned i = 0; i <= patlen; i++)
+                col[i] = ncol[i];
+        }
+        delete [] col;
+        delete [] ncol;
+        return std::make_pair(mindist,minlen);
+    }
+
+
+    unsigned pushChar(uchar c) {
+        unsigned mindist = patlen;
+        p++;
+        if (p>maxlen)
+            std::cout << "out of bounds in DP computation, p = " << p << ", max = " << maxlen << std::endl;
+        dp[0][p] = p;
+        for (unsigned i = 1; i <= patlen; i++) {
+            unsigned match = (c == pat[i-1])? dp[i - 1][p-1] : (dp[i - 1][p-1] + 1);
+            dp[i][p] = std::min(dp[i][p-1] + 1, dp[i - 1][p] + 1);
+            dp[i][p] = std::min(dp[i][p],match);
+            if (dp[i][p]<mindist)
+                mindist = dp[i][p];
+        }
+        return mindist;
+    }
+
+    void popChar() {
+        --p;
+        return;
+    }
+
+    unsigned dist() {
+        return dp[patlen][p];
+    }
+
+    unsigned getSuffixLength()
+    {
+        unsigned mindist = dp[patlen][p];
+        unsigned minlen = patlen;
+        for (unsigned i = 0; i < patlen; ++i)
+            if (dp[i][p] < mindist)
+            {
+                mindist = dp[i][p];
+                minlen = i;
+            }
+        return minlen;
+    }
+
+};
+
+
+class MyersEditDistanceIncremental
+{
+  public:
+
+    MyersEditDistanceIncremental(const uchar *pattern, unsigned _length, unsigned _max_errors) :
+      length(_length), max_errors(_max_errors), min_row(~0u)
+    {
+      this->max_length = this->length + this->max_errors;
+      this->blocks = (this->length + W - 1) / W;
+      this->last_block_length = this->length % W;
+      if(this->last_block_length == 0) { this->last_block_length = W; }
+      this->last_block_mask = ~0lu >> (this->blocks * W - this->length);
+
+      this->initializeCharVectors(pattern);
+      this->initializePlusMinusVectors();
+    }
+
+    ~MyersEditDistanceIncremental()
+    {
+      for(unsigned c = 0; c < 256; c++)
+      {
+        delete[] this->char_vectors[c]; this->char_vectors[c] = 0;
+      }
+      delete[] this->char_vectors; this->char_vectors = 0;
+
+      for(unsigned i = 0; i <= this->max_length; i++)
+      {
+        delete[] this->plus_vectors[i]; this->plus_vectors[i] = 0;
+        delete[] this->minus_vectors[i]; this->minus_vectors[i] = 0;
+      }
+      delete[] this->plus_vectors; this->plus_vectors = 0;
+      delete[] this->minus_vectors; this->minus_vectors = 0;
+
+      delete[] this->score; this->score = 0;
+    }
+
+    unsigned pushChar(unsigned c)
+    {
+      if(c >= 256)
+      {
+        std::cerr << "MyersEditDistance: Invalid character value " << c << "!" << std::endl;
+        return ~0u;
+      }
+      if(this->pos >= this->max_length)
+      {
+        std::cerr << "MyersEditDistance: Cannot pushChar(c) past limit!" << std::endl;
+        std::cerr << "MyersEditDistance: Current text position " << this-> pos << ", maximum " << this->max_length << std::endl;
+        return ~0u;
+      }
+
+      this->pos++;
+      ulong c_vec, diagonal, h_plus = 0, h_minus = 0, plus, minus, temp;
+      ulong d_overflow = 0, hp_overflow = 1, hm_overflow = 0;
+
+      /*
+        Update rules from http://www.cs.uta.fi/~helmu/pubs/phd.pdf (Figure 25).
+      */
+      for(unsigned i = 0; i < this->blocks; i++)
+      {
+        c_vec = this->char_vectors[c][i];
+        plus = this->plus_vectors[this->pos - 1][i];
+        minus = this->minus_vectors[this->pos - 1][i];
+
+        diagonal = c_vec & plus;
+        temp = diagonal + plus + d_overflow;
+        d_overflow = ((temp < diagonal) || (temp == diagonal && d_overflow == 1) ? 1 : 0);
+        diagonal = (temp ^ plus) | c_vec | minus;
+
+        h_plus = minus | ~(diagonal | plus);
+        h_minus = diagonal & plus;
+
+        this->plus_vectors[this->pos][i] =
+          (h_minus << 1) | hm_overflow | ~(diagonal | (h_plus << 1) | hp_overflow);
+        this->minus_vectors[this->pos][i] = diagonal & ((h_plus << 1) | hp_overflow);
+
+        hp_overflow = (h_plus & HIGH_BIT ? 1 : 0);
+        hm_overflow = (h_minus & HIGH_BIT ? 1 : 0);
+      }
+      this->plus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
+      this->minus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
+
+      this->score[this->pos] = this->score[this->pos - 1];
+      if(h_plus & (1lu << (this->length % W - 1)))
+      {
+        this->score[this->pos]++;
+      }
+      else if(h_minus & (1lu << (this->length % W - 1)))
+      {
+        this->score[this->pos]--;
+      }
+
+      // Find the minimal value in the current column.
+      int minimum = this->score[this->pos], cur = minimum;
+      min_row = this->length;
+      for(unsigned i = this->blocks; i > 0;)
+      {
+        i--;
+        plus = this->plus_vectors[this->pos][i];
+        minus = this->minus_vectors[this->pos][i];
+
+        for(unsigned j = W; j > 0;)
+        {
+          j -= CHAR_BIT;
+          unsigned a = (plus >> j) & 0xFF, b = (minus >> j) & 0xFF;
+          if (minimum > cur + myers_optimum[256 * a + b])
+          {
+              minimum = cur + myers_optimum[256 * a + b];
+              min_row = i*W + j + myers_best_pos[256 * a + b];
+          }
+          cur += myers_total[256 * a + b];
+        }
+      }
+
+      return minimum;
+    }
+
+    unsigned dist() {
+        return this->score[this->pos];
+    }
+    
+    void popChar()
+    {
+      if(this->pos == 0)
+      {
+        std::cerr << "MyersEditDistance: Cannot popChar() from text position 0!" << std::endl;
+        return;
+      }
+
+      this->pos--;
+    }
+
+    inline unsigned getSuffixLength()
+    {
+        return min_row;
+    }
+
+/*
+    // FIXME implement
+    std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
+*/
+
+    bool end()
+    {
+        if (pos < max_length)
+            return false;
+        return true;
+    }
+
+static void initMyersFourRussians()
+{
+//  myers_total = new signed char[256 * 256];
+//  myers_optimum = new signed char[256 * 256];
+//   myers_best_pos = new uchar[256 * 256];
+
+  unsigned bp = 8;
+  for(unsigned i = 0; i < 256; i++)
+  {
+    for(unsigned j = 0; j < 256; j++)
+    {
+      int cur = 0, best = 0;
+      for(unsigned k = CHAR_BIT; k > 0;)
+      {
+        k--;
+        if(i & (1 << k)) { cur--; }
+        if(j & (1 << k)) { cur++; }
+        if(cur < best)
+        {
+          best = cur;
+          bp = k;
+        }
+      }
+
+      myers_total[256 * i + j] = cur;
+      myers_optimum[256 * i + j] = best;
+      myers_best_pos[256 * i + j] = bp;
+    }
+  }
+}
+
+    unsigned length;
+private:
+    static signed char myers_total[];
+    static signed char myers_optimum[];
+    static uchar       myers_best_pos[];
+
+
+    const static ulong HIGH_BIT = 1lu << (W - 1);
+
+    ulong **char_vectors;
+    unsigned max_errors;
+    unsigned max_length;
+    unsigned min_row;
+
+    unsigned blocks;            // Number of blocks required for the pattern
+    unsigned last_block_length; // Number of bits in the final block
+    ulong last_block_mask;
+
+    ulong **plus_vectors;
+    ulong **minus_vectors;
+    unsigned* score;
+
+    unsigned pos;               // We have matched pos characters of text
+
+
+    void initializeCharVectors(uchar const *pattern)
+    {
+      this->char_vectors = new ulong*[256];
+      for(unsigned c = 0; c < 256; c++)
+      {
+        this->char_vectors[c] = new ulong[this->blocks];
+        std::memset(this->char_vectors[c], 0, this->blocks * sizeof(ulong));
+      }
+
+      for(unsigned i = 0; i < this->length; i++)
+      {
+        this->char_vectors[pattern[i]][i / W] |= (1lu << (i % W));
+      }
+    }
+
+    void initializePlusMinusVectors()
+    {
+      this->plus_vectors = new ulong*[this->max_length + 1];
+      this->minus_vectors = new ulong*[this->max_length + 1];
+      this->score = new unsigned[this->max_length + 1];
+
+      for(unsigned i = 0; i <= this->max_length; i++)
+      {
+        this->plus_vectors[i] = new ulong[this->blocks];
+        this->minus_vectors[i] = new ulong[this->blocks];
+      }
+
+      this->initialStep();
+    }
+
+    void initialStep()
+    {
+      for(unsigned i = 0; i < this->blocks; i++)
+      {
+        this->plus_vectors[0][i] = ~0lu;
+        this->minus_vectors[0][i] = 0;
+      }
+      this->plus_vectors[0][this->blocks - 1] = this->last_block_mask;
+
+      this->score[0] = this->length;
+      this->pos = 0;
+    }
+
+
+
+};
+
+
+
+
+    class MyersEditDistance
+    {
+    public:
+        ulong **Peq;
+        bool S[256];
+        unsigned k;
+        unsigned m;
+        unsigned blockCount;
+        unsigned block;
+        unsigned lastBlockLength;
+        ulong *Pv;
+        ulong *Mv;
+        ulong *Score;
+        ulong twopowmpos;
+        ulong twopowwpos;
+        void init(uchar const *P)
+        {
+            ulong i;
+            ulong j;
+            ulong l;
+            for(i = 0;i< 256;i++) {
+                S[i] = false;
+                for (l=0;l <= ((m-1) / W); l++)
+                    Peq[l][i] = 0;
+            }
+            for (l=0;l <= ((m-1) / W); l++)
+                for (j = 1; j <= W; j++) {
+                    if (W*l+j > m)
+                        break;
+                    Peq[l][P[W*l+j-1]] = Peq[l][P[W*l+j-1]] | (1lu << (j-1));
+                    if (j+W*l <= k+1)
+                        S[(unsigned)P[W*l+j-1]] = true;
+                }
+            for (l=0; l < blockCount; l++) {
+                Mv[l] = 0lu;
+                Score[l] = (l+1lu)*W;
+                Pv[l] = ~0lu;
+            }
+            Mv[blockCount] = 0;
+            Score[blockCount] = m;
+            Pv[blockCount] = (~0lu) >> ((blockCount+1lu)*W - m);
+        }
+
+        MyersEditDistance(unsigned _m, unsigned maxk) 
+            : Peq(0), k(maxk), m(_m)
+        {
+            ulong l;
+            Peq = new ulong *[(m-1) / W + 1]; // (unsigned **)malloc((((m-1) / w)+1)*sizeof(unsigned*));
+            for (l=0;l <= ((m-1) / W); l++)
+                Peq[l] = new ulong[256]; //(unsigned *)malloc(256*sizeof(unsigned));
+
+            blockCount= ((m-1) / W);
+            block = (k-1) / W;
+            lastBlockLength = m % W;
+            if (lastBlockLength == 0)
+                lastBlockLength = W;
+            Pv = new ulong[blockCount+1];  // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
+            Mv = new ulong[blockCount+1];  // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
+            Score = new ulong[blockCount+1];//(unsigned *)malloc((blockCount+1)*sizeof(unsigned));
+            twopowmpos = 1lu << (lastBlockLength-1);
+            twopowwpos = 1lu << (W-1);
+        }
+
+        ~MyersEditDistance()
+        {
+            delete [] Pv; Pv = 0;
+            delete [] Mv; Mv = 0;
+            delete [] Score; Score = 0;
+            unsigned l;
+            for (l=0;l <= ((m-1) / W); l++)
+            {
+                delete [] Peq[l];
+                Peq[l] = 0;
+            }
+            delete [] Peq; Peq = 0;
+        }
+
+        std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
+        {
+            unsigned mindist = m+n;
+            unsigned minlen = m+n;
+            //            int count=0;
+            //int blockAverage=0;
+            ulong Eq, Xv, Xh, Ph, Mh;
+            ulong Phendbit;
+            ulong Mhendbit;
+            ulong temp;
+            ulong j;
+            ulong l;
+            ulong overk=1;
+            block = (k-1) / W;
+            for(j = 1; j <= n; j++) {
+                Phendbit = 1lu; //0;
+                Mhendbit = 0lu;
+                for (l=0; l <= blockCount; l++) {
+                    Eq = Peq[l][(unsigned char)text[j-1]];
+                    Xv = Eq | Mv[l];
+                    Xh = ((((Eq | Mhendbit)& Pv[l]) + Pv[l]) ^ Pv[l]) | (Eq | Mhendbit);
+                    Ph = Mv[l] | ~ (Xh | Pv[l]);
+                    Mh = Pv[l] & Xh;
+                    temp = l < blockCount ? twopowwpos : twopowmpos;
+                    if (Ph & temp)
+                        Score[l] += 1;
+                    else if (Mh & temp)
+                        Score[l] -= 1;
+                    temp = (Ph >> (W-1));
+                    Ph <<= 1;
+                    Ph |= Phendbit;
+                    Phendbit = temp;
+                    temp = (Mh >> (W-1));
+                    Mh <<= 1;
+                    Mh |= Mhendbit;
+                    Mhendbit = temp;
+                    Pv[l] = (Mh) | ~ (Xv | Ph);
+                    Mv[l] = Ph & Xv;
+                    if (block == l+1 &&
+                        ((block == blockCount &&
+                          (Score[l] > k+lastBlockLength ||
+                           (Score[l] > k && overk != 0 && j - overk >= lastBlockLength))) ||
+                         (block != blockCount &&
+                          (Score[l] > k+W ||
+                           (Score[l] > k && overk != 0 && j - overk >= W)))))  {
+                        // lohkon Score kasvanut niin isoksi, ettei seuraavaa kannata laskea
+                        overk = 0;
+                        block = l;
+                        break;
+                    }
+                    else if (block == l+1 && overk == 0 && Score[l] > k)
+                        // Talletetaan ensimmäinen diagonaali jolla Score > k. Näin tiedetään
+                        // jatkossa koska seuraavan lohkon laskeminen on turhaa.
+                        overk = j;
+                    else if (block == l+1 && Score[l] <= k)
+                        // Diagonaalilla Score <= k ==> Nollataan muuttuja overk.
+                        overk = 0;
+                    else if (block == l && block != blockCount && Score[l] <= k) {
+                        // Score <= k ==> jatketaan seuraavaan lohkoon. Koska seuraavaa lohkoa
+                        // ei ole laskettu edellisellä sarakkeella, pitää sen arvot päivittää.
+                        Score[l+1] = k+1lu;
+                        Pv[l+1] = 0lu;
+                        Mv[l+1] = 0lu;
+                        overk = 0lu;
+                        block = l+1;
+                    }
+                    else if (block == l)
+                        // Seuraavaan lohkoon ei kannata siirtyä, kun Score > k.
+                        break;
+                }
+//                blockAverage += block+1;
+                // if (block == blockCount && Score[blockCount] <= k) {
+                if (block == blockCount && Score[blockCount] <= mindist)
+                {
+                    mindist = Score[blockCount];
+                    minlen = j;
+                
+                    //printf("Min %u at len %u\n", mindist, minlen);
+                //    count++;
+                }
+            }
+//            return count;
+            return std::make_pair(mindist,minlen);
+            /*ShowMessage("Esiintymiä löytyi " + IntToStr(laskuri) + "kpl. " +
+              "Aikaa meni " + IntToStr(msec) + "msec");
+              ShowMessage("Keskimääräinen lohkojen määrä " +
+              FormatFloat("0.00",(float)blockAverage / n));*/
+        }
+
+};
+
+
+} // Namespace
+
+#endif