Debug swcsa
[SXSI/TextCollection.git] / EditDistance.h
1 #ifndef _SXSI_EditDistance_h_
2 #define _SXSI_EditDistance_h_
3
4 /**
5  * TODO
6  * [ ] clean up code
7  */
8
9 #include <utility>
10 #include <cstring>
11 #include "Tools.h"
12
13 namespace SXSI
14 {
15
16 class EditDistance 
17 {
18 public:
19     uchar const *pat;
20     unsigned  patlen;
21     unsigned maxlen;
22     unsigned **dp;
23     unsigned p;
24
25     bool end()
26     {
27         if (p < maxlen)
28             return false;
29         return true;
30     }
31
32     EditDistance(uchar const *pattern, unsigned len, unsigned maxk) {
33         pat = pattern;
34         patlen = len;
35         maxlen = len + maxk;
36         p = 0;
37         dp = new unsigned*[patlen + 1];
38         for (unsigned i = 0; i <= patlen; i++) {
39             dp[i] = new unsigned[maxlen+1];
40             dp[i][0] = i;
41         }
42     }
43
44     ~EditDistance() {
45         for (unsigned i = 0; i <= patlen; i++)
46             delete [] dp[i];
47         delete [] dp;
48     }
49
50     std::pair<unsigned, unsigned> prefixDist(uchar const *cand, unsigned candlen) {
51         unsigned mindist = patlen+candlen;
52         unsigned minlen = patlen+candlen;
53         unsigned *col = new unsigned[patlen+1];
54         unsigned *ncol = new unsigned[patlen+1];
55         for (unsigned i = 0; i <= patlen; i++)
56             col[i] = i;
57         for (unsigned j = 1; j <= candlen; j++) {
58             ncol[0] = j;
59             for (unsigned i = 1; i <= patlen; i++) {
60                 unsigned match = (cand[j-1] == pat[i-1])? col[i - 1] : (col[i - 1] + 1);
61                 ncol[i] = std::min(col[i] + 1, ncol[i - 1] + 1);
62                 ncol[i] = std::min(ncol[i],match);
63             }
64             if (ncol[patlen]<mindist)
65             {
66                 mindist = ncol[patlen];
67                 minlen = j;
68             }
69             for (unsigned i = 0; i <= patlen; i++)
70                 col[i] = ncol[i];
71         }
72         delete [] col;
73         delete [] ncol;
74         return std::make_pair(mindist,minlen);
75     }
76
77
78     unsigned pushChar(uchar c) {
79         unsigned mindist = patlen;
80         p++;
81         if (p>maxlen)
82             std::cout << "out of bounds in DP computation, p = " << p << ", max = " << maxlen << std::endl;
83         dp[0][p] = p;
84         for (unsigned i = 1; i <= patlen; i++) {
85             unsigned match = (c == pat[i-1])? dp[i - 1][p-1] : (dp[i - 1][p-1] + 1);
86             dp[i][p] = std::min(dp[i][p-1] + 1, dp[i - 1][p] + 1);
87             dp[i][p] = std::min(dp[i][p],match);
88             if (dp[i][p]<mindist)
89                 mindist = dp[i][p];
90         }
91         return mindist;
92     }
93
94     void popChar() {
95         --p;
96         return;
97     }
98
99     unsigned dist() {
100         return dp[patlen][p];
101     }
102
103     unsigned getSuffixLength()
104     {
105         unsigned mindist = dp[patlen][p];
106         unsigned minlen = patlen;
107         for (unsigned i = 0; i < patlen; ++i)
108             if (dp[i][p] < mindist)
109             {
110                 mindist = dp[i][p];
111                 minlen = i;
112             }
113         return minlen;
114     }
115
116 };
117
118
119 class MyersEditDistanceIncremental
120 {
121   public:
122
123     MyersEditDistanceIncremental(const uchar *pattern, unsigned _length, unsigned _max_errors) :
124       length(_length), max_errors(_max_errors), min_row(~0u)
125     {
126       this->max_length = this->length + this->max_errors;
127       this->blocks = (this->length + W - 1) / W;
128       this->last_block_length = this->length % W;
129       if(this->last_block_length == 0) { this->last_block_length = W; }
130       this->last_block_mask = ~0lu >> (this->blocks * W - this->length);
131
132       this->initializeCharVectors(pattern);
133       this->initializePlusMinusVectors();
134     }
135
136     ~MyersEditDistanceIncremental()
137     {
138       for(unsigned c = 0; c < 256; c++)
139       {
140         delete[] this->char_vectors[c]; this->char_vectors[c] = 0;
141       }
142       delete[] this->char_vectors; this->char_vectors = 0;
143
144       for(unsigned i = 0; i <= this->max_length; i++)
145       {
146         delete[] this->plus_vectors[i]; this->plus_vectors[i] = 0;
147         delete[] this->minus_vectors[i]; this->minus_vectors[i] = 0;
148       }
149       delete[] this->plus_vectors; this->plus_vectors = 0;
150       delete[] this->minus_vectors; this->minus_vectors = 0;
151
152       delete[] this->score; this->score = 0;
153     }
154
155     unsigned pushChar(unsigned c)
156     {
157       if(c >= 256)
158       {
159         std::cerr << "MyersEditDistance: Invalid character value " << c << "!" << std::endl;
160         return ~0u;
161       }
162       if(this->pos >= this->max_length)
163       {
164         std::cerr << "MyersEditDistance: Cannot pushChar(c) past limit!" << std::endl;
165         std::cerr << "MyersEditDistance: Current text position " << this-> pos << ", maximum " << this->max_length << std::endl;
166         return ~0u;
167       }
168
169       this->pos++;
170       ulong c_vec, diagonal, h_plus = 0, h_minus = 0, plus, minus, temp;
171       ulong d_overflow = 0, hp_overflow = 1, hm_overflow = 0;
172
173       /*
174         Update rules from http://www.cs.uta.fi/~helmu/pubs/phd.pdf (Figure 25).
175       */
176       for(unsigned i = 0; i < this->blocks; i++)
177       {
178         c_vec = this->char_vectors[c][i];
179         plus = this->plus_vectors[this->pos - 1][i];
180         minus = this->minus_vectors[this->pos - 1][i];
181
182         diagonal = c_vec & plus;
183         temp = diagonal + plus + d_overflow;
184         d_overflow = ((temp < diagonal) || (temp == diagonal && d_overflow == 1) ? 1 : 0);
185         diagonal = (temp ^ plus) | c_vec | minus;
186
187         h_plus = minus | ~(diagonal | plus);
188         h_minus = diagonal & plus;
189
190         this->plus_vectors[this->pos][i] =
191           (h_minus << 1) | hm_overflow | ~(diagonal | (h_plus << 1) | hp_overflow);
192         this->minus_vectors[this->pos][i] = diagonal & ((h_plus << 1) | hp_overflow);
193
194         hp_overflow = (h_plus & HIGH_BIT ? 1 : 0);
195         hm_overflow = (h_minus & HIGH_BIT ? 1 : 0);
196       }
197       this->plus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
198       this->minus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
199
200       this->score[this->pos] = this->score[this->pos - 1];
201       if(h_plus & (1lu << (this->length % W - 1)))
202       {
203         this->score[this->pos]++;
204       }
205       else if(h_minus & (1lu << (this->length % W - 1)))
206       {
207         this->score[this->pos]--;
208       }
209
210       // Find the minimal value in the current column.
211       int minimum = this->score[this->pos], cur = minimum;
212       min_row = this->length;
213       for(unsigned i = this->blocks; i > 0;)
214       {
215         i--;
216         plus = this->plus_vectors[this->pos][i];
217         minus = this->minus_vectors[this->pos][i];
218
219         for(unsigned j = W; j > 0;)
220         {
221           j -= CHAR_BIT;
222           unsigned a = (plus >> j) & 0xFF, b = (minus >> j) & 0xFF;
223           if (minimum > cur + myers_optimum[256 * a + b])
224           {
225               minimum = cur + myers_optimum[256 * a + b];
226               min_row = i*W + j + myers_best_pos[256 * a + b];
227           }
228           cur += myers_total[256 * a + b];
229         }
230       }
231
232       return minimum;
233     }
234
235     unsigned dist() {
236         return this->score[this->pos];
237     }
238     
239     void popChar()
240     {
241       if(this->pos == 0)
242       {
243         std::cerr << "MyersEditDistance: Cannot popChar() from text position 0!" << std::endl;
244         return;
245       }
246
247       this->pos--;
248     }
249
250     inline unsigned getSuffixLength()
251     {
252         return min_row;
253     }
254
255 /*
256     // FIXME implement
257     std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
258 */
259
260     bool end()
261     {
262         if (pos < max_length)
263             return false;
264         return true;
265     }
266
267 static void initMyersFourRussians()
268 {
269 //  myers_total = new signed char[256 * 256];
270 //  myers_optimum = new signed char[256 * 256];
271 //   myers_best_pos = new uchar[256 * 256];
272
273   unsigned bp = 8;
274   for(unsigned i = 0; i < 256; i++)
275   {
276     for(unsigned j = 0; j < 256; j++)
277     {
278       int cur = 0, best = 0;
279       for(unsigned k = CHAR_BIT; k > 0;)
280       {
281         k--;
282         if(i & (1 << k)) { cur--; }
283         if(j & (1 << k)) { cur++; }
284         if(cur < best)
285         {
286           best = cur;
287           bp = k;
288         }
289       }
290
291       myers_total[256 * i + j] = cur;
292       myers_optimum[256 * i + j] = best;
293       myers_best_pos[256 * i + j] = bp;
294     }
295   }
296 }
297
298     unsigned length;
299 private:
300     static signed char myers_total[];
301     static signed char myers_optimum[];
302     static uchar       myers_best_pos[];
303
304
305     const static ulong HIGH_BIT = 1lu << (W - 1);
306
307     ulong **char_vectors;
308     unsigned max_errors;
309     unsigned max_length;
310     unsigned min_row;
311
312     unsigned blocks;            // Number of blocks required for the pattern
313     unsigned last_block_length; // Number of bits in the final block
314     ulong last_block_mask;
315
316     ulong **plus_vectors;
317     ulong **minus_vectors;
318     unsigned* score;
319
320     unsigned pos;               // We have matched pos characters of text
321
322
323     void initializeCharVectors(uchar const *pattern)
324     {
325       this->char_vectors = new ulong*[256];
326       for(unsigned c = 0; c < 256; c++)
327       {
328         this->char_vectors[c] = new ulong[this->blocks];
329         std::memset(this->char_vectors[c], 0, this->blocks * sizeof(ulong));
330       }
331
332       for(unsigned i = 0; i < this->length; i++)
333       {
334         this->char_vectors[pattern[i]][i / W] |= (1lu << (i % W));
335       }
336     }
337
338     void initializePlusMinusVectors()
339     {
340       this->plus_vectors = new ulong*[this->max_length + 1];
341       this->minus_vectors = new ulong*[this->max_length + 1];
342       this->score = new unsigned[this->max_length + 1];
343
344       for(unsigned i = 0; i <= this->max_length; i++)
345       {
346         this->plus_vectors[i] = new ulong[this->blocks];
347         this->minus_vectors[i] = new ulong[this->blocks];
348       }
349
350       this->initialStep();
351     }
352
353     void initialStep()
354     {
355       for(unsigned i = 0; i < this->blocks; i++)
356       {
357         this->plus_vectors[0][i] = ~0lu;
358         this->minus_vectors[0][i] = 0;
359       }
360       this->plus_vectors[0][this->blocks - 1] = this->last_block_mask;
361
362       this->score[0] = this->length;
363       this->pos = 0;
364     }
365
366
367
368 };
369
370
371
372
373     class MyersEditDistance
374     {
375     public:
376         ulong **Peq;
377         bool S[256];
378         unsigned k;
379         unsigned m;
380         unsigned blockCount;
381         unsigned block;
382         unsigned lastBlockLength;
383         ulong *Pv;
384         ulong *Mv;
385         ulong *Score;
386         ulong twopowmpos;
387         ulong twopowwpos;
388  
389         void init(uchar const *P)
390         {
391             ulong i;
392             ulong j;
393             ulong l;
394             for(i = 0;i< 256;i++) {
395                 S[i] = false;
396                 for (l=0;l <= ((m-1) / W); l++)
397                     Peq[l][i] = 0;
398             }
399             for (l=0;l <= ((m-1) / W); l++)
400                 for (j = 1; j <= W; j++) {
401                     if (W*l+j > m)
402                         break;
403                     Peq[l][P[W*l+j-1]] = Peq[l][P[W*l+j-1]] | (1lu << (j-1));
404                     if (j+W*l <= k+1)
405                         S[(unsigned)P[W*l+j-1]] = true;
406                 }
407             for (l=0; l < blockCount; l++) {
408                 Mv[l] = 0lu;
409                 Score[l] = (l+1lu)*W;
410                 Pv[l] = ~0lu;
411             }
412             Mv[blockCount] = 0;
413             Score[blockCount] = m;
414             Pv[blockCount] = (~0lu) >> ((blockCount+1lu)*W - m);
415         }
416
417         MyersEditDistance(unsigned _m, unsigned maxk) 
418             : Peq(0), k(maxk), m(_m)
419         {
420             ulong l;
421             Peq = new ulong *[(m-1) / W + 1]; // (unsigned **)malloc((((m-1) / w)+1)*sizeof(unsigned*));
422             for (l=0;l <= ((m-1) / W); l++)
423                 Peq[l] = new ulong[256]; //(unsigned *)malloc(256*sizeof(unsigned));
424
425             blockCount= ((m-1) / W);
426             block = (k-1) / W;
427             lastBlockLength = m % W;
428             if (lastBlockLength == 0)
429                 lastBlockLength = W;
430             Pv = new ulong[blockCount+1];  // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
431             Mv = new ulong[blockCount+1];  // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
432             Score = new ulong[blockCount+1];//(unsigned *)malloc((blockCount+1)*sizeof(unsigned));
433             twopowmpos = 1lu << (lastBlockLength-1);
434             twopowwpos = 1lu << (W-1);
435         }
436
437         ~MyersEditDistance()
438         {
439             delete [] Pv; Pv = 0;
440             delete [] Mv; Mv = 0;
441             delete [] Score; Score = 0;
442             unsigned l;
443             for (l=0;l <= ((m-1) / W); l++)
444             {
445                 delete [] Peq[l];
446                 Peq[l] = 0;
447             }
448             delete [] Peq; Peq = 0;
449         }
450
451         std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
452         {
453             unsigned mindist = m+n;
454             unsigned minlen = m+n;
455             //            int count=0;
456             //int blockAverage=0;
457             ulong Eq, Xv, Xh, Ph, Mh;
458             ulong Phendbit;
459             ulong Mhendbit;
460             ulong temp;
461             ulong j;
462             ulong l;
463             ulong overk=1;
464             block = (k-1) / W;
465             for(j = 1; j <= n; j++) {
466                 Phendbit = 1lu; //0;
467                 Mhendbit = 0lu;
468                 for (l=0; l <= blockCount; l++) {
469                     Eq = Peq[l][(unsigned char)text[j-1]];
470                     Xv = Eq | Mv[l];
471                     Xh = ((((Eq | Mhendbit)& Pv[l]) + Pv[l]) ^ Pv[l]) | (Eq | Mhendbit);
472                     Ph = Mv[l] | ~ (Xh | Pv[l]);
473                     Mh = Pv[l] & Xh;
474                     temp = l < blockCount ? twopowwpos : twopowmpos;
475                     if (Ph & temp)
476                         Score[l] += 1;
477                     else if (Mh & temp)
478                         Score[l] -= 1;
479                     temp = (Ph >> (W-1));
480                     Ph <<= 1;
481                     Ph |= Phendbit;
482                     Phendbit = temp;
483                     temp = (Mh >> (W-1));
484                     Mh <<= 1;
485                     Mh |= Mhendbit;
486                     Mhendbit = temp;
487                     Pv[l] = (Mh) | ~ (Xv | Ph);
488                     Mv[l] = Ph & Xv;
489                     if (block == l+1 &&
490                         ((block == blockCount &&
491                           (Score[l] > k+lastBlockLength ||
492                            (Score[l] > k && overk != 0 && j - overk >= lastBlockLength))) ||
493                          (block != blockCount &&
494                           (Score[l] > k+W ||
495                            (Score[l] > k && overk != 0 && j - overk >= W)))))  {
496                         // lohkon Score kasvanut niin isoksi, ettei seuraavaa kannata laskea
497                         overk = 0;
498                         block = l;
499                         break;
500                     }
501                     else if (block == l+1 && overk == 0 && Score[l] > k)
502                         // Talletetaan ensimmäinen diagonaali jolla Score > k. Näin tiedetään
503                         // jatkossa koska seuraavan lohkon laskeminen on turhaa.
504                         overk = j;
505                     else if (block == l+1 && Score[l] <= k)
506                         // Diagonaalilla Score <= k ==> Nollataan muuttuja overk.
507                         overk = 0;
508                     else if (block == l && block != blockCount && Score[l] <= k) {
509                         // Score <= k ==> jatketaan seuraavaan lohkoon. Koska seuraavaa lohkoa
510                         // ei ole laskettu edellisellä sarakkeella, pitää sen arvot päivittää.
511                         Score[l+1] = k+1lu;
512                         Pv[l+1] = 0lu;
513                         Mv[l+1] = 0lu;
514                         overk = 0lu;
515                         block = l+1;
516                     }
517                     else if (block == l)
518                         // Seuraavaan lohkoon ei kannata siirtyä, kun Score > k.
519                         break;
520                 }
521 //                blockAverage += block+1;
522                 // if (block == blockCount && Score[blockCount] <= k) {
523                 if (block == blockCount && Score[blockCount] <= mindist)
524                 {
525                     mindist = Score[blockCount];
526                     minlen = j;
527                 
528                     //printf("Min %u at len %u\n", mindist, minlen);
529                 //    count++;
530                 }
531             }
532 //            return count;
533             return std::make_pair(mindist,minlen);
534             /*ShowMessage("Esiintymiä löytyi " + IntToStr(laskuri) + "kpl. " +
535               "Aikaa meni " + IntToStr(msec) + "msec");
536               ShowMessage("Keskimääräinen lohkojen määrä " +
537               FormatFloat("0.00",(float)blockAverage / n));*/
538         }
539
540 };
541
542
543 } // Namespace
544
545 #endif