1 #ifndef _SXSI_EditDistance_h_
2 #define _SXSI_EditDistance_h_
32 EditDistance(uchar const *pattern, unsigned len, unsigned maxk) {
37 dp = new unsigned*[patlen + 1];
38 for (unsigned i = 0; i <= patlen; i++) {
39 dp[i] = new unsigned[maxlen+1];
45 for (unsigned i = 0; i <= patlen; i++)
50 std::pair<unsigned, unsigned> prefixDist(uchar const *cand, unsigned candlen) {
51 unsigned mindist = patlen+candlen;
52 unsigned minlen = patlen+candlen;
53 unsigned *col = new unsigned[patlen+1];
54 unsigned *ncol = new unsigned[patlen+1];
55 for (unsigned i = 0; i <= patlen; i++)
57 for (unsigned j = 1; j <= candlen; j++) {
59 for (unsigned i = 1; i <= patlen; i++) {
60 unsigned match = (cand[j-1] == pat[i-1])? col[i - 1] : (col[i - 1] + 1);
61 ncol[i] = std::min(col[i] + 1, ncol[i - 1] + 1);
62 ncol[i] = std::min(ncol[i],match);
64 if (ncol[patlen]<mindist)
66 mindist = ncol[patlen];
69 for (unsigned i = 0; i <= patlen; i++)
74 return std::make_pair(mindist,minlen);
78 unsigned pushChar(uchar c) {
79 unsigned mindist = patlen;
82 std::cout << "out of bounds in DP computation, p = " << p << ", max = " << maxlen << std::endl;
84 for (unsigned i = 1; i <= patlen; i++) {
85 unsigned match = (c == pat[i-1])? dp[i - 1][p-1] : (dp[i - 1][p-1] + 1);
86 dp[i][p] = std::min(dp[i][p-1] + 1, dp[i - 1][p] + 1);
87 dp[i][p] = std::min(dp[i][p],match);
100 return dp[patlen][p];
103 unsigned getSuffixLength()
105 unsigned mindist = dp[patlen][p];
106 unsigned minlen = patlen;
107 for (unsigned i = 0; i < patlen; ++i)
108 if (dp[i][p] < mindist)
119 class MyersEditDistanceIncremental
123 MyersEditDistanceIncremental(const uchar *pattern, unsigned _length, unsigned _max_errors) :
124 length(_length), max_errors(_max_errors), min_row(~0u)
126 this->max_length = this->length + this->max_errors;
127 this->blocks = (this->length + W - 1) / W;
128 this->last_block_length = this->length % W;
129 if(this->last_block_length == 0) { this->last_block_length = W; }
130 this->last_block_mask = ~0lu >> (this->blocks * W - this->length);
132 this->initializeCharVectors(pattern);
133 this->initializePlusMinusVectors();
136 ~MyersEditDistanceIncremental()
138 for(unsigned c = 0; c < 256; c++)
140 delete[] this->char_vectors[c]; this->char_vectors[c] = 0;
142 delete[] this->char_vectors; this->char_vectors = 0;
144 for(unsigned i = 0; i <= this->max_length; i++)
146 delete[] this->plus_vectors[i]; this->plus_vectors[i] = 0;
147 delete[] this->minus_vectors[i]; this->minus_vectors[i] = 0;
149 delete[] this->plus_vectors; this->plus_vectors = 0;
150 delete[] this->minus_vectors; this->minus_vectors = 0;
152 delete[] this->score; this->score = 0;
155 unsigned pushChar(unsigned c)
159 std::cerr << "MyersEditDistance: Invalid character value " << c << "!" << std::endl;
162 if(this->pos >= this->max_length)
164 std::cerr << "MyersEditDistance: Cannot pushChar(c) past limit!" << std::endl;
165 std::cerr << "MyersEditDistance: Current text position " << this-> pos << ", maximum " << this->max_length << std::endl;
170 ulong c_vec, diagonal, h_plus = 0, h_minus = 0, plus, minus, temp;
171 ulong d_overflow = 0, hp_overflow = 1, hm_overflow = 0;
174 Update rules from http://www.cs.uta.fi/~helmu/pubs/phd.pdf (Figure 25).
176 for(unsigned i = 0; i < this->blocks; i++)
178 c_vec = this->char_vectors[c][i];
179 plus = this->plus_vectors[this->pos - 1][i];
180 minus = this->minus_vectors[this->pos - 1][i];
182 diagonal = c_vec & plus;
183 temp = diagonal + plus + d_overflow;
184 d_overflow = ((temp < diagonal) || (temp == diagonal && d_overflow == 1) ? 1 : 0);
185 diagonal = (temp ^ plus) | c_vec | minus;
187 h_plus = minus | ~(diagonal | plus);
188 h_minus = diagonal & plus;
190 this->plus_vectors[this->pos][i] =
191 (h_minus << 1) | hm_overflow | ~(diagonal | (h_plus << 1) | hp_overflow);
192 this->minus_vectors[this->pos][i] = diagonal & ((h_plus << 1) | hp_overflow);
194 hp_overflow = (h_plus & HIGH_BIT ? 1 : 0);
195 hm_overflow = (h_minus & HIGH_BIT ? 1 : 0);
197 this->plus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
198 this->minus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask;
200 this->score[this->pos] = this->score[this->pos - 1];
201 if(h_plus & (1lu << (this->length % W - 1)))
203 this->score[this->pos]++;
205 else if(h_minus & (1lu << (this->length % W - 1)))
207 this->score[this->pos]--;
210 // Find the minimal value in the current column.
211 int minimum = this->score[this->pos], cur = minimum;
212 min_row = this->length;
213 for(unsigned i = this->blocks; i > 0;)
216 plus = this->plus_vectors[this->pos][i];
217 minus = this->minus_vectors[this->pos][i];
219 for(unsigned j = W; j > 0;)
222 unsigned a = (plus >> j) & 0xFF, b = (minus >> j) & 0xFF;
223 if (minimum > cur + myers_optimum[256 * a + b])
225 minimum = cur + myers_optimum[256 * a + b];
226 min_row = i*W + j + myers_best_pos[256 * a + b];
228 cur += myers_total[256 * a + b];
236 return this->score[this->pos];
243 std::cerr << "MyersEditDistance: Cannot popChar() from text position 0!" << std::endl;
250 inline unsigned getSuffixLength()
257 std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
262 if (pos < max_length)
267 static void initMyersFourRussians()
269 // myers_total = new signed char[256 * 256];
270 // myers_optimum = new signed char[256 * 256];
271 // myers_best_pos = new uchar[256 * 256];
274 for(unsigned i = 0; i < 256; i++)
276 for(unsigned j = 0; j < 256; j++)
278 int cur = 0, best = 0;
279 for(unsigned k = CHAR_BIT; k > 0;)
282 if(i & (1 << k)) { cur--; }
283 if(j & (1 << k)) { cur++; }
291 myers_total[256 * i + j] = cur;
292 myers_optimum[256 * i + j] = best;
293 myers_best_pos[256 * i + j] = bp;
300 static signed char myers_total[];
301 static signed char myers_optimum[];
302 static uchar myers_best_pos[];
305 const static ulong HIGH_BIT = 1lu << (W - 1);
307 ulong **char_vectors;
312 unsigned blocks; // Number of blocks required for the pattern
313 unsigned last_block_length; // Number of bits in the final block
314 ulong last_block_mask;
316 ulong **plus_vectors;
317 ulong **minus_vectors;
320 unsigned pos; // We have matched pos characters of text
323 void initializeCharVectors(uchar const *pattern)
325 this->char_vectors = new ulong*[256];
326 for(unsigned c = 0; c < 256; c++)
328 this->char_vectors[c] = new ulong[this->blocks];
329 std::memset(this->char_vectors[c], 0, this->blocks * sizeof(ulong));
332 for(unsigned i = 0; i < this->length; i++)
334 this->char_vectors[pattern[i]][i / W] |= (1lu << (i % W));
338 void initializePlusMinusVectors()
340 this->plus_vectors = new ulong*[this->max_length + 1];
341 this->minus_vectors = new ulong*[this->max_length + 1];
342 this->score = new unsigned[this->max_length + 1];
344 for(unsigned i = 0; i <= this->max_length; i++)
346 this->plus_vectors[i] = new ulong[this->blocks];
347 this->minus_vectors[i] = new ulong[this->blocks];
355 for(unsigned i = 0; i < this->blocks; i++)
357 this->plus_vectors[0][i] = ~0lu;
358 this->minus_vectors[0][i] = 0;
360 this->plus_vectors[0][this->blocks - 1] = this->last_block_mask;
362 this->score[0] = this->length;
373 class MyersEditDistance
382 unsigned lastBlockLength;
389 void init(uchar const *P)
394 for(i = 0;i< 256;i++) {
396 for (l=0;l <= ((m-1) / W); l++)
399 for (l=0;l <= ((m-1) / W); l++)
400 for (j = 1; j <= W; j++) {
403 Peq[l][P[W*l+j-1]] = Peq[l][P[W*l+j-1]] | (1lu << (j-1));
405 S[(unsigned)P[W*l+j-1]] = true;
407 for (l=0; l < blockCount; l++) {
409 Score[l] = (l+1lu)*W;
413 Score[blockCount] = m;
414 Pv[blockCount] = (~0lu) >> ((blockCount+1lu)*W - m);
417 MyersEditDistance(unsigned _m, unsigned maxk)
418 : Peq(0), k(maxk), m(_m)
421 Peq = new ulong *[(m-1) / W + 1]; // (unsigned **)malloc((((m-1) / w)+1)*sizeof(unsigned*));
422 for (l=0;l <= ((m-1) / W); l++)
423 Peq[l] = new ulong[256]; //(unsigned *)malloc(256*sizeof(unsigned));
425 blockCount= ((m-1) / W);
427 lastBlockLength = m % W;
428 if (lastBlockLength == 0)
430 Pv = new ulong[blockCount+1]; // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
431 Mv = new ulong[blockCount+1]; // (unsigned *)malloc((blockCount+1)*sizeof(unsigned));
432 Score = new ulong[blockCount+1];//(unsigned *)malloc((blockCount+1)*sizeof(unsigned));
433 twopowmpos = 1lu << (lastBlockLength-1);
434 twopowwpos = 1lu << (W-1);
439 delete [] Pv; Pv = 0;
440 delete [] Mv; Mv = 0;
441 delete [] Score; Score = 0;
443 for (l=0;l <= ((m-1) / W); l++)
448 delete [] Peq; Peq = 0;
451 std::pair<unsigned, unsigned> prefixDist(uchar const *text, unsigned n)
453 unsigned mindist = m+n;
454 unsigned minlen = m+n;
456 //int blockAverage=0;
457 ulong Eq, Xv, Xh, Ph, Mh;
465 for(j = 1; j <= n; j++) {
468 for (l=0; l <= blockCount; l++) {
469 Eq = Peq[l][(unsigned char)text[j-1]];
471 Xh = ((((Eq | Mhendbit)& Pv[l]) + Pv[l]) ^ Pv[l]) | (Eq | Mhendbit);
472 Ph = Mv[l] | ~ (Xh | Pv[l]);
474 temp = l < blockCount ? twopowwpos : twopowmpos;
479 temp = (Ph >> (W-1));
483 temp = (Mh >> (W-1));
487 Pv[l] = (Mh) | ~ (Xv | Ph);
490 ((block == blockCount &&
491 (Score[l] > k+lastBlockLength ||
492 (Score[l] > k && overk != 0 && j - overk >= lastBlockLength))) ||
493 (block != blockCount &&
495 (Score[l] > k && overk != 0 && j - overk >= W))))) {
496 // lohkon Score kasvanut niin isoksi, ettei seuraavaa kannata laskea
501 else if (block == l+1 && overk == 0 && Score[l] > k)
502 // Talletetaan ensimmäinen diagonaali jolla Score > k. Näin tiedetään
503 // jatkossa koska seuraavan lohkon laskeminen on turhaa.
505 else if (block == l+1 && Score[l] <= k)
506 // Diagonaalilla Score <= k ==> Nollataan muuttuja overk.
508 else if (block == l && block != blockCount && Score[l] <= k) {
509 // Score <= k ==> jatketaan seuraavaan lohkoon. Koska seuraavaa lohkoa
510 // ei ole laskettu edellisellä sarakkeella, pitää sen arvot päivittää.
518 // Seuraavaan lohkoon ei kannata siirtyä, kun Score > k.
521 // blockAverage += block+1;
522 // if (block == blockCount && Score[blockCount] <= k) {
523 if (block == blockCount && Score[blockCount] <= mindist)
525 mindist = Score[blockCount];
528 //printf("Min %u at len %u\n", mindist, minlen);
533 return std::make_pair(mindist,minlen);
534 /*ShowMessage("Esiintymiä löytyi " + IntToStr(laskuri) + "kpl. " +
535 "Aikaa meni " + IntToStr(msec) + "msec");
536 ShowMessage("Keskimääräinen lohkojen määrä " +
537 FormatFloat("0.00",(float)blockAverage / n));*/