From 4ea6ddc07bb8223bb8f56008121b519fa6c75438 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Mon, 1 Nov 2010 13:58:38 +0000 Subject: [PATCH] Added approximate pattern matching with Suffix Filters git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@929 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- EditDistance.cpp | 10 + EditDistance.h | 545 +++++++++++++++++++++++++++++++++++++++++ IndelQuery.h | 244 ++++++++++++++++++ Query.cpp | 83 +++++++ Query.h | 86 +++++++ RLCSAWrapper.h | 106 ++++++-- ResultSet.cpp | 227 +++++++++++++++++ ResultSet.h | 53 ++++ TextCollection.h | 30 +++ dependencies.mk | 37 +-- incbwt/dependencies.mk | 39 +-- makefile | 2 +- 12 files changed, 1408 insertions(+), 54 deletions(-) create mode 100644 EditDistance.cpp create mode 100644 EditDistance.h create mode 100644 IndelQuery.h create mode 100644 Query.cpp create mode 100644 Query.h create mode 100644 ResultSet.cpp create mode 100644 ResultSet.h diff --git a/EditDistance.cpp b/EditDistance.cpp new file mode 100644 index 0000000..23b7de6 --- /dev/null +++ b/EditDistance.cpp @@ -0,0 +1,10 @@ +#include "EditDistance.h" + +namespace SXSI +{ + +signed char MyersEditDistanceIncremental::myers_total[256 * 256]; +signed char MyersEditDistanceIncremental::myers_optimum[256 * 256]; +uchar MyersEditDistanceIncremental::myers_best_pos[256 * 256]; + +} diff --git a/EditDistance.h b/EditDistance.h new file mode 100644 index 0000000..ad0e868 --- /dev/null +++ b/EditDistance.h @@ -0,0 +1,545 @@ +#ifndef _SXSI_EditDistance_h_ +#define _SXSI_EditDistance_h_ + +/** + * TODO + * [ ] clean up code + */ + +#include +#include +#include "Tools.h" + +namespace SXSI +{ + +class EditDistance +{ +public: + uchar const *pat; + unsigned patlen; + unsigned maxlen; + unsigned **dp; + unsigned p; + + bool end() + { + if (p < maxlen) + return false; + return true; + } + + EditDistance(uchar const *pattern, unsigned len, unsigned maxk) { + pat = pattern; + patlen = len; + maxlen = len + maxk; + p = 0; + dp = new unsigned*[patlen + 1]; + for (unsigned i = 0; i <= patlen; i++) { + dp[i] = new unsigned[maxlen+1]; + dp[i][0] = i; + } + } + + ~EditDistance() { + for (unsigned i = 0; i <= patlen; i++) + delete [] dp[i]; + delete [] dp; + } + + std::pair prefixDist(uchar const *cand, unsigned candlen) { + unsigned mindist = patlen+candlen; + unsigned minlen = patlen+candlen; + unsigned *col = new unsigned[patlen+1]; + unsigned *ncol = new unsigned[patlen+1]; + for (unsigned i = 0; i <= patlen; i++) + col[i] = i; + for (unsigned j = 1; j <= candlen; j++) { + ncol[0] = j; + for (unsigned i = 1; i <= patlen; i++) { + unsigned match = (cand[j-1] == pat[i-1])? col[i - 1] : (col[i - 1] + 1); + ncol[i] = std::min(col[i] + 1, ncol[i - 1] + 1); + ncol[i] = std::min(ncol[i],match); + } + if (ncol[patlen]maxlen) + std::cout << "out of bounds in DP computation, p = " << p << ", max = " << maxlen << std::endl; + dp[0][p] = p; + for (unsigned i = 1; i <= patlen; i++) { + unsigned match = (c == pat[i-1])? dp[i - 1][p-1] : (dp[i - 1][p-1] + 1); + dp[i][p] = std::min(dp[i][p-1] + 1, dp[i - 1][p] + 1); + dp[i][p] = std::min(dp[i][p],match); + if (dp[i][p]max_length = this->length + this->max_errors; + this->blocks = (this->length + W - 1) / W; + this->last_block_length = this->length % W; + if(this->last_block_length == 0) { this->last_block_length = W; } + this->last_block_mask = ~0lu >> (this->blocks * W - this->length); + + this->initializeCharVectors(pattern); + this->initializePlusMinusVectors(); + } + + ~MyersEditDistanceIncremental() + { + for(unsigned c = 0; c < 256; c++) + { + delete[] this->char_vectors[c]; this->char_vectors[c] = 0; + } + delete[] this->char_vectors; this->char_vectors = 0; + + for(unsigned i = 0; i <= this->max_length; i++) + { + delete[] this->plus_vectors[i]; this->plus_vectors[i] = 0; + delete[] this->minus_vectors[i]; this->minus_vectors[i] = 0; + } + delete[] this->plus_vectors; this->plus_vectors = 0; + delete[] this->minus_vectors; this->minus_vectors = 0; + + delete[] this->score; this->score = 0; + } + + unsigned pushChar(unsigned c) + { + if(c >= 256) + { + std::cerr << "MyersEditDistance: Invalid character value " << c << "!" << std::endl; + return ~0u; + } + if(this->pos >= this->max_length) + { + std::cerr << "MyersEditDistance: Cannot pushChar(c) past limit!" << std::endl; + std::cerr << "MyersEditDistance: Current text position " << this-> pos << ", maximum " << this->max_length << std::endl; + return ~0u; + } + + this->pos++; + ulong c_vec, diagonal, h_plus = 0, h_minus = 0, plus, minus, temp; + ulong d_overflow = 0, hp_overflow = 1, hm_overflow = 0; + + /* + Update rules from http://www.cs.uta.fi/~helmu/pubs/phd.pdf (Figure 25). + */ + for(unsigned i = 0; i < this->blocks; i++) + { + c_vec = this->char_vectors[c][i]; + plus = this->plus_vectors[this->pos - 1][i]; + minus = this->minus_vectors[this->pos - 1][i]; + + diagonal = c_vec & plus; + temp = diagonal + plus + d_overflow; + d_overflow = ((temp < diagonal) || (temp == diagonal && d_overflow == 1) ? 1 : 0); + diagonal = (temp ^ plus) | c_vec | minus; + + h_plus = minus | ~(diagonal | plus); + h_minus = diagonal & plus; + + this->plus_vectors[this->pos][i] = + (h_minus << 1) | hm_overflow | ~(diagonal | (h_plus << 1) | hp_overflow); + this->minus_vectors[this->pos][i] = diagonal & ((h_plus << 1) | hp_overflow); + + hp_overflow = (h_plus & HIGH_BIT ? 1 : 0); + hm_overflow = (h_minus & HIGH_BIT ? 1 : 0); + } + this->plus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask; + this->minus_vectors[this->pos][this->blocks - 1] &= this->last_block_mask; + + this->score[this->pos] = this->score[this->pos - 1]; + if(h_plus & (1lu << (this->length % W - 1))) + { + this->score[this->pos]++; + } + else if(h_minus & (1lu << (this->length % W - 1))) + { + this->score[this->pos]--; + } + + // Find the minimal value in the current column. + int minimum = this->score[this->pos], cur = minimum; + min_row = this->length; + for(unsigned i = this->blocks; i > 0;) + { + i--; + plus = this->plus_vectors[this->pos][i]; + minus = this->minus_vectors[this->pos][i]; + + for(unsigned j = W; j > 0;) + { + j -= CHAR_BIT; + unsigned a = (plus >> j) & 0xFF, b = (minus >> j) & 0xFF; + if (minimum > cur + myers_optimum[256 * a + b]) + { + minimum = cur + myers_optimum[256 * a + b]; + min_row = i*W + j + myers_best_pos[256 * a + b]; + } + cur += myers_total[256 * a + b]; + } + } + + return minimum; + } + + unsigned dist() { + return this->score[this->pos]; + } + + void popChar() + { + if(this->pos == 0) + { + std::cerr << "MyersEditDistance: Cannot popChar() from text position 0!" << std::endl; + return; + } + + this->pos--; + } + + inline unsigned getSuffixLength() + { + return min_row; + } + +/* + // FIXME implement + std::pair prefixDist(uchar const *text, unsigned n) +*/ + + bool end() + { + if (pos < max_length) + return false; + return true; + } + +static void initMyersFourRussians() +{ +// myers_total = new signed char[256 * 256]; +// myers_optimum = new signed char[256 * 256]; +// myers_best_pos = new uchar[256 * 256]; + + unsigned bp = 8; + for(unsigned i = 0; i < 256; i++) + { + for(unsigned j = 0; j < 256; j++) + { + int cur = 0, best = 0; + for(unsigned k = CHAR_BIT; k > 0;) + { + k--; + if(i & (1 << k)) { cur--; } + if(j & (1 << k)) { cur++; } + if(cur < best) + { + best = cur; + bp = k; + } + } + + myers_total[256 * i + j] = cur; + myers_optimum[256 * i + j] = best; + myers_best_pos[256 * i + j] = bp; + } + } +} + + unsigned length; +private: + static signed char myers_total[]; + static signed char myers_optimum[]; + static uchar myers_best_pos[]; + + + const static ulong HIGH_BIT = 1lu << (W - 1); + + ulong **char_vectors; + unsigned max_errors; + unsigned max_length; + unsigned min_row; + + unsigned blocks; // Number of blocks required for the pattern + unsigned last_block_length; // Number of bits in the final block + ulong last_block_mask; + + ulong **plus_vectors; + ulong **minus_vectors; + unsigned* score; + + unsigned pos; // We have matched pos characters of text + + + void initializeCharVectors(uchar const *pattern) + { + this->char_vectors = new ulong*[256]; + for(unsigned c = 0; c < 256; c++) + { + this->char_vectors[c] = new ulong[this->blocks]; + std::memset(this->char_vectors[c], 0, this->blocks * sizeof(ulong)); + } + + for(unsigned i = 0; i < this->length; i++) + { + this->char_vectors[pattern[i]][i / W] |= (1lu << (i % W)); + } + } + + void initializePlusMinusVectors() + { + this->plus_vectors = new ulong*[this->max_length + 1]; + this->minus_vectors = new ulong*[this->max_length + 1]; + this->score = new unsigned[this->max_length + 1]; + + for(unsigned i = 0; i <= this->max_length; i++) + { + this->plus_vectors[i] = new ulong[this->blocks]; + this->minus_vectors[i] = new ulong[this->blocks]; + } + + this->initialStep(); + } + + void initialStep() + { + for(unsigned i = 0; i < this->blocks; i++) + { + this->plus_vectors[0][i] = ~0lu; + this->minus_vectors[0][i] = 0; + } + this->plus_vectors[0][this->blocks - 1] = this->last_block_mask; + + this->score[0] = this->length; + this->pos = 0; + } + + + +}; + + + + + class MyersEditDistance + { + public: + ulong **Peq; + bool S[256]; + unsigned k; + unsigned m; + unsigned blockCount; + unsigned block; + unsigned lastBlockLength; + ulong *Pv; + ulong *Mv; + ulong *Score; + ulong twopowmpos; + ulong twopowwpos; + + void init(uchar const *P) + { + ulong i; + ulong j; + ulong l; + for(i = 0;i< 256;i++) { + S[i] = false; + for (l=0;l <= ((m-1) / W); l++) + Peq[l][i] = 0; + } + for (l=0;l <= ((m-1) / W); l++) + for (j = 1; j <= W; j++) { + if (W*l+j > m) + break; + Peq[l][P[W*l+j-1]] = Peq[l][P[W*l+j-1]] | (1lu << (j-1)); + if (j+W*l <= k+1) + S[(unsigned)P[W*l+j-1]] = true; + } + for (l=0; l < blockCount; l++) { + Mv[l] = 0lu; + Score[l] = (l+1lu)*W; + Pv[l] = ~0lu; + } + Mv[blockCount] = 0; + Score[blockCount] = m; + Pv[blockCount] = (~0lu) >> ((blockCount+1lu)*W - m); + } + + MyersEditDistance(unsigned _m, unsigned maxk) + : Peq(0), k(maxk), m(_m) + { + ulong l; + Peq = new ulong *[(m-1) / W + 1]; // (unsigned **)malloc((((m-1) / w)+1)*sizeof(unsigned*)); + for (l=0;l <= ((m-1) / W); l++) + Peq[l] = new ulong[256]; //(unsigned *)malloc(256*sizeof(unsigned)); + + blockCount= ((m-1) / W); + block = (k-1) / W; + lastBlockLength = m % W; + if (lastBlockLength == 0) + lastBlockLength = W; + Pv = new ulong[blockCount+1]; // (unsigned *)malloc((blockCount+1)*sizeof(unsigned)); + Mv = new ulong[blockCount+1]; // (unsigned *)malloc((blockCount+1)*sizeof(unsigned)); + Score = new ulong[blockCount+1];//(unsigned *)malloc((blockCount+1)*sizeof(unsigned)); + twopowmpos = 1lu << (lastBlockLength-1); + twopowwpos = 1lu << (W-1); + } + + ~MyersEditDistance() + { + delete [] Pv; Pv = 0; + delete [] Mv; Mv = 0; + delete [] Score; Score = 0; + unsigned l; + for (l=0;l <= ((m-1) / W); l++) + { + delete [] Peq[l]; + Peq[l] = 0; + } + delete [] Peq; Peq = 0; + } + + std::pair prefixDist(uchar const *text, unsigned n) + { + unsigned mindist = m+n; + unsigned minlen = m+n; + // int count=0; + //int blockAverage=0; + ulong Eq, Xv, Xh, Ph, Mh; + ulong Phendbit; + ulong Mhendbit; + ulong temp; + ulong j; + ulong l; + ulong overk=1; + block = (k-1) / W; + for(j = 1; j <= n; j++) { + Phendbit = 1lu; //0; + Mhendbit = 0lu; + for (l=0; l <= blockCount; l++) { + Eq = Peq[l][(unsigned char)text[j-1]]; + Xv = Eq | Mv[l]; + Xh = ((((Eq | Mhendbit)& Pv[l]) + Pv[l]) ^ Pv[l]) | (Eq | Mhendbit); + Ph = Mv[l] | ~ (Xh | Pv[l]); + Mh = Pv[l] & Xh; + temp = l < blockCount ? twopowwpos : twopowmpos; + if (Ph & temp) + Score[l] += 1; + else if (Mh & temp) + Score[l] -= 1; + temp = (Ph >> (W-1)); + Ph <<= 1; + Ph |= Phendbit; + Phendbit = temp; + temp = (Mh >> (W-1)); + Mh <<= 1; + Mh |= Mhendbit; + Mhendbit = temp; + Pv[l] = (Mh) | ~ (Xv | Ph); + Mv[l] = Ph & Xv; + if (block == l+1 && + ((block == blockCount && + (Score[l] > k+lastBlockLength || + (Score[l] > k && overk != 0 && j - overk >= lastBlockLength))) || + (block != blockCount && + (Score[l] > k+W || + (Score[l] > k && overk != 0 && j - overk >= W))))) { + // lohkon Score kasvanut niin isoksi, ettei seuraavaa kannata laskea + overk = 0; + block = l; + break; + } + else if (block == l+1 && overk == 0 && Score[l] > k) + // Talletetaan ensimmäinen diagonaali jolla Score > k. Näin tiedetään + // jatkossa koska seuraavan lohkon laskeminen on turhaa. + overk = j; + else if (block == l+1 && Score[l] <= k) + // Diagonaalilla Score <= k ==> Nollataan muuttuja overk. + overk = 0; + else if (block == l && block != blockCount && Score[l] <= k) { + // Score <= k ==> jatketaan seuraavaan lohkoon. Koska seuraavaa lohkoa + // ei ole laskettu edellisellä sarakkeella, pitää sen arvot päivittää. + Score[l+1] = k+1lu; + Pv[l+1] = 0lu; + Mv[l+1] = 0lu; + overk = 0lu; + block = l+1; + } + else if (block == l) + // Seuraavaan lohkoon ei kannata siirtyä, kun Score > k. + break; + } +// blockAverage += block+1; + // if (block == blockCount && Score[blockCount] <= k) { + if (block == blockCount && Score[blockCount] <= mindist) + { + mindist = Score[blockCount]; + minlen = j; + + //printf("Min %u at len %u\n", mindist, minlen); + // count++; + } + } +// return count; + return std::make_pair(mindist,minlen); + /*ShowMessage("Esiintymiä löytyi " + IntToStr(laskuri) + "kpl. " + + "Aikaa meni " + IntToStr(msec) + "msec"); + ShowMessage("Keskimääräinen lohkojen määrä " + + FormatFloat("0.00",(float)blockAverage / n));*/ + } + +}; + + +} // Namespace + +#endif diff --git a/IndelQuery.h b/IndelQuery.h new file mode 100644 index 0000000..169ab81 --- /dev/null +++ b/IndelQuery.h @@ -0,0 +1,244 @@ +/** + * k-errors queries + * + * Aligment using suffix filters. + */ + +#ifndef _IndelQuery_H_ +#define _IndelQuery_H_ + +#include "ResultSet.h" +#include "EditDistance.h" +#include "Query.h" + +#include +#include +#include + +namespace SXSI +{ + +class IndelQuery : public Query +{ +public: + IndelQuery(TextCollection const *tc_) + : Query(tc_), checked(0), splitPos(0), ed(0) + { + checked = new ResultSet(tc->getLength()); + } + + virtual ~IndelQuery() + { + delete checked; checked = 0; + delete [] splitPos; splitPos = 0; + delete ed; ed = 0; + } + +protected: + void firstStep(void) + { + checked->clear(); + constructPartition(); + + uchar *reversepat = new uchar[patlen+1]; + for (unsigned i = klimit+1; i > 1; --i) + { + if (debug) std::cout << "new try: " << splitPos[i] << ", split = " << i << std::endl; + + for (unsigned j = 0; j < splitPos[i-1]; j++) + reversepat[j] = pat[splitPos[i-1]-j-1]; + reversepat[splitPos[i-1]] = 0; + ed = new MyersEditDistanceIncremental(reversepat, splitPos[i-1], i-1); + + exactMatch(i); + + delete ed; + ed = 0; + } + delete [] reversepat; + reversepat = 0; + + if (debug) std::cout << "new try: " << splitPos[1] << ", split = 1" << std::endl; + ed = 0; // Not used, exact search only for the first block + exactMatch(1); + } + +private: + + void exactMatch(const unsigned split) + { + unsigned i = splitPos[split]; + const unsigned border = splitPos[split-1]; + if (debug) std::cerr << "Exact match: i = " << i << ", border = " << border << std::endl; + + while (i != border && pushChar(pat[i-1])) + --i; + + if (debug) std::cerr << "matched upto: i = " << i << std::endl; + if (i == 0) + checkMatch(); + else + if (i == border) + nextStep(split); + + // Rewind stack + match.clear(); + while (!smin.empty()) smin.pop(); + while (!smax.empty()) smax.pop(); + smin.push(0); + smax.push(textlen-1); + } + + void nextStep(const unsigned split) + { + // Maximum edit distance for the current filter is split-1 + if (ed->dist() <= split-1) checkMatch(); + + if (ed->end()) return; + + // ALPHABET has been defined in Query + for (const char *cp = ALPHABET; cp < ALPHABET + ALPHABET_SIZE; ++cp) { + char c = *cp; + if (!pushChar(c)) continue; + const unsigned colmin = ed->pushChar(c); + + // Edit distance is for reversed pattern, compute prefix lenght: + const unsigned prefixl = ed->length - ed->getSuffixLength(); + // How many errors are we allowing on a prefix of given length? + unsigned i = 1; + while (splitPos[i] < prefixl) + ++i; + + assert(i < klimit+1); + assert(split-1 > i-1); + const unsigned maxk = (split-1) - (i-1); + + if (debug) + std::cerr << "c = " << c << ", split = " << split << ", maxk = " << maxk << ", colmin = " << colmin + << ", prefixl = " << prefixl << ", dist = " << ed->dist() << std::endl; + + if (colmin <= maxk) nextStep(split); + + //if (debug) std::cerr << "pop c = " << c << std::endl; + ed->popChar(); + popChar(); + } + } + + inline void checkMatch(void) + { +// if (debug) std::cerr << "min,max: " << smin.top() << ", " << smax.top() << std::endl; + ulong min = smin.top(); + ulong max = smax.top(); + MyersEditDistance med(patlen, klimit); + for (ulong i = min; i <= max; i++) { + if (!checked->get(i)) + { + checked->set(i); + med.init((uchar const *)pat); + validateCandidate(med, i); + } + } + } + + void validateCandidate(MyersEditDistance &med, ulong sapos) + { + uchar *candidate = tc->getSuffix(sapos, patlen+klimit); + unsigned l = strlen((char*)candidate); + if (l < patlen) + { + delete [] candidate; + return; + } + + std::pair dist(0, patlen); + if (klimit == 0) + { // MyersEditDistance class does not support klimit == 0 + unsigned i = 0; + while (i < patlen && pat[i] == candidate[i]) + ++i; + if (i < patlen) + { + delete [] candidate; + return; // There is at least one mismatch (and klimit == 0) + } + } + else // Using prefixDist() when klimit > 0 + dist = med.prefixDist(candidate, l); + + if (dist.first <= klimit) + newMatch(sapos); + delete [] candidate; + } + + + inline void newMatch(ulong sapos) + { + TextCollection::DocId di = tc->getDoc(sapos); + result.insert(di); + } + + + void constructPartition(void) + { + delete [] splitPos; + splitPos = new unsigned[klimit+2]; + + if (klimit == 0) + { + splitPos[0] = 0; + splitPos[1]=patlen; + return; + } + + // First piece size + unsigned psize = patlen/(klimit+2); + if (psize == 0) + { + std::cerr << "error: First psize == 0, for patlen = " << patlen << ", klimit = " << klimit << std::endl; + std::exit(1); + } + splitPos[0] = 0; + splitPos[1] = 2*psize; + + // Last klimit are of size: + psize = (patlen-splitPos[1])/klimit; + if (psize == 0) + { + std::cerr << "error: psize == 0, for patlen = " << patlen << ", klimit = " << klimit << std::endl; + std::exit(1); + } + + for (unsigned i = 2; i <= klimit; i++) + splitPos[i]=splitPos[i-1]+psize; + splitPos[klimit+1]=patlen; + + if (splitPos[klimit] >= patlen) + { + std::cerr << "error: splitPos[" << klimit << "] == " << splitPos[klimit] << " >= patlen, for patlen = " << patlen << ", klimit = " << klimit << std::endl; + std::exit(1); + } + + if (debug) + { + std::cerr << "splitpos: "; + for (unsigned i= 0;i<=klimit+1; i++) + std::cerr << splitPos[i] << ", "; + std::cerr << std::endl; + } + } + + + ResultSet *checked; + unsigned *splitPos; + MyersEditDistanceIncremental *ed; + + IndelQuery(); + // No copy constructor or assignment + IndelQuery(IndelQuery const&); + IndelQuery& operator = (IndelQuery const&); +}; + +} // namespace + +#endif // _IndelQuery_H_ diff --git a/Query.cpp b/Query.cpp new file mode 100644 index 0000000..29b0bb1 --- /dev/null +++ b/Query.cpp @@ -0,0 +1,83 @@ +#include "Query.h" +#include + +namespace SXSI +{ + +const char Query::ALPHABET_DNA[] = {'A', 'C', 'G', 'T', 'N'}; + + +Query::Query(TextCollection const *tc_) + : pat(0), patlen(0), tc(tc_), debug(false), klimit(0), + textlen(tc->getLength()), ALPHABET(ALPHABET_DNA) +{ } + + +/*void reverse() +{ + uchar c; + for (int i = 0; i < patlen / 2; ++i) { + c = pat[i]; + pat[i] = pat[patlen - i - 1]; + pat[patlen - i - 1] = c; + } +} + +void revcmp() +{ + reverse(); + for (int i = 0; i < patlen; ++i) + switch (pat[i]) + { + case('A'): pat[i] = 'T'; break; + case('C'): pat[i] = 'G'; break; + case('G'): pat[i] = 'C'; break; + case('T'): pat[i] = 'A'; break; + case('a'): pat[i] = 't'; break; + case('c'): pat[i] = 'g'; break; + case('g'): pat[i] = 'c'; break; + case('a'): pat[i] = 't'; break; + case('N'): pat[i] = 'N'; break; + case('n'): pat[i] = 'n'; break; + default: + std::cerr << "Query::align(): invalid alphabet ("<< pat[i] << ") given!" << std::endl; + std::exit(1); + break; + } + }*/ + +TextCollection::document_result Query::align(uchar const *p, unsigned k) +{ + result.clear(); + this->pat = p; + this->patlen = std::strlen((char const *)p); + this->klimit = k; + + while (!smin.empty()) smin.pop(); + while (!smax.empty()) smax.pop(); + smin.push(0); + smax.push(textlen-1); + match.clear(); + + firstStep(); + + // Search reverse complemented? +/* { + revcmp(); + this->pat = p->c_str(); + while (!smin.empty()) smin.pop(); + while (!smax.empty()) smax.pop(); + smin.push(0); + smax.push(textlen-1); + match.clear(); + + firstStep(); + + revcmp(); // Restore orig string + }*/ + + TextCollection::document_result dr(result.begin(), result.end()); + return dr; +} + +} // namespace diff --git a/Query.h b/Query.h new file mode 100644 index 0000000..bf77802 --- /dev/null +++ b/Query.h @@ -0,0 +1,86 @@ +/** + * Interface for alignment queries + */ + +#ifndef _Query_H_ +#define _Query_H_ + +#include "TextCollection.h" + +#include +#include +#include + +namespace SXSI +{ + +/** + * Base class for queries + */ +class Query +{ +public: + TextCollection::document_result align(uchar const *p, unsigned k); + + void setDebug(bool b) + { this->debug = b; } + + virtual ~Query() + { } + +protected: + virtual void firstStep() = 0; + + inline bool pushChar(char c) { + ulong nmin = tc->LF(c, smin.top()-1); + ulong nmax = tc->LF(c, smax.top())-1; + if (nmin > nmax) return false; + smin.push(nmin); + smax.push(nmax); + match.push_back(c); + return true; + } + + inline void popChar() { + smin.pop(); + smax.pop(); + match.pop_back(); + } + + inline int min(int a, int b) { + if (a < b) return a; else return b; + } + + inline int min(int a, int b, int c) { + if (a < b) if (a < c) return a; else return c; + else if (b < c) return b; else return c; + } + + Query(TextCollection const *tc_); + + uchar const *pat; + unsigned patlen; + TextCollection const *tc; + bool debug; + unsigned klimit; + ulong textlen; + + const char *ALPHABET; + static const unsigned ALPHABET_SIZE = 5; + static const char ALPHABET_DNA[]; + + std::stack smin; + std::stack smax; + std::vector match; + std::set result; + +private: + Query(); + // No copy constructor or assignment + Query(Query const&); + Query& operator = (Query const&); +}; + +} // Namespace + +#endif // _Query_H_ diff --git a/RLCSAWrapper.h b/RLCSAWrapper.h index 4e22fe8..8fd1d68 100644 --- a/RLCSAWrapper.h +++ b/RLCSAWrapper.h @@ -23,6 +23,7 @@ #define _RLCSAWrapper_H_ #include "TextCollection.h" +#include "IndelQuery.h" #include "incbwt/rlcsa.h" @@ -34,6 +35,7 @@ # define W 32 #endif +#include #include #include @@ -51,7 +53,10 @@ class RLCSAWrapper : public SXSI::TextCollection public: RLCSAWrapper(const CSA::RLCSA* index) : rlcsa(index) - { /* NOP */ } + { + // Init the edit distance look-up tables + MyersEditDistanceIncremental::initMyersFourRussians(); + } ~RLCSAWrapper() { @@ -131,9 +136,43 @@ public: document_result Prefix(uchar const *) const { unsupported(); return document_result(); }; document_result Suffix(uchar const *) const { unsupported(); return document_result(); }; document_result Equal(uchar const *) const { unsupported(); return document_result(); }; + document_result Contains(uchar const *pattern) const - { - //std::pair p = backwards(pattern); + { + /************************************************************************* + * Approximate pattern matching + * + * Using suffix filters. Has to enumerate *all* approx. occurrences (sloooow...) + * instead of just returning the best occurrence (which is usually much faster). + * + * Query format: contains("APM 3 GATTACA") + * where + * "APM" is the keyword for approximate queries. + * "3" is the maximum edit distance allowed. + * "GATTACA" is the query word to be aligned. + */ + if (strncmp((char const *)pattern, "APM ", 4) == 0) + { + // Edit distance allowed. + int k = std::atoi((char const *)pattern + 4); + if (k < 0 || k == INT_MAX || k == INT_MIN) + goto exact_pattern_matching; // Invalid format + + // Find the start of the pattern (i.e. the second ' ') + uchar const * tmp = pattern + 4; + while (*tmp != ' ' && *tmp != 0) ++tmp; + if (*tmp != ' ' || tmp == pattern + 4) + goto exact_pattern_matching; // Invalid format + + IndelQuery iq(this); + //std::cerr << "Pattern: " << tmp+1 << ", k = " << k << std::endl; + return iq.align(tmp+1, k); + } + + /************************************************************************* + * Exact pattern matching + */ + exact_pattern_matching: CSA::pair_type p = rlcsa->count(std::string((char const *)pattern)); if (p.first > p.second) return document_result(); @@ -175,6 +214,45 @@ public: this->rlcsa->writeTo(std::string(filename)); } + TextCollection::TextPosition getLength() const + { + return this->rlcsa->getSize() + this->rlcsa->getNumberOfSequences(); + } + + inline TextCollection::TextPosition LF(uchar c, TextPosition i) const + { + ++i; + if(i < this->rlcsa->getNumberOfSequences()) + return rlcsa->C(c); + return rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c); + } + + uchar* getSuffix(TextPosition pos, unsigned l) const + { + ++pos; + uchar* text = new uchar[l + 1]; + + if(l == 0 || pos < this->rlcsa->getNumberOfSequences()) + { + text[0] = 0; + return text; + } + pos -= this->rlcsa->getNumberOfSequences(); + + unsigned n = rlcsa->displayFromPosition(pos, l, text); + text[n] = 0; + return text; + } + + DocId getDoc(TextPosition i) const + { + if(i < this->rlcsa->getNumberOfSequences()) + return i; + + CSA::pair_type pt = rlcsa->getRelativePosition(this->rlcsa->locate(i - this->rlcsa->getNumberOfSequences())); + return pt.first; + } + private: const CSA::RLCSA* rlcsa; @@ -185,32 +263,20 @@ private: std::cerr << "\nFirst symb = " << (char)c << std::endl; - TextPosition sp = rlcsa->C(c); - TextPosition ep = rlcsa->C(c+1)-1; + TextPosition sp = LF(c, -1); + TextPosition ep = LF(c, getLength()-1) - 1; printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); while (sp<=ep && i>=1) { c = (int)pattern[--i]; - sp = LF(c, sp); - ep = LF(c, ep+1)-1; + sp = LF(c, sp-1); + ep = LF(c, ep)-1; printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep); } - uchar* found = rlcsa->display(sp, std::strlen((char const *)pattern), 5, i); - std::cerr << "found: " << found << " (" << i << std::endl; - return std::make_pair(sp, ep); }*/ - - inline TextCollection::TextPosition - LF(uchar c, TextPosition i) const - { - if(i == (TextPosition)-1 || i < this->rlcsa->getNumberOfSequences()) - { return this->rlcsa->C(c); } - return this->rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c); - } - - + void unsupported() const { std::cerr << std::endl << "-------------------------------------------------------------\n" diff --git a/ResultSet.cpp b/ResultSet.cpp new file mode 100644 index 0000000..208a9d5 --- /dev/null +++ b/ResultSet.cpp @@ -0,0 +1,227 @@ + +#include +#include "ResultSet.h" + +namespace SXSI +{ + +#define W (8*sizeof(ulong)) + +#if __WORDSIZE == 64 +#define logW 6lu // 64bit system, code checks = lg(W)-1 +#else +#define logW 5lu // 32bit system, code checks = lg(W)-1 +#endif + +#define divW(p) ((p)>>logW) +#define modW(p) ((p)&(W-1)) + +#define setBit(arr,pos) ((arr)[divW(pos)] |= 1lu<>=1lu; answ++; } + return answ; +} + +ResultSet::ResultSet(ulong n) +{ + if (logW != lg(W)-1) + { + std::cerr << "Error, redefine logW as " << lg(W)-1 << " and recompile\n"; +// exit(1); + } + this->n = 2*n-1; + this->lgn = lg(n); + this->tree = new ulong[(this->n+W-1)/W]; + if (!this->tree) + { + std::cerr << "Malloc failed at ResultSet class." << std::endl; + // exit(1); + } + clearBit(this->tree,0); // clear all +} + +ResultSet::~ResultSet() +{ + delete [] tree; +} + +// to map 1..n to the bottom of the tree when n is not a power of 2 +ulong ResultSet::conv (ulong p, ulong n, ulong lgn) + + { ulong t = n+1-(1lu<>1lu; + } + +bool ResultSet::get (ulong p) // returns 0 or 1 + + { + ulong pos = 0; + ulong pot = this->lgn; + p = conv(p,n,pot); + do { if (!getBit(this->tree,pos)) return false; + pot--; + pos = (pos<<1lu)+1+(p>>pot); + p &= ~(1lu<lgn; + p = conv(p,n,pot); + do { npos = (pos<<1lu)+1; + if (!getBit(this->tree,pos)) + { setBit(this->tree,pos); + if (npos < n) + { clearBit(this->tree,npos); + clearBit(this->tree,npos+1); + } + } + pot--; + pos = npos+(p>>pot); + p &= ~(1lu<>pot) == 0) // go left, clear right + { clearBit(tree,npos+1); + bit = clearRangeLeft(p1,n,npos,pot); + } + else // go right + { bit = clearRangeLeft(p1,n,npos+1,pot); + if (!bit) bit = getBit(tree,npos); + } + if (!bit) clearBit(tree,pos); + return bit; + } + +ulong ResultSet::clearRangeRight (ulong p2, ulong n, ulong pos, ulong pot) + + { ulong npos; + ulong bit; + if (!getBit(tree,pos)) return 0; // range already zeroed + p2 &= ~(1lu<>pot) == 1) // go right, clear left + { clearBit(tree,npos); + bit = clearRangeRight(p2,n,npos+1,pot); + } + else // go left + { bit = clearRangeRight(p2,n,npos,pot); + if (!bit) bit = getBit(tree,npos+1); + } + if (!bit) clearBit(tree,pos); + return bit; + } + +ulong ResultSet::clearBoth (ulong n, ulong p1, ulong p2, ulong pos, ulong pot) + + { ulong npos,npos1,npos2; + ulong bit; + if (!getBit(tree,pos)) return 0; // range is already zeroed + npos = (pos<<1lu)+1; + // children must exist while the path is unique, as p1>pot); + npos2 = npos+(p2>>pot); + if (npos1 == npos2) // we're inside npos1=npos2 + { bit = clearBoth (n,p1&~(1lu<>pot)); // the other + } + else // p1 and p2 take different paths here + { bit = clearRangeLeft(p1,n,npos1,pot); + bit |= clearRangeRight(p2,n,npos2,pot); + } + if (!bit) clearBit(tree,pos); + return bit; + } + +void ResultSet::clearRange (ulong p1, ulong p2) + + { if ((p2+1)<<1lu > this->n) + clearRangeLeft(conv(p1,this->n,this->lgn),this->n,0,this->lgn); + else clearBoth(n,conv(p1,n,this->lgn),conv(p2+1,n,lgn),0,lgn); + } + +ulong ResultSet::nextSmallest (ulong n, ulong pos, ulong pot) + + { ulong p = 0; + while (1) + { pot--; + pos = (pos<<1lu)+1; + if (pos >= n) return p; + if (!getBit(tree,pos)) { pos++; p |= (1lu<= n) return 0; // when n is not a power of 2, missing leaves + if ((p>>pot) == 0) // p goes left + { answ = nextLarger(n,p&~(1lu<tree,0); +} + +ulong ResultSet::nextResult (ulong p) // returns pos of next(p) or -1 if none + + { ulong answ; + if (((p+1)<<1lu) > this->n) return ~0lu; // next(last), p+1 out of bounds + answ = nextLarger(this->n,conv(p+1,this->n,this->lgn),0,this->lgn); + if (answ == ~0lu) return ~0lu; + return unconv(answ,this->n,this->lgn); + } + +} // Namespace diff --git a/ResultSet.h b/ResultSet.h new file mode 100644 index 0000000..ba357af --- /dev/null +++ b/ResultSet.h @@ -0,0 +1,53 @@ +// Dynamic result set class +// Originally by Gonzalo Navarro + +#ifndef _RESULTSET_H_ +#define _RESULTSET_H_ + +namespace SXSI +{ + +#ifndef ulong +typedef unsigned long ulong; +#endif + +class ResultSet { +private: + ulong n, lgn; + ulong *tree; + ulong lg(ulong); + + ulong conv (ulong p, ulong n, ulong lgn); + ulong unconv (ulong p, ulong n, ulong lgn); + ulong clearRangeLeft (ulong p1, ulong n, ulong pos, ulong pot); + ulong clearRangeRight (ulong p2, ulong n, ulong pos, ulong pot); + ulong clearBoth (ulong n, ulong p1, ulong p2, ulong pos, ulong pot); + + ulong nextSmallest (ulong n, ulong pos, ulong pot); + ulong nextLarger (ulong n, ulong p, ulong pos, ulong pot); + + void prnspace (ulong k); +public: + // creates empty results data structure for n nodes numbered 0..n-1 + ResultSet(ulong n); + ~ResultSet(); + + // returns 0/1 telling whether result p is not/is present in R + bool get(ulong p); + + // inserts result p into R + void set(ulong p); + + // clears all results p1..p2 in R + void clearRange (ulong p1, ulong p2); + // clears all results + void clear(); + + // returns pos of next(p) in R, or -1 if none + ulong nextResult (ulong p); + +}; + +} + +#endif // _RESULTSET_H_ diff --git a/TextCollection.h b/TextCollection.h index 9c6591f..98272c1 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -201,6 +201,36 @@ namespace SXSI virtual full_result FullKMismatches(uchar const *, unsigned) const = 0; virtual full_result FullKErrors(uchar const *, unsigned) const = 0; + + virtual TextPosition getLength() const + { + std::cerr << "TextCollection::getLength() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + virtual TextPosition LF(uchar c, TextPosition i) const + { + std::cerr << "TextCollection::LF() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + virtual uchar* getSuffix(TextPosition pos, unsigned l) const + { + std::cerr << "TextCollection::getSuffix() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + virtual DocId getDoc(TextPosition i) const + { + std::cerr << "TextCollection::getDoc() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + protected: // Protected constructor; use TextCollectionBuilder TextCollection() { }; diff --git a/dependencies.mk b/dependencies.mk index e8f8dbc..494b444 100644 --- a/dependencies.mk +++ b/dependencies.mk @@ -1,37 +1,40 @@ BitRank.o: BitRank.cpp BitRank.h BlockArray.h Tools.h +EditDistance.o: EditDistance.cpp EditDistance.h Tools.h FMIndex.o: FMIndex.cpp FMIndex.h incbwt/bits/deltavector.h \ incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ incbwt/bits/bitbuffer.h BitRank.h BlockArray.h Tools.h TextCollection.h \ TextStorage.h ArrayDoc.h FMIndexBuilder.o: FMIndexBuilder.cpp incbwt/rlcsa_builder.h \ - incbwt/rlcsa.h incbwt/bits/vectors.h incbwt/bits/deltavector.h \ - incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ - incbwt/bits/bitbuffer.h incbwt/bits/rlevector.h incbwt/sasamples.h \ - incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \ - incbwt/bits/deltavector.h incbwt/lcpsamples.h incbwt/bits/array.h \ - incbwt/misc/utils.h incbwt/misc/definitions.h incbwt/misc/parameters.h \ - incbwt/bits/deltavector.h FMIndexBuilder.h TextCollectionBuilder.h \ - TextCollection.h Tools.h TextStorage.h FMIndex.h BitRank.h BlockArray.h \ - ArrayDoc.h + incbwt/rlcsa.h incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ + incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \ + incbwt/bits/rlevector.h incbwt/bits/nibblevector.h incbwt/sasamples.h \ + incbwt/misc/definitions.h incbwt/bits/bitbuffer.h incbwt/lcpsamples.h \ + incbwt/bits/array.h incbwt/misc/utils.h incbwt/misc/definitions.h \ + incbwt/misc/parameters.h incbwt/bits/deltavector.h FMIndexBuilder.h \ + TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h FMIndex.h \ + BitRank.h BlockArray.h ArrayDoc.h HeapProfiler.o: HeapProfiler.cpp HeapProfiler.h +Query.o: Query.cpp Query.h TextCollection.h Tools.h RLCSABuilder.o: RLCSABuilder.cpp incbwt/rlcsa_builder.h incbwt/rlcsa.h \ - incbwt/bits/vectors.h incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ + incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \ - incbwt/bits/rlevector.h incbwt/sasamples.h incbwt/misc/definitions.h \ - incbwt/bits/bitbuffer.h incbwt/bits/deltavector.h incbwt/lcpsamples.h \ + incbwt/bits/rlevector.h incbwt/bits/nibblevector.h incbwt/sasamples.h \ + incbwt/misc/definitions.h incbwt/bits/bitbuffer.h incbwt/lcpsamples.h \ incbwt/bits/array.h incbwt/misc/utils.h incbwt/misc/definitions.h \ incbwt/misc/parameters.h RLCSABuilder.h TextCollectionBuilder.h \ TextCollection.h Tools.h TextStorage.h incbwt/bits/deltavector.h \ - RLCSAWrapper.h incbwt/rlcsa.h + RLCSAWrapper.h IndelQuery.h ResultSet.h EditDistance.h Query.h \ + incbwt/rlcsa.h +ResultSet.o: ResultSet.cpp ResultSet.h TextCollection.o: TextCollection.cpp TextCollection.h Tools.h FMIndex.h \ incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h BitRank.h \ BlockArray.h TextStorage.h ArrayDoc.h SWCSAWrapper.h \ swcsa/utils/defValues.h swcsa/utils/valstring.h swcsa/interface.h \ - RLCSAWrapper.h incbwt/rlcsa.h incbwt/bits/vectors.h \ - incbwt/bits/deltavector.h incbwt/bits/rlevector.h incbwt/sasamples.h \ - incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \ - incbwt/bits/deltavector.h incbwt/lcpsamples.h incbwt/bits/array.h \ + RLCSAWrapper.h IndelQuery.h ResultSet.h EditDistance.h Query.h \ + incbwt/rlcsa.h incbwt/bits/deltavector.h incbwt/bits/rlevector.h \ + incbwt/bits/nibblevector.h incbwt/sasamples.h incbwt/misc/definitions.h \ + incbwt/bits/bitbuffer.h incbwt/lcpsamples.h incbwt/bits/array.h \ incbwt/misc/utils.h incbwt/misc/definitions.h incbwt/misc/parameters.h TextCollectionBuilder.o: TextCollectionBuilder.cpp \ TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h \ diff --git a/incbwt/dependencies.mk b/incbwt/dependencies.mk index 1c203df..a23a883 100644 --- a/incbwt/dependencies.mk +++ b/incbwt/dependencies.mk @@ -1,74 +1,81 @@ build_plcp.o: build_plcp.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ misc/parameters.h build_rlcsa.o: build_rlcsa.cpp rlcsa_builder.h rlcsa.h bits/deltavector.h \ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \ - bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \ + bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \ misc/definitions.h misc/parameters.h display_test.o: display_test.cpp rlcsa.h bits/deltavector.h \ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \ - bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \ + bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \ misc/definitions.h misc/parameters.h extract_sequence.o: extract_sequence.cpp rlcsa.h bits/deltavector.h \ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \ - bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \ + bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \ misc/definitions.h misc/parameters.h lcp_test.o: lcp_test.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ misc/parameters.h -lcpsamples.o: lcpsamples.cpp lcpsamples.h bits/array.h \ - bits/../misc/definitions.h bits/bitbuffer.h bits/vectors.h misc/utils.h \ +lcpsamples.o: lcpsamples.cpp lcpsamples.h bits/deltavector.h \ + bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ + bits/rlevector.h bits/nibblevector.h bits/array.h misc/utils.h \ misc/definitions.h locate_test.o: locate_test.cpp rlcsa.h bits/deltavector.h \ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \ - bits/bitbuffer.h lcpsamples.h bits/array.h bits/vectors.h misc/utils.h \ + bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \ misc/definitions.h misc/parameters.h parallel_build.o: parallel_build.cpp rlcsa_builder.h rlcsa.h \ bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \ bits/bitbuffer.h bits/rlevector.h bits/nibblevector.h sasamples.h \ misc/definitions.h bits/bitbuffer.h lcpsamples.h bits/array.h \ - bits/vectors.h misc/utils.h misc/definitions.h misc/parameters.h + misc/utils.h misc/definitions.h misc/parameters.h read_bwt.o: read_bwt.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ misc/parameters.h rlcsa.o: rlcsa.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ - misc/parameters.h qsufsort/qsufsort.h qsufsort/../misc/definitions.h + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ + misc/parameters.h qsufsort/qsufsort.h qsufsort/../misc/definitions.h \ + bits/vectors.h rlcsa_builder.o: rlcsa_builder.cpp rlcsa_builder.h rlcsa.h \ bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \ bits/bitbuffer.h bits/rlevector.h bits/nibblevector.h sasamples.h \ misc/definitions.h bits/bitbuffer.h lcpsamples.h bits/array.h \ - bits/vectors.h misc/utils.h misc/definitions.h misc/parameters.h + misc/utils.h misc/definitions.h misc/parameters.h rlcsa_grep.o: rlcsa_grep.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ misc/parameters.h rlcsa_test.o: rlcsa_test.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ misc/parameters.h sample_lcp.o: sample_lcp.cpp rlcsa.h bits/deltavector.h bits/bitvector.h \ bits/../misc/definitions.h bits/bitbuffer.h bits/rlevector.h \ bits/nibblevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - lcpsamples.h bits/array.h bits/vectors.h misc/utils.h misc/definitions.h \ + lcpsamples.h bits/array.h misc/utils.h misc/definitions.h \ misc/parameters.h sasamples.o: sasamples.cpp sasamples.h misc/definitions.h \ bits/bitbuffer.h bits/../misc/definitions.h bits/deltavector.h \ bits/bitvector.h bits/bitbuffer.h misc/utils.h misc/definitions.h +text_generator.o: text_generator.cpp rlcsa.h bits/deltavector.h \ + bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ + bits/rlevector.h bits/nibblevector.h sasamples.h misc/definitions.h \ + bits/bitbuffer.h lcpsamples.h bits/array.h misc/utils.h \ + misc/definitions.h misc/parameters.h array.o: bits/array.cpp bits/array.h bits/../misc/definitions.h \ bits/bitbuffer.h bitvector.o: bits/bitvector.cpp bits/bitvector.h \ diff --git a/makefile b/makefile index 60aa8ba..8c88467 100644 --- a/makefile +++ b/makefile @@ -10,7 +10,7 @@ LIBSWCSA = swcsa/swcsa.a dcover_obs = dcover/difference_cover.o TextCollection_obs = TextCollection.o TextCollectionBuilder.o FMIndexBuilder.o RLCSABuilder.o FMIndex.o Tools.o \ - TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA} + TextStorage.o Query.o EditDistance.o ResultSet.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA} TCDebug_obs = bittree.o rbtree.o dynFMI.o all: testTextCollection -- 2.17.1