From: nvalimak Date: Sun, 7 Nov 2010 12:23:25 +0000 (+0000) Subject: Added PSSM support for RLCSA X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=ef1fa07736c89663b284d530f1aeab24c848ac90 Added PSSM support for RLCSA git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@932 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/PssmQuery.h b/PssmQuery.h new file mode 100644 index 0000000..a59e86e --- /dev/null +++ b/PssmQuery.h @@ -0,0 +1,196 @@ +/** + * PSSM queries + * + * Input is a Position Frequency Matrix (PFM, see e.g. http://jaspar.genereg.net/) + * and a threshold: + * + * contains('PSSM 0.1 3 4 19 0 16 0 20 0 1 0 0 0 0') + * + * Where + * 'PSSM' is the query keyword. + * '0.1' is the minimum threshold for log-odds score. + * '3' is the number of columsn in the given PFM. + * + * Rest of the numbers contain the PFM in row-wise order, + * e.g. in the above example we have: + * A [ 4 19 0 ] + * C [16 0 20 ] + * G [ 0 1 0 ] + * T [ 0 0 0 ] + */ + +#ifndef _PssmQuery_H_ +#define _PssmQuery_H_ + +#include "Query.h" + +#include +#include +#include +#include + +namespace SXSI +{ + +class PssmQuery : public Query +{ +public: + PssmQuery(TextCollection const *tc_, double thr) + : Query(tc_), threshold(thr) + { } + + virtual ~PssmQuery() + { } + +protected: + void firstStep(void) + { + // Parse input into PFM + uchar const *tmp = pat; + int l = parseNextInt(tmp); + if (l <= 0) + { + std::cerr << "PssmQuery::firstStep(): invalid length given: length must be greater than 0." << std::endl; + std::exit(1); + } + + P = std::vector(); + P.reserve(4 * l); + while (*tmp != 0) + P.push_back(parseNextInt(tmp)); + + if ((int)P.size() != 4 * l) + { + std::cerr << "PssmQuery::firstStep(): invalid length or matrix given: length was " << l << " * 4 == " << l * 4 << " but the matrix had " << P.size() << " values." << std::endl; + std::exit(1); + } + + // And into log-odds: + P = convertPfmToLogOdds(P, l); + calculateMaxSums(P, l); + pat = 0; + patlen = l; + PssmMatch(patlen-1, 0); + } + +private: + + int parseNextInt(uchar const *&tmp) + { + int r = std::atoi((char const *)tmp); + while (*tmp != ' ' && *tmp != 0) ++tmp; + if (*tmp == ' ') + ++tmp; + return r; + } + + void PssmMatch(const unsigned pos, const double score) + { + // ALPHABET has been defined in Query.cpp + for (int i = 0; i < 4; ++i) + { + const char c = ALPHABET[i]; + if (!pushChar(c)) continue; + + const double nval = P[pos + i * patlen]; + + if (pos == 0) + { + if (score + nval >= threshold) + newMatch(); + } + else + { + const double achievableScore = score + nval + maxSum[pos-1]; + if (achievableScore >= threshold) + PssmMatch(pos - 1, score + nval); + } + popChar(); + } + } + + inline void newMatch() + { + ulong min = smin.top(); + ulong max = smax.top(); + for (ulong i = min; i <= max; i++) + result.insert(tc->getDoc(i)); + } + + + /** + * Convert Position Frequency Matrix to Log-Odds + * + * Calculates log-odds and pseudocount into a PSSM. + * + * @param P Position Frequency Matrix. + * @param l width of the input matrix. + * @param threshold a threshold score. + * @return a PSSM. + */ + std::vector convertPfmToLogOdds(std::vector &P, unsigned l) + { + // Frequencies of different characters in human genome (calculated from NCBI Build 36.3) + const double freq[] = { 790648081, 549365549, 549611686, 791675782 }; + const double freqSum = freq[0] + freq[1] + freq[2] + freq[3]; + const double logSum = std::log(freqSum); + + double logArr[4]; + logArr[0] = std::log(freq[0]) - logSum; // equals log (freq['A']/sum) + logArr[1] = std::log(freq[1]) - logSum; + logArr[2] = std::log(freq[2]) - logSum; + logArr[3] = std::log(freq[3]) - logSum; + + std::vector R = std::vector(l*4); + for(unsigned i=0; i < l; i++) + { + // Sum + pseudocount + double sum = P[i] + P[i+l] + P[i + 2*l] + P[i + 3*l] + 1; + for(int j =0; j<4; j++) + { + // Propability p' (background probability, used as a pseudocount) + double backProb = freq[j] / freqSum; + // Log of probability p with pseudocount added + double logProb = std::log(P[i+j*l] + backProb) - log(sum); // equals log((P + pseudo)/sum) + + R[i+j*l] = logProb - logArr[j]; // equals log(p/p') + +// cout << "i = " << i+j*l << ", R = " << R[i+j*l] << ", back = " << backProb << ", logp= " << logProb << ", logA = " << logArr[j] << endl; + } + } + return R; + } + + void calculateMaxSums(std::vector P, unsigned l) + { + maxSum = std::vector(l); + // Max sum so far + double sum = 0; + // Iterate through columns in P + for (unsigned i = 0; i < l; ++i) + { + double value = P[i]; + // Scan the rest of column i + for (unsigned j = 1; j < 4; j ++) + if (P[i + j*l] > value) + value = P[i + j*l]; // new maximum value in column i + sum += value; + maxSum[i] = sum; +// cout << "maxsum[" << i << "] = " << sum << endl; + } + } + + + std::vector P; + double threshold; + std::vector maxSum; + + PssmQuery(); + // No copy constructor or assignment + PssmQuery(PssmQuery const&); + PssmQuery& operator = (PssmQuery const&); +}; + +} // namespace + +#endif // _PssmQuery_H_