4 * Input is a Position Frequency Matrix (PFM, see e.g. http://jaspar.genereg.net/)
7 * contains('PSSM 0.1 3 4 19 0 16 0 20 0 1 0 0 0 0')
10 * 'PSSM' is the query keyword.
11 * '0.1' is the minimum threshold for log-odds score.
12 * '3' is the number of columsn in the given PFM.
14 * Rest of the numbers contain the PFM in row-wise order,
15 * e.g. in the above example we have:
35 class PssmQuery : public Query
38 PssmQuery(TextCollection const *tc_, double thr)
39 : Query(tc_), threshold(thr)
48 // Parse input into PFM
49 uchar const *tmp = pat;
50 int l = parseNextInt(tmp);
53 std::cerr << "PssmQuery::firstStep(): invalid length given: length must be greater than 0." << std::endl;
57 P = std::vector<double>();
60 P.push_back(parseNextInt(tmp));
62 if ((int)P.size() != 4 * l)
64 std::cerr << "PssmQuery::firstStep(): invalid length or matrix given: length was " << l << " * 4 == " << l * 4 << " but the matrix had " << P.size() << " values." << std::endl;
69 P = convertPfmToLogOdds(P, l);
70 calculateMaxSums(P, l);
73 PssmMatch(patlen-1, 0);
78 int parseNextInt(uchar const *&tmp)
80 int r = std::atoi((char const *)tmp);
81 while (*tmp != ' ' && *tmp != 0) ++tmp;
87 void PssmMatch(const unsigned pos, const double score)
89 // ALPHABET has been defined in Query.cpp
90 for (int i = 0; i < 4; ++i)
92 const char c = ALPHABET[i];
93 if (!pushChar(c)) continue;
95 const double nval = P[pos + i * patlen];
99 if (score + nval >= threshold)
104 const double achievableScore = score + nval + maxSum[pos-1];
105 if (achievableScore >= threshold)
106 PssmMatch(pos - 1, score + nval);
112 inline void newMatch()
114 ulong min = smin.top();
115 ulong max = smax.top();
116 for (ulong i = min; i <= max; i++)
117 result.insert(tc->getDoc(i));
122 * Convert Position Frequency Matrix to Log-Odds
124 * Calculates log-odds and pseudocount into a PSSM.
126 * @param P Position Frequency Matrix.
127 * @param l width of the input matrix.
128 * @param threshold a threshold score.
131 std::vector<double> convertPfmToLogOdds(std::vector<double> &P, unsigned l)
133 // Frequencies of different characters in human genome (calculated from NCBI Build 36.3)
134 const double freq[] = { 790648081, 549365549, 549611686, 791675782 };
135 const double freqSum = freq[0] + freq[1] + freq[2] + freq[3];
136 const double logSum = std::log(freqSum);
139 logArr[0] = std::log(freq[0]) - logSum; // equals log (freq['A']/sum)
140 logArr[1] = std::log(freq[1]) - logSum;
141 logArr[2] = std::log(freq[2]) - logSum;
142 logArr[3] = std::log(freq[3]) - logSum;
144 std::vector<double> R = std::vector<double>(l*4);
145 for(unsigned i=0; i < l; i++)
148 double sum = P[i] + P[i+l] + P[i + 2*l] + P[i + 3*l] + 1;
149 for(int j =0; j<4; j++)
151 // Propability p' (background probability, used as a pseudocount)
152 double backProb = freq[j] / freqSum;
153 // Log of probability p with pseudocount added
154 double logProb = std::log(P[i+j*l] + backProb) - log(sum); // equals log((P + pseudo)/sum)
156 R[i+j*l] = logProb - logArr[j]; // equals log(p/p')
158 // cout << "i = " << i+j*l << ", R = " << R[i+j*l] << ", back = " << backProb << ", logp= " << logProb << ", logA = " << logArr[j] << endl;
164 void calculateMaxSums(std::vector<double> P, unsigned l)
166 maxSum = std::vector<double>(l);
169 // Iterate through columns in P
170 for (unsigned i = 0; i < l; ++i)
173 // Scan the rest of column i
174 for (unsigned j = 1; j < 4; j ++)
175 if (P[i + j*l] > value)
176 value = P[i + j*l]; // new maximum value in column i
179 // cout << "maxsum[" << i << "] = " << sum << endl;
184 std::vector<double> P;
186 std::vector<double> maxSum;
189 // No copy constructor or assignment
190 PssmQuery(PssmQuery const&);
191 PssmQuery& operator = (PssmQuery const&);
196 #endif // _PssmQuery_H_