--- /dev/null
+/**
+ * PSSM queries
+ *
+ * Input is a Position Frequency Matrix (PFM, see e.g. http://jaspar.genereg.net/)
+ * and a threshold:
+ *
+ * contains('PSSM 0.1 3 4 19 0 16 0 20 0 1 0 0 0 0')
+ *
+ * Where
+ * 'PSSM' is the query keyword.
+ * '0.1' is the minimum threshold for log-odds score.
+ * '3' is the number of columsn in the given PFM.
+ *
+ * Rest of the numbers contain the PFM in row-wise order,
+ * e.g. in the above example we have:
+ * A [ 4 19 0 ]
+ * C [16 0 20 ]
+ * G [ 0 1 0 ]
+ * T [ 0 0 0 ]
+ */
+
+#ifndef _PssmQuery_H_
+#define _PssmQuery_H_
+
+#include "Query.h"
+
+#include <string>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+
+namespace SXSI
+{
+
+class PssmQuery : public Query
+{
+public:
+ PssmQuery(TextCollection const *tc_, double thr)
+ : Query(tc_), threshold(thr)
+ { }
+
+ virtual ~PssmQuery()
+ { }
+
+protected:
+ void firstStep(void)
+ {
+ // Parse input into PFM
+ uchar const *tmp = pat;
+ int l = parseNextInt(tmp);
+ if (l <= 0)
+ {
+ std::cerr << "PssmQuery::firstStep(): invalid length given: length must be greater than 0." << std::endl;
+ std::exit(1);
+ }
+
+ P = std::vector<double>();
+ P.reserve(4 * l);
+ while (*tmp != 0)
+ P.push_back(parseNextInt(tmp));
+
+ if ((int)P.size() != 4 * l)
+ {
+ std::cerr << "PssmQuery::firstStep(): invalid length or matrix given: length was " << l << " * 4 == " << l * 4 << " but the matrix had " << P.size() << " values." << std::endl;
+ std::exit(1);
+ }
+
+ // And into log-odds:
+ P = convertPfmToLogOdds(P, l);
+ calculateMaxSums(P, l);
+ pat = 0;
+ patlen = l;
+ PssmMatch(patlen-1, 0);
+ }
+
+private:
+
+ int parseNextInt(uchar const *&tmp)
+ {
+ int r = std::atoi((char const *)tmp);
+ while (*tmp != ' ' && *tmp != 0) ++tmp;
+ if (*tmp == ' ')
+ ++tmp;
+ return r;
+ }
+
+ void PssmMatch(const unsigned pos, const double score)
+ {
+ // ALPHABET has been defined in Query.cpp
+ for (int i = 0; i < 4; ++i)
+ {
+ const char c = ALPHABET[i];
+ if (!pushChar(c)) continue;
+
+ const double nval = P[pos + i * patlen];
+
+ if (pos == 0)
+ {
+ if (score + nval >= threshold)
+ newMatch();
+ }
+ else
+ {
+ const double achievableScore = score + nval + maxSum[pos-1];
+ if (achievableScore >= threshold)
+ PssmMatch(pos - 1, score + nval);
+ }
+ popChar();
+ }
+ }
+
+ inline void newMatch()
+ {
+ ulong min = smin.top();
+ ulong max = smax.top();
+ for (ulong i = min; i <= max; i++)
+ result.insert(tc->getDoc(i));
+ }
+
+
+ /**
+ * Convert Position Frequency Matrix to Log-Odds
+ *
+ * Calculates log-odds and pseudocount into a PSSM.
+ *
+ * @param P Position Frequency Matrix.
+ * @param l width of the input matrix.
+ * @param threshold a threshold score.
+ * @return a PSSM.
+ */
+ std::vector<double> convertPfmToLogOdds(std::vector<double> &P, unsigned l)
+ {
+ // Frequencies of different characters in human genome (calculated from NCBI Build 36.3)
+ const double freq[] = { 790648081, 549365549, 549611686, 791675782 };
+ const double freqSum = freq[0] + freq[1] + freq[2] + freq[3];
+ const double logSum = std::log(freqSum);
+
+ double logArr[4];
+ logArr[0] = std::log(freq[0]) - logSum; // equals log (freq['A']/sum)
+ logArr[1] = std::log(freq[1]) - logSum;
+ logArr[2] = std::log(freq[2]) - logSum;
+ logArr[3] = std::log(freq[3]) - logSum;
+
+ std::vector<double> R = std::vector<double>(l*4);
+ for(unsigned i=0; i < l; i++)
+ {
+ // Sum + pseudocount
+ double sum = P[i] + P[i+l] + P[i + 2*l] + P[i + 3*l] + 1;
+ for(int j =0; j<4; j++)
+ {
+ // Propability p' (background probability, used as a pseudocount)
+ double backProb = freq[j] / freqSum;
+ // Log of probability p with pseudocount added
+ double logProb = std::log(P[i+j*l] + backProb) - log(sum); // equals log((P + pseudo)/sum)
+
+ R[i+j*l] = logProb - logArr[j]; // equals log(p/p')
+
+// cout << "i = " << i+j*l << ", R = " << R[i+j*l] << ", back = " << backProb << ", logp= " << logProb << ", logA = " << logArr[j] << endl;
+ }
+ }
+ return R;
+ }
+
+ void calculateMaxSums(std::vector<double> P, unsigned l)
+ {
+ maxSum = std::vector<double>(l);
+ // Max sum so far
+ double sum = 0;
+ // Iterate through columns in P
+ for (unsigned i = 0; i < l; ++i)
+ {
+ double value = P[i];
+ // Scan the rest of column i
+ for (unsigned j = 1; j < 4; j ++)
+ if (P[i + j*l] > value)
+ value = P[i + j*l]; // new maximum value in column i
+ sum += value;
+ maxSum[i] = sum;
+// cout << "maxsum[" << i << "] = " << sum << endl;
+ }
+ }
+
+
+ std::vector<double> P;
+ double threshold;
+ std::vector<double> maxSum;
+
+ PssmQuery();
+ // No copy constructor or assignment
+ PssmQuery(PssmQuery const&);
+ PssmQuery& operator = (PssmQuery const&);
+};
+
+} // namespace
+
+#endif // _PssmQuery_H_