-
Notifications
You must be signed in to change notification settings - Fork 119
/
NoiseQProfile.h
181 lines (146 loc) · 4.2 KB
/
NoiseQProfile.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#ifndef NOISEQPROFILE_H_
#define NOISEQPROFILE_H_
#include<cmath>
#include<cstdio>
#include<cstring>
#include<string>
#include<cassert>
#include "utils.h"
#include "RefSeq.h"
#include "simul.h"
class NoiseQProfile {
public:
NoiseQProfile() {
logp = 0.0;
memset(c, 0, sizeof(c));
memset(p, 0, sizeof(p));
}
NoiseQProfile& operator=(const NoiseQProfile&);
void init();
void updateC(const std::string&, const std::string&);
void update(const std::string&, const std::string&, double frac);
void finish();
void calcInitParams();
double getProb(const std::string&, const std::string&);
double getLogP() { return logp; }
void collect(const NoiseQProfile&);
void read(FILE*);
void write(FILE*);
void startSimulation();
std::string simulate(simul*, int, const std::string&);
void finishSimulation();
private:
static const int NCODES = 5; // number of possible codes
static const int SIZE = 100;
double logp; //log prob;
double c[SIZE][NCODES]; //counts in N0;
double p[SIZE][NCODES]; //p[q][c] = p(c|q)
int c2q(char c) { assert(c >= 33 && c <= 126); return c - 33; }
double (*pc)[NCODES]; // for simulation
};
NoiseQProfile& NoiseQProfile::operator=(const NoiseQProfile& rv) {
if (this == &rv) return *this;
logp = rv.logp;
memcpy(c, rv.c, sizeof(rv.c));
memcpy(p, rv.p, sizeof(rv.p));
return *this;
}
void NoiseQProfile::init() {
memset(p, 0, sizeof(p));
}
void NoiseQProfile::updateC(const std::string& readseq, const std::string& qual) {
int len = readseq.size();
for (int i = 0; i < len; i++) {
++c[c2q(qual[i])][get_base_id(readseq[i])];
}
}
void NoiseQProfile::update(const std::string& readseq, const std::string& qual, double frac) {
int len = readseq.size();
for (int i = 0; i < len; i++) {
p[c2q(qual[i])][get_base_id(readseq[i])] += frac;
}
}
void NoiseQProfile::finish() {
double sum;
//If N0 is 0, p(c|q) = 0 for all c, q
logp = 0.0;
for (int i = 0; i < SIZE; i++) {
sum = 0.0;
for (int j = 0; j < NCODES; j++) sum += (p[i][j] + c[i][j]);
if (sum <= 0.0) continue;
//if (isZero(sum)) continue;
for (int j = 0; j < NCODES; j++) {
p[i][j] = (p[i][j] + c[i][j]) /sum;
if (c[i][j] > 0.0) { logp += c[i][j] * log(p[i][j]); }
}
}
}
//make init parameters not zero
void NoiseQProfile::calcInitParams() {
double sum;
logp = 0.0;
for (int i = 0; i < SIZE; i++) {
sum = 0.0;
for (int j = 0; j < NCODES; j++) sum += (1.0 + c[i][j]); // 1.0 pseudo count
for (int j = 0; j < NCODES; j++) {
p[i][j] = (c[i][j] + 1.0) / sum;
if (c[i][j] > 0.0) { logp += c[i][j] * log(p[i][j]); }
}
}
}
double NoiseQProfile::getProb(const std::string& readseq, const std::string& qual) {
double prob = 1.0;
int len = readseq.size();
for (int i = 0; i < len; i++) {
prob *= p[c2q(qual[i])][get_base_id(readseq[i])];
}
return prob;
}
void NoiseQProfile::collect(const NoiseQProfile& o) {
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < NCODES; j++)
p[i][j] += o.p[i][j];
}
}
//If read from file, assume do not need to estimate from data
void NoiseQProfile::read(FILE *fi) {
int tmp_size, tmp_ncodes;
memset(c, 0, sizeof(c));
assert(fscanf(fi, "%d %d", &tmp_size, &tmp_ncodes) == 2);
assert(tmp_size == SIZE && tmp_ncodes == NCODES);
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < NCODES; j++)
assert(fscanf(fi, "%lf", &p[i][j]) == 1);
}
}
void NoiseQProfile::write(FILE *fo) {
fprintf(fo, "%d %d\n", SIZE, NCODES);
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < NCODES - 1; j++) { fprintf(fo, "%.10g ", p[i][j]); }
fprintf(fo, "%.10g\n", p[i][NCODES - 1]);
}
}
void NoiseQProfile::startSimulation() {
pc = new double[SIZE][NCODES];
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < NCODES; j++) {
pc[i][j] = p[i][j];
if (j > 0) pc[i][j] += pc[i][j - 1];
}
if (isZero(pc[i][NCODES - 1])) {
assert(NCODES == 5);
pc[i][0] = 0.25; pc[i][1] = 0.5; pc[i][2] = 0.75; pc[i][3] = 1.0; pc[i][4] = 1.0;
}
}
}
std::string NoiseQProfile::simulate(simul* sampler, int len, const std::string& qual) {
std::string readseq = "";
for (int i = 0; i < len; i++) {
readseq.push_back(getCharacter(sampler->sample(pc[c2q(qual[i])], NCODES)));
}
return readseq;
}
void NoiseQProfile::finishSimulation() {
delete[] pc;
}
#endif /* NOISEQPROFILE_H_ */