-
Notifications
You must be signed in to change notification settings - Fork 0
/
significant.py
executable file
·102 lines (72 loc) · 3.82 KB
/
significant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
import pandas as pd
import numpy as np
from os import path
from scipy.stats import binom
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
from argparse import ArgumentParser
test_methods = ['bonferroni', 'fdr_bh', 'fdr_by', 'fdr_tsbh', 'fdr_tsbky', 'holm', 'holm-sidak' ]
parser = ArgumentParser(description="Significance testing for methylation.")
parser.add_argument('-m', dest='methylated', help="DataFrame containing methylated call counts", required=True)
parser.add_argument('-c', dest='covered', help="DataFrame containing total call counts", required=True)
parser.add_argument('--conversion_rate', help="(Bisulfite) Conversion rate for significance testing", type=float, default=0.995)
parser.add_argument('--alpha', help="False discovery rate", type=float, default=0.001)
parser.add_argument('--min_covered', help="Minimum coverage", type=int, default=5)
parser.add_argument('--correction_method', help="Multiple testing correction method", default='fdr_bh', choices=test_methods)
args= parser.parse_args()
print (args)
print ('Loading files...')
methylated_fn = path.basename(args.methylated).split('.')[0]
covered_fn = path.basename(args.covered).split('.')[0]
methylated = pd.read_pickle(args.methylated)
covered = pd.read_pickle(args.covered)
assert methylated.index.equals(covered.index)
assert methylated.columns.equals(covered.columns)
print (f'Loaded {len(covered)} C sites.')
print('Filtering coverage..')
# We zero out coverage values below the threshold
covered = covered.where(covered >= args.min_covered, other=0)
# We zero out the 0.1% highest covered positions for each experiment
# (outliers, likely experimental artifacts)
#max_covered = covered.where(covered>0).quantile(0.999)
max_covered = pd.Series({ label:data[data>0].quantile(0.999) for label, data in covered.iteritems() },
dtype='uint16')
covered = covered.where(covered <= max_covered, other=0)
# Saving us some space
if covered.max().max()<256 :
print ('Casting to uint8 for space savings.')
covered = covered.astype('uint8')
methylated = methylated.astype('uint8')
# This is the boolean mask for significance, same shape as methylated/covered matrices
# True means we reject null hypothesis
significant = pd.DataFrame(np.zeros(methylated.shape, dtype=bool), index=methylated.index, columns=methylated.columns)
failed_conversion_prob = 1 - args.conversion_rate
# Buffer for p values to avoid re-calculating binomial cdf
p_buffer = dict()
print ('Significance testing..')
# Significance testing per column
for (sample, met), (smp,cov) in tqdm(zip(methylated.iteritems(), covered.iteritems()), total=methylated.shape[1]) :
assert sample == smp
# We only take positions with some methylated calls
mask = (cov > 0) & (met > 0)
p = np.empty(mask.sum())
# We compare methylated vs. total call count per position
# through a binomial model to get p values
for i , m_c in enumerate(zip(met[mask], cov[mask])) :
try:
p[i] = p_buffer[m_c]
except :
p[i] = p_buffer[m_c] = 1- binom.cdf(*m_c, failed_conversion_prob)
# We correct p values for multiple testing
reject, pvals, ignore, ignore = multipletests(p, alpha=args.alpha, method=args.correction_method)
# Write results to relevant column
significant[sample][mask]=reject
# We only take rows where that are significantly methylated in a sample
significant = significant[significant.any(axis=1)]
print(f'{len(significant)} significantly methylated sites.')
# We zero out insgnificant methylation calls (noise)
methylated = methylated.reindex_like(significant).where(significant, other=0)
methylated.to_pickle(f'{methylated_fn}_significant.pkl')
covered.to_pickle(f'{covered_fn}_filtered.pkl')
#covered.reindex_like(significant).to_pickle(f'{covered_fn}_filtered_significant.pkl')