-
Notifications
You must be signed in to change notification settings - Fork 14
/
multiple_testing_correction.py
300 lines (247 loc) · 9.9 KB
/
multiple_testing_correction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
"""
Script to enable multiple testing correction
for hotspots
authors : Collin Tokheim, Rohit Bhattacharya
emails : [email protected], [email protected]
"""
# imports
import src.statistics as mystats
import argparse
import operator
import scripts.get_hotspot_residues as get_hotspot
import numpy as np
import csv
import os
MISSING_COUNT = 0
EMPTY_PVALS = 0
def parse_arguments():
"""
Function to parse command line arguements
from the user
Returns
-------
opts : dict
command line arguements from the user
"""
info = 'Multiple testing correction for predicted hotspots'
parser = argparse.ArgumentParser(description=info)
# program arguments
parser.add_argument('-i', '--hotspot-file',
type=str,
required=True,
help='File containing the verbose output from hotspot.py')
parser.add_argument('-f', '--function',
type=str,
default='min',
help='Function applied to a group of p-values (Default: min)')
parser.add_argument('-m', '--mupit-dir',
type=str,
required=True,
help='Directory containing the mupit annotations')
parser.add_argument('-q', '--q-value',
type=float,
required=True,
help='Q-value (FDR) threshold to apply for significance')
parser.add_argument('-o', '--output-file',
type=str,
help='Name of output file')
parser.add_argument('-s', '--significance-level',
type=str, required=True,
help='Output file to write the significance level cutoff'
' for each tumor type')
args = parser.parse_args()
opts = vars(args)
return opts
def read_mupit_file(in_file):
"""
Reads in a mupit annotated file and sorts it
by gene, transcript and residue
Parameters:
-----------
in_file : str
name of the mupit annotated file
Returns:
--------
mupit_annotations : list
sorted by gene, transcript and residue
"""
# mupit annotations list
mupit_annotations = []
# open the file
in_file = open(in_file)
# read in header
header = in_file.readline().strip()
header = header.split('\t')
# get indices of relevant info
gene_ind = header.index("HUGO symbol")
#transcript_ind = header.index("Sequence ontology transcript")
#residue_ind = header.index("Sequence ontology protein sequence change")
transcript_ind = header.index("Reference Transcript")
residue_ind = header.index("Reference Codon Position")
aa_ind = header.index("Reference AA")
genomic_pos_ind = header.index("Reference Genomic Position")
chrom_ind = header.index("Chromosome")
for line in in_file:
# remove trailing characters
line = line.strip()
# split by tab
line = line.split('\t')
# obtain the cravat residue id, which is between two letters indicating
# the protein sequence change
# also leave out special cases
if len(line) < residue_ind or line[residue_ind] == '':
continue
# get residue position
#line[residue_ind] = int(line[residue_ind][1:-1])
line[residue_ind] = int(line[residue_ind])
# add as tuple so we can sort
mupit_annotations.append(tuple(line))
# sort by gene, transcript and position
mupit_annotations = sorted(mupit_annotations,
key=operator.itemgetter(gene_ind, transcript_ind, residue_ind))
in_file.close()
return (mupit_annotations, gene_ind, transcript_ind,
residue_ind, chrom_ind, genomic_pos_ind, aa_ind)
def get_group_pvals(mupit_groups, gene_ind, transcript_ind,
residue_ind, chrom_ind, genomic_pos_ind, aa_ind,
hotspot_output, ttype, func=min):
"""
"""
global MISSING_COUNT
global EMPTY_PVALS
# dictionary containing pdb id, chain, residue as keys
# and a list of all pvals associated with the unique combination
# as values
hspot_pvals = {}
# iterate through each line of the hotspot output for the given ttype
for hspot_data in hotspot_output:
# make a key from pdb id, chain, and residue
curr_key = (hspot_data[0], hspot_data[3], hspot_data[4])
# check if this key is already in our pvals dict
# if not, add it
if not curr_key in hspot_pvals:
hspot_pvals[curr_key] = [float(hspot_data[5])]
else:
hspot_pvals[curr_key].append(float(hspot_data[5]))
# list of pvals grouped by unique gene, transcript, residue
grouped_pvals = []
# list of min pvals from each grouping of pvals
min_pvals = []
# variable to check last grouping of gene, transcript, residue
prev_group = (mupit_groups[0][gene_ind],
mupit_groups[0][transcript_ind],
mupit_groups[0][residue_ind])
prev_line = mupit_groups[0]
# go through all mupit data
for j, line in enumerate(mupit_groups):
# get current group
curr_group = (line[gene_ind], line[transcript_ind], line[residue_ind])
# check if this is the same as previous
if not prev_group[0] == curr_group[0] \
or not prev_group[1] == curr_group[1] \
or not prev_group[2] == curr_group[2]:
# check if we have groups with zero
# pvals
if not grouped_pvals:
EMPTY_PVALS += 1
#print "no pval"
else:
# if not, get the min pval and move onto the next
min_pvals.append([prev_group[0], ttype, prev_group[1],
prev_group[2], prev_line[aa_ind], prev_line[chrom_ind],
prev_line[genomic_pos_ind], func(grouped_pvals)])
grouped_pvals = []
# update prev_group
prev_group = curr_group
# update prev_line
prev_line = line
# get the pvals corresponding to the pdb, chain, residue
# combination, stored in hspot_pvals dict
pdb_chain_residue = (line[0], line[1], line[2])
# check if this combination exists in our hotspot data
if not pdb_chain_residue in hspot_pvals:
MISSING_COUNT += 1
continue
# add all associated pvals
for pval in hspot_pvals[pdb_chain_residue]:
grouped_pvals.append(pval)
return min_pvals
def main(opts):
"""
Main function
"""
# obtain user defined parameters
hspots_file = opts["hotspot_file"]
out_file = opts["output_file"]
mupit_dir = opts["mupit_dir"]
signif_lvl_file = opts['significance_level']
# use external module to separate out the residues in the hotspot.py output
# onto separate lines
args = {"input": hspots_file,
"significance_level": 1.1,
"output": None}
hotspot_output = get_hotspot.main(args)
# stratify hotspot output by tumour type
stratified_hotspot_output = {}
for hspot_data in hotspot_output:
# skip homology model
#if hspot_data[0].startswith('NP_') or hspot_data[0].startswith('ENSP'):
#continue
ttype = hspot_data[1]
if not ttype in stratified_hotspot_output:
stratified_hotspot_output[ttype] = [hspot_data]
else:
stratified_hotspot_output[ttype].append(hspot_data)
# open a file to write results to
header = ["HUGO Symbol", "Tumor Type",
"Sequence Ontology Transcript", "CRAVAT Res",
"Ref AA",
'chromosome', 'genomic position',
"Min p-value", 'q-value']
output, signif_lvl_output = [], []
# go through all mupit annotation files
if opts['function'] == 'min':
myfunc = min
elif opts['function'] == 'median':
myfunc = np.median
elif opts['function'] == 'max':
myfunc = max
for m_file in os.listdir(mupit_dir):
# print the tumor type we're on
ttype = m_file.split('_')[-1]
print ttype
# read in the file
tmp = read_mupit_file(os.path.join(mupit_dir, m_file))
(mupit_annotations, gene_ind, transcript_ind,
residue_ind, chrom_ind, genomic_pos_ind, aa_ind) = tmp
# get p-values for grouped up mupit-hotspot groups
grouped_p_vals = get_group_pvals(mupit_annotations, gene_ind,
transcript_ind, residue_ind,
chrom_ind, genomic_pos_ind, aa_ind,
stratified_hotspot_output[ttype],
ttype, myfunc)
# add the q-value
tmp_pvals = [g[-1] for g in grouped_p_vals]
tmp_qvals = mystats.bh_fdr(tmp_pvals)
for i in range(len(grouped_p_vals)):
grouped_p_vals[i].append(tmp_qvals[i])
output.extend(grouped_p_vals)
# figure out the equivalent p-value threshold
signif_pvals = [tmp_pvals[i]
for i in range(len(tmp_qvals))
if tmp_qvals[i] <= opts['q_value']]
if signif_pvals:
pval_cutoff = max(signif_pvals)
signif_lvl_output.append([ttype, pval_cutoff])
# write to output file
with open(out_file, 'wb') as out_file:
mywriter = csv.writer(out_file, delimiter='\t', lineterminator='\n')
mywriter.writerows([header]+output)
# write significance level cutoffs for each tumor type
with open(signif_lvl_file, 'wb') as out_file:
mywriter = csv.writer(out_file, delimiter='\t', lineterminator='\n')
mywriter.writerows(signif_lvl_output)
print("MISSING COUNTS = " + str(MISSING_COUNT))
if __name__ == "__main__":
opts = parse_arguments()
main(opts)