-
Notifications
You must be signed in to change notification settings - Fork 14
/
hotspot_1d.py
253 lines (221 loc) · 10.1 KB
/
hotspot_1d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import argparse
from src.density import *
import src.utils as utils
import numpy as np
import scipy.stats as stats
import csv
import re
from Bio.PDB import *
from src.pdb_structure import *
import src.statistics as mystats
import src.simulation as sim
import src.mutations
# import modules needed for logging
import logging
import os
logger = logging.getLogger(__name__) # module logger
def parse_arguments():
info = 'Detects hotspot protein regions'
parser = argparse.ArgumentParser(description=info)
# program arguments
parser.add_argument('-m', '--mutations',
type=str, required=True,
help='Mutation counts for specific structures')
parser.add_argument('-a', '--annotation',
type=str, required=True,
help='Annotations about PDB')
parser.add_argument('-n', '--num-simulations',
default=10000,
type=int,
help='Number of simulations (Default: 10000)')
parser.add_argument('-r', '--radius',
default=10.0,
type=float,
help='Sphere radius in angstroms (Default: 10.0)')
parser.add_argument('-s', '--seed',
default=None,
type=int,
help='Random number generator seed (Default: automatic)')
parser.add_argument('-sc', '--stop-criterion',
default=200,
type=int,
help='Number of simulations exceeding the maximum observed '
'residue before stopping. This speeds computation by spending '
'less time on non-significant structures. (Default: 200)')
parser.add_argument('-t', '--tumor-type',
type=str, default='EVERY',
help='Perform analysis for only specific tumor type (Default: "EVERY" = each tumor type)')
parser.add_argument('-e', '--error-pdb',
type=str, default=None,
help='File containing structures that have badly formated pdb files')
parser.add_argument('-o', '--output',
default='output.txt',
type=str,
help='Output result file of hotspots')
# logging arguments
parser.add_argument('-ll', '--log-level',
type=str,
action='store',
default='',
help='Write a log file (--log-level=DEBUG for debug mode, '
'--log-level=INFO for info mode)')
parser.add_argument('-l', '--log',
type=str,
action='store',
default='',
help='Path to log file. (accepts "stdout")')
args = parser.parse_args()
# handle logging
if args.log_level or args.log:
if args.log:
log_file = args.log
else:
log_file = '' # auto-name the log file
else:
log_file = os.devnull
log_level = args.log_level
utils.start_logging(log_file=log_file,
log_level=log_level) # start logging
opts = vars(args)
return opts
def main(opts):
"""Currently, performs analysis for the given genes. It attempts to use
any available PDB sturctures. It then loops through each protein chain
and tumor type.
"""
# read in data
logger.info('Reading in annotations . . .')
pdb_info = utils.read_pdb_info(opts['annotation'])
logger.info('Finished reading in annotations.')
logger.info('Reading in mutations . . .')
mutations = utils.read_mutations(opts['mutations'])
logger.info('Finished reading in mutations.')
# iterate over each structure
logger.info('Running of PDB structures . . .')
output = []
num_pdbs = 0
num_missing_pdbs = 0
missing_pdb_list = []
error_pdb_structs = []
quiet = True if opts['log_level'] != "DEBUG" else False # flag indicating pdb warnings
pdb_parser = PDBParser(QUIET=quiet) # parser for pdb files
for structure_id in pdb_info:
print (structure_id)
# get pdb info
struct_info = pdb_info[structure_id]
pdb_path = struct_info.pop('path')
# read in structure
structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
if structure is None:
continue
# make a list of all chain letters in structure
struct_chains = []
for k in struct_info.keys():
struct_chains.extend(struct_info[k])
# get mutation info
structure_mutations = mutations.get(structure_id, [])
# skip structure if no mutations
if not structure_mutations:
continue
# separate out mutation info
ttypes, mres, mcount, mchains = zip(*structure_mutations) # if model_mutations else ([], [], [])
# stratify mutations by their tumor type
# ttype_ixs is a dictionary that contains
# ttype as the keys and a list of relevant
# indices as the values
unique_ttypes = set(ttypes)
ttype_ixs = {t: [i for i in range(len(mcount)) if ttypes[i]==t]
for t in unique_ttypes}
unique_ttypes = list(unique_ttypes)
# obtain relevant info from structure
tmp_info = get_structure_info(structure, mchains, mres, mcount,
struct_chains, ttype_ixs)
(mut_res_centers_of_geometry,
mut_res_mutation_counts,
all_res_centers_of_geometry,
models) = tmp_info
if not all_res_centers_of_geometry:
logger.error('No available center of geometries for {0}'.format(structure_id))
continue
# get neigbours for all residues
neighbors = find_neighbors_1D(all_res_centers_of_geometry, opts['radius'])
pan_counts = []
# iterate through each tumour type
for tumour in unique_ttypes:
# skip tumor types if not one specified
if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'):
continue
# draw information for the specific tumour type
t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]
mut_density = src.mutations.mutation_density(t_mut_res_mutation_counts,
neighbors)
mut_vals = mut_density.values()
if mut_vals:
max_obs_dens = max(mut_density.values())
else:
max_obs_dens =0
# generate null distribution
# count total mutations in structure while
# avoiding double counting due to same id and chain
# being on multiple models
obs_models = []
obs_chains = []
total_mutations = 0
#if not tumour == 'PANCAN':
if True:
for k in t_mut_res_mutation_counts:
mutations_to_add = t_mut_res_mutation_counts[k]
for i in range(len(obs_models)):
if not k[1] == obs_models[i] and k[2] == obs_chains[i]:
mutations_to_add = 0
break
total_mutations += mutations_to_add
obs_models.append(k[1])
obs_chains.append(k[2])
pan_counts.append(total_mutations)
else:
total_mutations = sum(pan_counts)
sim_null_dist = sim.generate_null_dist(structure_id, models, struct_info,
all_res_centers_of_geometry,
total_mutations,
opts['num_simulations'],
opts['seed'],
neighbors,
opts['stop_criterion'],
max_obs_dens)
# get a list of lists format for compute p values function
mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density]
if not t_mut_res_mutation_counts:
print("here")
# aditional information about p-values
# for specific residues in a structure
# compute p-values for observed
obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist)
output.append([structure_id, tumour,
','.join([str(o[0][1]) for o in mut_list]),
','.join([str(o[0][2]) for o in mut_list]),
','.join([str(o[0][3][1]) for o in mut_list]),
','.join([str(t_mut_res_mutation_counts[o[0]])
for o in mut_list]),
','.join([str(o[1]) for o in mut_list]),
','.join(map(str, obs_pvals))])
# write output to file
output = [['Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues',
'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value',
]] + output
with open(opts['output'], 'w') as handle:
csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output)
# if user specified to log failed reading of pdbs
if opts['error_pdb'] and error_pdb_structs:
with open(opts['error_pdb'], 'w') as handle:
for bad_pdb in error_pdb_structs:
handle.write(bad_pdb+'\n')
print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF))
print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF))
print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF))
print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF))
logger.info('Finished successfully!')
if __name__ == "__main__":
opts = parse_arguments()
main(opts)