-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathestimate_syn_sites.py
370 lines (262 loc) · 11.5 KB
/
estimate_syn_sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
"""
python 2.7
Mathias Scharmann
2020-06-02
counts the numbers of potentially synonymous and potentially nonsynonymous sites (Nei and Gojobori 1986)
Nei, M., Gojobori, T., 1986. Simple methods for estimating the numbers of synonymous and nonsynonymous nucleotide substitutions. Mol. Biol. Evol. 3, 418-426. https://doi.org/10.1093/oxfordjournals.molbev.a040410
it is a 'mutational opportunity method' (Bierne and Eyre-Walker, 2003, Genetics 165: 1587-1597)
the relevant paragraph in Nei & Gojobori (1986) is:
We now compute the number of synonymous sites (s) and the number of non-synonymous sites (n) for each codon, considering the above property of codon changes. We denote by f(i) the fraction of synonymous changes at the ith position of a given codon(i=1,2,3). The s and n for this codon are then given by s = sum( f(i) ) over all i and n = 3-s , respectively (also see Kafatos et al. 1977). For example, in the case of codon TTA (Leu), f(1) = 1/3 (T -> C), f(2) = 0, and f(3) = 1/3 (A -> G). Thus, s = 2/3 and n = 7/3. For a DNA sequence of r codons, the total number of synonymous and nonsynonymous sites is therefore given by S = sum over j (S(i)) and n = (3r - S), respectively, where s(i) the value of s for the ith codon. When two sequences are compared, the averages of S and N for the two sequences are used.
This code implements this concept, as inspired by
https://www.biostars.org/p/60902/
and
https://github.com/a1ultima/hpcleap_dnds/blob/master/py/scripts/changes.py
inputs:
- a gff with CDS annotations
- a gff filtered for the CDS that are covered at a min read depth (get this as downfiltered from the full GFF with bedtools)
- a fasta reference genome
How does this script handle codons with "N" or "-" characters? They are ignored!
=> missing sites count as nonsyn.
"""
import argparse, gzip
# parses command line arguments
def get_commandline_arguments ():
parser = argparse.ArgumentParser()
parser.add_argument("--fasta", required=True, help="name/path of fasta genome reference", metavar="FILE")
parser.add_argument("--full_gff", required=True, help="name/path of GFF annotation file, MUST be complete in terms of CDS, MUST NOT lack any CDS for any included genes", metavar="FILE")
parser.add_argument("--filtered_gff", required=True, help="name/path of filtered GFF annotation file, MUST contain only those CDS to be included", metavar="FILE")
args = parser.parse_args()
return args
def reverse_complement (inseq):
compdict = {"T":"A","C":"G","A":"T","G":"C","N":"N","-":"-"}
outseq = ""
for i in inseq[::-1]:
outseq += compdict[i]
return outseq
def read_fasta (infile):
if infile.endswith(".gz"):
F = gzip.open(infile)
else:
F = open(infile)
fasta_dict = {}
seq = ""
name = "dummy"
for line in F:
if line.startswith(">"):
fasta_dict[name] = seq
name = line.lstrip(">").rstrip("\n").strip()
print name
seq = ""
else:
seq += line.strip("\n")
F.close()
# last record:
fasta_dict[name] = seq
del fasta_dict["dummy"]
return fasta_dict
def parse_gff (infile):
## NOTE: GFF coordinates are 1-based, but python indexes are 0-based
## NOTE: GFF entries within each chromosome MUST be sorted by start coordinate from lowest to highest !!
## GFF orientation: + == as-is of the fasta sequence
## - == reverse complement of the fasta sequence
## the CDS per gene must be stitched together to interpret the codons,
## because sometimes codons are actually split up into different exons, so that the ORF is broken when translating the split codons (frameshift)
if infile.endswith(".gz"):
F = gzip.open(infile)
else:
F = open(infile)
gff_dict = {}
for line in F:
if not line.startswith("#"):
fields = line.strip("\n").split("\t")
if fields[2] == "CDS":
parent = fields[8].split("Parent=")[1]
try:
gff_dict[parent].append(fields)
except KeyError:
gff_dict[parent] = [fields]
return gff_dict
codon_table = {"TTT" : "F", "CTT" : "L", "ATT" : "I", "GTT" : "V",
"TTC" : "F", "CTC" : "L", "ATC" : "I", "GTC" : "V",
"TTA" : "L", "CTA" : "L", "ATA" : "I", "GTA" : "V",
"TTG" : "L", "CTG" : "L", "ATG" : "M", "GTG" : "V",
"TCT" : "S", "CCT" : "P", "ACT" : "T", "GCT" : "A",
"TCC" : "S", "CCC" : "P", "ACC" : "T", "GCC" : "A",
"TCA" : "S", "CCA" : "P", "ACA" : "T", "GCA" : "A",
"TCG" : "S", "CCG" : "P", "ACG" : "T", "GCG" : "A",
"TAT" : "Y", "CAT" : "H", "AAT" : "N", "GAT" : "D",
"TAC" : "Y", "CAC" : "H", "AAC" : "N", "GAC" : "D",
"TAA" : "*", "CAA" : "Q", "AAA" : "K", "GAA" : "E",
"TAG" : "*", "CAG" : "Q", "AAG" : "K", "GAG" : "E",
"TGT" : "C", "CGT" : "R", "AGT" : "S", "GGT" : "G",
"TGC" : "C", "CGC" : "R", "AGC" : "S", "GGC" : "G",
"TGA" : "*", "CGA" : "R", "AGA" : "R", "GGA" : "G",
"TGG" : "W", "CGG" : "R", "AGG" : "R", "GGG" : "G"
}
def interpret_gff_with_region_filter (gff_dict, fastadict, regions_to_retain, gene_whitelist):
# purpose of gene_whitelist: there are some genes whose CDSs are overlapping in the + and - strands,
# to avoid that both get included even though only one is explicity in the regions_to_retain, enforce the gene names with this whitelist!
print "counting syn sites in CDS to be retained"
out_dict = {}
for k,v in gff_dict.items():
if k in gene_whitelist:
anyhit = None
fourfolds = 0
complete_CDSs = ""
abs_coords = []
in_gene_coordinates = []
i = 0
retained_CDSs = []
for fields in v:
chrom = fields[0]
abs_start = int(fields[3])
abs_stop = int(fields[4])
# print abs_start, abs_stop
abs_coords += range(abs_start,abs_stop+1)
cds_length = len( range(abs_start,abs_stop+1) )
in_gene_coordinates += range(i,cds_length+i)
i += cds_length
# ingene_stop = ingene_start + (abs_stop - abs_start)
orientation = fields[6]
try:
complete_CDSs += fastadict[chrom][abs_start-1:abs_stop]
except KeyError:
complete_CDSs += ""
# fo calcualtion of gene length, check if CDS is in regions to retain
hit = site_region_lookup(abs_start, regions_to_retain[chrom])
if hit:
retained_CDSs.append(fields)
anyhit = True
if anyhit: # proceed only if any of the CDS in this gene is within regions_to_retain
if len(complete_CDSs) > 0: # proceed only if the gene was actually in the fasta
# if orientation "-", reverse the absolute coordinates list and reverse-complement the sequence
if orientation == "-":
abs_coords = abs_coords[::-1]
complete_CDSs = reverse_complement (complete_CDSs)
# print orientation, len(v), len(in_gene_coordinates), len(abs_coords)
# print zip(abs_coords,in_gene_coordinates), complete_CDSs
# now estimate syn_sites
cnt = 0
syn_fractions_list = []
for idx in range(0, len(complete_CDSs)-(len(complete_CDSs)%3), 3):
codon = complete_CDSs[idx:idx+3]
if not "N" in codon and not "-" in codon:
syn_fractions_list += count_fraction_syn_changes (codon)
else:
syn_fractions_list += [0.0,0.0,0.0]
print "ignored a codon with undetermined bases", codon
# return coordinates to ascending order if necessary
if orientation == "-":
abs_coords = abs_coords[::-1]
syn_fractions_list = syn_fractions_list[::-1]
for fields in retained_CDSs:
chrom = fields[0]
abs_start = int(fields[3])
abs_stop = int(fields[4])
parent = fields[8].split("Parent=")[1]
cdslen = abs_stop - abs_start
syn_sites = sum( syn_fractions_list[ abs_coords.index(abs_start) : abs_coords.index(abs_stop) +1] )
# print chrom, abs_start, abs_stop, cdslen, syn_sites, syn_sites/cdslen, parent
# print syn_fractions_list[abs_coords.index(abs_start):abs_coords.index(abs_stop)+1]
try:
out_dict[chrom].append( [ abs_start, abs_stop, cdslen, syn_sites, syn_sites/cdslen, parent ] )
except KeyError:
out_dict[chrom] = [ [ abs_start, abs_stop, cdslen, syn_sites, syn_sites/cdslen, parent ] ]
return out_dict
def count_fraction_syn_changes (codon):
# find the proportion of syn mutations among all possible mutations of the codon
# translate all three possible mutations of each site of the codon, score if syn or nonsyn, count the syn ones and divide by three.
# export syn_fraction per site
is_AA = codon_table[codon]
#print "is_codon", codon, is_AA
syn_fractions = []
for i in range(3):
syn_mutations = 0
codonlist = list(codon)
isbase = codonlist[i]
otherbases = [x for x in ["A","C","G","T"] if not x == isbase]
for b in otherbases:
x = codonlist
x[i] = b
# print i, "".join(x)
alter_AA = codon_table["".join(x)]
if alter_AA == is_AA:
syn_mutations += 1
syn_fractions.append( syn_mutations/3.0 )
return syn_fractions
def gff_to_regions (gff_dict):
# return a SORTED list of the regions (sorted ascending by start)
interimdict = {}
for k,v in gff_dict.items():
for fields in v:
chrom = fields[0]
start = int(fields[3])
stop = int(fields[4])
try:
interimdict[chrom].append([start,fields])
except KeyError:
interimdict[chrom] = [[start,fields]]
outdict = {}
out_genelists = {}
for k,v in interimdict.items():
a = sorted(v, key=lambda x:x[0])
for entry in a:
chrom = entry[1][0]
start = int(entry[1][3])
stop = int(entry[1][4])
parent = entry[1][8].split("Parent=")[1]
try:
outdict[chrom].append((start,stop))
out_genelists[chrom].append(parent)
except KeyError:
outdict[chrom] = [(start,stop)]
out_genelists[chrom] = [parent]
for k,v in outdict.items():
x = tuple(v)
outdict[k] = x
# outl = []
# for k,y in outdict.items():
# for v in y:
# outl.append(k + "\t" + str(v[0]) + "\t" + str(v[1]))
# with open("regions_to_retain.txt", "w") as O:
# O.write("\n".join(outl) + "\n")
return outdict, out_genelists
def site_region_lookup(value, ranges):
## https://stackoverflow.com/questions/6053974/python-efficiently-check-if-integer-is-within-many-ranges
## "binary search"
left, right = 0, len(ranges)
while left != right - 1:
mid = left + (right - left) // 2
if value <= ranges[mid - 1][1]: # Check left split max
right = mid
elif value >= ranges[mid][0]: # Check right split min
left = mid
else: # We are in a gap
return None
if ranges[left][0] <= value <= ranges[left][1]:
# Return the range that it fit:
# return [ranges[left][0], ranges[left][1]]
return True
def export_results(thedict):
# within chrom, sort positions ascending, on the start coordinate
outdict = {}
for k,v in thedict.items():
a = sorted(v, key=lambda x:x[0])
outdict[k] = a
outlines = [ "\t".join( ["chrom","start","stop","cdslen", "syn_sites", "syn_sites_frac", "parent_gene"] ) ]
for k, v in outdict.items():
for entry in v:
outlines.append( k + "\t" + "\t".join([str(x) for x in entry]) )
# outlines = header + [k + "\t" + "\t".join([str(x) for x in v]) for k,v in outdict.items()]
with open("CDS_syn_sites.txt", "w") as O:
O.write("\n".join(outlines) + "\n")
######################## MAIN
args = get_commandline_arguments ()
fastadict = read_fasta (args.fasta)
full_gff_dict = parse_gff (args.full_gff)
retainable_CDS = parse_gff (args.filtered_gff)
print "genes to retain with at least one CDS: ", len(retainable_CDS.keys())
regions_to_retain, genelists_per_chrom = gff_to_regions (retainable_CDS)
# filter directly:
chrom_dict_with_synsites_per_CDS = interpret_gff_with_region_filter(full_gff_dict, fastadict, regions_to_retain, set( retainable_CDS.keys()) )
export_results(chrom_dict_with_synsites_per_CDS)