-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsecondary_structures.py
75 lines (62 loc) · 2.91 KB
/
secondary_structures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import matplotlib.pyplot as plt
from mutation_rates import get_mutation_rates, get_genetic_code
def characterise_mutation_site(codon, offset, genetic_code, alphabet):
synonym=0
original_nuc=codon[offset]
original_a=genetic_code[codon]
for letter in alphabet:
if letter!=original_nuc:
if offset==0:
mutated_codon=letter+codon[1:]
if offset==1:
mutated_codon=codon[0]+letter+codon[2]
if offset==2:
mutated_codon=codon[:2]+letter
mutated_a=genetic_code[mutated_codon]
if original_a==mutated_a:
synonym+=1
return synonym
#for each base of the cds compute:
#synonym mutation sites
#non synonym mutation sites
#put them into two lists representing the position of the cds
#for each window on the cds compute the ratio of nonsynonymus mutations over the nonsynonymus sites and
#the ratio of synonymus mutations over synonyms sites, then compute the ratio of the two ratios
def plot_secondary_structures(k,reference,cds_start,cds_end,alphabet,tree_node_data):
res = get_mutation_rates(tree_node_data,reference,cds_start,cds_end,alphabet)
synonym_distribution = res["syn_distribution"]
genetic_code=get_genetic_code(alphabet)
cds=reference[cds_start: cds_end]
synonym_sites_distribution=[0 for i in range(len(synonym_distribution))]
for pos in range(len(synonym_distribution)):
codon_offset=pos%3
codon=cds[pos-codon_offset:pos+(3-codon_offset)]
synonym_site=characterise_mutation_site(codon, codon_offset, genetic_code, alphabet)
synonym_sites_distribution[pos]=synonym_site
scores=np.zeros([len(synonym_distribution)-k])
synonyms=np.zeros([len(synonym_distribution)-k])
for pos in range(len(synonym_distribution)-k):
score=sum(synonym_distribution[pos:pos+k])
synonym_sum=sum(synonym_sites_distribution[pos:pos+k])
#divide by the number of possible mutations in the window
scores[pos]=score
synonyms[pos]=synonym_sum
mean=np.mean(scores)
print('mean of synonym mutation occurring in', k, 'bp :', mean)
sd=np.std(scores)
print('sd:',sd)
sites_without_mutations=[]
for pos in range(len(synonym_distribution)-k):
score=sum(synonym_distribution[pos:pos+k])
z=(score-mean)/sd #normalisation of the observation
if z<-3:#check if the observation is very small
sites_without_mutations.append((int(pos+(k/2)),score))
p=[i for i in range(len(synonym_distribution)-k)]
secondary_structures=plt.figure(figsize=(25,4))
plt.plot(p, scores)
plt.plot(p, synonyms)
plt.legend(['number of synonymous mutations in the tree','number of available synonymous mutations'])
for low_score in sites_without_mutations:
plt.plot(low_score[0],low_score[1],'yo')
return {'sec_struct':sites_without_mutations}