-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_to_derived_allele_freq.py
228 lines (158 loc) · 6.49 KB
/
vcf_to_derived_allele_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/python
# python 2.7
# 21 Dec 2016
# Mathias Scharmann
# Command line outline
# usage example
# python
# Inputs
# Outputs
#module load python/2.7
import os
import sys
#######
# checks if file exists and breaks script if not
def extant_file(x):
"""
'Type' for argparse - checks that file exists but does not open.
"""
if not os.path.exists(x):
print "Error: {0} does not exist".format(x)
exit()
x = str(x)
return x
# checks for non-UNIX linebreaks
def linebreak_check(x):
if "\r" in open(x, "rb").readline():
print "Error: classic mac (CR) or DOS (CRLF) linebreaks in {0}".format(x)
exit()
# parses command line arguments
def get_commandline_arguments ():
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--popmap", required=True, dest="popmapfile", type=extant_file, help="name and path of the popmap file: 1st column barcode/sample name separated by tab from second column indicating the population", metavar="FILE")
parser.add_argument("--outgr", required=True, help="name of outgroup population", metavar="STRING")
parser.add_argument("--vcf", required=True, dest="vcf", type=extant_file, help="vcf genotypes", metavar="FILE")
parser.add_argument("--max_missing_ingr", required=True, help="site-filter: maximum proportion of individuals with missing genotype per population to include a site, float value")
parser.add_argument("--max_missing_outgr", required=True, help="site-filter: maximum proportion of individuals with missing genotype per population to include a site, float value")
parser.add_argument("--out", required=True, help="name and path of output file", metavar="FILENAME")
args = parser.parse_args()
print args
linebreak_check(args.popmapfile)
return args
########
def check_congruence (popmapfile, vcf):
popmapsamples = []
with open(popmapfile, "r") as INFILE:
for line in INFILE:
fields = line.strip("\n").split("\t")
popmapsamples.append(fields[0])
popmapsamples = set(popmapsamples)
with open(vcf, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
vcf_samples = header_line[9:]
for samp in popmapsamples:
if samp not in vcf_samples:
print "vcf does not contain all popmap samples"
exit()
print "all samples in popmap are in .vcf, good to go!"
#######
def read_popmap (popmapfile):
popdict = {}
with open(popmapfile,"r") as INFILE:
for line in INFILE:
fields = line.strip("\n").split("\t")
try:
popdict[fields[1]].append( fields[0] )
except KeyError:
popdict[fields[1]] = [ fields[0] ]
return popdict
def vcf_to_derived_allele_freqs (vcf, popdict, max_missing_ingr, max_missing_outgr, outgr, output_file):
presence_treshold_ingr = 1.0 - max_missing_ingr
presence_treshold_outgr = 1.0 - max_missing_outgr
with open(vcf, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
samples = header_line[9:]
# E3379_L96 43 . ATCG ATTG,ATCA,GTCA,GTCG 4179.3 . AB=BLABLA 0/0:1:
popdict_vcf_idxes = {}
for k, v in popdict.items():
popdict_vcf_idxes[ k ] = [ header_line.index(x) for x in v ]
ingroup_pops = [ x for x in popdict.keys() if x != outgr ]
pop_order = popdict.keys()
output_lines = [pop_order]
with open(output_file, "w") as OUTF:
OUTF.write( "\n".join(["\t".join(x) for x in output_lines ]) + "\n" )
# now start the main business of walking through the vcf:
print "counting derived allele frequencies: sites have to be fixed in outgroup"
derived_allele_freq_dict = {k : [] for k in pop_order }
with open(vcf, "r") as INFILE:
cnt = 0
for line in INFILE:
if line.startswith("#"):
continue
cnt += 1
c = cnt / 100000.0
if (c).is_integer():
print str(cnt)
write_ouput_chunk ( derived_allele_freq_dict, pop_order, output_file )
# and clean memory by replacing the dict:
derived_allele_freq_dict = {k : [] for k in pop_order }
fields = line.strip("\n").split("\t")
## first check that site occurs in the outgroup and fixed in the outgroup:
a = [ fields[x].split(":")[0].split("/") for x in popdict_vcf_idxes[outgr] ]
b = [x for genotype in a for x in genotype if x != "."]
pop_n = float( len( popdict_vcf_idxes[outgr] ) )*2
if len(b) >= presence_treshold_outgr * pop_n:
if len(set(b)) == 1:
# we have a site that survives max_missing AND is fixed in the outgroup
# now check the other populations:
ancestral_allele = b[0]
derived_allele = [x for x in ["1","0"] if x != ancestral_allele][0]
derived_allele_freq_dict[outgr].append( "0" )
for pop in ingroup_pops:
a = [ fields[x].split(":")[0].split("/") for x in popdict_vcf_idxes[pop] ]
b = [x for genotype in a for x in genotype if x != "."]
pop_n = float( len( popdict_vcf_idxes[pop] ) )*2
if len(b) >= presence_treshold_ingr * pop_n:
p = str( b.count( derived_allele )/float(len(b)) )
else:
p = "-"
derived_allele_freq_dict[pop].append( p )
# final write:
write_ouput_chunk ( derived_allele_freq_dict, pop_order, output_file )
return cnt
def write_ouput_chunk ( derived_allele_freq_dict, pop_order, output_file ):
"""
better to write the output from time to time rather than to clog up the memory with GB!
"""
# clean allele_freq_dict from sites that are not present in at least four populations:
print "filtering sites in allele frequency table for presence in >= 4 populations"
output_lines = []
for i in range( len( derived_allele_freq_dict[derived_allele_freq_dict.keys()[0]] ) ):
column = [ derived_allele_freq_dict[pop][i] for pop in pop_order ]
if len( [ x for x in column if x != "-" ] ) >= 4:
output_lines.append( column ) # now its a row!
with open(output_file, "a") as OUTF:
OUTF.write( "\n".join(["\t".join(x) for x in output_lines ]) + "\n" )
######################## MAIN
args = get_commandline_arguments ()
check_congruence (args.popmapfile, args.vcf)
popdict = read_popmap (args.popmapfile)
max_missing_ingr = float( args.max_missing_ingr )
max_missing_outgr = float( args.max_missing_outgr )
input_sites_cnt = vcf_to_derived_allele_freqs (args.vcf, popdict, max_missing_ingr, max_missing_outgr, args.outgr, args.out)
## check length of output_file:
output_lines = sum(1 for line in open( args.out )) -1
print "retained sites: {0} out of {1}".format( output_lines, input_sites_cnt )
print "Done!"