-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf2able.py
167 lines (153 loc) · 5.89 KB
/
vcf2able.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 1 15:27:50 2017
vcf2able.py
@author: scott
"""
import argparse
from collections import defaultdict
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument('-i', "--vcfin", type=str, required=True,
help="vcffile")
parser.add_argument('-b', "--block_size", type=int, required=True,
help="size of linkage blocks")
parser.add_argument('-d', "--pedfile", type=str, required=True,
help="path to pedfile with pop and individuals")
parser.add_argument('-p', "--pop", type=str, nargs='+', required=False,
default='', help="which pops, if blank uses all")
parser.add_argument('-s', "--size", type=str, nargs='+', required=False,
default='', help="how many from each pop, blank uses all")
parser.add_argument("--bin", action="store_true",
help="create pseudo-ms with binary instead of gt requires"
"AA tag in the VCF")
args = parser.parse_args()
# TODO: make binary parser for bin option
def firstchrom(vcfin):
"""
"""
with open(vcfin, 'r') as vcf:
for line in vcf:
if line.startswith("#CHROM"):
pop_iix = line.strip().split()
line = vcf.next()
chrom = line.strip().split()[0]
pos1 = line.strip().split()[1]
break
return(chrom, int(pos1), pop_iix)
def makeable(i, pop, p, x, abledict):
"""
"""
ra = x[3]
aa = x[4]
gt = x[p].split(":")[0]
if gt.count("0") == 2:
abledict[pop][2*i] += ra
abledict[pop][2*i+1] += ra
elif gt.count("1") == 2:
abledict[pop][2*i] += aa
abledict[pop][2*i+1] += aa
else:
if gt == "0/1" or gt == "0|1":
abledict[pop][2*i] += ra
abledict[pop][2*i+1] += aa
elif gt == "1/0" or gt == "1|0":
abledict[pop][2*i] += aa
abledict[pop][2*i+1] += ra
else:
abledict[pop][2*i] += "N"
abledict[pop][2*i+1] += "N"
return(abledict)
def able2vcf(vcfin, block, popinfo, pops, sizes, rand=True):
"""
"""
# parse ped
peddict = defaultdict(list)
with open(popinfo, 'r') as ped:
for line in ped:
if line.strip():
x = line.strip().split()
if (x[0] in pops) or (pops == '0'):
peddict[x[0]].append(x[1])
else:
continue
poplist = pops
if sizes:
sizes = map(int, sizes)
if rand:
for pop in poplist:
i = pops.index(pop)
peddict[pop] = np.random.choice(peddict[pop], sizes[i],
replace=False)
else:
for pop in poplist:
i = pops.index(pop)
peddict[pop] = peddict[pop][:sizes[i]]
# get initial
chrom, pos1, pop_iix = firstchrom(vcfin)
# make able file
abledict = {}
for pop in poplist:
abledict[pop] = [''] * 2 * len(peddict[pop])
b = 0
with open("{}.{}.{}.able.in".format("-".join(pops), "-".join(map(str, sizes)), block), 'w') as able:
with open(vcfin, 'r') as vcf:
for line in vcf:
if not line.startswith("#"):
x = line.strip().split()
pos = int(x[1])
if (pos <= (block + b)) and (x[0] == chrom):
for pop in poplist:
for i, sample in enumerate(peddict[pop]):
p = pop_iix.index(sample)
abledict = makeable(i, pop, p, x, abledict)
else:
# header
able.write("\n//\n#{}_{}-{}\n".format(chrom, b,
block+b))
if abledict[pop][0] == '':
# monomorphic
pass
else:
# polymorphic write out seqs
for pop in poplist:
for samp in abledict[pop]:
able.write("{}\n".format(samp))
# restart with blank set of seq
for pop in poplist:
abledict[pop] = [''] * 2 * len(peddict[pop])
# catch first instance that trigger else statement
if x[0] != chrom:
b = 0
chrom = x[0]
else:
b += block
if (pos <= (block + b)) and (x[0] == chrom):
for pop in poplist:
abledict[pop] = [''] * 2 * len(peddict[pop])
for i, sample in enumerate(peddict[pop]):
p = pop_iix.index(sample)
abledict = makeable(i, pop, p, x, abledict)
else:
# nothing in that block, monomorphic so print empty
able.write("\n//\n#{}_{}-{}\n".format(chrom, b,
block+b))
b += block
# update chrom
chrom = x[0]
def able2vcf_bin(vcf, block, popinfo, pops):
"""Make binary file for bSFS, requires AA in VCF format field, else default
to REF allele as ancestral
"""
return(None)
if __name__ == "__main__":
vcf = args.vcfin
block = args.block_size
popinfo = args.pedfile
pops = args.pop
sizes = args.size
if args.bin:
able2vcf_bin(vcf, block, popinfo, pops, sizes)
else:
able2vcf(vcf, block, popinfo, pops, sizes)