-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimdata.py
223 lines (195 loc) · 8.98 KB
/
simdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: utf-8 -*-
"""
This script simulates genomic samples, and the corresponding AFS and LD summary
statistics, as described in Boitard 2016. Results will be written in the 'res'
directory, in two files with suffices: FOO.params and FOO.stat. Each of this
file includes one line per simulated dataset.
Each column in the params file corresponds to a different parameter.
mutation_rate recombination_rate pop_size_1 pop_size_2 .... pop_size_I
,where I is the number of intervals
Each column in the FOO.stat file corresponds to a different statistic.
AFS0 AFS1 ... AFSn LD1 LD2 ... LDp
,where n is the number of diploid individuals
,p is the number of physical distance bins
If option save_ms is set to True, a compressed tar file example.ms.tar.bz2
including the raw ms output for all simulated datasets and all segments is
also created.
Simulation settings are similar to those used in our study, but this can
easily be changed by modifying variables in the parameters section below.
In practice, it is strongly recommended to excute this script using many
processors in parallel (with different output names!) and to merge all output
files at the end.
"""
import numpy as np
import popgen_abc
import tarfile
import os
import argparse
try:
import configparser
except ImportError:
import ConfigParser as configparser
parser = argparse.ArgumentParser()
parser.add_argument('-o', "--outfile", type=str, required=True,
help='outfile name')
parser.add_argument('-c', "--configfile", type=str, required=True,
help='config file')
args = parser.parse_args()
def timewindows(nb_times, Tmax, a):
"""The length of time windows increases when time increases at a speed that
is determined by this coefficient computation of time windows based on the
above parameters
Parameters:
nb_times: int, number of time windows
Tmax: int, the oldest time window will start at Tmax
a: float, the length of time windows increases when time increases,
at a speed that is determined by this coefficient
Returns:
times: numpy array
"""
times = -np.ones(shape=nb_times, dtype='float')
for i in range(nb_times):
times[i] = (np.exp(np.log(1 + a * Tmax) *
i/(nb_times - 1)) - 1)/a
print("Population size changes at the following times (in generations): {}"
.format(times))
return(times)
def ldstats(nb_times, times, r, L, per_err, Tmax):
"""Creation of the bins of physical distance for which the average LD will
be computed, based on the time windows defined above.
Parameters:
nb_times: int, number of time windows
times: array,
r: float,recomb rate per generation per bp
L: int, size of each segment, in bp.
per_err: int, the length of each interval, as a per of the target
distance
Tmax: int, the oldest time window will start at Tmax
Returns:
intervals_list: list
"""
interval_list = []
for i in range(nb_times - 1):
t = (times[i + 1] + times[i])/2
d = 1/(2*r*t)
if d <= L:
interval_list.append([d - per_err * d/100,
d + per_err * d/100])
t = Tmax + times[nb_times - 1] - times[nb_times - 2]
# d = 10.0**8/(2 * t)
d = 1/(2*r*t)
interval_list.append([d-per_err * d/100, d + per_err * d/100])
print("Average LD will be computed for the following distance bins (in bp)"
": {}".format(interval_list))
return(interval_list)
def simdata(interval_list, nb_rep, nb_times, haps, mmu, r_min, r_max, Nmin,
Nmax, outfile, nb_seg, L, times, mac, mac_ld, writedir, save_msp):
"""Simulates data under model of popsizeabc
Parameters:
interval_list: list
nb_rep: int, number of simulated datasets. This must be the same as
that used in simul_data.py.
nb_times: int, number of time windows
haps: int, haploid sample size. This must be the same as that used in
simul_data.py.
mmu: float, per generation per bp mutation rate
r_min: float, lower bound for the per generation per bp recomb rate
r_max: float, upper bound for the per generation per bp recomb rate
Nmin: int, lower bound for the pop size in each time window log10 scale
Nmax: int, upper bound for the pop size in each time window log10 scale
outfile: file. root of the output files. This must be the same as
that used in simul_data.py.
nb_seg: int, number of independent segments in each dataset. This must
be the same as that used in simul_data.py.
L: int, length of one segment, in bp. This must be the same as that
used in simul_data.py.
times: array,
mac: int, minor allele count threshold for AFS statistics computation
mac_ld: int, minor allele count threshold for LD statistics computation
save_msp: bool, if True, snp positions and haplotypes corresponding to
the same dataset will be stored in a compressed tar file. This
allows to keep and potentially re-use the exact genomic samples,
rather than just the summary statistics, but this require high
memory ressources (approx 1 Mo per simulated dataset, on average,
with current parameter values).
"""
# create the matrices where results (parameters and statistics) are stored
nb_dist = len(interval_list)
params = -np.ones(shape=[nb_rep, nb_times + 2], dtype='float')
results = -np.ones(shape=[nb_rep, nb_dist + haps/2 + 1], dtype='float')
# missing statistic values will be set to -1. LD statistics, because it is
# sometimes impossible to find SNP pairs in a given distance bin.
print("Total number of statistics:{}\n".format(nb_dist + haps/2 + 1))
# simulate parameters from the prior
mu = mmu * np.ones([nb_rep])
params[:, 0] = mu
r = np.random.uniform(low=r_min, high=r_max, size=nb_rep)
params[:, 1] = r
for i in range(nb_times):
if i == 0:
pop_size = 10**(np.random.uniform(low=Nmin, high=Nmax,
size=nb_rep))
else:
multiplication_factor = 10**(np.random.uniform(-1, 1, size=nb_rep))
pop_size = params[:, 1 + i] * multiplication_factor
pop_size = np.maximum(10**Nmin, pop_size)
pop_size = np.minimum(pop_size, 10**Nmax)
params[:, 2 + i] = pop_size
# simulate summary statistics
directory = "{}/res".format(writedir)
if not os.path.exists(directory):
os.makedirs(directory)
out_name_2 = "{}/res/{}_n{}_s{}".format(writedir, outfile, haps, nb_seg)
print('Started the simualtions')
for i in range(nb_rep):
print("Simulating replicate, {}".format(i + 1))
results[i, :] = popgen_abc.simul_stats_one_rep_macld(
out_name_2, i, nb_seg, L, haps, times, params[i, :],
interval_list, mac=mac, mac_ld=mac_ld, save_msp=save_msp)
# print the results
fname = "{}_mac{}_macld{}.stat".format(out_name_2, mac, mac_ld)
np.savetxt(fname, results[0:nb_rep, :], fmt='%.3e')
np.savetxt("{}.params".format(out_name_2), params[0:nb_rep, :], fmt='%.3e')
print("Printed the results")
# # tar ms files (if they exist)
if save_msp:
mytar = tarfile.open("{}.msp.tar.bz2".format(out_name_2), 'w:bz2')
for i in range(nb_rep):
for j in range(nb_seg):
filename = "{}_rep{}_seg{}.msp".format(out_name_2, i + 1,
j + 1)
if os.path.isfile(filename):
mytar.add(filename)
os.remove(filename)
else:
pass
mytar.close()
print("Created tar for msp files")
if __name__ == "__main__":
config = configparser.ConfigParser()
config.read(args.configfile)
sh = "simulation"
nb_times = config.getint(sh, "nbtimes")
Tmax = config.getint(sh, "tmax")
a = config.getfloat(sh, "a")
times = timewindows(nb_times, Tmax, a)
per_err = config.getint(sh, "pererr")
r = config.getfloat(sh, "recomb")
L = config.getint(sh, "seg_size")
interval_list = ldstats(nb_times, times, r, L, per_err, Tmax)
outfile_name = args.outfile
nb_rep = config.getint(sh, "nb_rep")
nb_seg = config.getint(sh, "nb_seg")
haps = config.getint(sh, "haps")
mac = config.getint(sh, "minallelcount_sfs")
mac_ld = config.getint(sh, "minallelcount_LD")
save_msp = config.getboolean(sh, "savemsp")
# prior distributions
r_min = config.getfloat(sh, "recombmin")
r_max = config.getfloat(sh, "recombmax")
mmu = config.getfloat(sh, "mmu")
Nmin = config.getfloat(sh, "Nemin")
Nmax = config.getfloat(sh, "Nemax")
wrdir = config.get(sh, "writedirectory")
simdata(interval_list, nb_rep, nb_times, haps, mmu, r_min, r_max, Nmin,
Nmax, outfile_name, nb_seg, L, times, mac, mac_ld, wrdir, save_msp)