-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_stats.py
executable file
·334 lines (289 loc) · 12.1 KB
/
run_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 24 13:05:00 2020
@author: Scott T. Small
Main script for generating statistics for ABC and ML training.
depends: parse_sims.py, sims_stats.py, obs_stats.py
Example
-------
run_stats.py sim tests/msout/test.ms -cfg docs/examples/example.stats.ms.cfg
--nprocs 1 --ms msmove
run_stats.py obs --infile vcf/h5 --fasta foo.mask.fa --gff foo.gff --pops 4
--pairs 0-1 0-2 0-3 --stats filet --anc_fasta anc.fasta
Notes
-----
Creates summary stats from coalescent simulations: ms, msmove, discoal, msprime
There are two main modes: sims and obs
Mode 'sims' is for ms-style formats in any file, those produced by run_sims.py
Mode 'obs' is for generating the same stats but with a starting vcf.
"""
import allel
import argparse
import glob
import subprocess
from math import ceil
import multiprocessing
import numcodecs
import numpy as np
import pandas as pd
from pathlib import Path
import sys
from tqdm import tqdm
import zarr
from project.sim_modules.readconfig import read_config_stats
from project.stat_modules.write_stats import headers, stats_out
from project.stat_modules.sequtils import read_ms, add_seqerror
from project.stat_modules.sumstats import PopSumStats
def calc_simstats(ms):
# calc stats
stat_mat = np.zeros([len(ms), header_len])
length_bp = stats_dt["length_bp"]
pfe = stats_dt["perfixder"]
i = 0
for pos, haps in zip(*ms):
pos, haps, counts = add_seqerror(pos, haps, length_bp, pfe, seq_error=True)
stats_ls = []
popsumstats = PopSumStats(pos, haps, counts, stats_dt)
for stat in stats_dt["calc_stats"]:
stat_fx = getattr(popsumstats, stat)
try:
ss = stat_fx()
# print(f"{stat} = {len(ss)}")
except IndexError:
ss = [np.nan] * len(stats_dt["pw_quants"])
stats_ls.extend(ss)
stat_mat[i, :] = stats_ls
i += 1
pop_stats_arr = np.nanmean(stat_mat, axis=0)
return pop_stats_arr
def run_simstats(ms_files, msexe, outpath, nprocs):
""" """
global header_len
global header
# read in all the files
length_bp = stats_dt["length_bp"]
nhaps = stats_dt["num_haps"]
ms_dict = read_ms(ms_files, msexe, nhaps, length_bp)
sim_number = len(ms_dict)
# write headers
outfile = outpath.parent / f"{outpath.stem}.pop_stats.txt"
pops_outfile = open(outfile, 'w')
pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt)
header_len = header_
header = header_ls
if nprocs == 1:
for ms in tqdm(ms_dict.values()):
pop_stats_arr = calc_simstats(ms)
pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
pops_outfile.close()
else:
# chunk and MP
nk = nprocs * 10
ms_vals = list(ms_dict.values())
chunk_list = [ms_vals[i:i + nk] for i in range(0, len(ms_vals), nk)]
chunksize = ceil(nk/nprocs)
pool = multiprocessing.Pool(nprocs)
for i, args in enumerate(chunk_list):
pop_stats_arr = pool.map(calc_simstats, args, chunksize=chunksize)
pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
print(i)
pool.close()
pops_outfile.close()
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath):
"""Calculate stats from a VCF file."""
# if reuse_zarr is true
if zarrpath.exists():
zarrfile = zarrpath
else:
zarrfile = zarrpath
allel.vcf_to_zarr(str(vcfpath), str(zarrpath), group=chrom, fields='*', alt_number=2,
log=sys.stdout, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False))
# load pop info
panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population'])
# load zarr
callset = zarr.open_group(str(zarrfile), mode='r')
samples = callset[f'{chrom}/samples'][:]
samples_list = list(samples)
samples_callset_index = [samples_list.index(s) for s in panel['sampleID']]
panel['callset_index'] = samples_callset_index
panel = panel.sort_values(by='callset_index')
# load gt
pos = allel.SortedIndex(callset[f'{chrom}/variants/POS'])
gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT'])
# separate gt for each population
ix_s = 0
pop_dt = {}
pop_ix = []
for i, p in enumerate(panel["population"].unique()):
p_ix = panel[panel["population"] == p]["callset_index"].values
ix_e = len(p_ix)*2 + ix_s
pop_ix.append(list(range(ix_s, ix_e)))
pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes()
ix_s = ix_e
# combine and transpose
haps = np.concatenate(list(pop_dt.values()), axis=1).T
# prep progress bar
ln_count = 0
with open(coord_bed, 'r') as cb:
for line in cb:
if not line.startswith("chrom"):
ln_count += 1
progressbar = tqdm(total=ln_count, desc="window numb", unit='window')
# update stats_dt
stats_dt["num_haps"] = haps.shape[0]
stats_dt["pop_config"] = pop_ix
stats_dt["length_bp"] = int(line.split()[-1]) # may be shorter than expected due to last window
stats_dt["reps"] = ln_count
# write headers
outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt"
pops_outfile = open(outfile, 'w')
pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt,
pop_names=list(pop_dt.keys()),
obs=True)
# calc stats
# TODO: parallel
chrom_ls = []
i = 0
stat_mat = np.zeros([ln_count, len(header_ls)-1])
with open(coord_bed, 'r') as cb:
for line in cb:
if not line.startswith("chrom"):
cb_lin = line.split()
chrom = cb_lin[0]
chrom_ls.append(chrom)
start = int(cb_lin[1])
stop = int(cb_lin[2])
len_bp = stop - start
stats_dt["length_bp"] = len_bp
sites = int(cb_lin[3])
try:
pos_ix = pos.locate_range(start, stop)
except KeyError:
continue
pos_t = pos[pos_ix] - start
haps_t = haps[:, pos_ix]
counts_t = haps_t.sum(axis=0).astype(int)
# run stats
stats_ls = [start, stop, sites]
popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt)
for stat in stats_dt["calc_stats"]:
stat_fx = getattr(popsumstats, stat)
try:
ss = stat_fx()
# print(f"{stat} = {len(ss)}")
except IndexError:
ss = [np.nan] * len(stats_dt["pw_quants"])
stats_ls.extend(ss)
try:
stat_mat[i, :] = stats_ls
i += 1
progressbar.update()
except ValueError:
continue
# write stats out
stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5)
stats_str = "\t".join(map(str, stat_mean[3:]))
pops_outfile.write(f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n")
for stat in range(stat_mat.shape[0]):
chrom = chrom_ls[stat]
start = int(stat_mat[stat, 0])
stop = int(stat_mat[stat, 1])
sites = int(stat_mat[stat, 2])
rd = [round(num, 5) for num in stat_mat[stat, 3:]]
stats_str = "\t".join(map(str, rd))
pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n")
progressbar.close()
pops_outfile.close()
return outfile
def parse_args(args_in):
"""Parse args.
Parameters
----------
args_in : TYPE
DESCRIPTION.
Returns
-------
argsDict : TYPE
DESCRIPTION.
"""
parser = argparse.ArgumentParser(description="calculate summary statistics"
" from simulated data or observed data")
# parser._positionals.title = f"enter 'python {sys.argv[0]} modeName -h' for modeName's help message"
subparsers = parser.add_subparsers(help='sub-command help')
parser_a = subparsers.add_parser('sim', help="Generate stats from sim data",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_b = subparsers.add_parser('obs', help="Generate stats from data in a VCF",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_a.set_defaults(mode='sim')
# parser_a._positionals.title = "required arguments"
parser_a.add_argument('ms_file', help="path to simulation output file"
"must be same format used by Hudson\'s ms. Can be "
"a directory of files but the suffix needs to be .msout")
parser_a.add_argument('-cfg', "--configFile",
help="path to config file, see examples")
parser_a.add_argument("--ms", type=str, required=True,
help=" full path to discoal/msbgs/scrm exe")
parser_a.add_argument('-o', "--outfile", default="./",
help="path to file where stats will be written")
parser_a.add_argument("--nprocs", type=int, default=1,
help="processors for MP")
# calculate stats from a VCF file
parser_b.set_defaults(mode='obs')
# parser_b._positionals.title = "required arguments"
parser_b.add_argument('vcfFileIn', help="VCF format file containing data"
"for one chromosome arm (other arms will be ignored)")
parser_b.add_argument('chr_arm', help="Exact name of the chromosome arm for"
"which feature vectors will be calculated")
parser_b.add_argument('--pops_file', help="individual names to pops as found in"
"VCF file")
parser_b.add_argument('-cfg', "--configFile",
help="path to config stats file, see examples")
parser_b.add_argument('--coords_bed', required=True,
help="Path to a bed file of coordinates for stats")
parser_b.add_argument('--zarr_path', type=str,
help="Path to a zarr file. If exists will reuse if not"
" it will build one at that location")
parser_b.add_argument('-o', "--outfile", default="./",
help="path to file where stats will be written")
args = parser.parse_args(args_in)
argsDict = vars(args)
return argsDict
def main():
"""Run main function."""
argsDict = parse_args(sys.argv[1:])
# =========================================================================
# Gather args
# =========================================================================
if argsDict["mode"] == "sim":
mspath = Path(argsDict["ms_file"])
configFile = argsDict["configFile"]
msexe = argsDict["ms"]
outpath = Path(argsDict["outfile"])
nprocs = argsDict["nprocs"]
else:
vcfpath = Path(argsDict["vcfFileIn"])
chrom = argsDict["chr_arm"]
configFile = argsDict["configFile"]
pops = argsDict["pops_file"]
coord_bed = argsDict["coords_bed"]
zarrpath = Path(argsDict["zarr_path"])
outpath = Path(argsDict["outfile"])
# =========================================================================
# Config parser
# =========================================================================
global stats_dt
stats_dt = read_config_stats(configFile)
# =========================================================================
# Main executions
# =========================================================================
if argsDict["mode"] == "sim":
if mspath.is_dir():
# will open many files, suffix with msout
ms_files = list(mspath.glob("*.msout"))
else:
ms_files = [mspath]
run_simstats(ms_files, msexe, outpath, nprocs)
elif argsDict["mode"] == "obs":
outfile = calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath)
if __name__ == "__main__":
main()