-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_twisst_dist_len.py
393 lines (349 loc) · 13.5 KB
/
parse_twisst_dist_len.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 14:09:48 2020
@author: Scott T. Small
Finding the tree heights or coalescent times between species pairs is
best done using ETE3. This is simple for trees with 1 allele per species but
takes much more time for trees with >1 allele per species. The program twisst
can estimate distances and branch lengths when calculating weights.
twisst.py -t TREES_in -w WEIGHTS_out -D DISTANCE_out -L BRANCH_LEN_out
--outputTopos TOPO_out -g FUN -g LIK -g VAN -g LON -g PAR -g RIV
--groupsFile twisst.43.groups --method complete
This script will parse the Dist and Len files produced by twisst. Namely
specifying topologies or frequencies.
Example
-------
python parse_twisst_dist_len.py -i 3L.blen.txt --blen -f 0.05 --bplot -c 3L
python parse_twisst_dist_len.py -i X.blen.txt --blen -t topo3 topo10 topo11 --bplot
python parse_twisst_dist_len.py -i 3R.dist.txt -p FUN-VAN -t topo3 topo10 --dist --chr 3R
"""
import sys
import argparse
import numpy as np
import pandas as pd
from os import path
from collections import defaultdict
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rcParams['pdf.fonttype'] = 42
def boxplot_dist(chrm, div_df, pair, topo, topoplot, pairsplot):
"""plot the divergences
"""
if pairsplot:
label = pair
if topo:
df_list = div_df[topo].loc[pair].sort_index()
x = [l for l in df_list]
poplist = list(df_list.index.values)
else:
df_list = div_df.loc[pair].sort_index()
x = [l for l in df_list]
poplist = list(df_list.index.values)
elif topoplot:
label = topo
x = [l for l in div_df[topo]]
poplist = list(div_df[topo].index.values)
fig, ax = plt.subplots(figsize=(10, 6))
sns.despine(ax=ax, offset=5)
lw = 1.5
box = ax.boxplot(
x,
labels=poplist, patch_artist=True,
medianprops={"color": "k", "linewidth": lw},
whiskerprops={"color": "k"},
capprops={"color": "k"},
showfliers=False,
flierprops={"c": "k", "markersize": 2})
ax.set_ylabel(r'PwDiv', rotation=90, fontsize=12)
ax.set_xticklabels(poplist, rotation=45, fontsize=8)
# set list of colors
colornames = list(mpl.colors.cnames.keys())[:len(poplist)]
for patch, color in zip(box['boxes'], colornames):
patch.set_facecolor(color)
patch.set_linewidth(lw)
fig.savefig("{}.{}.pdf".format(chrm, label), bbox_inches="tight")
return(None)
def boxplot_branchlen(chrm, data, topos, toposfreq, minLen=1):
"""plots the nodeages
"""
if topos:
poplist = topos
else:
poplist = toposfreq
topo_ix = [(int(x.strip("topo"))-1) for x in poplist]
data_arr = np.array(data)[topo_ix]
data_list = [a1[~np.isnan(a1)] for a1 in data_arr]
fig, ax = plt.subplots(figsize=(10, 6))
sns.despine(ax=ax, offset=5)
lw = 1.5
box = ax.boxplot(
data_list,
labels=poplist,
patch_artist=True,
medianprops={"color": "k", "linewidth": lw},
whiskerprops={"color": "k"},
capprops={"color": "k"},
showfliers=False,
flierprops={"c": "k", "markersize": 2})
ax.set_ylabel(r'NodeAge', rotation=90, fontsize=12)
ax.set_xticklabels(poplist, rotation=45, fontsize=8)
# set list of colors
colornames = list(mpl.colors.cnames.keys())[:len(poplist)]
for patch, color in zip(box['boxes'], colornames):
patch.set_facecolor(color)
patch.set_linewidth(lw)
fig.savefig("{}.NodeDepth.pdf".format(chrm), bbox_inches="tight")
return(None)
def parse_divergence(infile):
"""parses out distance file from twisst. returns a distribution for each
species pair this should be applicable to other species
Parameters
----------
infile: str, file
infile with distance data
Returns
-------
div_df: obj
Pandas DataFrame
"""
div_dict = defaultdict(lambda: defaultdict(list))
with open(infile, 'r') as f:
header = next(f).split()
for line in f:
for i, div in enumerate(line.split()):
topo, ind1, ind2 = header[i].split("_")
pairname = "{}-{}".format(ind1, ind2)
if np.isnan(div):
pass
else:
div_dict[topo][pairname].append(float(div))
div_df = pd.DataFrame(div_dict)
return(div_df)
def calc_mean_distance(div_df, topo, pair):
"""Calculates mean and median from
Parameters
----------
div_df: obj
pandas dataframe
topo: str
which topology
pair:str
which pair
Returns
-------
pmean: flt
numpy mean
pmedian: flt
numpy median
quant_U: flt
numpy upper quantile 97.5
qunat_L: flt
numpy lower quantile 2.5
"""
pmean = np.nanmean(div_df[topo].loc[pair])
pmedian = np.nanmedian(div_df[topo].loc[pair])
quant_U = np.nanpercentile(div_df[topo].loc[pair], 97.5)
quant_L = np.nanpercentile(div_df[topo].loc[pair], 2.5)
return pmean, pmedian, quant_U, quant_L
def mean_distances(chrm, div_df, topos, pairs):
"""
Parameters
----------
chrm: str
chromosome name
div_df: dataframe
dataframe of divergence data
topos: List(str)
list of topos for distance calculations
pairs: List(str)
list of pairs for distance calculations
Returns
-------
NONE
"""
# calculate mean distances
if path.exists("{}.meandist.out".format(chrm)):
f = open("{}.meandist.out".format(chrm), 'a')
else:
f = open("{}.meandist.out".format(chrm), 'w')
if topos: # specific topos
for topo in topos:
if pairs: # specific pairs
for pair in pairs:
pmean, pmedian, quant_U, quant_L = calc_mean_distance(div_df, topo, pair)
f.write(f"{topo}:{pair}:{pmean} {pmedian} [{quant_U}-{quant_L}]\n")
else:
for pair in list(div_df.index.values):
pmean, pmedian, quant_U, quant_L = calc_mean_distance(div_df, topo, pair)
f.write(f"{topo}:{pair}:{pmean} {pmedian} [{quant_U}-{quant_L}]\n")
else:
for topo in list(div_df.columns.values):
if pairs: # specific pairs
for pair in pairs:
pmean, pmedian, quant_U, quant_L = calc_mean_distance(div_df, topo, pair)
f.write(f"{topo}:{pair}:{pmean} {pmedian} [{quant_U}-{quant_L}]\n")
else:
for piar in list(div_df.index.values):
pmean, pmedian, quant_U, quant_L = calc_mean_distance(div_df, topo, pair)
f.write(f"{topo}:{pair}:{pmean} {pmedian} [{quant_U}-{quant_L}]\n")
f.close()
def calc_mrca(chrm, topos, blen_data, min_freq, tree_count):
"""Finds the MRCA for each topos
Parameters
----------
chrm: str
chromosome or how to name the outfile
blen_data: List(List)
zipped list of MRCA from topologies
min_freq: flt
min frequency cut off to keep a topo for plotting
Returns
-------
topos_freq: list
topos passing the min_freq cutoff
"""
topos_freq = []
t = open(f"{chrm}.fulldata.out", "w")
with open(f"{chrm}.nodedepth.out", 'w') as f:
for i, mrca in enumerate(blen_data):
topo = f"topo{i + 1}"
nancount = sum(np.isnan(mrca))
freq = (1 - (nancount/tree_count))
if freq >= min_freq:
topos_freq.append(f"{topo}") # pass min_freq
if (topos is None) or (topo in topos):
# print full data
for blen in mrca:
if np.isnan(blen):
pass
else:
t.write(f"{chrm}\t{topo}\t{blen}\n")
mean = np.nanmean(mrca)
median = np.nanmedian(mrca)
quant_low = np.nanpercentile(mrca, 2.5)
quant_up = np.nanpercentile(mrca, 97.5)
f.write(f"{topo} {freq:.3f}:{mean:.3f} {median:.3f} [{quant_low:.3f}-{quant_up:.3f}]\n")
return topos_freq
def sum_branch_lengths(chrm, infile, min_freq, topos, step=10, outgroup_pos=2):
"""calculates the nodeage of the MRCA using a branchlength file produced by
twisst
to alter this for other species you must:
1) change step size
2) change outgroup blens
The blen file is
topo1_col1 topo1_col2 topo1_col3 topo2_col1 topo2_col2 topo2_col3
val val val val val val val val val val val
Parameters
----------
outgroup_pos: int
skip the first 2 as they are outgroup root lengths and we want ingroup
step: int
there are 8 node times and 2 outgroup time ... this is 10
topos: List(str)
list of topos for distance calculations
Returns
-------
blen_data: List(List)
zipped list of MRCA from topologies
topos_freq: list
topos passing the min_freq cutoff
"""
blen_boxplot = []
tree_count = 0 # total window count
topodict = {}
with open(infile, 'r') as blen_file:
for line in blen_file:
if line.startswith("#"):
x = line.split()
topo = x[0][1:]
leaf_names = [leaf.split("--")[1] for leaf in x[3:]]
topodict[topo] = leaf_names
else:
tree_count += 1
start = 0
stop = step
blen = list(map(float, line.split()))
blen_list = []
while stop <= len(blen):
blen_vals = blen[start + outgroup_pos:stop]
if np.isnan(blen_vals[0]):
blen_list.append(np.nan)
else:
topo_key = start//step # topos are column header
blen_topo = topodict[f"topo{topo_key+1}"]
leaf_ix = [blen_topo.index(bt) for bt in blen_topo if bt.count('_') == 0]
# should return 3 or more
blen_max = max([blen_vals[bl] for bl in leaf_ix])
# get all values for leaf index and max
blen_maxix = blen_topo[blen_vals.index(blen_max)]
# here is the leaf name 'Fun'
leaf_dist = [blen_vals[blen_topo.index(x1)] for x1 in blen_topo if blen_maxix in x1]
# now return all vals with instances of 'Fun' in topo
leaf_sum = sum(leaf_dist)
blen_list.append(leaf_sum)
start += step
stop += step
blen_boxplot.append(blen_list)
# boxplot of node depth
blen_data = list(zip(*blen_boxplot))
topos_freq = calc_mrca(chrm, topos, blen_data, min_freq, tree_count)
return blen_data, topos_freq
def parse_args(args_in):
parser = argparse.ArgumentParser(prog="sys.argv[0].py",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', "--infile", type=str, required=True,
help="infile from distmatrix twisst")
parser.add_argument('-t', "--topos", nargs='+', required=False,
help="list of topos e.g., topo82")
parser.add_argument('-p', "--pairs", nargs='+', required=False,
help="list of interested pairs, e.g. Fun-Lik")
parser.add_argument('-f', "--minFreq", type=float, default=0.0)
parser.add_argument("--dist", action="store_true",
help="a divergence file")
parser.add_argument("--blen", action="store_true",
help="a file of branch"
"lengths")
parser.add_argument('-c', "--chr", type=str, default='chr',
help="prefix of outfile")
parser.add_argument("--tplot", action="store_true",
help=" # for each topo returns all pairwise distances"
" on 1 plot")
parser.add_argument("--pplot", action="store_true",
help="for each pair returns a boxplot of distances on"
" each topo")
parser.add_argument("--bplot", action="store_true",
help="boxplot of nodeage for specified topos")
return(parser.parse_args(args_in))
if __name__ == "__main__":
args = parse_args(sys.argv[1:])
# =========================================================================
# Gather args
# =========================================================================
infile = args.infile
topos = args.topos
pairs = args.pairs
dist = args.dist
blen = args.blen
chrom = args.chr
min_freq = args.minFreq
tplot = args.tplot
pplot = args.pplot
bplot = args.bplot
# =========================================================================
# Main executions
# =========================================================================
if dist:
div_df = parse_divergence(infile)
# make boxplot
if tplot:
for topo in topos:
boxplot_dist(chrom, div_df, pairs, topos)
if pplot:
for pair in pairs:
boxplot_dist(chrom, div_df, pairs, topos)
elif blen:
blen_data, topos_freq = sum_branch_lengths(chrom, infile, min_freq, topos)
if bplot:
boxplot_branchlen(chrom, blen_data, topos, topos_freq)