-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2023-02-13--plot-top-virus-correlations-to-average.py
executable file
·152 lines (112 loc) · 4.23 KB
/
2023-02-13--plot-top-virus-correlations-to-average.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
import sys
import json
import datetime
import itertools
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
# accession -> date, site, nreads
metadata = {}
with open("longest-timeseries-nreads.tsv") as inf:
for line in inf:
accession, date, site, nreads = line.strip().split()
metadata[accession] = (
datetime.datetime.fromisoformat(date), site, int(nreads))
accession_counts = {} # accession -> vid -> count
for accession in metadata:
with open("2023-02-07--top-counts/%s.json" % accession) as inf:
accession_counts[accession] = json.load(inf)
all_vids = set()
for vids in accession_counts.values():
all_vids.update(vids)
all_vids = list(sorted(all_vids))
def to_slug(vid):
return vid.split(".")[0]
first_date, *_, last_date = sorted(
date for (date, _, _) in metadata.values())
all_sites = sorted(set(
site for (_, site, _) in metadata.values()))
first_site, *_, last_site = all_sites
def to_date_color(accession):
date, _, _ = metadata[accession]
return (last_date - date) / (last_date - first_date)
site_colors=cm.rainbow([i/(len(all_sites)-1) for i in range(len(all_sites))])
def to_site_color(accession):
_, site, _ = metadata[accession]
return site_colors[all_sites.index(site)]
fig, ax = plt.subplots(nrows=3,
ncols=3,
figsize=(9,9),
constrained_layout=True)
def relative_abundance(vid, accession):
absolute_abundance = accession_counts[accession][vid]
_, _, total_reads = metadata[accession]
return absolute_abundance/total_reads
def relative_relative_abundance(vid, accession):
return relative_abundance(vid, accession) / np.mean([
relative_abundance(vid, other_accession)
for other_accession in metadata
if other_accession != accession])
def mean_relative_relative_abundance(accession):
return np.mean([relative_relative_abundance(vid, accession)
for vid in all_vids])
plt.suptitle("per-sample abundance vs average abundance")
fig.supxlabel("mean relative relative abundance")
fig.supylabel("relative abundance of this virus")
for i, vid1 in enumerate(all_vids):
slug = to_slug(vid1)
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
ys.append(relative_relative_abundance(vid1, accession))
xs.append(mean_relative_relative_abundance(accession))
cs.append(to_date_color(accession))
a = ax[i//3][i%3]
a.scatter(xs, ys, c=cs)
a.set_title("%s" % (slug))
fig.savefig("corr-date-avg.png", dpi=180)
plt.clf()
fig, ax = plt.subplots(nrows=3,
ncols=3,
figsize=(9,9),
constrained_layout=True)
plt.suptitle("per-sample abundance vs average abundance")
fig.supxlabel("mean relative relative abundance")
fig.supylabel("relative abundance of this virus")
for i, vid1 in enumerate(all_vids):
slug = to_slug(vid1)
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
ys.append(relative_relative_abundance(vid1, accession))
xs.append(mean_relative_relative_abundance(accession))
cs.append(to_site_color(accession))
a = ax[i//3][i%3]
a.scatter(xs, ys, c=cs)
a.set_title("%s" % (slug))
fig.savefig("corr-site-avg.png", dpi=180)
plt.clf()
fig, ax = plt.subplots(nrows=3,
ncols=3,
figsize=(9,9),
constrained_layout=True)
plt.suptitle("per-sample abundance vs average abundance, with SRR21452135")
fig.supxlabel("mean relative relative abundance")
fig.supylabel("relative abundance of this virus")
for i, vid1 in enumerate(all_vids):
slug = to_slug(vid1)
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
ys.append(relative_relative_abundance(vid1, accession))
xs.append(mean_relative_relative_abundance(accession))
cs.append(accession == "SRR21452135")
a = ax[i//3][i%3]
a.scatter(xs, ys, c=cs)
a.set_title("%s" % (slug))
fig.savefig("corr-site-SRR21452135.png", dpi=180)