-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2023-02-13--investigate-kmer-prefixes.py
130 lines (97 loc) · 3.22 KB
/
2023-02-13--investigate-kmer-prefixes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
import sys
import json
import datetime
import itertools
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
with open("internal-counts.json") as inf:
internal_counts = json.load(inf)
with open("prefix-counts.json") as inf:
prefix_counts = json.load(inf)
fig, axs = plt.subplots(nrows=4,
ncols=2,
sharex=True, sharey=True,
figsize=(6,12),
constrained_layout=True)
plt.suptitle("patterns in read prefixes")
fig.supxlabel("kmers, most to least frequent")
fig.supylabel("proportion of data covered")
for i in range(8):
k = i+1
internal = internal_counts[i]
prefixes = prefix_counts[i]
ax = axs[i//2][i%2]
for data, label in [
[internal_counts[i], "internal"],
[prefix_counts[i], "prefix"]]:
xs = []
ys = []
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
raw = [(count, kmer)
for (kmer, count) in data.items()
if 'N' not in kmer]
raw.sort(reverse=True)
total = sum(data.values())
cumulative = 0
for n, (count, kmer) in enumerate(raw):
cumulative += count
xs.append(100 * n / (4**k-1))
ys.append(100 * cumulative / total)
ax.plot(xs, ys, label=label)
ax.plot([0,100], [0,100], color='black', label="x=y")
ax.legend()
ax.set_title("k=%s" % k)
fig.savefig("kmer-prefix-frequency.png", dpi=180)
plt.clf()
fig, axs = plt.subplots(nrows=4,
ncols=2,
sharex=False, sharey=False,
figsize=(8,16),
constrained_layout=True)
plt.suptitle("top read prefix k-mers")
fig.supylabel("top kmers")
fig.supxlabel("proportion of data covered")
for i in range(8):
k = i+1
internal = internal_counts[i]
prefixes = prefix_counts[i]
ax = axs[i//2][i%2]
raw = [[count, kmer]
for (kmer, count) in prefixes.items()
if 'N' not in kmer][:32]
for row, (_, kmer) in enumerate(raw):
raw[row].append(internal.get(kmer, 0))
raw.sort(reverse=True)
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
internal_total = sum(internal.values())
prefixes_total = sum(prefixes.values())
xs_p = []
xs_i = []
labels = []
for prefix_count, kmer, internal_count in raw:
xs_p.append(100 * prefix_count / prefixes_total)
xs_i.append(100 * internal_count / internal_total)
labels.append(kmer)
if k == 1:
width = 0.4
fontsize=20
elif k == 2:
width = 0.4
fontsize=10
else:
width = 0.4
fontsize=8
y_pos_p = np.arange(len(xs_p))
y_pos_i = y_pos_p + width
ax.barh(y_pos_p, xs_p, width, label='prefix')
ax.barh(y_pos_i, xs_i, width, label='internal')
ax.set_yticks((y_pos_i + y_pos_p)/2, labels=labels, fontsize=fontsize)
ax.invert_yaxis() # labels read top-to-bottom
ax.legend()
ax.set_title("k=%s" % k)
fig.savefig("kmer-prefix-frequency-bars.png", dpi=180)
plt.clf()