-
Notifications
You must be signed in to change notification settings - Fork 0
/
kegg_analysis.py
executable file
·236 lines (163 loc) · 8.6 KB
/
kegg_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python
import pandas as pd
import argparse
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def read_args(args):
parser = argparse.ArgumentParser()
parser.add_argument('-gap_roary',
'--gene_absence_presence_roary',
nargs = '?',
help = "The table of gene absence and presence from Roary.",
type = str)
parser.add_argument('-p',
'--pangenome',
nargs = '?',
help = 'The pangenome file from Roary.',
type = str)
parser.add_argument('-eggnog',
'--eggnog_annotation',
nargs = '?',
help = 'The annotation file from EggNOG.',
type = str)
parser.add_argument('-cd',
'--core_density',
help = 'proportion of isolates a gene must be in to be core. default [1]',
type = float,
default = 1.0)
parser.add_argument('-hmap',
'--heatmap',
help = 'plotting the heatmap',
action = 'store_true')
parser.add_argument('-o_fig',
'--output_figure',
help = 'Specify the output figure name.',
type = str)
parser.add_argument('-o_eggnog_subset',
'--output_eggnog_subset',
help = 'output eggnog file specific for unique genes',
type = str)
parser.add_argument('-o_unique_genes',
'--output_unique_genes',
help = 'output the unique genes in a file.',
type = str)
parser.add_argument('-s',
'--species',
help = 'Choose the species you want to check for unique genes. [TS_1, TS_2, Moralis]',
type = str)
return vars(parser.parse_args())
def gap_file_parser(gap_file):
gap_df = pd.read_csv(gap_file, header = 0)
return gap_df
species_genomes = {'calc_2082.bin.4': 'TS1', 'calc_2086.bin.1': 'TS1',
'calc_2084.bin.6': 'TS2', 'calc_2090.bin.4': 'TS2',
'calc_2091.bin.4': 'TS2', 'calc_2093.bin.1': 'TS2',
'calc_2094.bin.7': 'TS2', 'calc_2095.bin.8': 'TS2',
'calc_2096.bin.3': 'TS2', 'calc_2099.bin.5': 'TS2',
'GCA_001639275_SGB720': 'Moralis', 'GCF_000529525.1_SGB720': 'Moralis',
'GCF_900289035.1_PRJEB24872_SGB720': 'Moralis', 'GCF_902384065.1_SGB720': 'Moralis',
'calc_2102.bin.3': 'Moralis'}
def pangenome_paser(pangenome_file):
seqs = SeqIO.parse(pangenome_file, 'fasta')
seqs_2 = SeqIO.parse(pangenome_file, 'fasta')
label_link = {}
for seq in seqs:
labels = seq.description.split(' ')
label_link[labels[0]] = labels[1]
for seq in seqs_2:
labels = seq.description.split(' ')
label_link[labels[1]] = labels[0]
return label_link
def eggnog_parser(eggnog_annotation):
eggnog_annotation = pd.read_csv(eggnog_annotation, skiprows = 4, skipfooter = 3, sep = '\t', engine = 'python')
return eggnog_annotation
def select_ko_pathway_genes(eggnog_df):
selected_df = eggnog_df.loc[(eggnog_df['KEGG_ko'] != '-') & (eggnog_df['KEGG_Pathway'] != '-')]
return selected_df
def extracting_gap_file(gap_file, cut_off = 1.0):
# n_exclude: excluding genes which are present in N genomes
gap_file['No. isolates'] = gap_file['No. isolates'].astype(int)
gap_file['No. sequences'] = gap_file['No. sequences'].astype(int)
n_exclude = cut_off * 15
gap_file = gap_file.loc[(gap_file['No. isolates'] < n_exclude) & (gap_file['No. sequences'] < n_exclude)]
subset_df = gap_file[['Gene', 'calc_2082.bin.4', 'calc_2086.bin.1',
'calc_2084.bin.6', 'calc_2090.bin.4',
'calc_2091.bin.4', 'calc_2093.bin.1',
'calc_2094.bin.7', 'calc_2095.bin.8',
'calc_2096.bin.3', 'calc_2099.bin.5',
'GCA_001639275_SGB720', 'GCF_000529525.1_SGB720',
'GCF_900289035.1_PRJEB24872_SGB720', 'GCF_902384065.1_SGB720',
'calc_2102.bin.3']]
return subset_df
def heatmap_maker(df_, opt_fig):
sns.set(font_scale=1)
fig, ax = plt.subplots()
df_heatmap = df_
fig.set_size_inches(8, 12)
ax = sns.heatmap(df_heatmap, xticklabels=True, yticklabels=False)
ax.set(xlabel='', ylabel='KEGG')
fig.savefig(opt_fig, dpi=300, bbox_inches = 'tight')
def unique_gene_identifier(cleaned_gap_df, genomes_list):
# cleaned_gap_file: The gene absence and presence file
# genomes_list: a list of genomes to search for uniques to the rest
# eggnog_file: the annotation file from eggnog.
Moralis_genomes = ['GCA_001639275_SGB720', 'GCF_000529525.1_SGB720',
'GCF_900289035.1_PRJEB24872_SGB720', 'GCF_902384065.1_SGB720',
'calc_2102.bin.3']
remaining_genomes = [i for i in list(cleaned_gap_df) if i not in genomes_list + ['Gene']]
unique_genes = []
print('comparing to M. oralis')
print(genomes_list)
print(Moralis_genomes)
for index, row in cleaned_gap_df.iterrows():
if (row[genomes_list].isna().sum().sum() == 0) and (row[remaining_genomes].isna().sum().sum() == len(remaining_genomes)):
unique_genes.append(row['Gene'])
return unique_genes
if __name__ == '__main__':
pars = read_args(sys.argv)
gap_df = gap_file_parser(pars['gene_absence_presence_roary'])
gene_group_label = pangenome_paser(pars['pangenome'])
eggnog_annotations = eggnog_parser(pars['eggnog_annotation'])
genes_ko_pathway = list(select_ko_pathway_genes(eggnog_annotations)['#query']) # Genes identified with KEGG_ko and KEGG_pathway
gap_genes_ko_pathway = [gene_group_label[i] for i in genes_ko_pathway] # Converting gene label to group label
subset_gap_file = extracting_gap_file(gap_df, pars['core_density'])
abs_pre_in_ko_pathway_df = subset_gap_file[subset_gap_file.Gene.isin(gap_genes_ko_pathway)]
abs_pre_in_ko_pathway_df_4selecting = subset_gap_file[subset_gap_file.Gene.isin(gap_genes_ko_pathway)]
abs_pre_in_ko_pathway_df = abs_pre_in_ko_pathway_df.drop('Gene', axis = 1)
binary_gap_file = abs_pre_in_ko_pathway_df.notnull().astype('int')
if pars['heatmap']:
heatmap_maker(binary_gap_file, pars['output_figure'])
else:
print("Heatmap generating skipped.")
TS_1_genomes = ['calc_2082.bin.4', 'calc_2086.bin.1']
TS_2_genomes = ['calc_2084.bin.6', 'calc_2090.bin.4',
'calc_2091.bin.4', 'calc_2093.bin.1',
'calc_2094.bin.7', 'calc_2095.bin.8',
'calc_2096.bin.3', 'calc_2099.bin.5']
Moralis_genomes = ['GCA_001639275_SGB720', 'GCF_000529525.1_SGB720',
'GCF_900289035.1_PRJEB24872_SGB720', 'GCF_902384065.1_SGB720',
'calc_2102.bin.3']
species_genomes_link = {'TS_1': TS_1_genomes, 'TS_2': TS_2_genomes, 'Moralis': Moralis_genomes}
if pars['output_eggnog_subset'] and pars['species']:
print("We are searching for unique genes (identified with KEGG KO and KEGG Pathway) to {}".format(pars['species']))
u_gene = unique_gene_identifier(abs_pre_in_ko_pathway_df_4selecting, species_genomes_link[pars['species']])
u_gene_group = [gene_group_label[i] for i in u_gene]
eggnog_subset_tab = eggnog_annotations[eggnog_annotations['#query'].isin(u_gene_group)]
eggnog_subset_tab.to_csv(pars['output_eggnog_subset'], sep = '\t', index = False)
else:
print('Skip unique genes (identified with KEGG KO and KEGG Pathway) searching......')
if pars['output_unique_genes'] and pars['species']:
print("We are searching for unique genes to {}".format(pars['species']))
general_u_gene = unique_gene_identifier(subset_gap_file, species_genomes_link[pars['species']])
general_u_gene_label = [gene_group_label[i] for i in general_u_gene]
opt_unique_genes = open(pars['output_unique_genes'], 'w')
for g in general_u_gene_label:
opt_unique_genes.write(g + '\n')
opt_unique_genes.close()
else:
print('Skip general unique genes.......')