Skip to content

Commit

Permalink
Major fix in mapping boxes IDs and positions
Browse files Browse the repository at this point in the history
in orthologs array
Also changed default of "step" to 40
- KEGG's API can't handle much more
  • Loading branch information
iquasere committed May 26, 2023
1 parent 3d2e133 commit 07df6af
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 38 deletions.
64 changes: 36 additions & 28 deletions keggcharter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from keggpathway_map import KEGGPathwayMap, expand_by_list_column

__version__ = "0.5.0"
__version__ = "0.5.1"


def get_arguments():
Expand Down Expand Up @@ -56,7 +56,7 @@ def get_arguments():
"--resume", action="store_true", default=False,
help="If data inputed has already been analyzed by KEGGCharter.")
parser.add_argument(
"--step", default=150, type=int, help="Number of IDs to submit per request through the KEGG API.")
"--step", default=40, type=int, help="Number of IDs to submit per request through the KEGG API [40]")
parser.add_argument('-v', '--version', action='version', version='KEGGCharter ' + __version__)

required_named = parser.add_argument_group('required named arguments')
Expand Down Expand Up @@ -117,10 +117,15 @@ def read_input_file(file):
return pd.read_csv(file, sep='\t')


def further_information(data, kegg_column=None, ko_column=None, ec_column=None, step=150):
def further_information(data, output, kegg_column=None, ko_column=None, ec_column=None, step=150):
"""
Adds KEGG, KO and EC information to the input data
"""
data = get_cross_references(data, kegg_column=kegg_column, ko_column=ko_column, ec_column=ec_column, step=step)
main_column = kegg_column if kegg_column is not None else ko_column if ko_column is not None else ec_column
data = condense_data(data, main_column)
data.to_csv(output, sep='\t', index=False)
timed_message(f'Results saved to {output}')
return data, main_column


Expand All @@ -142,8 +147,8 @@ def id2id(input_ids, column, output_column, output_ids_type, step=150, desc=''):
try:
result = pd.concat([result, pd.read_csv(
kegg_link(output_ids_type, input_ids[i:j]), sep='\t', names=[column, output_column])])
except:
print(f'IDs conversion broke at index: {i}')
except Exception as e:
print(f'IDs conversion broke at index: {i}; Error: {e}')
if output_ids_type == 'ko':
result[output_column] = result[output_column].apply(lambda x: x.strip('ko:'))
result[column] = result[column].apply(lambda x: x.strip('ec:'))
Expand All @@ -154,11 +159,11 @@ def id2id(input_ids, column, output_column, output_ids_type, step=150, desc=''):


def ids_interconversion(data, column, ids_type='kegg', step=150):
ids = list(set(data[data[column].notnull()][column]))
ids = data[column].dropna().unique().tolist()
base_desc = 'Converting %i %s to %s through the KEGG API'
if ids_type == 'kegg':
# sometimes Kegg IDs come as mfc:BRM9_0145;mfi:DSM1535_1468; (e.g. from UniProt IDs mapping).
# Should be no problem since both IDs likely will always represent same functions
# Should be no problem to select the first one since both IDs likely will represent the same functions
trimmed_ids = [ide.split(';')[0] for ide in ids]
relational = pd.DataFrame([ids, trimmed_ids]).transpose()
new_ids = id2id(
Expand All @@ -171,9 +176,11 @@ def ids_interconversion(data, column, ids_type='kegg', step=150):
new_ids = id2id(
ids, column, 'EC number (KEGGCharter)', 'enzyme', desc=base_desc % (len(ids), 'KOs', 'EC numbers'),
step=step)
else: # ids_type == 'ec':
elif ids_type == 'ec':
new_ids = id2id(
ids, column, 'KO (KEGGCharter)', 'ko', desc=base_desc % (len(ids), 'EC numbers', 'KOs'), step=step)
else:
raise ValueError('ids_type must be one of: kegg, ko, ec')
return pd.merge(data, new_ids, on=column, how='left')


Expand Down Expand Up @@ -412,17 +419,6 @@ def read_input():
sys.exit(kegg_metabolic_maps().to_string())
data = read_input_file(args.file)
timed_message('Data successfully read.')
return args, data


def main():
args, data = read_input()

if not args.resume:
data, main_column = further_information(
data, kegg_column=args.kegg_column, ko_column=args.ko_column, ec_column=args.ec_column, step=args.step)
data.to_csv(f'{args.output}/KEGGCharter_results.tsv', sep='\t', index=False)
timed_message(f'Results saved to {args.output}/KEGGCharter_results.tsv')

if args.input_quantification:
data['Quantification (KEGGCharter)'] = [1] * len(data)
Expand All @@ -433,12 +429,23 @@ def main():
args.taxa_column = 'Taxon (KEGGCharter)'
args.taxa_list = args.input_taxonomy

metabolic_maps = args.metabolic_maps.split(',')
args.metabolic_maps = args.metabolic_maps.split(',')
args.genomic_columns = args.genomic_columns.split(',')
if args.transcriptomic_columns:
args.transcriptomic_columns = args.transcriptomic_columns.split(',')

ko_column = 'KO (KEGGCharter)' if not hasattr(args, 'ko_column') or not args.ko_column else args.ko_column
return args, data


def main():
args, data = read_input()

if not args.resume:
data, main_column = further_information(
data, f'{args.output}/KEGGCharter_results.tsv', kegg_column=args.kegg_column, ko_column=args.ko_column,
ec_column=args.ec_column, step=args.step)

ko_column = args.ko_column if args.ko_column else 'KO (KEGGCharter)'

if args.resume:
data = pd.read_csv(f'{args.output}/data_for_charting.tsv', sep='\t')
Expand All @@ -452,27 +459,28 @@ def main():
data.to_csv(f'{args.output}/data_for_charting.tsv', sep='\t', index=False)
if not args.input_taxonomy:
taxon_to_mmap_to_orthologs = download_resources(
data, args.resources_directory, args.taxa_column, metabolic_maps)
data, args.resources_directory, args.taxa_column, args.metabolic_maps)
h = open(f"{args.output}/taxon_to_mmap_to_orthologs.json", "w")
json.dump(taxon_to_mmap_to_orthologs, h)
else:
taxon_to_mmap_to_orthologs = None

# '00190': ['Keratinibaculum paraultunense']
mmaps2taxa = get_mmaps2taxa(taxon_to_mmap_to_orthologs) if not args.input_taxonomy else None
timed_message(f'Creating KEGG Pathway representations for {len(metabolic_maps)} metabolic pathways.')
for i in range(len(metabolic_maps)):
pathway, ec_list = get_pathway_and_ec_list(args.resources_directory, metabolic_maps[i])
timed_message(f'Creating KEGG Pathway representations for {len(args.metabolic_maps)} metabolic pathways.')
for i in range(len(args.metabolic_maps)):
pathway, ec_list = get_pathway_and_ec_list(args.resources_directory, args.metabolic_maps[i])
if pathway is not None and ec_list is not None:
timed_message(f'[{i + 1}/{len(metabolic_maps)}] {pathway.title}')
timed_message(f'[{i + 1}/{len(args.metabolic_maps)}] {pathway.title}')
chart_map(
f'{args.resources_directory}/kc_kgmls/ko{metabolic_maps[i]}.xml', ec_list, data,
f'{args.resources_directory}/kc_kgmls/ko{args.metabolic_maps[i]}.xml', ec_list, data,
taxon_to_mmap_to_orthologs, mmaps2taxa, output=args.output,
ko_column=ko_column, taxa_column=args.taxa_column,
genomic_columns=args.genomic_columns, transcriptomic_columns=args.transcriptomic_columns,
number_of_taxa=args.number_of_taxa,
grey_taxa=('Other taxa' if args.input_taxonomy is None else args.input_taxonomy))
else:
print(f'Analysis of map {metabolic_maps[i]} failed! Map might have been deleted,'
print(f'Analysis of map {args.metabolic_maps[i]} failed! Map might have been deleted, '
f'for more info raise an issue at https://github.com/iquasere/KEGGCharter/issues')
i += 1

Expand Down
22 changes: 12 additions & 10 deletions keggpathway_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,9 @@ def __init__(self, pathway, ec_list):
:param ec_list: (list) - list from ECs CSV
"""
self.pathway = pathway
self.ko_boxes = {}
self.set_pathway(ec_list)
self.ortho_ids_to_pos = {self.pathway.orthologs[i].id: i for i in range(len(self.pathway.orthologs))}

def __getattr__(self, item):
m = getattr(self.pathway, item, None)
Expand All @@ -208,18 +210,18 @@ def set_pathway(self, ec_list):
"""
Set pathway with KEGG Pathway ID
"""
self.ko_boxes = {}
for i in range(len(self.orthologs)):
set_bgcolor(self.orthologs[i], "#ffffff") # set all boxes to white
# self.set_fgcolor(self.pathway.orthologs[i], "#ffffff") # This might be helpful in the future, if an additional layer of marking is needed
orthologs_in_box = [ide[3:] for ide in self.orthologs[
i].name.split()] # 'ko:K16157 ko:K16158 ko:K16159' -> ['K16157', 'K16158', 'K16159']
for ortholog in orthologs_in_box:
if ortholog not in self.ko_boxes.keys():
self.ko_boxes[ortholog] = []
self.ko_boxes[ortholog].append(i) # {'K16157':[0,13,432], 'K16158':[4,13,545]}
self.ko_boxes[ortholog] = [self.orthologs[i].id]
else:
self.ko_boxes[ortholog].append(self.orthologs[i].id) # {'K16157':[0,13,432], 'K16158':[4,13,545]}

# Set name as EC number
# Set name as most abundant EC number, if no EC numbers are available use KO
ecs = ec_list[i].split(',')
if len(ecs) > 0:
self.orthologs[i].graphics[0].name = max(set(ecs), key=ecs.count).split(':')[1]
Expand Down Expand Up @@ -260,23 +262,24 @@ def pathway_box_list(self, taxa_in_box, dic_colors, maxshared=10):
:param maxshared: int - maximum number of taxa sharing one box
"""
for box in taxa_in_box.keys():
boxidx = self.ortho_ids_to_pos[box] # get box index
nrboxes = len(taxa_in_box[box])
if nrboxes > maxshared:
nrboxes = maxshared

paired = True if nrboxes % 2 == 0 else False
for i in range(nrboxes):
newrecord = create_box_heatmap(
self.orthologs[box], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2),
self.orthologs[boxidx], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2),
# if nrboxes = 8, i * 2 - (nrboxes - 1) = -7,-5,-3,-1,1,3,5,7
# if nrboxes = 9, i - int(nrboxes / 2) = -4,-3,-2,-1,0,1,2,3,4
paired=paired)
if newrecord != 1:
newrecord.bgcolor = dic_colors[taxa_in_box[box][i]]
self.orthologs[box].graphics.append(newrecord)
self.orthologs[boxidx].graphics.append(newrecord)
# TODO - must check more deeply why sometimes width is None
if self.orthologs[box].graphics[0].width is not None:
create_tile_box(self.orthologs[box])
if self.orthologs[boxidx].graphics[0].width is not None:
create_tile_box(self.orthologs[boxidx])

def pathway_boxes_differential(self, dataframe, log=True, colormap="coolwarm"):
"""
Expand Down Expand Up @@ -417,7 +420,6 @@ def genomic_potential_taxa(
box2taxon[box].append(grey_taxa)
else:
box2taxon[box] = [grey_taxa]

name = self.name.split(':')[-1]
name_pdf = f'{output_basename}_{name}.pdf'
self.to_pdf(name_pdf)
Expand Down Expand Up @@ -455,7 +457,7 @@ def differential_expression_sample(
"""
if mmaps2taxa is not None:
data = data[data[taxa_column].isin(mmaps2taxa[self.name.split('ko')[1]])]
df = data.groupby(ko_column)[samples + [ko_column]].sum()
df = data.groupby(ko_column)[samples + [ko_column]].sum(numeric_only=True)
df = df[df.any(axis=1)]
df['Boxes'] = [self.ko_boxes[ko] if ko in self.ko_boxes.keys() else np.nan for ko in df.index]
df = df[df['Boxes'].notnull()]
Expand Down

0 comments on commit 07df6af

Please sign in to comment.