Update workflow cmpb #8

draeger-lab · May 23, 2024 · 50c6d0d · 50c6d0d
1 parent 339ab0b
commit 50c6d0d
Show file tree

Hide file tree

Showing 3 changed files with 257 additions and 264 deletions.
diff --git a/dev/old_workflow_cmpb.py b/dev/old_workflow_cmpb.py
@@ -0,0 +1,228 @@
+###########
+# old stuff
+###########
+# @TODO:
+# despite some changes, this is still mainly the old version, 
+# which WILL NOT work properly with the new refinegems update - yet
+# address TODOs before release!!!!!
+
+def run_old(configpath=None):
+    """main function to run the program"""
+    print("RefineGEMs provides functions to curate and investigate genome-scale metabolic models!")
+    print("Author:", __author__)
+
+    config = save_cmpb_user_input(configpath)
+    today = date.today().strftime("%Y%m%d")
+    log_file = Path(config["out_path"],f'rg_{str(today)}.log')
+
+    # check if the output directory is already present, if not create it
+    if not os.path.isdir(config['out_path']):
+        logging.info('Given out_path is not yet a directory, creating ' + config['out_path'])
+        os.makedirs(config['out_path'])
+    if config['visualize']:
+        dir = Path(config['out_path'],'visualization')
+        if not os.path.isdir(dir): 
+            os.makedirs(dir) 
+
+    print(f'The following logs are saved to: {log_file}')
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s:%(name)s] %(message)s",
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler()
+        ]
+    )
+    logging.getLogger('cobra').setLevel(logging.WARNING)
+    logging.getLogger('requests').setLevel(logging.WARNING)
+    logging.getLogger('urllib3').setLevel(logging.WARNING)
+    logging.info('----------- New run of refineGEMs -----------')
+
+    logging.info(f'Your output will be saved to: {config["out_path"]}')
+
+    if config['multiple']:
+        logging.info('Growth simulation for multiple models: ')
+        models_cobra = rg.utility.io.load_model(config['multiple_paths'], 'cobra')
+        growth_all = rg.analysis.growth.growth_analysis(model_cobra,
+                    config['mediapath'],
+                    namespace = config['namespace'],
+                    retrieve='report')
+        growth_all.save(Path(config['out_path']))
+        logging.info('Multiple model growth simulation results are saved to ' +  str(Path(config['out_path'], 'GrowthSimReport')))
+
+        ###########
+
+        # visualizations
+        # .....................................
+        # @TODO: more than half of the functions work differently now
+        if config['visualize']:
+            models_libsbml = rg.io.load_multiple_models(config['multiple_paths'], 'libsbml')
+            ini_plot = rg.comparison.plot_initial_analysis(models_libsbml).get_figure()
+            sbo_fig_all = rg.comparison.plot_rea_sbo_multiple(models_libsbml).get_figure()
+            venn_reac = rg.comparison.plot_venn(models_cobra, 'reaction', True).get_figure()
+            venn_metab = rg.comparison.plot_venn(models_cobra, 'metabolite', True).get_figure()
+            heatmap = rg.comparison.plot_heatmap_dt(growth_all[['model', 'medium', 'doubling_time [min]']])
+            native_heatmap = rg.comparison.plot_heatmap_native(growth_all)
+            # saving them
+            sbo_fig_all.savefig(config['out_path'] + 'visualization/' + 'all_ReacPerSBO_' + str(today) + '.png', bbox_inches='tight')
+            venn_reac.savefig(config['out_path'] + 'visualization/' + 'all_ReacOverlap_' + str(today) + '.png', bbox_inches='tight')
+            venn_metab.savefig(config['out_path'] + 'visualization/' + 'all_MetabOverlap_' + str(today) + '.png', bbox_inches='tight')
+            heatmap_dt_prefix = 'heatmap_dt_additives_anaerobic_' if config['anaerobic_growth'] else 'heatmap_dt_additives_'
+            heatmap.savefig(config['out_path'] + 'visualization/' + heatmap_dt_prefix + str(today) + '.png')
+            native_heatmap_prefix = 'heatmap_native_anaerobic_' if config['anaerobic_growth'] else 'heatmap_native_'
+            native_heatmap.savefig(config['out_path'] + 'visualization/' + native_heatmap_prefix + str(today) + '.png', bbox_inches='tight')
+            ini_plot.savefig(config['out_path'] + 'visualization/' + 'model_status_' + str(today) + '.png', bbox_inches='tight')
+        # .....................................
+
+    if config['single']:        
+        try:    
+            model_cobra, errors = cobra.io.sbml.validate_sbml_model(config['model'])
+            #logging.info(errors)
+        except (OSError):
+            model_cobra = None
+            logging.info('No or no valid model given, please enter a valid path in the model field in the config file.')
+
+        if config['keggpathways']:
+            model_libsbml, non_kegg = rg.curation.pathways.kegg_pathways(config['model'])
+            file = open(config['out_path'] + model_libsbml.getId() + '_reac_wo_kegg_' + str(today) + '.txt','w')
+            for reaction in non_kegg:
+                file.write(reaction+"\n")
+            file.close()
+            logging.info('Kegg Pathways were added to the model as groups. Reactions that have no KEGG annotation are denoted in ' + model_libsbml.getId() + '_reac_wo_kegg.txt')
+
+        else:
+            model_libsbml = rg.utility.io.load_model(config['model'],'libsbml')
+
+        if config['sboterms']:
+            if config['visualize']:
+                # .....................................
+                # @TODO: where us the functions now?
+                sbo_fig = rg.investigate.plot_rea_sbo_single(model_libsbml).get_figure()
+                # .....................................
+                # saving the created visualizations
+                sbo_fig.savefig(config['out_path'] + 'visualization/' + str(model_cobra.id) + '_ReacPerSBO_beforeUpdate_' + str(today) + '.png', bbox_inches='tight')
+            model_libsbml = rg.sboann.sbo_annotation(model_libsbml)
+            logging.info('SBO Terms updated for ' + model_libsbml.getId())
+
+        if config['charge_corr']:
+            model_libsbml, multiple_charges = rg.curation.charges.correct_charges_modelseed(model_libsbml)
+            pd.DataFrame.from_dict(multiple_charges, orient='index').to_csv(config['out_path'] + model_libsbml.getId() + '_mulchar_' + str(today) + '.csv', sep=',', header=False)
+            logging.info('Charges were corrected for ' + model_libsbml.getId() + '. A table with metabolites with multiple charges can be found under ' + model_libsbml.getId() + '_mulchar_' + str(today) + '.csv')
+
+        if config['man_cur']:
+            if config['man_cur_type'] == 'gapfill':
+                gapfill = rg.utility.io.load_manual_gapfill(config['man_cur_table'])
+                model_libsbml = rg.curation.curate.add_reactions_from_table(model_libsbml, gapfill, config['entrez_email'])
+                logging.info('Manual gap filling was done for ' + model_libsbml.getId())
+            elif config['man_cur_type'] == 'metabs':
+                man_ann = rg.utility.io.load_manual_annotations(config['man_cur_table'])
+                model_libsbml = rg.curation.curate.update_annotations_from_table(model_libsbml, man_ann)
+                model_libsbml = rg.curation.curate.update_annotations_from_others(model_libsbml)
+                logging.info('Manual update of annotations was done for ' + model_libsbml.getId())
+
+        if config['gap_analysis'] and config['gapfill_model']:
+            filename = f'{config["out_path"]}{model_libsbml.getId()}_gap_analysis_{str(today)}'
+            if config['gap_analysis_params'].get('db_to_compare') not in ('BioCyc', 'KEGG+BioCyc'):
+                logging.warning('Currently, only the result from the \'BioCyc\' or \'KEGG+BioCyc\' runs can be directly added to a model.')
+                gap_analysis = rg.curation.gapfill.gap_analysis(model_libsbml, config['gap_analysis_params'], filename)
+                logging.info(f'Gap analysis for {model_libsbml.getId()} with {config["gap_analysis_params"].get("db_to_compare")} was performed.')
+                logging.info(f'Complete Excel table is in file: {filename}.')
+            else:
+                gapfill = rg.curation.gapfill.gapfill(model_libsbml, config['gap_analysis_params'], filename)
+                gap_analysis_stats = gapfill[0][0]
+                logging.info(f'Statistics on missing entites for {model_libsbml.getId()}:')
+                logging.info(gap_analysis_stats)
+                logging.info(f'Complete Excel table is in file: {filename}.')
+                model_libsbml = gapfill[-1]
+                logging.info(f'Gaps were filled in {model_libsbml.getId()}.')
+        elif config['gap_analysis']:
+            filename = f'{config["out_path"]}{model_libsbml.getId()}_gap_analysis_{str(today)}'
+            gap_analysis = rg.curation.gapfill.gap_analysis(model_libsbml, config['gap_analysis_params'], filename)
+            logging.info(f'Gap analysis for {model_libsbml.getId()} with {config["gap_analysis_params"].get("db_to_compare")} was performed.')
+            if  config["gap_analysis_params"].get("db_to_compare") != 'KEGG':
+                logging.info(f'Statistics on missing entites for {model_libsbml.getId()}:')
+                logging.info(gap_analysis[0])
+            logging.info(f'Complete Excel table is in file: {filename}.')
+        elif config['gapfill_model']:
+            model_libsbml = rg.curation.gapfill.gapfill_model(model_libsbml, config['gap_analysis_file'])
+            logging.info(f'Gaps were filled in {model_libsbml.getId()}.')
+
+        if config['polish']:
+            model_libsbml = rg.curation.polish.polish(model_libsbml, config['entrez_email'], config['id_db'], config['gff_file'], 
+                                             config['protein_fasta'], config['lab_strain'], config['organismid'], config['out_path'])
+            logging.info(model_libsbml.getId() + ' has been polished')
+
+        if config['biomass']:
+            result = rg.curation.biomass.check_normalise_biomass(model_cobra)
+            if result:
+                model_libsbml = result
+                logging.info(model_libsbml.getId() + '\'s biomass function has been checked.')
+
+        mods = [config['keggpathways'], config['sboterms'], config['charge_corr'], config['man_cur'], config['gapfill_model'], config['polish'], config['biomass']]
+
+        if any(mods):
+            if config['model_out'] == 'stdout':   
+                config['model_out'] = config['out_path'] + model_libsbml.getId() + '_modified_' + str(today) + '.xml'
+
+            try:
+                rg.utility.io.write_model_to_file(model_libsbml, config['model_out'])
+            except (OSError) as e:
+                logging.info(e)
+                logging.info("Model could not be saved...")
+
+            if model_cobra:                                          
+                try:    
+                    model_cobra, errors = cobra.io.sbml.validate_sbml_model(config['model_out'])
+                    logging.info(errors)
+                except (OSError) as e:
+                    logging.info(e)
+                    model_cobra = None
+                    logging.info('Model was invalidated during curation steps.')
+
+        if model_cobra:
+            logging.info(model_cobra.id + ' will be investigated.')
+            # .....................................................
+            # @TODO: different !!!!!
+            name, reac, metab, genes = rg.analysis.investigate.initial_analysis(model_libsbml) 
+            orphans, deadends, disconnected = rg.investigate.get_orphans_deadends_disconnected(model_cobra)
+            mass_unbal, charge_unbal = rg.investigate.get_mass_charge_unbalanced(model_cobra)
+            egc = rg.investigate.get_egc(model_cobra)
+            if config['visualize']:
+                logging.info('All visualizations can be found in the subfolder "visualization".')
+                sbo_fig = rg.investigate.plot_rea_sbo_single(model_libsbml).get_figure()
+
+                # saving the created visualizations
+                sbo_fig.savefig(config['out_path'] + 'visualization/' + str(model_cobra.id) + '_ReacPerSBO_' + str(today) + '.png', bbox_inches='tight')
+            # .....................................................
+            # .....................................................
+            # @TODO different function
+            if config['memote']:
+                score = rg.investigate.get_memote_score(rg.investigate.run_memote(model_cobra))
+            # .....................................................
+            if config['modelseed']:
+                charge_mismatch, formula_mismatch = rg.curation.db_access.modelseed.compare_to_modelseed(model_cobra)
+            # .....................................................
+            # @TODO function changed
+            if config['media']:
+                growth_sim = rg.growth.get_growth_selected_media(model_cobra, config['media'], config['growth_basis'], config['anaerobic_growth'])
+            # .....................................................
+            if config['memote']:
+                information = [[name], [reac], [metab], [genes], [score], orphans, deadends, disconnected, mass_unbal, charge_unbal]
+                model_params = pd.DataFrame(information, ['model name', '#reactions', '#metabolites', '#genes', 'memote score', 'orphans', 'deadends', 'disconnected', 'mass unbalanced', 'charge unbalanced']).T
+            else:
+                information = [[name], [reac], [metab], [genes], orphans, deadends, disconnected, mass_unbal, charge_unbal]
+                model_params = pd.DataFrame(information, ['model name', '#reactions', '#metabolites', '#genes', 'orphans', 'deadends', 'disconnected', 'mass unbalanced', 'charge unbalanced']).T
+            with pd.ExcelWriter(config['out_path'] + name + '_' + str(today) + '.xlsx') as writer:  
+                model_params.to_excel(writer, sheet_name='model params', index=False)
+                growth_sim_name = 'anaerobic growth simulation' if config['anaerobic_growth'] else 'growth simulation'
+                if config['media']: #@TODO: needed???
+                    growth_sim.to_excel(writer, sheet_name=growth_sim_name, index=False)
+                egc.to_excel(writer, sheet_name='EGC test', index=False)
+                if config['modelseed']:
+                    charge_mismatch.to_excel(writer, sheet_name='charge mismatches', index=False)
+                    formula_mismatch.to_excel(writer, sheet_name='formula mismatches', index=False)
+                logging.info('Single model analyses results are saved to ' + name + '_analyses_' + str(today) + '.xlsx')
+        else:
+            logging.info('No valid model, investigation aborted!')
+