From 6a18bb7258e2992dd4137a73a104e60753b26cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gwendolyn=20O=2E=20D=C3=B6bel?= <81755070+GwennyGit@users.noreply.github.com> Date: Wed, 10 Jul 2024 15:57:53 +0200 Subject: [PATCH] Updated config.yaml & .rst for cmpb #5 #8 #11 --- docs/source/cmpb/cmpb-config.rst | 135 +++++++++++----------- src/specimen/data/config/cmpb_config.yaml | 74 ++++++------ 2 files changed, 105 insertions(+), 104 deletions(-) diff --git a/docs/source/cmpb/cmpb-config.rst b/docs/source/cmpb/cmpb-config.rst index c629c4b..7216b25 100644 --- a/docs/source/cmpb/cmpb-config.rst +++ b/docs/source/cmpb/cmpb-config.rst @@ -1,100 +1,101 @@ -CMPB Configuration File -======================= +``CMPB`` Configuration File +=========================== -Below, the configuration file with the underlying defaults, is listed. +Below, the configuration file with the underlying defaults, is displayed. .. code-block:: yaml # Configuration file for the SPECIMEN CMPB pipeline - # Explaination for default parameters: - # with the value __USER__ are required to be specified by the user - # with the value USER are required only under specific cases + # Meaning of the default parameters: + # The value __USER__ indicates parameters required to be specified by the user + # The value USER indicates parameters required only in specific cases - # meta info: + # Meta info: # model: USER # organism: USER # date: USER # author: USER - # input for the pipeline + # Input for the pipeline # ---------------------- input: - modelpath: NULL # optional, path to a model. - # If not given, runs CarveMe - annotated_genome: __USER__ # required, path to the annotated genome file - mediapath: __USER__ # path to a media config to tests growth with + modelpath: NULL # Optional, path to a model. + # If not given, runs CarveMe -> Future update! + annotated_genome: __USER__ # Required, path to the annotated genome file + mediapath: __USER__ # Path to a media config to test growth with - # general options + # General options # --------------- general: - dir: './' # Path/Name of a directory to save output to - colours: 'YlGn' # set the colour scheme for the plots - # should be a valid matplotlib continuous color palette - namespace: BiGG # Namespace to use for the model - # Possible identifiers, currently: BiGG - save_all_models: True # save all models (models for each step) - memote_always_on: False # run memote after every step - stats_always_on: False # calculate the model statistics after every step - # below are options used by multiple steps - refseq_gff: USER # Path to RefSeq GFF file: Required for gap analysis with 'KEGG'. - # Can be optionally provided for cm-polish. - kegg_organism_id: USER # KEGG ID of the organism: Required for gap analysis with KEGG. - # Can be optionally provided for cm-polish. - - # part-specific options + dir: './' # Path/Name of a directory to save output to + colours: 'YlGn' # Set the colour scheme for the plots + # should be a valid matplotlib continuous color palette + namespace: BiGG # Namespace to use for the model + # Possible identifiers, currently: BiGG + save_all_models: True # Save a model per step + memote_always_on: False # Run MEMOTE after every step + stats_always_on: False # Calculate the model statistics after every step + + # Below are options used by multiple steps + refseq_gff: USER # Path to RefSeq GFF file: Required for gap analysis with 'KEGG'. + # Can be optionally provided for cm-polish. + kegg_organism_id: USER # KEGG ID of the organism: Required for gap analysis with 'KEGG'. + # Can be optionally provided for cm-polish. + + # Part-specific options # --------------------- - # polish a CarveMe model - # only neccessary, if the mode will or has been build with CarveMe - # will only be used, if model is indeed a CarveMe model + # Polish a CarveMe model + # Only neccessary, if the model will or has been build with CarveMe + # Will only be used, if model is indeed a CarveMe model cm-polish: - email: USER # User Mail to use for Entrez - protein_fasta: USER # optional, except for is_lab_strain: True. - # The path to the protein FASTA used to create the CarveMe model. - is_lab_strain: False # whether the users strain originates from a lab - # Needs to be set to ensure that protein IDs get the 'bqbiol:isHomologTo' qualifier - # & to set the locus_tag to the ones obtained by the annotation + email: USER # User Mail to use for Entrez + protein_fasta: USER # Optional, except for 'is_lab_strain: True'. + # The path to the protein FASTA used to create the CarveMe model. + is_lab_strain: False # Whether the users strain originates from a lab + # Needs to be set to ensure that protein IDs get the 'bqbiol:isHomologTo' qualifier + # & to set the locus_tag to the ones obtained by the annotation + # (Warning: Might cause issues if annotatione was not performed with NCBI PGAP!) - # gapfilling, optional + # Filling gaps, optional gapfilling: - ### Automatic gap filling ### - # All parameters are required for all db_to_compare choices except biocyc_files which is not required for 'KEGG' - gap_fill_params: + ### Automatic gap filling ### + # All parameters are required for all 'db_to_compare' choices except 'biocyc_files' which is not required for 'KEGG' + gap_fill_params: db_to_compare: USER # One of the choices KEGG|BioCyc|KEGG+BioCyc biocyc_files: - - USER # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' - - USER # Path to TXT file containing a SmartTable with all reaction relevant information (*) - - USER # Path to TXT file containing a SmartTable with all metabolite relevant information (+) - - USER # Path to protein FASTA file used as input for CarveMe (Required to get the protein IDs from the locus tags) - # (*) 'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'KEGG Reaction' 'MetaNetX' 'Reaction-Direction' 'Spontaneous?' - # (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI' - gap_fill_file: NULL # Path to Excel file with which gaps in model should be filled - # Either obtained by running gapfilling/Created by hand with the same structure as the result file from gapfilling - # Example Excel file to fill in by hand: data/modelName_gapfill_analysis_date_example.xlsx + - USER # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' + - USER # Path to TXT file containing a SmartTable with all reaction relevant information (*) + - USER # Path to TXT file containing a SmartTable with all metabolite relevant information (+) + - USER # Path to protein FASTA file used as input for CarveMe (Required to get the protein IDs from the locus tags) + # (*) 'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'KEGG Reaction' 'MetaNetX' 'Reaction-Direction' 'Spontaneous?' + # (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI' + gap_fill_file: NULL # Path to Excel file with which gaps in model should be filled + # Either obtained by running gapfilling/Created by hand with the same structure as the result file from gapfilling + # Example Excel file to fill in by hand: refinegems/src/refinegems/example/example_inputs/modelName_gapfill_analysis_date_example.xlsx - # add KEGG pathways as groups, optional - kegg_pathway_groups: True # decide, whether to run this or not + # Add KEGG pathways as groups, optional + kegg_pathway_groups: True - # resolve duplicates + # Resolve duplicates duplicates: - # three possible option for the resolvement of duplicates for the following model entities: - # - check: check for duplicates and simply report them - # - remove: check for and remove duplicates from the model (if possible) - # - skip: skip the resolvement - reactions: remove - metabolites: remove - # additional remove unused metabolites (reduces possible knowledge base) - remove_unused_metabs: False + # Three possible options for the resolvement of duplicates for the following model entities: + # - check: Check for duplicates and simply report them + # - remove: Check for and remove duplicates from the model (if possible) + # - skip: Skip the resolvement + reactions: remove + metabolites: remove + # Additionally, remove unused metabolites (possibly reduces knowledge-base) + remove_unused_metabs: False # BOFdat / Biomass objective function BOF: - run_bofdat: False - # if BOFdat should be run, - # fill out the params below - bofdat_params: - full_genome_sequence: USER # whole genome sequence + run_bofdat: False + # if BOFdat should be run, + # fill out the params below + bofdat_params: + full_genome_sequence: USER # Whole genome sequence dna_weight_fraction: USER # DNA weight fraction for the organism weight_fraction: USER # Ezyme/ion weight fractions for the organism - diff --git a/src/specimen/data/config/cmpb_config.yaml b/src/specimen/data/config/cmpb_config.yaml index d74bd08..576f354 100644 --- a/src/specimen/data/config/cmpb_config.yaml +++ b/src/specimen/data/config/cmpb_config.yaml @@ -1,58 +1,60 @@ # Configuration file for the SPECIMEN CMPB pipeline -# Explaination for default parameters: -# with the value __USER__ are required to be specified by the user -# with the value USER are required only under specific cases +# Meaning of the default parameters: +# The value __USER__ indicates parameters required to be specified by the user +# The value USER indicates parameters required only in specific cases -# meta info: +# Meta info: # model: USER # organism: USER # date: USER # author: USER -# input for the pipeline +# Input for the pipeline # ---------------------- input: - modelpath: NULL # optional, path to a model. - # If not given, runs CarveMe - annotated_genome: __USER__ # required, path to the annotated genome file - mediapath: __USER__ # path to a media config to tests growth with + modelpath: NULL # Optional, path to a model. + # If not given, runs CarveMe -> Future update! + annotated_genome: __USER__ # Required, path to the annotated genome file + mediapath: __USER__ # Path to a media config to test growth with -# general options +# General options # --------------- general: dir: './' # Path/Name of a directory to save output to - colours: 'YlGn' # set the colour scheme for the plots + colours: 'YlGn' # Set the colour scheme for the plots # should be a valid matplotlib continuous color palette namespace: BiGG # Namespace to use for the model # Possible identifiers, currently: BiGG - save_all_models: True # save all models (models for each step) - memote_always_on: False # run memote after every step - stats_always_on: False # calculate the model statistics after every step - # below are options used by multiple steps + save_all_models: True # Save a model per step + memote_always_on: False # Run MEMOTE after every step + stats_always_on: False # Calculate the model statistics after every step + + # Below are options used by multiple steps refseq_gff: USER # Path to RefSeq GFF file: Required for gap analysis with 'KEGG'. # Can be optionally provided for cm-polish. - kegg_organism_id: USER # KEGG ID of the organism: Required for gap analysis with KEGG. + kegg_organism_id: USER # KEGG ID of the organism: Required for gap analysis with 'KEGG'. # Can be optionally provided for cm-polish. - -# part-specific options + +# Part-specific options # --------------------- -# polish a CarveMe model -# only neccessary, if the mode will or has been build with CarveMe -# will only be used, if model is indeed a CarveMe model +# Polish a CarveMe model +# Only neccessary, if the model will or has been build with CarveMe +# Will only be used, if model is indeed a CarveMe model cm-polish: email: USER # User Mail to use for Entrez - protein_fasta: USER # optional, except for is_lab_strain: True. + protein_fasta: USER # Optional, except for 'is_lab_strain: True'. # The path to the protein FASTA used to create the CarveMe model. - is_lab_strain: False # whether the users strain originates from a lab + is_lab_strain: False # Whether the users strain originates from a lab # Needs to be set to ensure that protein IDs get the 'bqbiol:isHomologTo' qualifier # & to set the locus_tag to the ones obtained by the annotation + # (Warning: Might cause issues if annotatione was not performed with NCBI PGAP!) -# gapfilling, optional +# Filling gaps, optional gapfilling: ### Automatic gap filling ### - # All parameters are required for all db_to_compare choices except biocyc_files which is not required for 'KEGG' + # All parameters are required for all 'db_to_compare' choices except 'biocyc_files' which is not required for 'KEGG' gap_fill_params: db_to_compare: USER # One of the choices KEGG|BioCyc|KEGG+BioCyc biocyc_files: @@ -64,20 +66,20 @@ gapfilling: # (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI' gap_fill_file: NULL # Path to Excel file with which gaps in model should be filled # Either obtained by running gapfilling/Created by hand with the same structure as the result file from gapfilling - # Example Excel file to fill in by hand: data/modelName_gapfill_analysis_date_example.xlsx + # Example Excel file to fill in by hand: refinegems/src/refinegems/example/example_inputs/modelName_gapfill_analysis_date_example.xlsx -# add KEGG pathways as groups, optional -kegg_pathway_groups: True # decide, whether to run this or not +# Add KEGG pathways as groups, optional +kegg_pathway_groups: True -# resolve duplicates +# Resolve duplicates duplicates: - # three possible option for the resolvement of duplicates for the following model entities: - # - check: check for duplicates and simply report them - # - remove: check for and remove duplicates from the model (if possible) - # - skip: skip the resolvement + # Three possible options for the resolvement of duplicates for the following model entities: + # - check: Check for duplicates and simply report them + # - remove: Check for and remove duplicates from the model (if possible) + # - skip: Skip the resolvement reactions: remove metabolites: remove - # additional remove unused metabolites (reduces possible knowledge base) + # Additionally, remove unused metabolites (possibly reduces knowledge-base) remove_unused_metabs: False # BOFdat / Biomass objective function @@ -86,8 +88,6 @@ BOF: # if BOFdat should be run, # fill out the params below bofdat_params: - full_genome_sequence: USER # whole genome sequence + full_genome_sequence: USER # Whole genome sequence dna_weight_fraction: USER # DNA weight fraction for the organism weight_fraction: USER # Ezyme/ion weight fractions for the organism - -