SnpEff (#153)

* Help file * config file * config file * runners script * config file * test script * test * test * runners script * snake case * snake case * output parameters * modify argument formatting, container setup * fix buf with mv command * avoid boolean_false and fix bug with output files --------- Co-authored-by: Emma Rousseau <[email protected]>
viash-hub · Oct 15, 2024 · 86333c1 · 86333c1
1 parent add1252
commit 86333c1
Show file tree

Hide file tree

Showing 8 changed files with 672 additions and 0 deletions.
diff --git a/src/snpeff/config.vsh.yaml b/src/snpeff/config.vsh.yaml
@@ -0,0 +1,297 @@
+name: snpeff
+description: |
+  Genetic variant annotation, and functional effect prediction toolbox. 
+  It annotates and predicts the effects of genetic variants on genes and 
+  proteins (such as amino acid changes).
+keywords: [ "annotation", "effect prediction", "snp", "variant", "vcf"]
+
+links:
+  repository: https://github.com/pcingola/SnpEff
+  homepage: https://pcingola.github.io/SnpEff/
+  documentation: https://pcingola.github.io/SnpEff/
+references:
+  doi: 10.3389/fgene.2012.00035
+license: MIT
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        type: file
+        description: Input variants file.
+        example: test.vcf
+        required: true
+      - name: --genome_version
+        type: string
+        description: Reference genome version.
+        example: GRCh37.75
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --output
+        type: file
+        description: The output file.
+        example: out.vcf
+        direction: output
+        required: true
+      - name: --summary
+        type: file
+        description: Summary file directory.
+        example: summary_dir
+        direction: output
+      - name: --genes
+        type: file
+        description: Txt file directory.
+        example: genes_dir
+        direction: output
+  - name: Options
+    arguments:
+      - name: --chr
+        type: string
+        description: |
+          Prepend 'string' to chromosome name (e.g. 'chr1' instead of '1'). Only on TXT output.
+      - name: --classic
+        type: boolean_true
+        description: Use old style annotations instead of Sequence Ontology and Hgvs.
+      - name: --csv_stats
+        type: file
+        description: Create CSV summary file.
+      - name: --download
+        type: boolean_true
+        description: Download reference genome if not available.
+      - name: --input_format
+        alternatives: [-i]
+        type: string
+        description: |
+          Input format [ vcf, bed ]. Default: VCF.
+          example: "VCF"
+      - name: --file_list
+        type: boolean_true
+        description: Input actually contains a list of files to process.
+      - name: --output_format
+        alternatives: [-o]
+        type: string
+        description: |
+          Output format [ vcf, gatk, bed, bedAnn ]. Default: VCF.
+        example: "VCF"
+      - name: --stats
+        alternatives: [-s, --htmlStats]
+        type: boolean_true
+        description: Create HTML summary file.
+      - name: --no_stats
+        type: boolean_true
+        description: Do not create stats (summary) file.
+  - name: Results filter options
+    arguments:
+      - name: --fi
+        alternatives: [--filterInterval]
+        type: file
+        description: |
+          Only analyze changes that intersect with the intervals 
+          specified in this file. This option can be used several times.
+      - name: --no_downstream 
+        type: boolean_true
+        description: Do not show DOWNSTREAM changes
+      - name: --no_intergenic
+        type: boolean_true
+        description: Do not show INTERGENIC changes.
+      - name: --no_intron
+        type: boolean_true
+        description: Do not show INTRON changes.
+      - name: --no_upstream
+        type: boolean_true
+        description: Do not show UPSTREAM changes.
+      - name: --no_utr
+        type: boolean_true
+        description: Do not show 5_PRIME_UTR or 3_PRIME_UTR changes.
+      - name: --no
+        type: string
+        description: |
+          Do not show 'EffectType'. This option can be used several times.
+  - name: Annotations options
+    arguments:
+      - name: --cancer 
+        type: boolean_true
+        description: Perform 'cancer' comparisons (Somatic vs Germline).
+      - name: --cancer_samples
+        type: file
+        description: Two column TXT file defining 'original \t derived' samples.
+      - name: --fastaprot
+        type: file
+        description: |
+          Create an output file containing the resulting protein sequences.
+      - name: --format_eff
+        type: boolean_true
+        description: |
+          Use 'EFF' field compatible with older versions (instead of 'ANN').
+      - name: --gene_id
+        type: boolean_true
+        description: Use gene ID instead of gene name (VCF output).
+      - name: --hgvs
+        type: boolean_true
+        description: Use HGVS annotations for amino acid sub-field.
+      - name: --hgvs_old
+        type: boolean_true
+        description: Use old HGVS notation.
+      - name: --hgvs1_letter_aa 
+        type: boolean_true
+        description: Use one letter Amino acid codes in HGVS notation.
+      - name: --hgvs_tr_id
+        type: boolean_true
+        description: Use transcript ID in HGVS notation.
+      - name: --lof
+        type: boolean_true
+        description: |
+          Add loss of function (LOF) and Nonsense mediated decay (NMD) tags.
+      - name: -no_hgvs
+        type: boolean_true
+        description: Do not add HGVS annotations.
+      - name: --no_lof
+        type: boolean_true
+        description: Do not add LOF and NMD annotations.
+      - name: --no_shift_hgvs
+        type: boolean_true
+        description: |
+          Do not shift variants according to HGVS notation (most 3prime end).
+      - name: --oicr
+        type: boolean_true
+        description: Add OICR tag in VCF file.
+      - name: --sequence_ontology
+        type: boolean_true
+        description: Use Sequence Ontology terms.
+  - name: Generic options
+    arguments:
+      - name: --config
+        alternatives: [-c]
+        type: file
+        description: Specify config file
+      - name: --config_option
+        type: string
+        description: Override a config file option (name=value).
+      - name: --debug
+        alternatives: [-d]
+        type: boolean_true
+        description: Debug mode (very verbose).
+      - name: --data_dir
+        type: file
+        description: Override data_dir parameter from config file.
+      - name: --no_download
+        type: boolean_true
+        description: Do not download a SnpEff database, if not available locally.
+      - name: --no_log
+        type: boolean_true
+        description: Do not report usage statistics to server.
+      - name: --quiet
+        alternatives: [-q]
+        type: boolean_true
+        description: Quiet mode (do not show any messages or errors)
+      - name: --verbose
+        alternatives: [-v]
+        type: boolean_true
+        description: Verbose mode.
+  - name: Database options
+    arguments:
+      - name: --canon 
+        type: boolean_true
+        description: Only use canonical transcripts.
+      - name: --canon_list
+        type: file
+        description: |
+          Only use canonical transcripts, replace some transcripts using the 'gene_id         
+          transcript_id' entries in <file>.
+      - name: --tag
+        type: string
+        description: |
+          Only use transcript having a tag 'tagName'. This option can be used multiple times.
+      - name: --no_tag
+        type: boolean_true
+        description: |
+          Filter out transcript having a tag 'tagName'. This option can be used multiple times.
+      - name: --interaction
+        type: boolean_true
+        description: Annotate using interactions (requires interaction database).
+      - name: --interval
+        type: file
+        description: |
+          Use a custom intervals in TXT/BED/BigBed/VCF/GFF file (you may use this option many times).
+      - name: --max_tsl
+        type: integer
+        description: Only use transcripts having Transcript Support Level lower than <TSL_number>.
+      - name: --motif 
+        type: boolean_true
+        description: Annotate using motifs (requires Motif database).
+      - name: --nextprot
+        type: boolean_true
+        description: Annotate using NextProt (requires NextProt database).
+      - name: --no_genome
+        type: boolean_true
+        description: Do not load any genomic database (e.g. annotate using custom files).
+      - name: --no_expand_iub
+        type: boolean_true
+        description: Disable IUB code expansion in input variants.
+      - name: --no_interaction
+        type: boolean_true
+        description: Disable inteaction annotations.
+      - name: --no_motif
+        type: boolean_true
+        description: Disable motif annotations.
+      - name: --no_nextprot
+        type: boolean_true
+        description: Disable NextProt annotations.
+      - name: --only_reg
+        type: boolean_true
+        description: Only use regulation tracks.
+      - name: --only_protein
+        type: boolean_true
+        description: Only use protein coding transcripts.
+      - name: --only_tr
+        type: file
+        description: |
+          Only use the transcripts in this file. Format: One transcript ID per line.
+        example: file.txt
+      - name: --reg
+        type: string
+        description: Regulation track to use (this option can be used add several times).
+      - name: --ss
+        alternatives: [--spliceSiteSize]
+        type: integer
+        description: |
+          Set size for splice sites (donor and acceptor) in bases. Default: 2.
+      - name: --splice_region_exon_size
+        type: integer
+        description: |
+          Set size for splice site region within exons. Default: 3 bases.
+      - name: --splice_region_intron_min
+        type: integer
+        description: |
+          Set minimum number of bases for splice site region within intron. Default: 3 bases.
+      - name: --splice_region_intron_max
+        type: integer
+        description: |
+          Set maximum number of bases for splice site region within intron. Default: 8 bases.
+      - name: --strict
+        type: boolean_true
+        description: Only use 'validated' transcripts (i.e. sequence has been checked).
+      - name: --ud
+        alternatives: [--upDownStreamLen]
+        type: integer
+        description: Set upstream downstream interval length (in bases).
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/staphb/snpeff:5.2a
+    setup:
+      - type: docker
+        run: |
+          version=$(snpEff -version) && \
+          version_trimmed=$(echo "$version" | awk '{print $1, $2}') && \
+          echo "$version_trimmed" > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/snpeff/help.txt b/src/snpeff/help.txt
@@ -0,0 +1,79 @@
+Usage: snpEff [eff] [options] genome_version [input_file]
+
+        variants_file                   : Default is STDIN
+
+Options:
+        -chr <string>                   : Prepend 'string' to chromosome name (e.g. 'chr1' instead of '1'). Only on TXT output.
+        -classic                        : Use old style annotations instead of Sequence Ontology and Hgvs.
+        -csvStats <file>                : Create CSV summary file.
+        -download                       : Download reference genome if not available. Default: true
+        -i <format>                     : Input format [ vcf, bed ]. Default: VCF.
+        -fileList                       : Input actually contains a list of files to process.
+        -o <format>                     : Ouput format [ vcf, gatk, bed, bedAnn ]. Default: VCF.
+        -s , -stats, -htmlStats         : Create HTML summary file.  Default is 'snpEff_summary.html'
+        -noStats                        : Do not create stats (summary) file      
+
+Results filter options:
+        -fi , -filterInterval  <file>   : Only analyze changes that intersect with the intervals specified in this file (you may use this option many times)        
+        -no-downstream                  : Do not show DOWNSTREAM changes
+        -no-intergenic                  : Do not show INTERGENIC changes
+        -no-intron                      : Do not show INTRON changes
+        -no-upstream                    : Do not show UPSTREAM changes
+        -no-utr                         : Do not show 5_PRIME_UTR or 3_PRIME_UTR changes
+        -no <effectType>                : Do not show 'EffectType'. This option can be used several times.
+
+Annotations options:
+        -cancer                         : Perform 'cancer' comparisons (Somatic vs Germline). Default: false
+        -cancerSamples <file>           : Two column TXT file defining 'oringinal \t derived' samples.
+        -fastaProt <file>               : Create an output file containing the resulting protein sequences.
+        -formatEff                      : Use 'EFF' field compatible with older versions (instead of 'ANN').
+        -geneId                         : Use gene ID instead of gene name (VCF output). Default: false
+        -hgvs                           : Use HGVS annotations for amino acid sub-field. Default: true
+        -hgvsOld                        : Use old HGVS notation. Default: false   
+        -hgvs1LetterAa                  : Use one letter Amino acid codes in HGVS notation. Default: false
+        -hgvsTrId                       : Use transcript ID in HGVS notation. Default: false
+        -lof                            : Add loss of function (LOF) and Nonsense mediated decay (NMD) tags.
+        -noHgvs                         : Do not add HGVS annotations.
+        -noLof                          : Do not add LOF and NMD annotations.     
+        -noShiftHgvs                    : Do not shift variants according to HGVS notation (most 3prime end).
+        -oicr                           : Add OICR tag in VCF file. Default: false
+        -sequenceOntology               : Use Sequence Ontology terms. Default: true
+
+Generic options:
+        -c , -config                 : Specify config file
+        -configOption name=value     : Override a config file option
+        -d , -debug                  : Debug mode (very verbose).
+        -dataDir <path>              : Override data_dir parameter from config file.
+        -download                    : Download a SnpEff database, if not available locally. Default: true
+        -nodownload                  : Do not download a SnpEff database, if not available locally.
+        -h , -help                   : Show this help and exit
+        -noLog                       : Do not report usage statistics to server   
+        -q , -quiet                  : Quiet mode (do not show any messages or errors)
+        -v , -verbose                : Verbose mode
+        -version                     : Show version number and exit
+
+Database options:
+        -canon                       : Only use canonical transcripts.
+        -canonList <file>            : Only use canonical transcripts, replace some transcripts using the 'gene_id         transcript_id' entries in <file>.        
+        -tag <tagName>               : Only use transcript having a tag 'tagName'. This option can be used multiple times.
+        -notag <tagName>             : Filter out transcript having a tag 'tagName'. This option can be used multiple times.
+        -interaction                 : Annotate using interactions (requires interaction database). Default: true
+        -interval <file>             : Use a custom intervals in TXT/BED/BigBed/VCF/GFF file (you may use this option many times)
+        -maxTSL <TSL_number>         : Only use transcripts having Transcript Support Level lower than <TSL_number>.
+        -motif                       : Annotate using motifs (requires Motif database). Default: true
+        -nextProt                    : Annotate using NextProt (requires NextProt database).
+        -noGenome                    : Do not load any genomic database (e.g. annotate using custom files).
+        -noExpandIUB                 : Disable IUB code expansion in input variants
+        -noInteraction               : Disable inteaction annotations
+        -noMotif                     : Disable motif annotations.
+        -noNextProt                  : Disable NextProt annotations.
+        -onlyReg                     : Only use regulation tracks.
+        -onlyProtein                 : Only use protein coding transcripts. Default: false
+        -onlyTr <file.txt>           : Only use the transcripts in this file. Format: One transcript ID per line.
+        -reg <name>                  : Regulation track to use (this option can be used add several times).
+        -ss , -spliceSiteSize <int>  : Set size for splice sites (donor and acceptor) in bases. Default: 2
+        -spliceRegionExonSize <int>  : Set size for splice site region within exons. Default: 3 bases
+        -spliceRegionIntronMin <int> : Set minimum number of bases for splice site region within intron. Default: 3 bases
+        -spliceRegionIntronMax <int> : Set maximum number of bases for splice site region within intron. Default: 8 bases
+        -strict                      : Only use 'validated' transcripts (i.e. sequence has been checked). Default: false
+        -ud , -upDownStreamLen <int> : Set upstream downstream interval length (in bases)