annovar necessary changes

ghga-de · Apr 5, 2024 · 6ac42b6 · 6ac42b6
1 parent f6c84bf
commit 6ac42b6
Show file tree

Hide file tree

Showing 21 changed files with 112 additions and 571 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,5 @@ test.xml
 test_output/
 tests/data/
 work/
-.github/CODEOWNERS-tmp
+.github/CODEOWNERS-tmp
+bin/vcfparser.pyc
diff --git a/assets/config/convertToStdVCF.json b/assets/config/convertToStdVCF.json
@@ -17,12 +17,12 @@
     "RE": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in UCSC_27Sept2013_RepeatMasker.bed.gz region and/or SimpleTandemRepeats_chr.bed.gz region, downloaded from UCSC genome browser and/or variant in segmental duplication region, annotated by annovar"
+      "description": "variant in repeat_masker file region and/or simple_tandemrepeats file region, downloaded from UCSC genome browser and/or variant in segmental duplication region, annotated by annovar"
     },
     "BL": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in DAC-Blacklist from ENCODE or in DUKE_EXCLUDED list, both downloaded from UCSC genome browser"
+      "description": "variant in dac_blacklist from ENCODE or in duke_excluded list, both downloaded from UCSC genome browser"
     },
     "DP": {
       "number": "0",
@@ -42,22 +42,22 @@
     "dbSNP": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in dbSNP147"
+      "description": "variant in dbSNP database used: dbsnp_indel"
     },
     "DB": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in 1000Genomes, ALL.wgs.phase1_integrated_calls.20101123.snps_chr.vcf.gz or dbSNP"
+      "description": "variant in 1000Genomes (k_genomes) or dbSNP (dbsnp_indel)"
     },
     "HSDEPTH": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in HiSeqDepthTop10Pct_chr.bed.gz region, downloaded from UCSC genome browser"
+      "description": "variant in hiseq_depth region, downloaded from UCSC genome browser"
     },
     "MAP": {
       "number": "0",
       "type": "Flag",
-      "description": "variant overlaps a region from wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1 with a value below 0.5, punishment increases with a decreasing mapability"
+      "description": "variant overlaps a region from mapability_file"
     },
     "SBAF": {
       "number": "0",
@@ -539,25 +539,25 @@
     "Tumor_dpALT": {
       "number": "1",
       "type": "Integer",
-      "description": "DP of Tumor ALT",
+      "description": "Tumor_dpALT: DP of Tumor ALT",
       "new_info_id": "TDA"
     },
     "Control_dpALT": {
       "number": "1",
       "type": "Integer",
-      "description": "DP of Control ALT",
+      "description": "Control_dpALT: DP of Control ALT",
       "new_info_id": "CDA"
     },
     "Tumor_dp": {
       "number": "1",
       "type": "Integer",
-      "description": "DP of Tumor",
+      "description": "Tumor_dp: DP of Tumor",
       "new_info_id": "TDP"
     },
     "Control_dp": {
       "number": "1",
       "type": "Integer",
-      "description": "DP of Control",
+      "description": "Control_dp: DP of Control",
       "new_info_id": "CDP"
     },
     "GT_Classification": {
@@ -655,6 +655,12 @@
       "description": "GeneSymbol if variation overlaps gene",
       "new_info_id": "GENE"
     },
+    "ANNOVAR_FUNCTION": {
+      "number": "1",
+      "type": "String",
+      "description": "Functional classification of the variant by ANNOVAR",
+      "new_info_id": "ANNOVARFUN"
+    },
     "EXONIC_CLASSIFICATION": {
       "number": "1",
       "type": "String",
@@ -667,6 +673,18 @@
       "description": "Details of non-synonymous variation's impact on protein",
       "new_info_id": "ANNOVARTR"
     },
+    "SEGDUP": {
+      "number": ".",
+      "type": "String",
+      "description": "SEGDUP column from Annovar tool",
+      "new_info_id": "SEGDUP"
+    },    
+    "CYTOBAND": {
+      "number": ".",
+      "type": "String",
+      "description": "CYTOBAND column from Annovar tool",
+      "new_info_id": "CYTOBAND"
+    },
     "CONFIDENCE": {
       "number": "1",
       "type": "Integer",

diff --git a/bin/convertToStdVCF.py b/bin/convertToStdVCF.py
@@ -85,7 +85,7 @@ def convert_dict_to_str(dict_to_convert, pair_separator, key_val_separator):
     """
 
     return pair_separator.join(
-        key + key_val_separator + val
+        key + key_val_separator + val.replace(";", ",")
         for (key,val)
         in dict_to_convert.iteritems())
 

diff --git a/conf/dkfz_cluster_38.config b/conf/dkfz_cluster_38.config
@@ -32,6 +32,7 @@ params {
     runIndelVCFFilter          = true
     runTinda                   = true
     skip_multiqc               = false
+    standard_vcf               = true
 
     // Filtrations for Only tumor cases
     crit_exac_maxmaf           = 0        

diff --git a/conf/modules.config b/conf/modules.config
@@ -113,7 +113,7 @@ process {
             enabled: false
         ]
     }
-    withName: 'BCFTOOLS_REHEADER' {
+    withName: 'CREATE_CONTIGHEADER' {
         publishDir = [
             path: { "${params.outdir}/test" },
             enabled: false

diff --git a/docs/convertToStdVCF.README.md b/docs/convertToStdVCF.README.md
@@ -1,8 +1,8 @@
 # Convert DKFZ VCFs to standard-conform VCFs
 
-The VCFs produced by the SNVCallingWorkflow are not standard conform in that some values are not added as additional columns after a single variant column. By contrast, in the [standard](https://samtools.github.io/hts-specs/) format, additional columns should only be used to show variants occurring in additional samples.
+The VCFs generated by the nf-platypusindelcalling do not conform to the standard format. This is because annotations are added as additional columns after the variant genotype columns. On the other hand, the standard format only uses additional columns to display variants occurring in additional samples and adds annotations to the INFO column.
 
-The `convertToStdVCF.py` script can be used to convert the DKFZ VCFs to standard VCFs (version 4.2).
+The `bin/convertToStdVCF.py` script can be used to convert the DKFZ VCFs to standard VCFs (version 4.2).
 
 ## Execution
 
@@ -73,6 +73,8 @@ The file `convertToStdVCF.json` specifies how the non-standard columns are conve
 
 In this file, entries starting with "__" are considered as comment and ignored by the script.
 
+Entries ends with "__ctrl" are added from additional INFO column.
+
 The file contains three sections:
 
   * "FILTERS": Input columns mappet to key/value fields in the "filters" column.

diff --git a/modules/local/create_contigheader.nf b/modules/local/create_contigheader.nf
@@ -0,0 +1,29 @@
+process CREATE_CONTIGHEADER {
+    tag "$fasta"
+    label 'process_single'
+
+    conda     (params.enable_conda ? "" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"
+
+    input:
+    tuple path(fasta), path(fai)
+
+    output:
+    path ("*.header")       , emit: header
+    path  "versions.yml"    , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    awk '{printf("##contig=<ID=%s,length=%d>\\n",\$1,\$2);}' $fai > ${fai}.header
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/sample_swap.nf b/modules/local/sample_swap.nf
@@ -19,17 +19,17 @@ process SAMPLE_SWAP {
     val chrprefix
 
     output:
-    tuple val(meta), path('indel_*.tinda.vcf')                           , emit: vcf  , optional: true
-    tuple val(meta), path('indel_*.swap.json')                           , emit: json , optional: true
-    path  "snvs_*.GTfiltered_raw.vcf"                                    , optional: true
-    path  "snvs_*.GTfiltered_gnomAD.vcf"                                 , optional: true
-    path  "snvs_*.GTfiltered_gnomAD.SomaticIn.vcf"                       , optional: true   
-    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.vcf"                   , optional: true
-    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.txt"                   , optional: true
-    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.png"            , optional: true
-    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.txt"            , optional: true
-    path  "indel_*.checkSampleSwap_TiN.log"                              , emit: log
-    path  "versions.yml"                                                 , emit: versions
+    tuple val(meta), path('*.tinda.vcf')                           , emit: vcf  , optional: true
+    tuple val(meta), path('*.swap.json')                           , emit: json , optional: true
+    path  "snvs_*.GTfiltered_raw.vcf"                              , optional: true
+    path  "snvs_*.GTfiltered_gnomAD.vcf"                           , optional: true
+    path  "snvs_*.GTfiltered_gnomAD.SomaticIn.vcf"                 , optional: true   
+    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.vcf"             , optional: true
+    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.txt"             , optional: true
+    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.png"      , optional: true
+    path  "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.txt"      , optional: true
+    path  "*.checkSampleSwap_TiN.log"                              , emit: log
+    path  "versions.yml"                                           , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -57,9 +57,9 @@ process SAMPLE_SWAP {
             --sequenceType=${params.seqtype} \\
             --gene_model_bed=$genemodel \\
             --reference=$ref \\
-            --outfile_tindaVCF=indel_${prefix}.tinda.vcf \\
-            --outfile_swapJSON=indel_${prefix}.swap.json \\
-            2>&1 | tee indel_${prefix}.checkSampleSwap_TiN.log
+            --outfile_tindaVCF=smvs_${prefix}.tinda.vcf \\
+            --outfile_swapJSON=smvs_${prefix}.swap.json \\
+            2>&1 | tee smvs_${prefix}.checkSampleSwap_TiN.log
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
@@ -72,7 +72,7 @@ process SAMPLE_SWAP {
     }
     else {
         """
-        touch indel_empty.checkSampleSwap_TiN.log
+        touch smvs_empty.checkSampleSwap_TiN.log
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
             r-base: \$(echo \$(R --version 2>&1) | sed 's/^.*R version //; s/ .*\$//')

diff --git a/modules/nf-core/modules/bcftools/reheader/environment.yml b/modules/nf-core/modules/bcftools/reheader/environment.yml
diff --git a/modules/nf-core/modules/bcftools/reheader/main.nf b/modules/nf-core/modules/bcftools/reheader/main.nf
diff --git a/modules/nf-core/modules/bcftools/reheader/meta.yml b/modules/nf-core/modules/bcftools/reheader/meta.yml
diff --git a/modules/nf-core/modules/bcftools/reheader/tests/bcf.config b/modules/nf-core/modules/bcftools/reheader/tests/bcf.config