Skip to content

Commit

Permalink
annovar necessary changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kubranarci committed Apr 5, 2024
1 parent f6c84bf commit 6ac42b6
Show file tree
Hide file tree
Showing 21 changed files with 112 additions and 571 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ test.xml
test_output/
tests/data/
work/
.github/CODEOWNERS-tmp
.github/CODEOWNERS-tmp
bin/vcfparser.pyc
38 changes: 28 additions & 10 deletions assets/config/convertToStdVCF.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
"RE": {
"number": "0",
"type": "Flag",
"description": "variant in UCSC_27Sept2013_RepeatMasker.bed.gz region and/or SimpleTandemRepeats_chr.bed.gz region, downloaded from UCSC genome browser and/or variant in segmental duplication region, annotated by annovar"
"description": "variant in repeat_masker file region and/or simple_tandemrepeats file region, downloaded from UCSC genome browser and/or variant in segmental duplication region, annotated by annovar"
},
"BL": {
"number": "0",
"type": "Flag",
"description": "variant in DAC-Blacklist from ENCODE or in DUKE_EXCLUDED list, both downloaded from UCSC genome browser"
"description": "variant in dac_blacklist from ENCODE or in duke_excluded list, both downloaded from UCSC genome browser"
},
"DP": {
"number": "0",
Expand All @@ -42,22 +42,22 @@
"dbSNP": {
"number": "0",
"type": "Flag",
"description": "variant in dbSNP147"
"description": "variant in dbSNP database used: dbsnp_indel"
},
"DB": {
"number": "0",
"type": "Flag",
"description": "variant in 1000Genomes, ALL.wgs.phase1_integrated_calls.20101123.snps_chr.vcf.gz or dbSNP"
"description": "variant in 1000Genomes (k_genomes) or dbSNP (dbsnp_indel)"
},
"HSDEPTH": {
"number": "0",
"type": "Flag",
"description": "variant in HiSeqDepthTop10Pct_chr.bed.gz region, downloaded from UCSC genome browser"
"description": "variant in hiseq_depth region, downloaded from UCSC genome browser"
},
"MAP": {
"number": "0",
"type": "Flag",
"description": "variant overlaps a region from wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1 with a value below 0.5, punishment increases with a decreasing mapability"
"description": "variant overlaps a region from mapability_file"
},
"SBAF": {
"number": "0",
Expand Down Expand Up @@ -539,25 +539,25 @@
"Tumor_dpALT": {
"number": "1",
"type": "Integer",
"description": "DP of Tumor ALT",
"description": "Tumor_dpALT: DP of Tumor ALT",
"new_info_id": "TDA"
},
"Control_dpALT": {
"number": "1",
"type": "Integer",
"description": "DP of Control ALT",
"description": "Control_dpALT: DP of Control ALT",
"new_info_id": "CDA"
},
"Tumor_dp": {
"number": "1",
"type": "Integer",
"description": "DP of Tumor",
"description": "Tumor_dp: DP of Tumor",
"new_info_id": "TDP"
},
"Control_dp": {
"number": "1",
"type": "Integer",
"description": "DP of Control",
"description": "Control_dp: DP of Control",
"new_info_id": "CDP"
},
"GT_Classification": {
Expand Down Expand Up @@ -655,6 +655,12 @@
"description": "GeneSymbol if variation overlaps gene",
"new_info_id": "GENE"
},
"ANNOVAR_FUNCTION": {
"number": "1",
"type": "String",
"description": "Functional classification of the variant by ANNOVAR",
"new_info_id": "ANNOVARFUN"
},
"EXONIC_CLASSIFICATION": {
"number": "1",
"type": "String",
Expand All @@ -667,6 +673,18 @@
"description": "Details of non-synonymous variation's impact on protein",
"new_info_id": "ANNOVARTR"
},
"SEGDUP": {
"number": ".",
"type": "String",
"description": "SEGDUP column from Annovar tool",
"new_info_id": "SEGDUP"
},
"CYTOBAND": {
"number": ".",
"type": "String",
"description": "CYTOBAND column from Annovar tool",
"new_info_id": "CYTOBAND"
},
"CONFIDENCE": {
"number": "1",
"type": "Integer",
Expand Down
2 changes: 1 addition & 1 deletion bin/convertToStdVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def convert_dict_to_str(dict_to_convert, pair_separator, key_val_separator):
"""

return pair_separator.join(
key + key_val_separator + val
key + key_val_separator + val.replace(";", ",")
for (key,val)
in dict_to_convert.iteritems())

Expand Down
1 change: 1 addition & 0 deletions conf/dkfz_cluster_38.config
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ params {
runIndelVCFFilter = true
runTinda = true
skip_multiqc = false
standard_vcf = true

// Filtrations for Only tumor cases
crit_exac_maxmaf = 0
Expand Down
2 changes: 1 addition & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ process {
enabled: false
]
}
withName: 'BCFTOOLS_REHEADER' {
withName: 'CREATE_CONTIGHEADER' {
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
Expand Down
6 changes: 4 additions & 2 deletions docs/convertToStdVCF.README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Convert DKFZ VCFs to standard-conform VCFs

The VCFs produced by the SNVCallingWorkflow are not standard conform in that some values are not added as additional columns after a single variant column. By contrast, in the [standard](https://samtools.github.io/hts-specs/) format, additional columns should only be used to show variants occurring in additional samples.
The VCFs generated by the nf-platypusindelcalling do not conform to the standard format. This is because annotations are added as additional columns after the variant genotype columns. On the other hand, the standard format only uses additional columns to display variants occurring in additional samples and adds annotations to the INFO column.

The `convertToStdVCF.py` script can be used to convert the DKFZ VCFs to standard VCFs (version 4.2).
The `bin/convertToStdVCF.py` script can be used to convert the DKFZ VCFs to standard VCFs (version 4.2).

## Execution

Expand Down Expand Up @@ -73,6 +73,8 @@ The file `convertToStdVCF.json` specifies how the non-standard columns are conve

In this file, entries starting with "__" are considered as comment and ignored by the script.

Entries ends with "__ctrl" are added from additional INFO column.

The file contains three sections:

* "FILTERS": Input columns mappet to key/value fields in the "filters" column.
Expand Down
29 changes: 29 additions & 0 deletions modules/local/create_contigheader.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
process CREATE_CONTIGHEADER {
tag "$fasta"
label 'process_single'

conda (params.enable_conda ? "" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://kubran/odcf_platypusindelcalling:v1' :'kubran/odcf_platypusindelcalling:v1' }"

input:
tuple path(fasta), path(fai)

output:
path ("*.header") , emit: header
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
"""
awk '{printf("##contig=<ID=%s,length=%d>\\n",\$1,\$2);}' $fai > ${fai}.header
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}
30 changes: 15 additions & 15 deletions modules/local/sample_swap.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ process SAMPLE_SWAP {
val chrprefix

output:
tuple val(meta), path('indel_*.tinda.vcf') , emit: vcf , optional: true
tuple val(meta), path('indel_*.swap.json') , emit: json , optional: true
path "snvs_*.GTfiltered_raw.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.SomaticIn.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.txt" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.png" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.txt" , optional: true
path "indel_*.checkSampleSwap_TiN.log" , emit: log
path "versions.yml" , emit: versions
tuple val(meta), path('*.tinda.vcf') , emit: vcf , optional: true
tuple val(meta), path('*.swap.json') , emit: json , optional: true
path "snvs_*.GTfiltered_raw.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.SomaticIn.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.vcf" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.txt" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.png" , optional: true
path "snvs_*.GTfiltered_gnomAD.Germline.Rare.Rescue.txt" , optional: true
path "*.checkSampleSwap_TiN.log" , emit: log
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when
Expand Down Expand Up @@ -57,9 +57,9 @@ process SAMPLE_SWAP {
--sequenceType=${params.seqtype} \\
--gene_model_bed=$genemodel \\
--reference=$ref \\
--outfile_tindaVCF=indel_${prefix}.tinda.vcf \\
--outfile_swapJSON=indel_${prefix}.swap.json \\
2>&1 | tee indel_${prefix}.checkSampleSwap_TiN.log
--outfile_tindaVCF=smvs_${prefix}.tinda.vcf \\
--outfile_swapJSON=smvs_${prefix}.swap.json \\
2>&1 | tee smvs_${prefix}.checkSampleSwap_TiN.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -72,7 +72,7 @@ process SAMPLE_SWAP {
}
else {
"""
touch indel_empty.checkSampleSwap_TiN.log
touch smvs_empty.checkSampleSwap_TiN.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
r-base: \$(echo \$(R --version 2>&1) | sed 's/^.*R version //; s/ .*\$//')
Expand Down
7 changes: 0 additions & 7 deletions modules/nf-core/modules/bcftools/reheader/environment.yml

This file was deleted.

73 changes: 0 additions & 73 deletions modules/nf-core/modules/bcftools/reheader/main.nf

This file was deleted.

63 changes: 0 additions & 63 deletions modules/nf-core/modules/bcftools/reheader/meta.yml

This file was deleted.

4 changes: 0 additions & 4 deletions modules/nf-core/modules/bcftools/reheader/tests/bcf.config

This file was deleted.

Loading

0 comments on commit 6ac42b6

Please sign in to comment.