snakefmt

bihealth · Aug 22, 2024 · 1753b84 · 1753b84
1 parent a045a9d
commit 1753b84
Show file tree

Hide file tree

Showing 9 changed files with 572 additions and 346 deletions.
diff --git a/stemcnv_check/app/make_staticdata.py b/stemcnv_check/app/make_staticdata.py
@@ -180,7 +180,7 @@ def create_missing_staticdata(args):
                         ),
                     )
                     .dag(
-                        DAGSettings(targets=[use_vcf],
+                        DAGSettings(targets={use_vcf},
                                     force_incomplete=True)
                     )
                     .execute_workflow()

diff --git a/stemcnv_check/control_files/default_config.yaml b/stemcnv_check/control_files/default_config.yaml
@@ -392,6 +392,3 @@ tools:
   PennCNV:
     memory: 4000 # "4000MB"
     runtime: "30m"
-  make_cnv_vcf:
-    memory: 4000 # "4000MB"
-    runtime: "30m"
diff --git a/stemcnv_check/rules/SNP_processing.smk b/stemcnv_check/rules/SNP_processing.smk
@@ -3,82 +3,112 @@ import importlib.resources
 from stemcnv_check import STEM_CNV_CHECK
 
 
-
 rule filter_snp_vcf:
-    input: os.path.join(DATAPATH, "{sample_id}", "{sample_id}.unprocessed.vcf")
-    output: os.path.join(DATAPATH, "{sample_id}", "{sample_id}.processed-SNP-data.{filter}-filter.vcf")
-    threads: get_tool_resource('filter_snp_vcf', 'threads')
+    input:
+        os.path.join(DATAPATH, "{sample_id}", "{sample_id}.unprocessed.vcf"),
+    output:
+        os.path.join(
+            DATAPATH,
+            "{sample_id}",
+            "{sample_id}.processed-SNP-data.{filter}-filter.vcf",
+        ),
+    threads: get_tool_resource("filter_snp_vcf", "threads")
     resources:
-        runtime=get_tool_resource('filter_snp_vcf', 'runtime'),
-        mem_mb=get_tool_resource('filter_snp_vcf', 'memory'),
-        partition=get_tool_resource('filter_snp_vcf', 'partition')
+        runtime=get_tool_resource("filter_snp_vcf", "runtime"),
+        mem_mb=get_tool_resource("filter_snp_vcf", "memory"),
+        partition=get_tool_resource("filter_snp_vcf", "partition"),
     log:
         err=os.path.join(LOGPATH, "filter_snp_vcf", "{sample_id}", "{filter}.error.log"),
         #out=os.path.join(LOGPATH, "filter_snp_vcf", "{sample_id}", "{filter}.out.log")
     # conda:
     #     importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml")
     # script:
     #     '../scripts/filter_snp_vcf.R'
-#FIXME: wrong, this was the wrong fasta file
-# vcfpy changes the vcf too much, i.e.
-# - add new contigs to header
-# - merges variants at same position into multi-allelic
     conda:
-        importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","python-vcf.yaml")
+        "../envs/python-vcf.yaml"
     script:
-        '../scripts/filter_snp_vcf.py'
-        
+        "../scripts/filter_snp_vcf.py"
+
 
 rule annotate_snp_vcf:
     input:
-      vcf = os.path.join(DATAPATH, "{sample_id}", "{sample_id}.processed-SNP-data.{filter}-filter.vcf"),
-      genomefasta = get_genome_fasta
-    output: os.path.join(DATAPATH, "{sample_id}", "{sample_id}.annotated-SNP-data.{filter}-filter.vcf.gz")
-    threads: get_tool_resource('VEP', 'threads')
+        vcf=os.path.join(
+            DATAPATH,
+            "{sample_id}",
+            "{sample_id}.processed-SNP-data.{filter}-filter.vcf",
+        ),
+        genomefasta=get_genome_fasta,
+    output:
+        os.path.join(
+            DATAPATH,
+            "{sample_id}",
+            "{sample_id}.annotated-SNP-data.{filter}-filter.vcf.gz",
+        ),
+    threads: get_tool_resource("VEP", "threads")
     resources:
-        threads=get_tool_resource('VEP', 'threads'),
-        runtime=get_tool_resource('VEP', 'runtime'),
-        mem_mb=get_tool_resource('VEP', 'memory'),
-        partition=get_tool_resource('VEP', 'partition')
+        threads=get_tool_resource("VEP", "threads"),
+        runtime=get_tool_resource("VEP", "runtime"),
+        mem_mb=get_tool_resource("VEP", "memory"),
+        partition=get_tool_resource("VEP", "partition"),
     log:
-        err=os.path.join(LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.error.log"),
-        out=os.path.join(LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.out.log")
+        err=os.path.join(
+            LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.error.log"
+        ),
+        out=os.path.join(LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.out.log"),
     params:
-        genomeversion = 'GRCh38' if config['genome_version'] in ('hg38', 'GRCh38') else 'GRCh37',
-        vep_cache_path = config['use_vep_cache']
+        genomeversion=(
+            "GRCh38" if config["genome_version"] in ("hg38", "GRCh38") else "GRCh37"
+        ),
+        vep_cache_path=config["use_vep_cache"],
     conda:
-      importlib.resources.files(STEM_CNV_CHECK).joinpath("envs", "vep-annotation.yaml")
+        importlib.resources.files(STEM_CNV_CHECK).joinpath(
+            "envs", "vep-annotation.yaml"
+        )
     shell:
-        'vep --verbose ' 
-        '--fasta {input.genomefasta} ' # only needed for HGVS
+        "vep --verbose "
+        "--fasta {input.genomefasta} "
+
+        "--input_file {input.vcf} "
+        "--output_file {output} "
+        "--compress_output bgzip "
+        "--format vcf "
+        "--vcf "
+        "--force_overwrite "
+        "--no_stats "
+        "--warning_file {log.err} "
+        "--skipped_variants_file {log.out} "
+        "--assembly {params.genomeversion} "
+        "--fork {resources.threads} "
+        "--cache "
+        "--dir_cache {params.vep_cache_path} "
+
+        "--total_length "
+
+        "--gencode_basic "
+        "--symbol "
+        "--terms SO "
+        "--hgvs "
+        "--pick "
+
+
+        "--check_existing --no_check_alleles "
+        "--af "
+        "--pubmed "
+
+        '--fields "Gene,SYMBOL,STRAND,Consequence,cDNA_position,CDS_position,Protein_position,HGVSc,HGVSp,Existing_variation,CLIN_SIG,SOMATIC,PHENO,AF,PUBMED" '
+
+        " >> {log.out} 2>> {log.err}"
+        # only needed for HGVS
         # TODO: maye omitting this helps w/ pipe issues?
-        '--input_file {input.vcf} '
-        '--output_file {output} '
-        '--compress_output bgzip '
-        '--format vcf ' #might help with pipe auto-detection issues
-        '--vcf '
-        '--force_overwrite '
-        '--no_stats '
-        '--warning_file {log.err} '
-        '--skipped_variants_file {log.out} '
-        '--assembly {params.genomeversion} '
-        '--fork {resources.threads} '
-        '--cache '
-        '--dir_cache {params.vep_cache_path} '
+        #might help with pipe auto-detection issues
         ## Annotation options
-        '--total_length ' 
         # Gene & protein annotation
-        '--gencode_basic ' #> limit to gencode transcripts
-        '--symbol ' #> add gene symbol
-        '--terms SO ' # how to write/format/annotate the consequence
-        '--hgvs ' #> add HGVS nomenclature (protein changes)
-        '--pick ' #> pick the most severe consequence (& gene) per variant
+        #> limit to gencode transcripts
+        #> add gene symbol
+        # how to write/format/annotate the consequence
+        #> add HGVS nomenclature (protein changes)
+        #> pick the most severe consequence (& gene) per variant
         # will query databases for existing annotation at the same position
         # includes existing annotations from ClinVar, COSMIC etc
-        '--check_existing --no_check_alleles '
-        '--af ' #  global allele frequency (AF) from 1000 Genomes Phase 3 data 
-        '--pubmed '
+        #  global allele frequency (AF) from 1000 Genomes Phase 3 data 
         # Select content of CSQ:
-        '--fields "Gene,SYMBOL,STRAND,Consequence,cDNA_position,CDS_position,Protein_position,HGVSc,HGVSp,Existing_variation,CLIN_SIG,SOMATIC,PHENO,AF,PUBMED" '
-
-        ' >> {log.out} 2>> {log.err}'