From 1753b8432bf7558ec378ee48fd1f5efd7905c4ac Mon Sep 17 00:00:00 2001 From: Nicolai-vKuegelgen Date: Thu, 22 Aug 2024 10:40:56 +0200 Subject: [PATCH] snakefmt --- stemcnv_check/app/make_staticdata.py | 2 +- .../control_files/default_config.yaml | 3 - stemcnv_check/rules/SNP_processing.smk | 138 ++++++----- stemcnv_check/rules/StemCNV-check.smk | 220 +++++++++++------- stemcnv_check/rules/common.smk | 72 +++--- .../rules/illumina_raw_processing.smk | 102 ++++---- stemcnv_check/rules/penncnv.smk | 186 ++++++++++----- stemcnv_check/rules/report_generation.smk | 77 +++--- stemcnv_check/rules/staticdata_creation.smk | 118 ++++++---- 9 files changed, 572 insertions(+), 346 deletions(-) diff --git a/stemcnv_check/app/make_staticdata.py b/stemcnv_check/app/make_staticdata.py index 6ad1929..717646f 100644 --- a/stemcnv_check/app/make_staticdata.py +++ b/stemcnv_check/app/make_staticdata.py @@ -180,7 +180,7 @@ def create_missing_staticdata(args): ), ) .dag( - DAGSettings(targets=[use_vcf], + DAGSettings(targets={use_vcf}, force_incomplete=True) ) .execute_workflow() diff --git a/stemcnv_check/control_files/default_config.yaml b/stemcnv_check/control_files/default_config.yaml index 45b00d7..ef6dc03 100644 --- a/stemcnv_check/control_files/default_config.yaml +++ b/stemcnv_check/control_files/default_config.yaml @@ -392,6 +392,3 @@ tools: PennCNV: memory: 4000 # "4000MB" runtime: "30m" - make_cnv_vcf: - memory: 4000 # "4000MB" - runtime: "30m" diff --git a/stemcnv_check/rules/SNP_processing.smk b/stemcnv_check/rules/SNP_processing.smk index e48f5dd..2d26068 100644 --- a/stemcnv_check/rules/SNP_processing.smk +++ b/stemcnv_check/rules/SNP_processing.smk @@ -3,15 +3,20 @@ import importlib.resources from stemcnv_check import STEM_CNV_CHECK - rule filter_snp_vcf: - input: os.path.join(DATAPATH, "{sample_id}", "{sample_id}.unprocessed.vcf") - output: os.path.join(DATAPATH, "{sample_id}", "{sample_id}.processed-SNP-data.{filter}-filter.vcf") - threads: get_tool_resource('filter_snp_vcf', 'threads') + input: + os.path.join(DATAPATH, "{sample_id}", "{sample_id}.unprocessed.vcf"), + output: + os.path.join( + DATAPATH, + "{sample_id}", + "{sample_id}.processed-SNP-data.{filter}-filter.vcf", + ), + threads: get_tool_resource("filter_snp_vcf", "threads") resources: - runtime=get_tool_resource('filter_snp_vcf', 'runtime'), - mem_mb=get_tool_resource('filter_snp_vcf', 'memory'), - partition=get_tool_resource('filter_snp_vcf', 'partition') + runtime=get_tool_resource("filter_snp_vcf", "runtime"), + mem_mb=get_tool_resource("filter_snp_vcf", "memory"), + partition=get_tool_resource("filter_snp_vcf", "partition"), log: err=os.path.join(LOGPATH, "filter_snp_vcf", "{sample_id}", "{filter}.error.log"), #out=os.path.join(LOGPATH, "filter_snp_vcf", "{sample_id}", "{filter}.out.log") @@ -19,66 +24,91 @@ rule filter_snp_vcf: # importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") # script: # '../scripts/filter_snp_vcf.R' -#FIXME: wrong, this was the wrong fasta file -# vcfpy changes the vcf too much, i.e. -# - add new contigs to header -# - merges variants at same position into multi-allelic conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","python-vcf.yaml") + "../envs/python-vcf.yaml" script: - '../scripts/filter_snp_vcf.py' - + "../scripts/filter_snp_vcf.py" + rule annotate_snp_vcf: input: - vcf = os.path.join(DATAPATH, "{sample_id}", "{sample_id}.processed-SNP-data.{filter}-filter.vcf"), - genomefasta = get_genome_fasta - output: os.path.join(DATAPATH, "{sample_id}", "{sample_id}.annotated-SNP-data.{filter}-filter.vcf.gz") - threads: get_tool_resource('VEP', 'threads') + vcf=os.path.join( + DATAPATH, + "{sample_id}", + "{sample_id}.processed-SNP-data.{filter}-filter.vcf", + ), + genomefasta=get_genome_fasta, + output: + os.path.join( + DATAPATH, + "{sample_id}", + "{sample_id}.annotated-SNP-data.{filter}-filter.vcf.gz", + ), + threads: get_tool_resource("VEP", "threads") resources: - threads=get_tool_resource('VEP', 'threads'), - runtime=get_tool_resource('VEP', 'runtime'), - mem_mb=get_tool_resource('VEP', 'memory'), - partition=get_tool_resource('VEP', 'partition') + threads=get_tool_resource("VEP", "threads"), + runtime=get_tool_resource("VEP", "runtime"), + mem_mb=get_tool_resource("VEP", "memory"), + partition=get_tool_resource("VEP", "partition"), log: - err=os.path.join(LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.error.log"), - out=os.path.join(LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.out.log") + err=os.path.join( + LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.error.log" + ), + out=os.path.join(LOGPATH, "annotate_snp_vcf", "{sample_id}", "{filter}.out.log"), params: - genomeversion = 'GRCh38' if config['genome_version'] in ('hg38', 'GRCh38') else 'GRCh37', - vep_cache_path = config['use_vep_cache'] + genomeversion=( + "GRCh38" if config["genome_version"] in ("hg38", "GRCh38") else "GRCh37" + ), + vep_cache_path=config["use_vep_cache"], conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs", "vep-annotation.yaml") + importlib.resources.files(STEM_CNV_CHECK).joinpath( + "envs", "vep-annotation.yaml" + ) shell: - 'vep --verbose ' - '--fasta {input.genomefasta} ' # only needed for HGVS + "vep --verbose " + "--fasta {input.genomefasta} " + + "--input_file {input.vcf} " + "--output_file {output} " + "--compress_output bgzip " + "--format vcf " + "--vcf " + "--force_overwrite " + "--no_stats " + "--warning_file {log.err} " + "--skipped_variants_file {log.out} " + "--assembly {params.genomeversion} " + "--fork {resources.threads} " + "--cache " + "--dir_cache {params.vep_cache_path} " + + "--total_length " + + "--gencode_basic " + "--symbol " + "--terms SO " + "--hgvs " + "--pick " + + + "--check_existing --no_check_alleles " + "--af " + "--pubmed " + + '--fields "Gene,SYMBOL,STRAND,Consequence,cDNA_position,CDS_position,Protein_position,HGVSc,HGVSp,Existing_variation,CLIN_SIG,SOMATIC,PHENO,AF,PUBMED" ' + + " >> {log.out} 2>> {log.err}" + # only needed for HGVS # TODO: maye omitting this helps w/ pipe issues? - '--input_file {input.vcf} ' - '--output_file {output} ' - '--compress_output bgzip ' - '--format vcf ' #might help with pipe auto-detection issues - '--vcf ' - '--force_overwrite ' - '--no_stats ' - '--warning_file {log.err} ' - '--skipped_variants_file {log.out} ' - '--assembly {params.genomeversion} ' - '--fork {resources.threads} ' - '--cache ' - '--dir_cache {params.vep_cache_path} ' + #might help with pipe auto-detection issues ## Annotation options - '--total_length ' # Gene & protein annotation - '--gencode_basic ' #> limit to gencode transcripts - '--symbol ' #> add gene symbol - '--terms SO ' # how to write/format/annotate the consequence - '--hgvs ' #> add HGVS nomenclature (protein changes) - '--pick ' #> pick the most severe consequence (& gene) per variant + #> limit to gencode transcripts + #> add gene symbol + # how to write/format/annotate the consequence + #> add HGVS nomenclature (protein changes) + #> pick the most severe consequence (& gene) per variant # will query databases for existing annotation at the same position # includes existing annotations from ClinVar, COSMIC etc - '--check_existing --no_check_alleles ' - '--af ' # global allele frequency (AF) from 1000 Genomes Phase 3 data - '--pubmed ' + # global allele frequency (AF) from 1000 Genomes Phase 3 data # Select content of CSQ: - '--fields "Gene,SYMBOL,STRAND,Consequence,cDNA_position,CDS_position,Protein_position,HGVSc,HGVSp,Existing_variation,CLIN_SIG,SOMATIC,PHENO,AF,PUBMED" ' - - ' >> {log.out} 2>> {log.err}' \ No newline at end of file diff --git a/stemcnv_check/rules/StemCNV-check.smk b/stemcnv_check/rules/StemCNV-check.smk index 8cec727..8c80367 100644 --- a/stemcnv_check/rules/StemCNV-check.smk +++ b/stemcnv_check/rules/StemCNV-check.smk @@ -6,7 +6,11 @@ from pathlib import Path import tempfile import ruamel.yaml as ruamel_yaml from stemcnv_check import STEM_CNV_CHECK -from stemcnv_check.helpers import read_sample_table, config_extract, collect_SNP_cluster_ids +from stemcnv_check.helpers import ( + read_sample_table, + config_extract, + collect_SNP_cluster_ids, +) from stemcnv_check.exceptions import SampleConstraintError, ConfigValueError SNAKEDIR = str(importlib.resources.files(STEM_CNV_CHECK)) @@ -22,169 +26,221 @@ else: # This a manual workaround around for passing the config as a snakemake object to R & python scripts # Save config to tempfile & use that instead, this ensures the combined default + user configs are used # It also archives all options passed by command line and allows them all to be displayed in report - f, CONFIGFILE = tempfile.mkstemp(suffix = '.yaml', text=True) + f, CONFIGFILE = tempfile.mkstemp(suffix=".yaml", text=True) os.close(f) removetempconfig = True -yaml = ruamel_yaml.YAML(typ='safe') -config['snakedir'] = SNAKEDIR -with open(CONFIGFILE, 'w') as yamlout: + + +yaml = ruamel_yaml.YAML(typ="safe") +config["snakedir"] = SNAKEDIR +with open(CONFIGFILE, "w") as yamlout: yaml.dump(config, yamlout) -SAMPLETABLE = config['sample_table'] if 'sample_table' in config else 'sample_table.tsv' # Defined by wrapper -BASEPATH = config['basedir'] if 'basedir' in config else os.getcwd() #Defined by wrapper -DATAPATH = config['data_path'] #if os.path.isabs(config['data_path']) else os.path.join(BASEPATH, config['data_path']) -LOGPATH = config['log_path'] #if os.path.isabs(config['log_path']) else os.path.join(BASEPATH, config['log_path']) -TARGET = config['target'] if 'target' in config else 'report' #Defined by wrapper -IDAT_INPUT = config['raw_data_folder'] +SAMPLETABLE = ( + config["sample_table"] if "sample_table" in config else "sample_table.tsv" +) # Defined by wrapper +BASEPATH = ( + config["basedir"] if "basedir" in config else os.getcwd() +) # Defined by wrapper +DATAPATH = config[ + "data_path" +] # if os.path.isabs(config['data_path']) else os.path.join(BASEPATH, config['data_path']) +LOGPATH = config[ + "log_path" +] # if os.path.isabs(config['log_path']) else os.path.join(BASEPATH, config['log_path']) +TARGET = config["target"] if "target" in config else "report" # Defined by wrapper +IDAT_INPUT = config["raw_data_folder"] + wildcard_constraints: - sample_id=config['wildcard_constraints']['sample_id'], + sample_id=config["wildcard_constraints"]["sample_id"], #sentrix_name=config['wildcard_constraints']['sentrix_name'], #sentrix_pos=config['wildcard_constraints']['sentrix_pos'], -#Never submit these to cluster + +# Never submit these to cluster localrules: relink_gencall, - all + all, + sample_data = read_sample_table(SAMPLETABLE) sample_data_full = read_sample_table(SAMPLETABLE, with_opt=True) -include: 'common.smk' -include: 'illumina_raw_processing.smk' + +include: "common.smk" +include: "illumina_raw_processing.smk" include: "SNP_processing.smk" include: "penncnv.smk" - include: "report_generation.smk" + # Rule function,s move to common? ======================================================================== + def get_cnv_vcf_output(mode): - addition = config['settings']['make_cnv_vcf']['name_addition'] - if mode == 'combined-calls': - return os.path.join(DATAPATH, "{sample_id}", f"{{sample_id}}.combined-cnv-calls{addition}.vcf") - elif mode == 'split-tools': - return [os.path.join(DATAPATH, "{sample_id}", f"{{sample_id}}.{tool}-cnv-calls{addition}.vcf") - for tool in config['settings']['CNV.calling.tools']] + addition = config["settings"]["make_cnv_vcf"]["name_addition"] + if mode == "combined-calls": + return os.path.join( + DATAPATH, "{sample_id}", f"{{sample_id}}.combined-cnv-calls{addition}.vcf" + ) + elif mode == "split-tools": + return [ + os.path.join( + DATAPATH, "{sample_id}", f"{{sample_id}}.{tool}-cnv-calls{addition}.vcf" + ) + for tool in config["settings"]["CNV.calling.tools"] + ] else: - raise ConfigValueError('Value not allowed for settings$make.cnv.vcf$mode: "{}"'.format(config['settings']['make_cnv_vcf']['mode'])) + raise ConfigValueError( + 'Value not allowed for settings$make.cnv.vcf$mode: "{}"'.format( + config["settings"]["make_cnv_vcf"]["mode"] + ) + ) -def get_target_files(target = TARGET): - #Target options: ('report', 'combined-cnv-calls', 'PennCNV', 'CBS', 'SNP-data'), + +def get_target_files(target=TARGET): + # Target options: ('report', 'combined-cnv-calls', 'PennCNV', 'CBS', 'SNP-data'), all_samples = [sample_id for sample_id, _, _, _, _ in sample_data] # complete - if target == 'complete': - return get_target_files('report') + get_target_files('combined-cnv-calls') + if target == "complete": + return get_target_files("report") + get_target_files("combined-cnv-calls") # Report - if target == 'report': + if target == "report": return expand( os.path.join(DATAPATH, "{sample_id}", "{sample_id}.{report_filetype}"), - sample_id = all_samples, - report_filetype = [rep+'.'+config['reports'][rep]['file_type'] for rep in config['reports'].keys() - if rep != '__default__']) #+ \ - # expand(os.path.join(DATAPATH,"{sample_id}","{sample_id}.summary-check.tsv"), - # sample_id = all_samples) + sample_id=all_samples, + report_filetype=[ + rep + "." + config["reports"][rep]["file_type"] + for rep in config["reports"].keys() + if rep != "__default__" + ], + ) # + \ + # expand(os.path.join(DATAPATH,"{sample_id}","{sample_id}.summary-check.tsv"), + # sample_id = all_samples) # Target Processed-calls - if target == 'combined-cnv-calls': - return expand([ - os.path.join(DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.tsv"), - os.path.join(DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.vcf.gz"), + if target == "combined-cnv-calls": + return expand( + [ + os.path.join( + DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.tsv" + ), + os.path.join( + DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.vcf.gz" + ), ], - sample_id = all_samples + sample_id=all_samples, ) # Target PennCNV - if target == 'PennCNV': + if target == "PennCNV": return expand( - os.path.join(DATAPATH, "{sample_id}", "{sample_id}.CNV_calls.penncnv.vcf.gz"), - sample_id = all_samples, + os.path.join( + DATAPATH, "{sample_id}", "{sample_id}.CNV_calls.penncnv.vcf.gz" + ), + sample_id=all_samples, ) # Target CBS - if target == 'CBS': + if target == "CBS": return expand( os.path.join(DATAPATH, "{sample_id}", "{sample_id}.CNV_calls.CBS.vcf.gz"), - sample_id = all_samples + sample_id=all_samples, ) # Target SNP-data - if target == 'SNP-data': - #TODO: update this + if target == "SNP-data": + # TODO: update this return expand( - os.path.join(DATAPATH,"{sample_id}","{sample_id}.processed-SNP-data.{filter}-filter.vcf"), + os.path.join( + DATAPATH, + "{sample_id}", + "{sample_id}.processed-SNP-data.{filter}-filter.vcf", + ), # os.path.join(DATAPATH, "{sample_id}", "{sample_id}.annotated-SNP-data.{filter}-filter.vcf.gz"), - sample_id = all_samples, - filter = config['settings']['default-filter-set'] - ) - + sample_id=all_samples, + filter=config["settings"]["default-filter-set"], + ) + + # Rules ======================================================================== + rule all: input: - get_target_files() + get_target_files(), run: if removetempconfig: os.remove(CONFIGFILE) - rule run_CBS: input: - vcf = cnv_vcf_input_function('CBS') + vcf=cnv_vcf_input_function("CBS"), output: - vcf = os.path.join(DATAPATH, "{sample_id}", "{sample_id}.CNV_calls.CBS.vcf.gz") - threads: get_tool_resource('CBS', 'threads') + vcf=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.CNV_calls.CBS.vcf.gz"), + threads: get_tool_resource("CBS", "threads") resources: - runtime=get_tool_resource('CBS', 'runtime'), - mem_mb=get_tool_resource('CBS', 'memory'), - partition=get_tool_resource('CBS', 'partition') + runtime=get_tool_resource("CBS", "runtime"), + mem_mb=get_tool_resource("CBS", "memory"), + partition=get_tool_resource("CBS", "partition"), params: # SDundo = config['settings']['CBS']['SDundo'], # filter=get_tool_filter_settings('CBS'), - settings=config['settings']['CBS'] + settings=config["settings"]["CBS"], log: err=os.path.join(LOGPATH, "CBS", "{sample_id}", "error.log"), - out=os.path.join(LOGPATH, "CBS", "{sample_id}", "out.log") + out=os.path.join(LOGPATH, "CBS", "{sample_id}", "out.log"), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs", "general-R.yaml") + "../envs/general-R.yaml" script: - '../scripts/run_CBS_DNAcopy.R' - # shell: - # "Rscript {SNAKEDIR}/scripts/run_CBS_DNAcopy.R -s {params.SDundo} {input.tsv} {output.tsv} {CONFIGFILE} {SAMPLETABLE} > {log.out} 2> {log.err}" + "../scripts/run_CBS_DNAcopy.R" +# shell: +# "Rscript {SNAKEDIR}/scripts/run_CBS_DNAcopy.R -s {params.SDundo} {input.tsv} {output.tsv} {CONFIGFILE} {SAMPLETABLE} > {log.out} 2> {log.err}" def get_processed_ref_data(wildcards): sample_id, ref_id = get_ref_id(wildcards) - return os.path.join(DATAPATH, f"{ref_id}", f"{ref_id}.combined-cnv-calls.tsv") if ref_id else [] + return ( + os.path.join(DATAPATH, f"{ref_id}", f"{ref_id}.combined-cnv-calls.tsv") + if ref_id + else [] + ) rule run_process_CNV_calls: input: - cnv_calls = expand( - os.path.join(DATAPATH, "{{sample_id}}", "{{sample_id}}.CNV_calls.{caller}.vcf.gz"), - caller = config['settings']['CNV.calling.tools'] + cnv_calls=expand( + os.path.join( + DATAPATH, "{{sample_id}}", "{{sample_id}}.CNV_calls.{caller}.vcf.gz" + ), + caller=config["settings"]["CNV.calling.tools"], ), - ref_data = get_processed_ref_data, - snp_vcf = cnv_vcf_input_function('settings:CNV_processing:call_processing') + ref_data=get_processed_ref_data, + snp_vcf=cnv_vcf_input_function("settings:CNV_processing:call_processing"), output: - tsv = os.path.join(DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.tsv"), - vcf = os.path.join(DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.vcf.gz") - threads: get_tool_resource('CNV.process', 'threads') + tsv=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.tsv"), + vcf=os.path.join( + DATAPATH, "{sample_id}", "{sample_id}.combined-cnv-calls.vcf.gz" + ), + threads: get_tool_resource("CNV.process", "threads") resources: - runtime=get_tool_resource('CNV.process', 'runtime'), - mem_mb=get_tool_resource('CNV.process', 'memory'), - partition=get_tool_resource('CNV.process', 'partition') + runtime=get_tool_resource("CNV.process", "runtime"), + mem_mb=get_tool_resource("CNV.process", "memory"), + partition=get_tool_resource("CNV.process", "partition"), params: - settings=config['settings']['CNV_processing'] + settings=config["settings"]["CNV_processing"], log: err=os.path.join(LOGPATH, "CNV_process", "{sample_id}", "error.log"), # out=os.path.join(LOGPATH, "CNV_process", "{sample_id}", "out.log") conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") + "../envs/general-R.yaml" script: - '../scripts/process_CNV_calls.R' - # shell: - # "Rscript {SNAKEDIR}/scripts/process_CNV_calls.R {params.penncnv} {params.cbs} {DATAPATH} {wildcards.sample_id} {CONFIGFILE} {SAMPLETABLE} > {log.out} 2> {log.err}" + "../scripts/process_CNV_calls.R" + + +# shell: +# "Rscript {SNAKEDIR}/scripts/process_CNV_calls.R {params.penncnv} {params.cbs} {DATAPATH} {wildcards.sample_id} {CONFIGFILE} {SAMPLETABLE} > {log.out} 2> {log.err}" diff --git a/stemcnv_check/rules/common.smk b/stemcnv_check/rules/common.smk index 297bb24..4c0a00d 100644 --- a/stemcnv_check/rules/common.smk +++ b/stemcnv_check/rules/common.smk @@ -1,6 +1,6 @@ # Makestatic Data : # def fix_container_path(path_in, bound_to): -# +# # path_in = Path(path_in) # if bound_to == 'static': # rel_path = path_in.name @@ -10,9 +10,10 @@ # 'tmp': Path(DOWNLOAD_DIR) # }[bound_to].absolute() # rel_path = path_in.absolute().relative_to(local_target) -# +# # return Path('/outside/') / bound_to / rel_path + def get_ref_id(wildcards, get_sex=False): sample_id = wildcards.sample_id sex, ref_id = [(s, r) for sid, _, _, s, r in sample_data if sid == sample_id][0] @@ -20,12 +21,14 @@ def get_ref_id(wildcards, get_sex=False): if ref_id: try: - #Assume existing match -> wrapper should have done a check + # Assume existing match -> wrapper should have done a check ref_sex = [s for sid, _, _, s, _ in sample_data if sid == ref_id][0] ref_sex = ref_sex[0].lower() except IndexError: # Somehow no match - raise SampleConstraintError(f"Listed reference sample can not be found in sample-table: '{ref_id}'") + raise SampleConstraintError( + f"Listed reference sample can not be found in sample-table: '{ref_id}'" + ) else: ref_id = False ref_sex = False @@ -39,59 +42,62 @@ def get_ref_id(wildcards, get_sex=False): def fix_container_path(path_in, bound_to): path_in = Path(path_in) - if bound_to == 'static': + if bound_to == "static": rel_path = path_in.name else: local_target = { - 'data': Path(DATAPATH), - 'rawdata': Path(IDAT_INPUT), - 'logs': Path(LOGPATH), - 'snakedir': Path(importlib.resources.files(STEM_CNV_CHECK)), + "data": Path(DATAPATH), + "rawdata": Path(IDAT_INPUT), + "logs": Path(LOGPATH), + "snakedir": Path(importlib.resources.files(STEM_CNV_CHECK)), }[bound_to].absolute() rel_path = path_in.absolute().relative_to(local_target) - return Path('/outside/') / bound_to / rel_path + return Path("/outside/") / bound_to / rel_path + def get_tool_filter_settings(tool): - if tool.split(':')[0] == 'report': - report_settings = config['reports'][tool.split(':')[1]] - out = config_extract((tool.split(':')[2], 'filter-settings'), report_settings, config['reports']['__default__']) - elif tool.count(':') == 2 and tool.split(':')[1] == 'CNV_processing': - out = config['settings']['CNV_processing']['call_processing']['filter-settings'] + if tool.split(":")[0] == "report": + report_settings = config["reports"][tool.split(":")[1]] + out = config_extract( + (tool.split(":")[2], "filter-settings"), + report_settings, + config["reports"]["__default__"], + ) + elif tool.count(":") == 2 and tool.split(":")[1] == "CNV_processing": + out = config["settings"]["CNV_processing"]["call_processing"]["filter-settings"] else: - out = config['settings'][tool]['filter-settings'] - if out == '__default__': - out = config['settings']['default-filter-set'] + out = config["settings"][tool]["filter-settings"] + if out == "__default__": + out = config["settings"]["default-filter-set"] return out -def get_tool_resource(tool ,resource): - if not resource in ('threads', 'memory', 'runtime', 'partition', 'cmd-line-params'): +def get_tool_resource(tool, resource): + if not resource in ("threads", "memory", "runtime", "partition", "cmd-line-params"): raise KeyError(f"This resource can not be defined: {resource}") - if not tool in config['tools']: - return config['tools']['__default__'][resource] + if not tool in config["tools"] or not resource in config["tools"][tool]: + return config["tools"]["__default__"][resource] else: - if resource in config['tools'][tool]: - return config['tools'][tool][resource] - else: - return config['tools']['__default__'][resource] + return config["tools"][tool][resource] + def get_genome_fasta(wildcards): """Get the correct genome fasta file""" # #FIXME: future # chip = get_sample_info(wildcards.sample_id)['array_name'] # genome = config['array_definitions'][chip]['genome_version'] - if config['genome_version'] in ('hg38', 'GRCh38'): - return config['global_settings']['hg38_genome_fasta'] + if config["genome_version"] in ("hg38", "GRCh38"): + return config["global_settings"]["hg38_genome_fasta"] else: - return config['global_settings']['hg19_genome_fasta'] + return config["global_settings"]["hg19_genome_fasta"] def cnv_vcf_input_function(tool): return lambda wildcards: os.path.join( - DATAPATH, - wildcards.sample_id, - f"{wildcards.sample_id}.processed-SNP-data.{get_tool_filter_settings(tool)}-filter.vcf" + DATAPATH, + wildcards.sample_id, + f"{wildcards.sample_id}.processed-SNP-data.{get_tool_filter_settings(tool)}-filter.vcf", # VEP still has issues; skip for now # f"{wildcards.sample_id}.annotated-SNP-data.{get_tool_filter_settings(tool)}-filter.vcf.gz" - ) \ No newline at end of file + ) diff --git a/stemcnv_check/rules/illumina_raw_processing.smk b/stemcnv_check/rules/illumina_raw_processing.smk index b02c2a4..2acaf83 100644 --- a/stemcnv_check/rules/illumina_raw_processing.smk +++ b/stemcnv_check/rules/illumina_raw_processing.smk @@ -1,27 +1,36 @@ rule run_gencall: input: - bpm = config['static_data']['bpm_manifest_file'], - egt = config['static_data']['egt_cluster_file'], - idat_path = os.path.join(IDAT_INPUT, "{sentrix_name}"), + bpm=config["static_data"]["bpm_manifest_file"], + egt=config["static_data"]["egt_cluster_file"], + idat_path=os.path.join(IDAT_INPUT, "{sentrix_name}"), output: - os.path.join(DATAPATH, "gtc", "{sentrix_name}", "_done") - threads: get_tool_resource('GenCall', 'threads') + os.path.join(DATAPATH, "gtc", "{sentrix_name}", "_done"), + threads: get_tool_resource("GenCall", "threads") resources: - runtime=get_tool_resource('GenCall', 'runtime'), - mem_mb=get_tool_resource('GenCall', 'memory'), - partition=get_tool_resource('GenCall', 'partition') + runtime=get_tool_resource("GenCall", "runtime"), + mem_mb=get_tool_resource("GenCall", "memory"), + partition=get_tool_resource("GenCall", "partition"), params: - options = get_tool_resource('GenCall', 'cmd-line-params'), - outpath = lambda wildcards: fix_container_path(os.path.join(DATAPATH, "gtc", wildcards.sentrix_name), 'data'), - bpm = fix_container_path(config['static_data']['bpm_manifest_file'],'static'), - egt = fix_container_path(config['static_data']['egt_cluster_file'],'static'), - idat_path= lambda wildcards: fix_container_path(os.path.join(IDAT_INPUT, wildcards.sentrix_name),'rawdata'), - logerr = lambda wildcards: fix_container_path(os.path.join(LOGPATH,"GenCall",wildcards.sentrix_name,"error.log"),'logs'), - logout = lambda wildcards: fix_container_path(os.path.join(LOGPATH,"GenCall", wildcards.sentrix_name,"out.log"),'logs') + options=get_tool_resource("GenCall", "cmd-line-params"), + outpath=lambda wildcards: fix_container_path( + os.path.join(DATAPATH, "gtc", wildcards.sentrix_name), "data" + ), + bpm=fix_container_path(config["static_data"]["bpm_manifest_file"], "static"), + egt=fix_container_path(config["static_data"]["egt_cluster_file"], "static"), + idat_path=lambda wildcards: fix_container_path( + os.path.join(IDAT_INPUT, wildcards.sentrix_name), "rawdata" + ), + logerr=lambda wildcards: fix_container_path( + os.path.join(LOGPATH, "GenCall", wildcards.sentrix_name, "error.log"), + "logs", + ), + logout=lambda wildcards: fix_container_path( + os.path.join(LOGPATH, "GenCall", wildcards.sentrix_name, "out.log"), "logs" + ), log: - err = os.path.join(LOGPATH, "GenCall", "{sentrix_name}", "error.log"), - out = os.path.join(LOGPATH, "GenCall", "{sentrix_name}", "out.log"), + err=os.path.join(LOGPATH, "GenCall", "{sentrix_name}", "error.log"), + out=os.path.join(LOGPATH, "GenCall", "{sentrix_name}", "out.log"), container: # Not sure if we need to use a specific version here "docker://us.gcr.io/broad-gotc-prod/illumina-iaap-autocall:1.0.2-1.1.0-1629910298" @@ -30,51 +39,60 @@ rule run_gencall: # The iaap-cli will *always* generate filenames derived from the sentrix name & pos -def get_chip(wildcards, outtype='dir_path'): +def get_chip(wildcards, outtype="dir_path"): """Get the chip name from a sample_id Values for outtype: 'dirpath' | 'file'""" - chip_name, chip_pos = [(n, p) for sid, n, p, _, _ in sample_data if sid == wildcards.sample_id][0] - if outtype == 'dir_path': - return os.path.join(DATAPATH,'gtc',chip_name) - elif outtype == 'file': - return os.path.join(chip_name,chip_name + '_' + chip_pos + '.gtc') + chip_name, chip_pos = [ + (n, p) for sid, n, p, _, _ in sample_data if sid == wildcards.sample_id + ][0] + if outtype == "dir_path": + return os.path.join(DATAPATH, "gtc", chip_name) + elif outtype == "file": + return os.path.join(chip_name, chip_name + "_" + chip_pos + ".gtc") -#Note: +# Note: # the *.gtc output files from gencall will *always* match the idat_file names # --> maybe better to switch from Chip_Name & Chip_Pos to Folder_Name & IDAT_Name ?! rule relink_gencall: input: - lambda wildcards: os.path.join(get_chip(wildcards),'_done') + lambda wildcards: os.path.join(get_chip(wildcards), "_done"), output: - os.path.join(DATAPATH,"{sample_id}","{sample_id}.gencall.gtc") + os.path.join(DATAPATH, "{sample_id}", "{sample_id}.gencall.gtc"), params: - gtc_link_path=lambda wildcards: os.path.join('..','gtc',get_chip(wildcards,outtype='file')) + gtc_link_path=lambda wildcards: os.path.join( + "..", "gtc", get_chip(wildcards, outtype="file") + ), shell: 'ln -s "{params.gtc_link_path}" "{output}"' - -#TODO: input functions to get correct fasta (& later correct manifest files) + +# TODO: input functions to get correct fasta (& later correct manifest files) rule run_gtc2vcf_vcf: input: - bpm=config['static_data']['bpm_manifest_file'], - egt=config['static_data']['egt_cluster_file'], + bpm=config["static_data"]["bpm_manifest_file"], + egt=config["static_data"]["egt_cluster_file"], genome=get_genome_fasta, - gtc=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.gencall.gtc") + gtc=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.gencall.gtc"), output: - vcf = pipe(os.path.join(DATAPATH, "{sample_id}", "{sample_id}.unprocessed.vcf")), - metatxt = os.path.join(DATAPATH, "{sample_id}", "{sample_id}.stats.txt"), - threads: get_tool_resource('gtc2vcf', 'threads') + vcf=pipe(os.path.join(DATAPATH, "{sample_id}", "{sample_id}.unprocessed.vcf")), + metatxt=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.stats.txt"), + threads: get_tool_resource("gtc2vcf", "threads") resources: - runtime=get_tool_resource('gtc2vcf', 'runtime'), - mem_mb=get_tool_resource('gtc2vcf', 'memory'), - partition=get_tool_resource('gtc2vcf', 'partition') + runtime=get_tool_resource("gtc2vcf", "runtime"), + mem_mb=get_tool_resource("gtc2vcf", "memory"), + partition=get_tool_resource("gtc2vcf", "partition"), params: - options = get_tool_resource('gtc2vcf', 'cmd-line-params'), - csv='--csv "{}"'.format(config['static_data']['csv_manifest_file']) if config['static_data']['csv_manifest_file'] else '', + options=get_tool_resource("gtc2vcf", "cmd-line-params"), + csv=( + '--csv "{}"'.format(config["static_data"]["csv_manifest_file"]) + if config["static_data"]["csv_manifest_file"] + else "" + ), log: err=os.path.join(LOGPATH, "gtc2vcf", "{sample_id}", "vcf.error.log"), #out=os.path.join(LOGPATH, "gtc2vcf", "{sample_id}", "vcf.out.log"), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","gtc2vcf.yaml") - shell: 'bcftools plugin gtc2vcf {params.options} -O v --bpm "{input.bpm}" {params.csv} --egt "{input.egt}" --fasta-ref "{input.genome}" --extra {output.metatxt} {input.gtc} 2> {log.err} | bcftools sort -o {output.vcf} 2>> {log.err}' + "../envs/gtc2vcf.yaml" + shell: + 'bcftools plugin gtc2vcf {params.options} -O v --bpm "{input.bpm}" {params.csv} --egt "{input.egt}" --fasta-ref "{input.genome}" --extra {output.metatxt} {input.gtc} 2> {log.err} | bcftools sort -o {output.vcf} 2>> {log.err}' diff --git a/stemcnv_check/rules/penncnv.smk b/stemcnv_check/rules/penncnv.smk index 88bada1..75ff508 100644 --- a/stemcnv_check/rules/penncnv.smk +++ b/stemcnv_check/rules/penncnv.smk @@ -1,12 +1,23 @@ import os + rule prep_PennCNV_sexfile: - input: cnv_vcf_input_function('PennCNV') - output: temp(os.path.join(DATAPATH,"{sample_id}","{sample_id}.penncnv.sexfile.txt")) - log: os.path.join(LOGPATH,"PennCNV","{sample_id}","sexfile.log") - params: - sex_info = lambda wildcards: get_ref_id(wildcards, True), - sample_docker_path = lambda wildcards: fix_container_path(os.path.join(DATAPATH, wildcards.sample_id, f"{wildcards.sample_id}.penncnv.input.tsv"),'data'), + input: + cnv_vcf_input_function("PennCNV"), + output: + temp(os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv.sexfile.txt")), + log: + os.path.join(LOGPATH, "PennCNV", "{sample_id}", "sexfile.log"), + params: + sex_info=lambda wildcards: get_ref_id(wildcards, True), + sample_docker_path=lambda wildcards: fix_container_path( + os.path.join( + DATAPATH, + wildcards.sample_id, + f"{wildcards.sample_id}.penncnv.input.tsv", + ), + "data", + ), shell: #TODO check if clause 'echo -e "{params.sample_docker_path}\t{params.sex_info[2]}" > {output}; ' @@ -17,80 +28,133 @@ rule prep_PennCNV_sexfile: # - extract tsv SNP file from vcf rule prep_PennCNV_input: input: - vcf=cnv_vcf_input_function('PennCNV') + vcf=cnv_vcf_input_function("PennCNV"), output: - tsv=temp(os.path.join(DATAPATH,"{sample_id}","{sample_id}.penncnv.input.tsv")) - log: - os.path.join(LOGPATH,"PennCNV","{sample_id}","input.log") + tsv=temp(os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv.input.tsv")), + log: + os.path.join(LOGPATH, "PennCNV", "{sample_id}", "input.log"), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","vembrane.yaml") + "../envs/vembrane.yaml" params: - filter = get_tool_filter_settings('PennCNV') - #TODO: need to remove pseudo-autosomal regions form X & Y + filter=get_tool_filter_settings("PennCNV"), + # TODO: need to remove pseudo-autosomal regions form X & Y shell: - 'vembrane filter \'"PASS" in FILTER\' {input.vcf} 2> {log}|' - 'vembrane table --header \'Name, Chr, Position, B Allele Freq, Log R Ratio\'' + "vembrane filter '\"PASS\" in FILTER' {input.vcf} 2> {log}|" + "vembrane table --header 'Name, Chr, Position, B Allele Freq, Log R Ratio'" ' --long \'ID, CHROM, POS, FORMAT["BAF"][SAMPLE], FORMAT["LRR"][SAMPLE]\' > {output.tsv} 2>> {log}' + + # # Alternative: # wrapper: # "v3.14.1/bio/vembrane/table" + # - run auto, x & Y calling rule run_PennCNV: input: - tsv=os.path.join(DATAPATH,"{sample_id}","{sample_id}.penncnv.input.tsv"), - sexfile=os.path.join(DATAPATH,"{sample_id}","{sample_id}.penncnv.sexfile.txt"), - pfb=config['static_data']['penncnv_pfb_file'], - gcmodel=config['static_data']['penncnv_GCmodel_file'] + tsv=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv.input.tsv"), + sexfile=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv.sexfile.txt"), + pfb=config["static_data"]["penncnv_pfb_file"], + gcmodel=config["static_data"]["penncnv_GCmodel_file"], output: - tsv=temp(os.path.join(DATAPATH,"{sample_id}","{sample_id}.penncnv-{chr}.tsv")), - err=os.path.join(LOGPATH,"PennCNV","{sample_id}","{chr}.error.log") - threads: get_tool_resource('PennCNV','threads') + tsv=temp(os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv-{chr}.tsv")), + err=os.path.join(LOGPATH, "PennCNV", "{sample_id}", "{chr}.error.log"), + threads: get_tool_resource("PennCNV", "threads") resources: - runtime=get_tool_resource('PennCNV','runtime'), - mem_mb=get_tool_resource('PennCNV','memory'), - partition=get_tool_resource('PennCNV','partition') + runtime=get_tool_resource("PennCNV", "runtime"), + mem_mb=get_tool_resource("PennCNV", "memory"), + partition=get_tool_resource("PennCNV", "partition"), wildcard_constraints: - chr='chrx|chry|auto' + chr="chrx|chry|auto", params: - filter=get_tool_filter_settings('PennCNV'), - chrom=lambda wildcards: '' if wildcards.chr == 'auto' else '-' + wildcards.chr, + filter=get_tool_filter_settings("PennCNV"), + chrom=lambda wildcards: "" if wildcards.chr == "auto" else "-" + wildcards.chr, #LOH calling is optional, also disabled for male sex chromosomes - do_loh=lambda wildcards: '' if (wildcards.chr != 'auto' and get_ref_id(wildcards,True)[2] == 'm' and - config['settings']['PennCNV']['enable_LOH_calls']) else '-loh', - snakedir = fix_container_path(SNAKEDIR, 'snakedir'), - tsvout=lambda wildcards: fix_container_path(os.path.join(DATAPATH, wildcards.sample_id, wildcards.sample_id+".penncnv-"+wildcards.chr+".tsv"),'data'), - tsvin=lambda wildcards: fix_container_path(os.path.join(DATAPATH, wildcards.sample_id, wildcards.sample_id+".penncnv.input.tsv"),'data'), - pfb=fix_container_path(config['static_data']['penncnv_pfb_file'],'static'), - sexfile=lambda wildcards: '' if wildcards.chr == 'auto' else '--sexfile ' + str(fix_container_path(os.path.join(DATAPATH, wildcards.sample_id, wildcards.sample_id+".penncnv.sexfile.txt"),'data')), - gcmodel=fix_container_path(config['static_data']['penncnv_GCmodel_file'],'static'), - logerr=lambda wildcards:fix_container_path(os.path.join(LOGPATH,"PennCNV", wildcards.sample_id, wildcards.chr+".error.log"),'logs'), - logout=lambda wildcards:fix_container_path(os.path.join(LOGPATH,"PennCNV", wildcards.sample_id, wildcards.chr+".out.log"),'logs') + do_loh=lambda wildcards: ( + "" + if ( + wildcards.chr != "auto" + and get_ref_id(wildcards, True)[2] == "m" + and config["settings"]["PennCNV"]["enable_LOH_calls"] + ) + else "-loh" + ), + snakedir=fix_container_path(SNAKEDIR, "snakedir"), + tsvout=lambda wildcards: fix_container_path( + os.path.join( + DATAPATH, + wildcards.sample_id, + wildcards.sample_id + ".penncnv-" + wildcards.chr + ".tsv", + ), + "data", + ), + tsvin=lambda wildcards: fix_container_path( + os.path.join( + DATAPATH, + wildcards.sample_id, + wildcards.sample_id + ".penncnv.input.tsv", + ), + "data", + ), + pfb=fix_container_path(config["static_data"]["penncnv_pfb_file"], "static"), + sexfile=lambda wildcards: ( + "" + if wildcards.chr == "auto" + else "--sexfile " + + str( + fix_container_path( + os.path.join( + DATAPATH, + wildcards.sample_id, + wildcards.sample_id + ".penncnv.sexfile.txt", + ), + "data", + ) + ) + ), + gcmodel=fix_container_path( + config["static_data"]["penncnv_GCmodel_file"], "static" + ), + logerr=lambda wildcards: fix_container_path( + os.path.join( + LOGPATH, "PennCNV", wildcards.sample_id, wildcards.chr + ".error.log" + ), + "logs", + ), + logout=lambda wildcards: fix_container_path( + os.path.join( + LOGPATH, "PennCNV", wildcards.sample_id, wildcards.chr + ".out.log" + ), + "logs", + ), log: - err=os.path.join(LOGPATH,"PennCNV","{sample_id}","{chr}.error.log"), - out=os.path.join(LOGPATH,"PennCNV","{sample_id}","{chr}.out.log") + err=os.path.join(LOGPATH, "PennCNV", "{sample_id}", "{chr}.error.log"), + out=os.path.join(LOGPATH, "PennCNV", "{sample_id}", "{chr}.out.log"), container: "docker://genomicslab/penncnv" shell: - '/home/user/PennCNV/detect_cnv.pl -test {params.do_loh} {params.chrom} -confidence ' - '-hmm {params.snakedir}/supplemental-files/hhall_loh.hmm -pfb {params.pfb} -gcmodel {params.gcmodel} ' - '{params.sexfile} {params.tsvin} -out {params.tsvout} > {params.logout} 2> {params.logerr}' + "/home/user/PennCNV/detect_cnv.pl -test {params.do_loh} {params.chrom} -confidence " + "-hmm {params.snakedir}/supplemental-files/hhall_loh.hmm -pfb {params.pfb} -gcmodel {params.gcmodel} " + "{params.sexfile} {params.tsvin} -out {params.tsvout} > {params.logout} 2> {params.logerr}" + +def get_penncnv_output(wildcards, files="tsv"): + _, _, sex, _ = get_ref_id(wildcards, True) + chrs = ["auto", "chrx"] + if sex == "m": + chrs.append("chry") -def get_penncnv_output(wildcards, files = 'tsv'): - _, _, sex, _ = get_ref_id(wildcards, True) - chrs = ['auto','chrx'] - if sex == 'm': chrs.append('chry') - - if files == 'tsv': + if files == "tsv": return expand( - os.path.join(DATAPATH,"{sample_id}","{sample_id}.penncnv-{chr}.tsv"), - chr=chrs, sample_id=wildcards.sample_id + os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv-{chr}.tsv"), + chr=chrs, + sample_id=wildcards.sample_id, ) - elif files == 'log': + elif files == "log": return expand( - os.path.join(LOGPATH,"PennCNV","{sample_id}","{chr}.error.log"), - chr=chrs, sample_id=wildcards.sample_id + os.path.join(LOGPATH, "PennCNV", "{sample_id}", "{chr}.error.log"), + chr=chrs, + sample_id=wildcards.sample_id, ) else: raise ValueError("Invalid file type requested: {}".format(files)) @@ -99,16 +163,18 @@ def get_penncnv_output(wildcards, files = 'tsv'): # - combine calls and write out as vcf rule combined_PennCNV_output: input: - vcf=cnv_vcf_input_function('PennCNV'), + vcf=cnv_vcf_input_function("PennCNV"), tsvs=get_penncnv_output, #logs=get_penncnv_output(wildcards, 'log') output: - vcf=os.path.join(DATAPATH,"{sample_id}","{sample_id}.CNV_calls.PennCNV.vcf.gz"), + vcf=os.path.join( + DATAPATH, "{sample_id}", "{sample_id}.CNV_calls.PennCNV.vcf.gz" + ), # TODO: how to best handle this? > separate rule ?! # stats=os.path.join(DATAPATH,"{sample_id}","{sample_id}.CNV_calls.penncnv.stats.tsv") log: - err=os.path.join(LOGPATH,"PennCNV","{sample_id}","combine.error.log"), + err=os.path.join(LOGPATH, "PennCNV", "{sample_id}", "combine.error.log"), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") - script: - "../scripts/combine_penncnv.R" \ No newline at end of file + "../envs/general-R.yaml" + script: + "../scripts/combine_penncnv.R" diff --git a/stemcnv_check/rules/report_generation.smk b/stemcnv_check/rules/report_generation.smk index 878c7f6..67ea1f9 100644 --- a/stemcnv_check/rules/report_generation.smk +++ b/stemcnv_check/rules/report_generation.smk @@ -1,45 +1,67 @@ - def get_report_sample_input(wildcards): sample_id, ref_id, sex, ref_sex = get_ref_id(wildcards, True) - report_settings = config['reports'][wildcards.report] + report_settings = config["reports"][wildcards.report] - sample_files = expand([ + sample_files = expand( + [ os.path.join(DATAPATH, "{ids}", "{ids}.combined-cnv-calls.tsv"), os.path.join(DATAPATH, "{ids}", "{ids}.stats.txt"), - os.path.join(LOGPATH, "PennCNV", "{ids}","{chrs}.error.log"), - #TODO: replace with annotated VCF - os.path.join(DATAPATH, "{ids}", "{ids}.processed-SNP-data.{filter}-filter.vcf") + os.path.join(LOGPATH, "PennCNV", "{ids}", "{chrs}.error.log"), + # TODO: replace with annotated VCF + os.path.join( + DATAPATH, "{ids}", "{ids}.processed-SNP-data.{filter}-filter.vcf" + ), # os.path.join(DATAPATH, "{ids}", "{ids}.processed-data.tsv"), # os.path.join(DATAPATH, "{ids}", "{ids}.filtered-data-{filter}.tsv") ], - ids = (sample_id, ref_id) if ref_id else (sample_id,), - chrs = ['auto', 'chrx'] + (['chry'] if sex == 'm' else []), - filter = get_tool_filter_settings(f"report:{wildcards.report}:call.data.and.plots"), + ids=(sample_id, ref_id) if ref_id else (sample_id,), + chrs=["auto", "chrx"] + (["chry"] if sex == "m" else []), + filter=get_tool_filter_settings( + f"report:{wildcards.report}:call.data.and.plots" + ), ) - incl_sections = config_extract(('include_sections', ), report_settings, config['reports']['__default__']) - excl_sections = config_extract(('exclude_sections', ), report_settings, config['reports']['__default__']) - do_snp_clustering = (incl_sections == '__all__' or 'SNP.dendrogram' in incl_sections) and not 'SNP.dendrogram' in excl_sections + incl_sections = config_extract( + ("include_sections",), report_settings, config["reports"]["__default__"] + ) + excl_sections = config_extract( + ("exclude_sections",), report_settings, config["reports"]["__default__"] + ) + do_snp_clustering = ( + incl_sections == "__all__" or "SNP.dendrogram" in incl_sections + ) and not "SNP.dendrogram" in excl_sections if do_snp_clustering: - extra_sample_def = config_extract(('SNP_comparison', 'extra_samples'), report_settings, config['reports']['__default__']) + extra_sample_def = config_extract( + ("SNP_comparison", "extra_samples"), + report_settings, + config["reports"]["__default__"], + ) ids = collect_SNP_cluster_ids(sample_id, extra_sample_def, sample_data_full) # VCF has both filtered & unfiltered SNPs now # if not config_extract(('SNP_comparison', 'ignore_filter'), report_settings, config['reports']['__default__']): sample_files += expand( - [os.path.join(DATAPATH, "{ids}", "{ids}.processed-SNP-data.{filter}-filter.vcf")], - ids=ids, filter = get_tool_filter_settings(f"report:{wildcards.report}:SNP_comparison") + [ + os.path.join( + DATAPATH, "{ids}", "{ids}.processed-SNP-data.{filter}-filter.vcf" + ) + ], + ids=ids, + filter=get_tool_filter_settings( + f"report:{wildcards.report}:SNP_comparison" + ), ) - if wildcards.ext == 'pdf': + if wildcards.ext == "pdf": sample_files += [os.path.join(LOGPATH, "report", "_latex_installation_check")] return sample_files + rule check_latex_installation: output: - os.path.join(LOGPATH, "report", "_latex_installation_check") + os.path.join(LOGPATH, "report", "_latex_installation_check"), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") + "../envs/general-R.yaml" shell: """ Rscript - << 'EOF' @@ -60,26 +82,25 @@ EOF rule knit_report: input: - get_report_sample_input + get_report_sample_input, output: - report=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.{report}.{ext}") , + report=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.{report}.{ext}"), plots=directory(os.path.join(DATAPATH, "{sample_id}", "{report}-{ext}_images")), # -> this extra output is optional now! #summary=os.path.join(DATAPATH, "{sample_id}", "{sample_id}.summary-check.tsv") wildcard_constraints: - ext="(pdf|html)" - threads: get_tool_resource('knitr', 'threads') + ext="(pdf|html)", + threads: get_tool_resource("knitr", "threads") # params: # config = get_report_config resources: - runtime=get_tool_resource('knitr', 'runtime'), - mem_mb=get_tool_resource('knitr', 'memory'), - partition=get_tool_resource('knitr', 'partition') + runtime=get_tool_resource("knitr", "runtime"), + mem_mb=get_tool_resource("knitr", "memory"), + partition=get_tool_resource("knitr", "partition"), log: err=os.path.join(LOGPATH, "report", "{sample_id}", "{report}-{ext}.error.log"), - out=os.path.join(LOGPATH, "report", "{sample_id}", "{report}-{ext}.out.log") + out=os.path.join(LOGPATH, "report", "{sample_id}", "{report}-{ext}.out.log"), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") + "../envs/general-R.yaml" shell: "Rscript {SNAKEDIR}/scripts/knit_report.R {wildcards.sample_id} {wildcards.report} {CONFIGFILE} > {log.out} 2> {log.err}" - diff --git a/stemcnv_check/rules/staticdata_creation.smk b/stemcnv_check/rules/staticdata_creation.smk index c085ac4..625d0a0 100644 --- a/stemcnv_check/rules/staticdata_creation.smk +++ b/stemcnv_check/rules/staticdata_creation.smk @@ -5,26 +5,28 @@ from pathlib import Path import tempfile from stemcnv_check import STEM_CNV_CHECK -DOWNLOAD_DIR = config['TMPDIR'] if 'TMPDIR' in config else tempfile.mkdtemp() -GENOME = config['genome'] +DOWNLOAD_DIR = config["TMPDIR"] if "TMPDIR" in config else tempfile.mkdtemp() +GENOME = config["genome"] # ================================================================ + def fix_container_path(path_in, bound_to): path_in = Path(path_in) - if bound_to == 'static': + if bound_to == "static": rel_path = path_in.name else: local_target = { - 'snakedir': Path(importlib.resources.files(STEM_CNV_CHECK)), - 'tmp': Path(DOWNLOAD_DIR) + "snakedir": Path(importlib.resources.files(STEM_CNV_CHECK)), + "tmp": Path(DOWNLOAD_DIR), }[bound_to].absolute() rel_path = path_in.absolute().relative_to(local_target) - return Path('/outside/') / bound_to / rel_path + return Path("/outside/") / bound_to / rel_path + -# +# # rule all: # input: # config['genomeInfo_file'], @@ -35,14 +37,17 @@ def fix_container_path(path_in, bound_to): # config['genome_gtf_file'] -#Note: PennCNV does not seem to work with UCSC chromosome style in PFB file +# Note: PennCNV does not seem to work with UCSC chromosome style in PFB file rule create_pfb_from_vcf: - input: config['vcf_input_file'] - output: config['penncnv_pfb_file'] + input: + config["vcf_input_file"], + output: + config["penncnv_pfb_file"], conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") - #shell: "Rscript {SNAKEDIR}/scripts/make_PFB_from_vcf.R {input} {output}" - shell: """ + "../envs/general-R.yaml" + # shell: "Rscript {SNAKEDIR}/scripts/make_PFB_from_vcf.R {input} {output}" + shell: + """ Rscript - << 'EOF' suppressMessages(library(tidyverse)) suppressMessages(library(vcfR)) @@ -79,14 +84,17 @@ vcf.info %>% EOF """ -#FUTURE: -#also get the gc5base.bw file from UCSCand make the GC model from that? + +# FUTURE: +# also get the gc5base.bw file from UCSCand make the GC model from that? # -> PennCNV comes with a wig2gc5base python script (though that has a hard coded 'source' file in it? rule create_gcmodel_file: - input: config['penncnv_pfb_file'] - output: config['penncnv_GCmodel_file'] + input: + config["penncnv_pfb_file"], + output: + config["penncnv_GCmodel_file"], params: - download_path = fix_container_path(DOWNLOAD_DIR, 'tmp') + download_path=fix_container_path(DOWNLOAD_DIR, "tmp"), container: "docker://genomicslab/penncnv" shell: @@ -99,19 +107,20 @@ rule create_gcmodel_file: /home/user/PennCNV/cal_gc_snp.pl {params.download_path}/{GENOME}.gc5Base.txt {input} -out {output} """ + rule create_array_info_file: input: - pfb = config['penncnv_pfb_file'], - chromInfo = config['genomeInfo_file'] + pfb=config["penncnv_pfb_file"], + chromInfo=config["genomeInfo_file"], output: - density = config['array_density_file'], - gaps = config['array_gaps_file'] + density=config["array_density_file"], + gaps=config["array_gaps_file"], params: - min_gap_size = config['min_gap_size'], - density_windows = config['density_windows'], - genome = config['genome'] + min_gap_size=config["min_gap_size"], + density_windows=config["density_windows"], + genome=config["genome"], conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") + "../envs/general-R.yaml" shell: """ Rscript - << 'EOF' @@ -163,18 +172,25 @@ write_bed(density, '{output.density}') EOF """ + rule gencode_v45_gtf_download: - output: config['genome_gtf_file'] + output: + config["genome_gtf_file"], # Source gtf GRCh38: params: - ftp_base = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/", - release_path = "gencode.v45.basic.annotation.gtf.gz" if GENOME == "hg38" else "GRCh37_mapping/gencode.v45lift37.basic.annotation.gtf.gz" + ftp_base="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/", + release_path=( + "gencode.v45.basic.annotation.gtf.gz" + if GENOME == "hg38" + else "GRCh37_mapping/gencode.v45lift37.basic.annotation.gtf.gz" + ), shell: """ wget {params.ftp_base}/{params.release_path} -O {output} 2> /dev/null # gunzip {output}.gz """ + # #FIXME: the gencode fa.gz files are gzip not bgzip compressed # rule gencode_v45_genomeFasta_download: # output: config['genome_fasta_file'] @@ -187,10 +203,12 @@ rule gencode_v45_gtf_download: # wget {params.ftp_base}/{params.release_path} -O {output} 2> /dev/null # """ + rule ucsc_goldenpath_download: - output: temp("{DOWNLOAD_DIR}/{genome}.{filename}.txt") + output: + temp("{DOWNLOAD_DIR}/{genome}.{filename}.txt"), wildcard_constraints: - genome = "hg19|hg38" + genome="hg19|hg38", shell: """ wget 'https://hgdownload.cse.ucsc.edu/goldenPath/{wildcards.genome}/database/{wildcards.filename}.txt.gz' -O {output}.gz 2> /dev/null @@ -200,14 +218,15 @@ rule ucsc_goldenpath_download: rule create_genome_info_file: input: - cytobands = ancient(os.path.join(DOWNLOAD_DIR, f"{GENOME}.cytoBand.txt")), + cytobands=ancient(os.path.join(DOWNLOAD_DIR, f"{GENOME}.cytoBand.txt")), #cytoBandIdeo.txt has info for non-assmebled chromosomes #centromer = os.path.join(DOWNLOAD_DIR, f"{GENOME}.centromeres.txt"), #Only exists for hg38 - chrominfo = ancient(os.path.join(DOWNLOAD_DIR, f"{GENOME}.chromInfo.txt")) + chrominfo=ancient(os.path.join(DOWNLOAD_DIR, f"{GENOME}.chromInfo.txt")), #Note: the chromAlias.txt file might be useful if people use strange chr-/seqnames - output: config['genomeInfo_file'] + output: + config["genomeInfo_file"], conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","general-R.yaml") + "../envs/general-R.yaml" shell: """ Rscript - << 'EOF' @@ -239,25 +258,38 @@ write_tsv(chrominfo, "{output}") EOF """ + rule download_vep_fasta: output: - os.path.join(config['vep_fasta_path'], 'homo_sapiens', '112_{genome}', 'Homo_sapiens.{genome}.dna.toplevel.fa.gz') + os.path.join( + config["vep_fasta_path"], + "homo_sapiens", + "112_{genome}", + "Homo_sapiens.{genome}.dna.toplevel.fa.gz", + ), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","vep-annotation.yaml") + importlib.resources.files(STEM_CNV_CHECK).joinpath( + "envs", "vep-annotation.yaml" + ) params: - fasta_path = config['vep_fasta_path'] + fasta_path=config["vep_fasta_path"], shell: "vep_install -a f -s homo_sapiens -y {wildcards.genome} -c {params.fasta_path}" + rule download_vep_cache: output: - done = os.path.join(config['vep_cache_path'], '.{genome}.done'), - folder = directory(os.path.join(config['vep_cache_path'], 'homo_sapiens', '112_{genome}')) + done=os.path.join(config["vep_cache_path"], ".{genome}.done"), + folder=directory( + os.path.join(config["vep_cache_path"], "homo_sapiens", "112_{genome}") + ), conda: - importlib.resources.files(STEM_CNV_CHECK).joinpath("envs","vep-annotation.yaml") + importlib.resources.files(STEM_CNV_CHECK).joinpath( + "envs", "vep-annotation.yaml" + ) params: - cache_path = config['vep_cache_path'] + cache_path=config["vep_cache_path"], shell: # -r {params.cache_path}/plugins; check if needed or if it follows -c # -a f > automatically get ensembl genome fasta; can replace fasta & gtf - "vep_install -a cp -g DosageSensitivity -s homo_sapiens -y {wildcards.genome} -c {params.cache_path} --CONVERT && touch {output.done}" \ No newline at end of file + "vep_install -a cp -g DosageSensitivity -s homo_sapiens -y {wildcards.genome} -c {params.cache_path} --CONVERT && touch {output.done}"