From e8663fb61eef73d1bffa107dde73ee353e3a0b26 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 14 Jan 2020 06:31:57 -0700 Subject: [PATCH] Add file extension for SnakeMake (#3953) * Add file extension for SnakeMake Previously a file name was defined for [SnakeMake[(snakemake-wrappers.readthedocs.io): https://github.com/github/linguist/pull/1834 Currently, the canonical extension is `smk` (see [this discussion](https://groups.google.com/forum/#!topic/Snakemake/segLE-RlV_s) with the author (@johanneskoester) of SnakeMake, and the [FAQ](http://snakemake.readthedocs.io/en/stable/project_info/faq.html#how-do-i-enable-syntax-highlighting-in-vim-for-snakefiles)). * Adding two Snakemake (smk) example files --- lib/linguist/languages.yml | 1 + samples/Python/snakemake-calling.smk | 68 +++++++++++++++++++++ samples/Python/snakemake-mapping.smk | 90 ++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+) create mode 100644 samples/Python/snakemake-calling.smk create mode 100644 samples/Python/snakemake-mapping.smk diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 4dea6c60d4..f3488c959c 100755 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -4140,6 +4140,7 @@ Python: - ".pyt" - ".pyw" - ".rpy" + - ".smk" - ".spec" - ".tac" - ".wsgi" diff --git a/samples/Python/snakemake-calling.smk b/samples/Python/snakemake-calling.smk new file mode 100644 index 0000000000..886d51a1b5 --- /dev/null +++ b/samples/Python/snakemake-calling.smk @@ -0,0 +1,68 @@ +# Source: https://raw.githubusercontent.com/snakemake-workflows/dna-seq-gatk-variant-calling/master/rules/calling.smk +# Accessed: Jan 10 2020 by Nils Homer +# License: MIT (https://github.com/snakemake-workflows/dna-seq-gatk-variant-calling/blob/master/LICENSE) + +if "restrict-regions" in config["processing"]: + rule compose_regions: + input: + config["processing"]["restrict-regions"] + output: + "called/{contig}.regions.bed" + conda: + "../envs/bedops.yaml" + shell: + "bedextract {wildcards.contig} {input} > {output}" + + +rule call_variants: + input: + bam=get_sample_bams, + ref=config["ref"]["genome"], + known=config["ref"]["known-variants"], + regions="called/{contig}.regions.bed" if config["processing"].get("restrict-regions") else [] + output: + gvcf=protected("called/{sample}.{contig}.g.vcf.gz") + log: + "logs/gatk/haplotypecaller/{sample}.{contig}.log" + params: + extra=get_call_variants_params + wrapper: + "0.27.1/bio/gatk/haplotypecaller" + + +rule combine_calls: + input: + ref=config["ref"]["genome"], + gvcfs=expand("called/{sample}.{{contig}}.g.vcf.gz", sample=samples.index) + output: + gvcf="called/all.{contig}.g.vcf.gz" + log: + "logs/gatk/combinegvcfs.{contig}.log" + wrapper: + "0.27.1/bio/gatk/combinegvcfs" + + +rule genotype_variants: + input: + ref=config["ref"]["genome"], + gvcf="called/all.{contig}.g.vcf.gz" + output: + vcf=temp("genotyped/all.{contig}.vcf.gz") + params: + extra=config["params"]["gatk"]["GenotypeGVCFs"] + log: + "logs/gatk/genotypegvcfs.{contig}.log" + wrapper: + "0.27.1/bio/gatk/genotypegvcfs" + + +rule merge_variants: + input: + ref=get_fai(), # fai is needed to calculate aggregation over contigs below + vcfs=lambda w: expand("genotyped/all.{contig}.vcf.gz", contig=get_contigs()), + output: + vcf="genotyped/all.vcf.gz" + log: + "logs/picard/merge-genotyped.log" + wrapper: + "0.40.2/bio/picard/mergevcfs" diff --git a/samples/Python/snakemake-mapping.smk b/samples/Python/snakemake-mapping.smk new file mode 100644 index 0000000000..e48bb27fb7 --- /dev/null +++ b/samples/Python/snakemake-mapping.smk @@ -0,0 +1,90 @@ +# Source: https://raw.githubusercontent.com/snakemake-workflows/dna-seq-gatk-variant-calling/master/rules/mapping.smk +# Accessed: Jan 10 2020 by Nils Homer +# License: MIT (https://github.com/snakemake-workflows/dna-seq-gatk-variant-calling/blob/master/LICENSE) + +rule trim_reads_se: + input: + unpack(get_fastq) + output: + temp("trimmed/{sample}-{unit}.fastq.gz") + params: + extra="", + **config["params"]["trimmomatic"]["se"] + log: + "logs/trimmomatic/{sample}-{unit}.log" + wrapper: + "0.30.0/bio/trimmomatic/se" + + +rule trim_reads_pe: + input: + unpack(get_fastq) + output: + r1=temp("trimmed/{sample}-{unit}.1.fastq.gz"), + r2=temp("trimmed/{sample}-{unit}.2.fastq.gz"), + r1_unpaired=temp("trimmed/{sample}-{unit}.1.unpaired.fastq.gz"), + r2_unpaired=temp("trimmed/{sample}-{unit}.2.unpaired.fastq.gz"), + trimlog="trimmed/{sample}-{unit}.trimlog.txt" + params: + extra=lambda w, output: "-trimlog {}".format(output.trimlog), + **config["params"]["trimmomatic"]["pe"] + log: + "logs/trimmomatic/{sample}-{unit}.log" + wrapper: + "0.30.0/bio/trimmomatic/pe" + + +rule map_reads: + input: + reads=get_trimmed_reads + output: + temp("mapped/{sample}-{unit}.sorted.bam") + log: + "logs/bwa_mem/{sample}-{unit}.log" + params: + index=config["ref"]["genome"], + extra=get_read_group, + sort="samtools", + sort_order="coordinate" + threads: 8 + wrapper: + "0.27.1/bio/bwa/mem" + + +rule mark_duplicates: + input: + "mapped/{sample}-{unit}.sorted.bam" + output: + bam=temp("dedup/{sample}-{unit}.bam"), + metrics="qc/dedup/{sample}-{unit}.metrics.txt" + log: + "logs/picard/dedup/{sample}-{unit}.log" + params: + config["params"]["picard"]["MarkDuplicates"] + wrapper: + "0.26.1/bio/picard/markduplicates" + + +rule recalibrate_base_qualities: + input: + bam=get_recal_input(), + bai=get_recal_input(bai=True), + ref=config["ref"]["genome"], + known=config["ref"]["known-variants"] + output: + bam=protected("recal/{sample}-{unit}.bam") + params: + extra=get_regions_param() + config["params"]["gatk"]["BaseRecalibrator"] + log: + "logs/gatk/bqsr/{sample}-{unit}.log" + wrapper: + "0.27.1/bio/gatk/baserecalibrator" + + +rule samtools_index: + input: + "{prefix}.bam" + output: + "{prefix}.bam.bai" + wrapper: + "0.27.1/bio/samtools/index"