-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvarcall.Snakefile
325 lines (299 loc) · 11.9 KB
/
varcall.Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# @author: Vivek Bhardwaj (@vivekbhr)
# @date: October 31, 2016
# @desc: Snakemake pipeline for https://www.broadinstitute.org/gatk/guide/article?id=3891
# Based off of code by https://github.com/slowkow/snakefiles/blob/master/star/star.snakefile
#
# Usage: snakemake --snakefile Snakefile.varcall --cores 20 --jobs 3 -c "SlurmEasy -t {threads} -n {rule}"
## needs, STAR, picard, GATK
from os.path import join, dirname
from subprocess import check_output
# Globals ---------------------------------------------------------------------
THREADS=16
# Full path to genome fasta.
GENOME = config['genome_fasta']
# Full path to gene model annotations for splice aware alignment.
GTF = config['gtf']
# Full path to a folder that holds all of your FASTQ files.
FASTQ_DIR = config['indir']
# Full path to output folder.
OUTPUT_DIR = config['outdir']
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# A snakemake regular expression matching the forward mate FASTQ files.
SAMPLES, = glob_wildcards(join(FASTQ_DIR, '{sample,[^/]+}_R1.fastq.gz'))
print(SAMPLES)
# Patterns for the 1st mate and the 2nd mate using the 'sample' wildcard.
PATTERN_R1 = '{sample}_R1.fastq.gz'
PATTERN_R2 = '{sample}_R2.fastq.gz'
#rules -----------------------------------------------------------------------
rule all:
input:
expand(join(OUTPUT_DIR, '{sample}', 'pass1', 'Aligned.out.sam'), sample = SAMPLES),
expand(join(OUTPUT_DIR, '{sample}', 'pass2', 'Aligned.out.sam'), sample = SAMPLES),
expand(join(OUTPUT_DIR, '{sample}', 'Aligned.out.dedupped.bam'), sample = SAMPLES),
expand(join(OUTPUT_DIR, '{sample}', 'Aligned.out.split.bam'), sample = SAMPLES),
expand(join(OUTPUT_DIR, '{sample}', 'call.filtered.vcf'), sample = SAMPLES),
join(dirname(GENOME), 'star')
# Make an index of the genome for STAR.
rule star_index:
input:
genome = GENOME
output:
index = join(dirname(GENOME), 'star')
log:
join(dirname(GENOME), 'star', 'star.index.log')
threads:
THREADS
run:
# Write stderr and stdout to the log file.
shell('STAR'
' --runThreadN {threads}'
' --runMode genomeGenerate'
' --genomeDir ' + join(dirname(GENOME), 'star') +
' --genomeFastaFiles {input.genome}'
' > {log} 2>&1')
# 1. Map paired-end RNA-seq reads to the genome.
# 2. Count the number of reads supporting each splice junction.
rule star_pass1:
input:
r1 = join(FASTQ_DIR, PATTERN_R1),
r2 = join(FASTQ_DIR, PATTERN_R2),
gtf = GTF
output:
sam = join(OUTPUT_DIR, '{sample}', 'pass1', 'Aligned.out.sam'),
sj = join(OUTPUT_DIR, '{sample}', 'pass1', 'SJ.out.tab')
log:
join(OUTPUT_DIR, '{sample}', 'pass1', 'star.map.log')
threads:
THREADS
run:
# Map reads with STAR.
shell('STAR'
' --runThreadN {threads}'
' --genomeDir ' + join(dirname(GENOME), 'star') +
' --sjdbGTFfile {input.gtf}'
' --readFilesCommand zcat'
' --readFilesIn {input.r1} {input.r2}'
# By default, this prefix is "./".
' --outFileNamePrefix ' + join(OUTPUT_DIR, '{wildcards.sample}', 'pass1') + '/'
# If exceeded, the read is considered unmapped.
' --outFilterMultimapNmax 20'
# Minimum overhang for unannotated junctions.
' --alignSJoverhangMin 8'
# Minimum overhang for annotated junctions.
' --alignSJDBoverhangMin 1'
# Maximum number of mismatches per pair.
' --outFilterMismatchNmax 999'
# Minimum intron length.
' --alignIntronMin 1'
# Maximum intron length.
' --alignIntronMax 1000000'
# Maximum genomic distance between mates.
' --alignMatesGapMax 1000000'
' > {log} 2>&1')
# 1. Map paired-end RNA-seq reads to the genome.
# 2. Make a coordinate sorted BAM with genomic coordinates.
# 3. Count the number of reads mapped to each gene.
# 4. Count the number of reads supporting each splice junction.
rule star_pass2:
input:
r1 = join(FASTQ_DIR, PATTERN_R1),
r2 = join(FASTQ_DIR, PATTERN_R2),
genomeDir = dirname(rules.star_index.output.index),
gtf = GTF,
sjs = expand(join(OUTPUT_DIR, '{sample}', 'pass1', 'SJ.out.tab'), sample = SAMPLES)
output:
sam = join(OUTPUT_DIR, '{sample}', 'pass2', 'Aligned.out.sam'),
counts = join(OUTPUT_DIR, '{sample}', 'pass2', 'ReadsPerGene.out.tab'),
sj = join(OUTPUT_DIR, '{sample}', 'pass2', 'SJ.out.tab')
log:
join(OUTPUT_DIR, '{sample}', 'pass2', 'star.map.log')
threads:
THREADS
run:
# Map reads with STAR.
shell('STAR'
' --runThreadN {threads}'
' --genomeDir ' + join(dirname(GENOME), 'star') +
' --sjdbGTFfile {input.gtf}'
' --readFilesCommand zcat'
' --readFilesIn {input.r1} {input.r2}'
' --sjdbFileChrStartEnd {input.sjs}'
# BAM file in transcript coords, in addition to genomic BAM file.
' --quantMode GeneCounts'
# Basic 2-pass mapping, with all 1st pass junctions inserted
# into the genome indices on the fly.
' --twopassMode Basic'
# By default, this prefix is "./".
' --outFileNamePrefix ' + join(OUTPUT_DIR, '{wildcards.sample}', 'pass2') + '/'
# If exceeded, the read is considered unmapped.
' --outFilterMultimapNmax 20'
# Minimum overhang for unannotated junctions.
' --alignSJoverhangMin 8'
# Minimum overhang for annotated junctions.
' --alignSJDBoverhangMin 1'
# Maximum number of mismatches per pair.
' --outFilterMismatchNmax 999'
# Minimum intron length.
' --alignIntronMin 1'
# Maximum intron length.
' --alignIntronMax 1000000'
# Maximum genomic distance between mates.
' --alignMatesGapMax 1000000'
#
' --limitSjdbInsertNsj 10000000'
' > {log} 2>&1')
# add read groups, sort
rule picard_cleanstar:
input:
sam =rules.star_pass2.output.sam
output:
bam = join(OUTPUT_DIR, '{sample}', 'Aligned.out.bam'),
log:
join(OUTPUT_DIR, '{sample}', 'picard.clean.log')
run:
shell('picard'
' AddOrReplaceReadGroups'
' I={input.sam}'
' O={output}'
' SORT_ORDER=coordinate CREATE_INDEX=false RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample'
' > {log} 2>&1')
# mark duplicates
rule picard_markdup:
input:
bam =rules.picard_cleanstar.output.bam
output:
dupmarked = join(OUTPUT_DIR, '{sample}', 'Aligned.out.dedupped.bam'),
metrics = join(OUTPUT_DIR, '{sample}', 'Aligned.out.dedupped.metrics'),
log:
join(OUTPUT_DIR, '{sample}', 'picard.markdup.log')
run:
shell('picard'
' MarkDuplicates'
' I={input.bam}'
' O={output.dupmarked}'
' CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M={output.metrics}'
' > {log} 2>&1')
rule gatk_SplitNCigarReads:
input:
dupmarked =rules.picard_markdup.output.dupmarked,
genome = GENOME
output:
split = join(OUTPUT_DIR, '{sample}', 'Aligned.out.split.bam')
log:
join(OUTPUT_DIR, '{sample}', 'gatk.splitcigar.log')
run:
shell('java -Xmx16g -jar GenomeAnalysisTK.jar'
' -T SplitNCigarReads'
' -nt {threads}'
' -R {input.genome}'
' -I {input.dupmarked}'
' -o {output.split}'
' -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS'
' > {log} 2>&1')
# # indel realignment (OPTIONAL, ONLY TO SQEEZE A FEW MORE SNPS)
#rule gatk_realign_info:
# input:
# "mapping/{reference}/{prefix}.bam.bai",
# ref=GENOME,
# bam="mapping/{reference}/{prefix}.bam"
# output:
# temp("mapping/{reference,[^/]+}/{prefix}.realign.intervals")
# params:
# custom=config.get("params_gatk", "")
# log:
# "mapping/log/{reference}/{prefix}.realign_info.log"
# threads: 8
# shell:
# "module load GATK/3.5; java -jar /package/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar -T RealignerTargetCreator -R {input.ref} {params.custom} "
# "-nt {threads} "
# "-I {input.bam} -known {config[known_variants][dbsnp]} "
# "-o {output} >& {log}"
#rule gatk_realign_bam:
# input:
# ref=GENOME,
# bam="mapping/{reference}/{prefix}.bam",
# intervals="mapping/{reference}/{prefix}.realign.intervals"
# output:
# "mapping/{reference,[^/]+}/{prefix}.realigned.bam"
# params:
# custom=config.get("params_gatk", "")
# log:
# "mapping/log/{reference}/{prefix}.realign.log"
# shell:
# "module load GATK/3.5; java -jar /package/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar -T IndelRealigner -R {input.ref} {params.custom} "
# "-nt {threads} --disable_bam_indexing "
# "-I {input.bam} -targetIntervals {input.intervals} "
# "-o {output} >& {log}"
# base recalibration
#rule gatk_recalibrate_info:
# input:
# "mapping/{reference}/{prefix}.bam.bai",
# ref=GENOME,
# bam="mapping/{reference}/{prefix}.bam"
# output:
# temp("mapping/{reference,[^/]+}/{prefix}.recalibrate.grp")
# params:
# custom=config.get("params_gatk", "")
# log:
# "mapping/log/{reference}/{prefix}.recalibrate_info.log"
# threads: 8
# shell:
# "module load GATK/3.5; java -jar /package/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar -T BaseRecalibrator -R {input.ref} {params.custom} "
# "-nct {threads} "
# "-I {input.bam} -knownSites {config[known_variants][dbsnp]} "
# "-o {output} >& {log}"
#rule gatk_recalibrate_bam:
# input:
# ref=GENOME,
# bam="mapping/{reference}/{prefix}.bam",
# grp="mapping/{reference}/{prefix}.recalibrate.grp"
# output:
# "mapping/{reference,[^/]+}/{prefix}.recalibrated.bam"
# params:
# custom=config.get("params_gatk", "")
# log:
# "mapping/log/{reference}/{prefix}.recalibrate.log"
# threads: 8
# shell:
# "module load GATK/3.5; java -jar /package/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar -T PrintReads -R {input.ref} {params.custom} "
# "-nct {threads} "
# "--disable_bam_indexing "
# "-I {input.bam} -BQSR {input.grp} "
# "-o {output} >& {log}"
# variant calling
rule gatk_HaplotypeCaller:
input:
bam =rules.gatk_SplitNCigarReads.output.split,
genome = GENOME
output:
vcf = join(OUTPUT_DIR, '{sample}', 'call.vcf')
log:
join(OUTPUT_DIR, '{sample}', 'gatk.haplotypecaller.log')
run:
shell('java -Xmx16g -jar GenomeAnalysisTK.jar'
' -T HaplotypeCaller'
' -nct {threads}'
' -R {input.genome}'
' -I {input.bam}'
' -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0'
' -o {output.vcf}'
' > {log} 2>&1')
# variant filtering
rule gatk_VariantFiltration:
input:
vcf =rules.gatk_HaplotypeCaller.output.vcf,
genome = GENOME
output:
vcf = join(OUTPUT_DIR, '{sample}', 'call.filtered.vcf')
log:
join(OUTPUT_DIR, '{sample}', 'gatk.varfilter.log')
run:
shell('GenomeAnalysisTK.jar'
' -T VariantFiltration'
' -nt {threads}'
' -R {input.genome}'
' -V {input.vcf}'
' -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0"'
' -o {output.vcf}'
' > {log} 2>&1')