-
Notifications
You must be signed in to change notification settings - Fork 1
/
cappseq_snv_pipeline.smk
370 lines (337 loc) · 15.8 KB
/
cappseq_snv_pipeline.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/env snakemake
import os
import pandas
import pyfaidx
import numpy
import snakemake
snakemake.utils.min_version("7")
# Load config file
configfile: "config/cappseq_snv_pipeline.yaml"
configpath = "config/cappseq_snv_pipeline.yaml"
# Check that the config file has all the required parameters
pathkeys = {"samplelist", "basedir", "ref_genome", "hotspots_vcf", "capture_space", "ensembl", "unmatched_normal", "custom_enst", "vep_data", "fgbio_jar"}
for ckey, attribute in config["cappseq_snv_pipeline"].items():
if attribute == "__UPDATE__":
# Placeholder value in config. Warn user
raise AttributeError(f"\'__UPDATE__\' found for \'{ckey}\' in config file \'{configpath}\'. Please ensure the config file is updated with parameters relevant for your analysis")
# Check that required filepaths exist
if ckey in pathkeys:
if not os.path.exists(attribute):
raise AttributeError(f"Unable to locate \'{attribute}\' for key \'{ckey}\' in config file \'{configpath}\': No such file or directory")
# Load input files
samplelist = config["cappseq_snv_pipeline"]["samplelist"]
samples = []
samplepaths = {}
normals = {}
normal_ids = {}
sample_uncons = {}
with open(samplelist) as f:
i = 0
for line in f:
# Assuming a two-column sample file, with sample name as column 1, and
# a BAM/CRAM path as column 2
i += 1
# Ignore comment lines
if line.startswith("#"):
continue
line = line.rstrip("\n").rstrip("\r")
cols = line.split("\t")
try:
sample_name = cols[0]
sample_path = cols[1]
unmerged_path = cols[2]
normal_path = cols[3]
except IndexError as e:
raise AttributeError(f"Unable to parse line {i} of sample file \'{samplelist}\': Expected three columns specifying sample name, consensus BAM path, and pre-consensus BAM path") from e
# Sanity check that the filepath exists
if not os.path.exists(sample_path):
continue
raise AttributeError(f"Unable to locate BAM/CRAM file \'{sample_path}\' for sample \'{sample_name}\': No such file or directory")
if not os.path.exists(normal_path):
continue
raise AttributeError(f"Unable to locate normal BAM/CRAM file \'{normal_path}\' for sample \'{sample_name}\': No such file or directory")
# Obtain the path to the normal sample
if sample_name in samples:
raise AttributeError(f"Duplicate sample name \'{sample_name}\' detected in sample file \'{samplelist}\'")
samples.append(sample_name)
samplepaths[sample_name] = sample_path
sample_uncons[sample_name] = unmerged_path
normals[sample_name] = normal_path
# Store the sample name of the normal (as this will be different than the ctDNA).
normal_name = os.path.basename(normal_path)
normal_name = normal_name.split(".")[0]
normal_ids[sample_name] = normal_name
# Run variant calling
rule run_sage:
input:
tbam = lambda w: samplepaths[w.sample],
nbam = lambda w: normals[w.sample],
output:
vcf = config["cappseq_snv_pipeline"]["base_dir"] + "/01-SAGE/{sample}/{sample}.sage.vcf"
params:
ref_genome = config["cappseq_snv_pipeline"]["ref_genome"],
ref_genome_version = "38" if config["cappseq_snv_pipeline"]["ref_genome_ver"] == "GRCh38" else "37", # Should make more robust
# Panel regions
hotspots_vcf = config["cappseq_snv_pipeline"]["hotspots_vcf"],
panel_regions = config["cappseq_snv_pipeline"]["capture_space"],
ensembl = config["cappseq_snv_pipeline"]["ensembl"],
# Miscellaneous
normal_name = lambda w: normal_ids[w.sample],
max_depth = config["cappseq_snv_pipeline"]["max_depth"],
min_map = config["cappseq_snv_pipeline"]["min_map_qual"],
hard_vaf_cutoff = config["cappseq_snv_pipeline"]["tumor_min_vaf"],
soft_vaf_cutoff = config["cappseq_snv_pipeline"]["tumor_soft_min_vaf"],
min_norm_depth = 7
threads: 4
log:
config["cappseq_snv_pipeline"]["base_dir"] + "/logs/{sample}.sage_run.log"
conda:
"envs/sage.yaml"
shell:
"""SAGE -tumor {wildcards.sample} -tumor_bam {input.tbam} \
-out {output.vcf} -ref_genome {params.ref_genome} \
-ref_genome_version {params.ref_genome_version} \
-hotspots {params.hotspots_vcf} -panel_bed {params.panel_regions} \
-high_confidence_bed {params.panel_regions} \
-ensembl_data_dir {params.ensembl} \
-reference_bam {input.nbam} -reference {params.normal_name} \
-max_read_depth {params.max_depth} -min_map_quality {params.min_map} \
-hard_min_tumor_vaf {params.hard_vaf_cutoff} -hard_min_tumor_raw_alt_support 4 \
-panel_min_tumor_vaf {params.soft_vaf_cutoff} -panel_min_germline_depth {params.min_norm_depth} \
-hotspot_min_germline_depth {params.min_norm_depth} \
-high_confidence_min_tumor_vaf {params.soft_vaf_cutoff} \
-bqr_min_map_qual {params.min_map} -threads {threads} 2>&1 > {log}
"""
rule filter_sage:
input:
vcf = rules.run_sage.output.vcf
output:
vcf = config["cappseq_snv_pipeline"]["base_dir"] + "/02-vcfs/{sample}.sage.passed.vcf"
conda:
"envs/bcftools.yaml"
shell:
"""
bcftools view -f PASS {input.vcf} -O vcf -o {output.vcf}
"""
# Flag positions with a high incidence of masked bases
rule flag_masked_pos:
input:
bam = lambda w: samplepaths[w.sample]
output:
bed_raw = temp(config["cappseq_snv_pipeline"]["base_dir"] + "/03-masked_pos/{sample}.maskedpos.bed"),
bed = config["cappseq_snv_pipeline"]["base_dir"] + "/03-masked_pos/{sample}.maskedpos.bed.gz"
params:
script = config["cappseq_snv_pipeline"]["mask_script"],
n_threshold = config["cappseq_snv_pipeline"]["mask_threshold"],
min_count = config["cappseq_snv_pipeline"]["mask_count"],
panel_regions = config["cappseq_snv_pipeline"]["capture_space"]
conda:
"envs/bcftools.yaml"
log:
config["cappseq_snv_pipeline"]["base_dir"] + "/logs/{sample}.maskpos.log"
shell:
"""
{params.script} --input {input.bam} --regions {params.panel_regions} --output {output.bed_raw} --count {params.min_count} --fraction {params.n_threshold} > {log} &&
bgzip -c {output.bed_raw} > {output.bed} && tabix -p bed {output.bed} >> {log}
"""
# Restrict to the captured regions, remove backlisted positions
rule restrict_to_capture:
input:
vcf = rules.filter_sage.output.vcf,
bed = rules.flag_masked_pos.output.bed
output:
vcf = config["cappseq_snv_pipeline"]["base_dir"] + "/04-capturespace/{sample}.capspace.vcf"
params:
panel_regions = config["cappseq_snv_pipeline"]["capture_space"]
conda:
"envs/bcftools.yaml"
shell:
"""
bedtools intersect -a {input.vcf} -header -b {params.panel_regions} | bedtools intersect -a - -header -b {input.bed} -v | awk -F '\\t' '$4 !~ /N/ && $5 !~ /N/' | bcftools norm -m +any -O vcf -o {output.vcf}
"""
# Generate review BAM files containing only the reads supporting these variants
rule review_consensus_reads:
input:
bam_cons = lambda w: samplepaths[w.sample],
bam_uncons = lambda w: sample_uncons[w.sample],
vcf = rules.restrict_to_capture.output.vcf
output:
tmp_sort = temp(config["cappseq_snv_pipeline"]["base_dir"] + "/06-supportingreads/{sample}/{sample}.umigrouped.sort.bam"),
tmp_index = temp(config["cappseq_snv_pipeline"]["base_dir"] + "/06-supportingreads/{sample}/{sample}.umigrouped.sort.bam.bai"),
consensus_bam = config["cappseq_snv_pipeline"]["base_dir"] + "/06-supportingreads/{sample}/{sample}.consensus.bam",
grouped_bam = config["cappseq_snv_pipeline"]["base_dir"] + "/06-supportingreads/{sample}/{sample}.grouped.bam"
threads: 2
params:
fgbio = config["cappseq_snv_pipeline"]["fgbio_jar"],
ref_genome = config["cappseq_snv_pipeline"]["ref_genome"],
outdir = config["cappseq_snv_pipeline"]["base_dir"] + "/06-supportingreads/{sample}/"
conda:
"envs/fgbio.yaml"
log:
config["cappseq_snv_pipeline"]["base_dir"] + "/logs/{sample}.reviewconsensusvariant.log"
shell:
"""
samtools sort -@ 2 {input.bam_uncons} > {output.tmp_sort} && samtools index -@ 2 {output.tmp_sort} &&
java -jar {params.fgbio} ReviewConsensusVariants --input {input.vcf} --consensus {input.bam_cons} --grouped-bam {output.tmp_sort} --ref {params.ref_genome} --output {params.outdir}/{wildcards.sample} --sample {wildcards.sample} 2>&1 > {log}
"""
rule vcf2maf_annotate:
input:
vcf = rules.restrict_to_capture.output.vcf
output:
vep_vcf = temp(config["cappseq_snv_pipeline"]["base_dir"] + "/04-capturespace/{sample}.capspace.vep.vcf"),
maf = config["cappseq_snv_pipeline"]["base_dir"] + "/05-MAFs/{sample}.sage.maf"
params:
custom_enst = config["cappseq_snv_pipeline"]["custom_enst"],
vep_data = config["cappseq_snv_pipeline"]["vep_data"],
centre = config["cappseq_snv_pipeline"]["centre"],
normal_name = lambda w: normal_ids[w.sample],
ref_fasta = config["cappseq_snv_pipeline"]["ref_genome"],
ref_ver = config["cappseq_snv_pipeline"]["ref_genome_ver"]
threads: 2
conda:
"envs/vcf2maf.yaml"
log:
config["cappseq_snv_pipeline"]["base_dir"] + "/logs/{sample}.vcf2maf.log"
shell:
"""
vcf2maf.pl --input-vcf {input.vcf} --output-maf {output.maf} \
--tumor-id {wildcards.sample} --normal-id {params.normal_name} \
--vep-path $CONDA_PREFIX/bin/ \
--vep-data {params.vep_data} --vep-forks {threads} \
--custom-enst {params.custom_enst} --ref-fasta {params.ref_fasta} \
--species homo_sapiens --ncbi-build {params.ref_ver} \
--maf-center {params.centre} 2> {log} >> {log}
"""
rule augment_ssm:
input:
maf = rules.vcf2maf_annotate.output.maf,
bam = lambda w: samplepaths[w.sample]
output:
maf = config["cappseq_snv_pipeline"]["base_dir"] + "/08-augmentssm/{sample}.sage.augment.maf"
threads: 2
params:
script = "src/augment_ssm.py"
shell: """
python {params.script} \
--bam {input.bam} \
--maf {input.maf} \
--output {output.maf} \
--threads {threads}
"""
rule filter_alt_supp:
input:
maf = rules.augment_ssm.output.maf
output:
maf = config["cappseq_snv_pipeline"]["base_dir"] + "/09-altsupp/{sample}.sage.filtered.maf"
params:
min_alt_depth = config["cappseq_snv_pipeline"]["min_alt_depth"],
min_germline_depth = config["cappseq_snv_pipeline"]["min_germline_depth"],
tumor_af_diff = config["cappseq_snv_pipeline"]["tumor_af_diff"]
shell: """
awk -F '\t' '$42>={params.min_alt_depth} && $43>{params.min_germline_depth}' {input.maf} | \
awk -F '\t' '$1 != "DNMT3A" || $42/$40 >= {params.tumor_af_diff} * $45/$43' | \
grep -v ^HLA | grep -v ^FCGR > {output.maf}
"""
rule filter_repetitive_seq:
"""
Remove mutations which are adjacent to a repetitive sequence.
"""
input:
maf = rules.filter_alt_supp.output.maf
output:
maf = config["cappseq_snv_pipeline"]["base_dir"] + "/10-filter_repeat/{sample}.sage.repeat_filt.maf"
params:
max_repeat_len = 6,
ref_fasta = config["cappseq_snv_pipeline"]["ref_genome"]
run:
refseq = pyfaidx.Fasta(params.ref_fasta)
with open(input.maf) as f, open(output.maf, "w") as o:
for line in f:
if line.startswith("#") or line.startswith("Hugo_Symbol"):
# Skip header lines.
o.write(line)
continue
cols = line.split("\t")
# Get alternate allele
alt = cols[12]
if alt == "-":
alt = cols[10]
if len(alt) > 1: # Handle large indels
alt = alt[0]
# Get position of variant.
pos = int(cols[6]) - 1 # Offset by 1 for MAF vs pyfaidx sequence.
chrom = cols[4]
# Determine length of repeat via reference sequence.
repeat_len = 0
while True:
pos += 1
base = refseq[chrom][pos]
if base != alt or repeat_len > params.max_repeat_len:
break
repeat_len += 1
if repeat_len < params.max_repeat_len:
# Not a repeat (or not sufficiently long)
o.write(line)
# Filter output via GenomAD frequencies and position blacklist
rule filter_maf:
input:
maf = rules.filter_repetitive_seq.output.maf
output:
maf = config["cappseq_snv_pipeline"]["base_dir"] + "/99-final/{sample}.sage.blacklist.maf"
params:
exac_freq = float(config["cappseq_snv_pipeline"]["exac_max_freq"]),
blacklist = config["cappseq_snv_pipeline"]["blacklist"],
filter_cols = ["gnomAD_AF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_ASJ_AF", "gnomAD_EAS_AF", "gnomAD_FIN_AF", "gnomAD_NFE_AF", "gnomAD_OTH_AF", "gnomAD_SAS_AF"]
run:
# Load blacklist
blacklist_pos = []
with open(params.blacklist) as f:
for line in f:
if line.startswith("#"):
continue # Ignore comment lines
# Assuming the first column of this file is chrom:pos
line = line.rstrip("\n").rstrip("r")
pos = line.split("\t")[0]
blacklist_pos.append(pos)
blacklist_pos = set(blacklist_pos)
# Load variants
in_maf = pandas.read_csv(input.maf, sep ="\t", comment = "#")
if in_maf.shape[0] == 0: # Empty input MAF file
in_maf.to_csv(output.maf, sep="\t", header=True, index = False)
else:
# Filter based on allele frequencies
# This is slow and inefficient, but is probably fine for this implementation
filtered_maf = in_maf
for f_col in params.filter_cols:
filtered_maf = filtered_maf[(numpy.isnan(filtered_maf[f_col])) | (filtered_maf[f_col] < params.exac_freq)]
# Filter out any variants at positions overlapping the blacklist
blacklist_maf = filtered_maf
filtered_maf["key"] = filtered_maf["Chromosome"].astype(str) + ":" + filtered_maf["Start_Position"].astype(str)
blacklist_maf = blacklist_maf[filtered_maf["key"].isin(blacklist_pos) == False]
blacklist_maf.to_csv(output.maf, sep="\t", header=True, index = False)
# Generate IGV screenshots of these variants
rule igv_screenshot_variants:
input:
tbam = lambda w: samplepaths[w.sample],
nbam = lambda w: normals[w.sample],
maf = rules.filter_maf.output.maf
output:
html = config["cappseq_snv_pipeline"]["base_dir"] + "/07-IGV/{sample}_report.html"
params:
refgenome = config["cappseq_snv_pipeline"]["ref_genome"],
cytoband = config["cappseq_snv_pipeline"]["cytoband"],
genes = config["cappseq_snv_pipeline"]["genetrack"],
sample_name = lambda w: w.sample
conda:
"envs/igv.yaml"
log:
config["cappseq_snv_pipeline"]["base_dir"] + "/logs/{sample}.igv.log"
shell:
"""
create_report --fasta {params.refgenome} --type mutation --tracks {input.tbam} {input.nbam} {params.genes} --flanking 1500 --output {output.html} --standalone --title {params.sample_name} --ideogram {params.cytoband} {input.maf} > {log}
"""
rule all:
input:
expand(rules.filter_maf.output.maf, sample = samples),
expand(rules.igv_screenshot_variants.output.html, sample=samples)
default_target: True