-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
129 lines (108 loc) · 4.33 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pandas as pd
shell.executable("bash")
# Settings ------------------------------------------------------------------------------------------------------------------
workdir: config["workdir"]
ssheet = pd.read_csv(config["samples"], index_col=None, sep = "\t", engine="python",
dtype = {'sample': str, 'taxid': int, 'read_count': 'int'})
# Functions -----------------------------------------------------------------------------------------------------------------
def get_samples():
return [str(e) for e in ssheet['sample'].unique()]
def get_R1_names(wildcards):
taxids = ssheet[ssheet["sample"] == wildcards.sample]["taxid"]
return ["fastq/" + wildcards.sample + "_" + str(txd) + "_R1.fq" for txd in taxids]
def get_R2_names(wildcards):
taxids = ssheet[ssheet["sample"] == wildcards.sample]["taxid"]
return ["fastq/" + wildcards.sample + "_" + str(txd) + "_R2.fq" for txd in taxids]
def get_count(wildcards):
sub_df = ssheet[ssheet["sample"] == wildcards.sample]
return int(sub_df[sub_df["taxid"] == int(wildcards.taxid)]["read_count"])
def get_sim_args(wildcards):
if any([config["quality_profile_read1"] == "None", config["quality_profile_read2"] == None]):
return "--seqSys " + config["seq_system"]
else:
return "--qprof1 "+ config["quality_profile_read1"] + " --qprof2 " + config["quality_profile_read2"]
# Input rule ----------------------------------------------------------------------------------------------------------------
rule all:
input:
expand("samples/{sample}_S1_L001_R{N}_001.fastq.gz", sample = get_samples(), N = [1, 2])
# Workflow ------------------------------------------------------------------------------------------------------------------
rule extract_fasta:
output:
"fasta/{taxid}.fa"
params:
blast_db = config["blast_db"],
taxdb = config["taxdb"]
message: "Extracting fasta for taxid {wildcards.taxid}"
conda: "envs/blast.yaml"
shell:
"""
export BLASTDB={params.taxdb}
blastdbcmd -db {params.blast_db} -taxids {wildcards.taxid} -out {output} \
-outfmt '%f'
if [ $(grep -c '^>' {output}) -ne 1 ]; then
echo "Expected one sequence for taxid {wildcards.taxid}, got $(grep -c '^>' {output})"
exit 1
fi
"""
rule generate_reads:
input:
"fasta/{taxid}.fa"
output:
temp(multiext("fastq/{sample}_{taxid}", "_R1.fq", "_R2.fq"))
params:
rcount = get_count,
length = config["read_length"],
mflen = config["mean_fragment_size"],
qshift2 = config["quality_shift_R2"],
sdev = config["fragment_size_sdev"],
simargs = get_sim_args
message: "Generating taxid {wildcards.taxid} reads for samples {wildcards.sample}"
conda: "envs/art.yaml"
shell:
"""
len=$(tail -n+2 {input} | tr -d '[:space:]' | wc -c)
if [ {params.mflen} -gt $len ]; then
mflen=$len
else
mflen={params.mflen}
fi
if [ {params.length} -gt $len ]; then
length=$((mflen-1))
else
length={params.length}
fi
art_illumina --amplicon \
--rcount {params.rcount} \
--in {input} \
--len $length \
--mflen $mflen \
--noALN \
--out fastq/{wildcards.sample}_{wildcards.taxid}_R \
--paired \
--qShift2 {params.qshift2} \
--sdev {params.sdev} \
{params.simargs}
"""
rule mix_reads:
input:
r1 = get_R1_names,
r2 = get_R2_names
output:
r1 = temp("samples/{sample}_S1_L001_R1_001.fastq"),
r2 = temp("samples/{sample}_S1_L001_R2_001.fastq")
message: "Combining reads for sample {wildcards.sample}"
shell:
"""
cat {input.r1} > {output.r1}
cat {input.r2} > {output.r2}
"""
rule compress:
input:
"samples/{sample}_S1_L001_R{N}_001.fastq"
output:
"samples/{sample}_S1_L001_R{N}_001.fastq.gz"
message: "Compressing file {wildcards.sample}_S1_L001_R{wildcards.N}_001.fastq"
shell:
"""
gzip -c {input} > {output}
"""