-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmRNA.variant_seq.sh
319 lines (305 loc) · 7.99 KB
/
mRNA.variant_seq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/bin/bash
# @author: Slim Fourati
# load apps
module load STAR/2.5.3a
module load picard/2.11
module load gatk/3.8
module load samtools/1.8
module load java/8u121
# read arguments
genome="Mmul_8"
pairEnd=false
while getopts d:g: option
do
case "$option" in
d) dirData=$OPTARG;;
g) genome=$OPTARG;;
esac
done
# set global variables for the script
bin="/mnt/projects/SOM_PATH_RXS745U/bin"
seqDependencies="/mnt/projects/SOM_PATH_RXS745U/genome/$genome"
genomeFasta="$seqDependencies/Sequence/genome.fa"
maxProc=8
genomeDir="$seqDependencies/ggNoOverhang"
# Path to reference genome and Index files.
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: alignment of reads..."
sampleID=$(find $dirData -name "*_[1-2].fq.gz")
sampleID=$(echo $sampleID | sed -r 's/_[1-2].fq.gz//g')
sampleID=( $(echo $sampleID | tr ' ' '\n' | sort | uniq) )
for sample in ${sampleID[@]}
do
STAR --genomeDir $genomeDir \
--readFilesIn ${sample}_1.fq.gz \
${sample}_2.fq.gz \
--readFilesCommand zcat \
--runThreadN $maxProc \
--outSAMtype BAM Unsorted \
--outSAMstrandField intronMotif \
--twopassMode Basic \
--outFileNamePrefix ${sample}_star \
--outSAMattrRGline ID:RhCMV \
CN:Gale_lab \
LB:PairedEnd \
PL:Illumina \
PU:Unknown \
SM:${sample} &>/dev/null
if [ $? != 0 ]
then
echo -ne "error\n unable to aligned read in directory $sample"
exit 1
fi
# remove file
rm ${sample}_1.fq.gz
rm ${sample}_2.fq.gz
rm ${sample}_starLog.out
rm ${sample}_starLog.progress.out
rm ${sample}_starSJ.out.tab
rm -r ${sample}_star_STARgenome
rm -r ${sample}_star_STARpass1
done
echo "done"
fi
# Order BAM file by position
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: sorting bam file by position..."
for sample in $(find $dirData -name "*_starAligned.out.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?)_starAligned.out.bam$/\1/')
$bin/sambamba-0.6.6/sambamba_v0.6.6 sort \
-o ${sampleID}.sorted.bam \
-p \
-t $maxProc \
$sample &>/dev/null
# remove file
rm $sample
done
echo -ne "done\n"
fi
# Index BAM file
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: indexing bam file..."
for sample in $(find $dirData -name "*.sorted.bam")
do
$bin/sambamba-0.6.6/sambamba_v0.6.6 index \
-p \
-t $maxProc \
$sample &>/dev/null
done
echo -ne "done\n"
fi
# Mark duplicates using Picard
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: marking duplicates..."
for sample in $(find $dirData -name "*.sorted.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).sorted.bam$/\1/')
java -jar $PICARD MarkDuplicates \
I=$sample \
O=$sampleID.dupMarked.bam \
M=$sampleID.dup.metrics \
CREATE_INDEX=true \
VALIDATION_STRINGENCY=SILENT 2>$sampleID.markDuplicates.log
if [ $? != 0 ]
then
echo -ne "error\n unable to mark duplicates"
exit 1
fi
# remove files
rm $sample
rm $sample.bai
done
echo -ne "done\n"
fi
# SplitNCigarReads
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: splitting reads..."
for sample in $(find $dirData -name "*.dupMarked.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).dupMarked.bam$/\1/')
java -d64 -jar $GATK \
-T SplitNCigarReads \
-R $genomeFasta \
-I $sample \
-o $sampleID.split.bam \
-rf ReassignOneMappingQuality \
-RMQF 255 \
-RMQT 60 \
-U ALLOW_N_CIGAR_READS 2>$sampleID.splitNCigarReads.log
# remove files
rm $sample
rm $sampleID.dupMarked.bai
rm $sampleID.dup.metrics
done
echo -ne "done\n"
fi
# Create targets for indel realignment
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: creating targets for realignment..."
for sample in $(find $dirData -name "*.split.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).split.bam$/\1/')
java -d64 -jar $GATK \
-T RealignerTargetCreator \
-R $genomeFasta \
-I $sample \
-o $sampleID.intervals \
-nt $maxProc 2>$sampleID.indel.log
# -known $seqDependencies/Annotation/Mills_and_1000G_gold_standard.indels.hg38.vcf
# -known $seqDependencies/Annotation/Homo_sapiens_assembly38.known_indels.vcf
done
echo -ne "done\n"
fi
# Perform indel realignment
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: perform indel realignment..."
for sample in $(find $dirData -name "*.split.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).split.bam$/\1/')
java -d64 -Xmx32G -jar $GATK \
-T IndelRealigner \
-R $genomeFasta \
-targetIntervals $sampleID.intervals \
-I $sample \
-o $sampleID.processed.bam 2>$sampleID.indel2.log
# -known $seqDependencies/Annotation/Mills_and_1000G_gold_standard.indels.hg38.vcf
# -known $seqDependencies/Annotation/Homo_sapiens_assembly38.known_indels.vcf
if [ $? != 0 ]
then
echo -ne "error\n unable to perform indel realignment"
exit 1
fi
# remove files
rm $sample
rm $sampleID.split.bai
rm $sampleID.intervals
done
echo -ne "done\n"
fi
# Generate recalibration table
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: generate recalibration table..."
for sample in $(find $dirData -name "*.processed.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).processed.bam$/\1/')
java -d64 -jar $GATK \
-T BaseRecalibrator \
-I $sample \
-R $genomeFasta \
-knownSites $seqDependencies/Annotation/variant.vcf \
-o $sampleID.recal.table 2>$sampleID.genBaseRecalib.log
done
echo -ne "done\n"
fi
# base recalibration
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: printing recalibrated reads..."
for sample in $(find $dirData -name "*.processed.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).processed.bam$/\1/')
java -d64 -Xmx32G -jar $GATK \
-T PrintReads \
-R $genomeFasta \
-I $sample \
-nct $maxProc \
-BQSR $sampleID.recal.table \
-o $sampleID.recal.bam 2>$sampleID.baseRecalib.log
if [ $? != 0 ]
then
echo -ne "error\n unable to print recalibrated reads"
exit 1
fi
# remove files
rm $sample
rm $sampleID.processed.bai
done
echo -ne "done\n"
fi
# Variant calling
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: calling variants..."
for sample in $(find $dirData -name "*.recal.bam")
do
sampleID=$(echo $sample | sed -r 's/(.?).recal.bam$/\1/')
java -d64 -Xmx32G \
-jar $GATK \
-T HaplotypeCaller \
-R $genomeFasta \
-I $sample \
-dontUseSoftClippedBases \
-stand_call_conf 20 \
-ERC GVCF \
-variant_index_type LINEAR \
-variant_index_parameter 128000 \
-nct $maxProc \
-o $sampleID.vcf 2>$sampleID.haploCaller.log
if [ $? != 0 ]
then
echo -ne "error\n unable to call variants"
exit 1
fi
# -stand_emit_conf 20 is obsolete
# remove files
rm $sample
rm $sampleID.recal.bai
rm $sampleID.recal.table
done
echo "done"
fi
# filtering variant calling
flag=true
if $flag
then
currentDate=$(date +"%Y-%m-%d %X")
echo -ne "$currentDate: filtering variants..."
for sample in $(find $dirData -name "*.vcf")
do
sampleID=$(echo $sample | sed -r 's/(.?).vcf$/\1/')
java -d64 -jar $GATK \
-T VariantFiltration \
-R $genomeFasta \
-V $sample \
-window 35 \
-cluster 3 \
--filterName FS \
--filterExpression "FS > 30.0" \
--filterName QD \
--filterExpression "QD < 2.0" \
-o $sampleID.filtered.vcf 2>$sampleID.variantFilter.log
# remove files
rm $sample
rm $sampleID.vcf.idx
done
echo "done"
fi