-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathalign_reads.sh
executable file
·188 lines (153 loc) · 7.99 KB
/
align_reads.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/bin/bash
#align_reads.sh
#script to align paired end reads to a reference using bwa, prepare alignments for SNP calling, and qc alignment.
#Megan Supple
#15 April 2013
#usage align_reads.sh <align.input>
#<align.input> is a text file with entries as in example input file (see example.align.input or example at end of this script)
#requires BWA, samtools, picard, GATK, mutt
#produces output in current directory:
#align.out -- output from stdout and stderr
#gatk_qc -- directory containing alignment assessment for each sample
#for each sample
#<id>.mapped.sort.bam -- bwa mapped reads
#<id>.mapped.sort.bam.bai
#<id>.markdup.intervals
#<id>.markdup.realign.fixed.reorder.bai
#<id>.markdup.realign.fixed.reorder.bam -- ***alignments prepped for SNP calling***
#<id>.markdups.metrics
#preparing files for SNP calling is based on:
#Best Practice Variant Detection with the GATK v3
#http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3
#Better: sample-level realignment: dedup and realign
#no dbSNP, so cannot recalibrate
#each sample consists of a single library. If not, additional steps need to be added
###Let's begin###
#read in input file
source $1
#set output file for stdout
exec &>align.log
#create reference variable from ref_path and ref_file
reference=$ref_path/$ref_file
#print some header information
echo "Reference Alignment Pipeline"
echo -e "running $0 from working directory `pwd`\n"
echo "using input parameters in $1:"
cat $1
echo -e "\nBeginning Pipeline..."
echo -e "aligning reads from ${#id[@]} samples to the reference genome: $reference \n"
#need a BWA index for the reference
#check to see if one already exists (test if one of the files is there, and if so, assume the other are as well)
if (test -e $reference.bwt)
then echo "skipping indexing...index already exists"
else echo "creating index"
bwa index $reference
fi
#align reads and prep for SNP calling
for ((a=0; a<${#id[@]}; a++))
do
#align reads to reference with bwa
echo -e "\n\naligning sample ${id[a]} with bwa"
echo "bwa aln $q -t $t -n $n -o $o -e $e -l $l -k $k $reference $files/${file1[a]} > ${id[a]}_1.sai"
bwa aln $q -t $t -n $n -o $o -e $e -l $l -k $k $reference $files/${file1[a]} > ${id[a]}_1.sai
echo "bwa aln $q -t $t -n $n -o $o -e $e -l $l -k $k $reference $files/${file2[a]} > ${id[a]}_2.sai"
bwa aln $q -t $t -n $n -o $o -e $e -l $l -k $k $reference $files/${file2[a]} > ${id[a]}_2.sai
echo "bwa sampe -f ${id[a]}.sam -r "@RG\tID:${id[a]}\tSM:${sample[a]}\tPL:$platform" $reference ${id[a]}_1.sai ${id[a]}_2.sai $files/${file1[a]} $files/${file2[a]}"
bwa sampe -f ${id[a]}.sam -r "@RG\tID:${id[a]}\tSM:${sample[a]}\tPL:$platform" $reference ${id[a]}_1.sai ${id[a]}_2.sai $files/${file1[a]} $files/${file2[a]}
#create mapped only, index, sorted bam
echo -e "\nfiltering alignments"
echo "samtools view -Sb -F 4 -o ${id[a]}.mapped.bam ${id[a]}.sam"
samtools view -Sb -F 4 -o ${id[a]}.mapped.bam ${id[a]}.sam
echo "samtools sort ${id[a]}.mapped.bam ${id[a]}.mapped.sort"
samtools sort ${id[a]}.mapped.bam ${id[a]}.mapped.sort
echo "samtools index ${id[a]}.mapped.sort.bam"
samtools index ${id[a]}.mapped.sort.bam
#remove intermediate files
echo "remove intermediate bwa files"
rm ${id[a]}_1.sai ${id[a]}_2.sai ${id[a]}.sam ${id[a]}.mapped.bam
#prep alignments with picard and gatk
echo -e "\nprep alignment files for SNP calling"
#mark duplicates for each library/sample
echo "mark duplicates for sample ${id[a]}"
java -jar $picard/MarkDuplicates.jar INPUT=${id[a]}.mapped.sort.bam OUTPUT=${id[a]}.markdup.bam METRICS_FILE=${id[a]}.markdups.metrics VALIDATION_STRINGENCY=SILENT ASSUME_SORTED=true CREATE_INDEX=true
#realign around indels
echo -e "\nfind realign region"s
java -jar $gatk -T RealignerTargetCreator -R $reference -I ${id[a]}.markdup.bam -o ${id[a]}.markdup.intervals
echo -e "\nrealign"
java -jar $gatk -T IndelRealigner -R $reference -I ${id[a]}.markdup.bam -targetIntervals ${id[a]}.markdup.intervals -o ${id[a]}.markdup.realign.bam
#fix pair info
echo -e "\nfix pair info"
java -jar $picard/FixMateInformation.jar INPUT=${id[a]}.markdup.realign.bam OUTPUT=${id[a]}.markdup.realign.fixed.bam CREATE_INDEX=true SO=coordinate VALIDATION_STRINGENCY=SILENT
#reorder
echo -e "\nreorder"
java -jar $picard/ReorderSam.jar INPUT=${id[a]}.markdup.realign.fixed.bam OUTPUT=${id[a]}.markdup.realign.fixed.reorder.bam REFERENCE=$reference VALIDATION_STRINGENCY=SILENT
#index
echo -e "\nindex"
java -jar $picard/BuildBamIndex.jar I=${id[a]}.markdup.realign.fixed.reorder.bam VALIDATION_STRINGENCY=SILENT
#remove intermediate files
echo -e "\nremove intermediate picard/gatk files"
rm ${id[a]}.markdup.bam ${id[a]}.markdup.bai ${id[a]}.markdup.realign.bam ${id[a]}.markdup.realign.bai ${id[a]}.markdup.realign.fixed.bam ${id[a]}.markdup.realign.fixed.bai
done
echo | mutt -s "alignments from $1 complete" $email
echo -e "\ndone aligning and preparing for SNP calling"
echo -e "beginning quality assesment of alignments\n"
#qc alignments
#create directory for qc output
mkdir gatk_qc
for ((a=0; a<${#id[@]}; a++))
do
echo -e "\nassessing alignments for sample ${id[a]}"
java -jar $gatk -T FlagStat -R $reference -I ${id[a]}.markdup.realign.fixed.reorder.bam -o gatk_qc/FlagStat_${id[a]}.out
java -jar $gatk -T CountLoci -R $reference -I ${id[a]}.markdup.realign.fixed.reorder.bam -o gatk_qc/CountLoci_${id[a]}.out
java -jar $gatk -T CallableLoci -R $reference -I ${id[a]}.markdup.realign.fixed.reorder.bam -o gatk_qc/Callable_${id[a]}.out -summary gatk_qc/Callable_summary_${id[a]}.out -mmq 10 -mbq 20 -minDepth $mincov -maxDepth $maxcov
java -jar $gatk -T CoarseCoverage -R $reference -I ${id[a]}.markdup.realign.fixed.reorder.bam -o gatk_qc/CoarseCoverage_${id[a]}.out --granularity 1000
done
echo | mutt -s "alignment and qc script from $1 complete" $email
echo -e "\nDONE!!!"
#example input file to align samples to a reference (April 2013)
##input fastq files (make sure left and right match up in the order)
##path to fastq files
#files=/PATH/TO/FASTQ/DIR/
##left reads
#file1=( sampleA.R1.fastq.gz sampleB.R1.fastq.gz sampleC.R1.fastq.gz )
##right reads
#file2=( sampleA.R2.fastq.gz sampleB.R2.fastq.gz sampleC.R2.fastq.gz )
##quality score encoding
##if your reads are not phred+33/sanger/Illumina 1.8, uncomment this line so bwa uses phred+64 and converts to the standard phred+33 for alignment files
##q="-I"
#
##sam read group tags
#id=( sampleA sampleB sampleC )
#sample=( sampleA sampleB sampleC )
#platform=ILLUMINA
#
##location of reference genome
#ref_path=/PATH/TO/REFERENCE/DIR/
#ref_file=reference.fasta
#
##location of program executables (assumes BWA and samtools are already in your PATH)
#picard=/PATH/TO/PICARD/DIR/picard-tools-1.53
#gatk=/PATH/TO/GATK/DIR/GenomeAnalysisTK-1.2-4-gd9ea764/GenomeAnalysisTK.jar
#
##BWA mapping parameters (set for 100 bp reads and sanger quality scores)
##Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]
#n=8
##Maximum number of gap opens [1]
#o=2
##Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]
#e=3
##Take the first INT subsequence as seed. If INT is larger than the query sequence, seeding will be disabled. For long reads, this option is typically ranged from 25 to 35 for .-k 2.. [inf]
#l=35
##Maximum edit distance in the seed [2]
#k=2
##number threads
#t=16
#
##gatk alignment qc parameters
##minimum and maximum coverage per position to be considered callable
#mincov=10
#maxcov=100
#
##your email address to notify you when the alignments are complete and when the script finishes
#email="[email protected]"
#