forked from Sentieon/sentieon-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRNAseq-calling.sh
137 lines (114 loc) · 6.2 KB
/
RNAseq-calling.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/sh
# Copyright (c) 2016-2020 Sentieon Inc. All rights reserved
# *******************************************
# Script to perform RNA variant calling
# using a single sample with fastq files
# named 1.fastq.gz and 2.fastq.gz
# *******************************************
set -eu
# Update with the fullpath location of your sample fastq
SM="sample" #sample name
RGID="rg_$SM" #read group ID
PL="ILLUMINA" #or other sequencing platform
READ_LENGTH=101 #the read length for the sample
FASTQ_FOLDER="/home/pipeline/samples"
FASTQ_1="$FASTQ_FOLDER/rna1.fastq.gz"
FASTQ_2="$FASTQ_FOLDER/rna2.fastq.gz" #If using Illumina paired data
# Update with the location of the reference data files
FASTA_DIR="/home/regression/references/b37/"
FASTA="$FASTA_DIR/human_g1k_v37_decoy.fasta"
KNOWN_DBSNP="$FASTA_DIR/dbsnp_138.b37.vcf.gz"
KNOWN_INDELS="$FASTA_DIR/1000G_phase1.indels.b37.vcf.gz"
KNOWN_MILLS="$FASTA_DIR/Mills_and_1000G_gold_standard.indels.b37.vcf.gz"
#comment if STAR should generate a new genomeDir
STAR_FASTA="$FASTA_DIR/human_g1k_v37_decoy.fasta.genomeDir"
#uncomment if you would like to use a bed file
#INTERVAL_FILE="$FASTA_DIR/TruSeq_exome_targeted_regions.b37.bed"
# Update with the location of the Sentieon software package and license file
SENTIEON_INSTALL_DIR=/home/release/sentieon-genomics-|release_version|
export SENTIEON_LICENSE=/home/Licenses/Sentieon.lic #or using licsrvr: c1n11.sentieon.com:5443
# Other settings
NT=$(nproc) #number of threads to use in computation, set to number of cores in the server
START_DIR="$PWD/test/RNAseq" #Determine where the output files will be stored
# You do not need to modify any of the lines below unless you want to tweak the pipeline
# ************************************************************************************************************************************************************************
# ******************************************
# 0. Setup
# ******************************************
WORKDIR="$START_DIR/${SM}"
mkdir -p $WORKDIR
LOGFILE=$WORKDIR/run.log
exec >$LOGFILE 2>&1
cd $WORKDIR
DRIVER_INTERVAL_OPTION="${INTERVAL_FILE:+--interval $INTERVAL_FILE}"
# ******************************************
# 1. Mapping reads with STAR
# ******************************************
if [ -z "$STAR_FASTA" ]; then
STAR_FASTA="genomeDir"
# The genomeDir generation could be reused
mkdir $STAR_FASTA
$SENTIEON_INSTALL_DIR/bin/sentieon STAR --runMode genomeGenerate \
--genomeDir $STAR_FASTA --genomeFastaFiles $FASTA --runThreadN $NT || \
{ echo "STAR index failed"; exit 1; }
fi
#perform the actual alignment and sorting
( $SENTIEON_INSTALL_DIR/bin/sentieon STAR --twopassMode Basic --genomeDir $STAR_FASTA \
--runThreadN $NT --outStd BAM_Unsorted --outSAMtype BAM Unsorted \
--outBAMcompression 0 --twopass1readsN -1 --sjdbOverhang `expr "$READ_LENGTH" - 1` \
--readFilesIn $FASTQ_1 $FASTQ_2 --readFilesCommand "zcat" \
--outSAMattrRGline ID:$RGID SM:$SM PL:$PL || { echo -n 'STAR error'; exit 1; } ) | \
$SENTIEON_INSTALL_DIR/bin/sentieon util sort -r $FASTA -o sorted.bam \
-t $NT --bam_compression 1 -i - || { echo "STAR alignment failed"; exit 1; }
# ******************************************
# 2. Metrics
# ******************************************
$SENTIEON_INSTALL_DIR/bin/sentieon driver $DRIVER_INTERVAL_OPTION -r $FASTA -t $NT \
-i sorted.bam --algo MeanQualityByCycle mq_metrics.txt --algo QualDistribution \
qd_metrics.txt --algo GCBias --summary gc_summary.txt gc_metrics.txt \
--algo AlignmentStat --adapter_seq '' aln_metrics.txt --algo InsertSizeMetricAlgo \
is_metrics.txt || { echo "Metrics failed"; exit 1; }
$SENTIEON_INSTALL_DIR/bin/sentieon plot GCBias -o gc-report.pdf gc_metrics.txt
$SENTIEON_INSTALL_DIR/bin/sentieon plot QualDistribution -o qd-report.pdf qd_metrics.txt
$SENTIEON_INSTALL_DIR/bin/sentieon plot MeanQualityByCycle -o mq-report.pdf mq_metrics.txt
$SENTIEON_INSTALL_DIR/bin/sentieon plot InsertSizeMetricAlgo -o is-report.pdf is_metrics.txt
# ******************************************
# 3. Remove Duplicate Reads. It is possible
# to remove instead of mark duplicates
# by adding the --rmdup option in Dedup
# ******************************************
$SENTIEON_INSTALL_DIR/bin/sentieon driver -t $NT -i sorted.bam --algo LocusCollector \
--fun score_info score.txt || { echo "LocusCollector failed"; exit 1; }
$SENTIEON_INSTALL_DIR/bin/sentieon driver -t $NT -i sorted.bam --algo Dedup \
--score_info score.txt --metrics dedup_metrics.txt deduped.bam || \
{ echo "Dedup failed"; exit 1; }
# ******************************************
# 2a. Coverage metrics
# ******************************************
$SENTIEON_INSTALL_DIR/bin/sentieon driver -r $FASTA -t $NT -i deduped.bam \
--algo CoverageMetrics coverage_metrics || { echo "CoverageMetrics failed"; exit 1; }
# ******************************************
# 4. Split reads at Junction
# ******************************************
$SENTIEON_INSTALL_DIR/bin/sentieon driver -r $FASTA -t $NT -i deduped.bam \
--algo RNASplitReadsAtJunction --reassign_mapq 255:60 splitted.bam || \
{ echo "RNASplitReadsAtJunction failed"; exit 1; }
# ******************************************
# 6. Base recalibration
# ******************************************
$SENTIEON_INSTALL_DIR/bin/sentieon driver $DRIVER_INTERVAL_OPTION -r $FASTA -t $NT \
-i splitted.bam --algo QualCal -k $KNOWN_DBSNP -k $KNOWN_MILLS -k $KNOWN_INDELS \
recal_data.table
$SENTIEON_INSTALL_DIR/bin/sentieon driver $DRIVER_INTERVAL_OPTION -r $FASTA -t $NT \
-i splitted.bam -q recal_data.table --algo QualCal -k $KNOWN_DBSNP -k $KNOWN_MILLS \
-k $KNOWN_INDELS recal_data.table.post
$SENTIEON_INSTALL_DIR/bin/sentieon driver -t $NT --algo QualCal --plot \
--before recal_data.table --after recal_data.table.post recal.csv
$SENTIEON_INSTALL_DIR/bin/sentieon plot QualCal -o recal_plots.pdf recal.csv
# ******************************************
# 7. HC Variant caller for RNA
# ******************************************
$SENTIEON_INSTALL_DIR/bin/sentieon driver $DRIVER_INTERVAL_OPTION -r $FASTA -t $NT \
-i splitted.bam -q recal_data.table --algo Haplotyper -d $KNOWN_DBSNP \
--trim_soft_clip --emit_conf=20 --call_conf=20 output-hc-rna.vcf.gz || \
{ echo "Haplotyper failed"; exit 1; }