-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathimargi_wrapper.sh
executable file
·237 lines (196 loc) · 10.4 KB
/
imargi_wrapper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env bash
set -e
PROGNAME=$0
usage() {
cat << EOF >&2
---------------------------------------------------------------------------
Usage: $PROGNAME [-r <ref_name>] [-N <base_name>] [-g <ref_fasta>]
[-c <chromSize_file>] [-i <bwa_index>] [-R <restrict_sites>]
[-Q <min_mapq>] [-G <max_inter_align_gap>]
[-O <offset_restriction_site>] [-M <max_ligation_size>] [-t <threads>]
[-1 <fastq.gz_R1>] [-2 <fastq.gz_R2>]
[-o <output_dir>]
Dependency: seqtk, samtools, bwa, pairtools, pbgzip
This is an all-in-one wrapper script of iMARGI Pipeline.
-r : Reference assembly name, such as "hg38"
-N : Base name for ouput result. Such as -N HEK_iMARGI, then the output file will be renamed using the base name,
such as final_HEK_iMARGI.paris.gz, HEK_iMARGI.bam.
-g : Reference genome fasta file. It will be used to generate ref files of '-c', '-i' and '-R'. If any of them are
not supplied. It will be generated from the reference fasta file.
-c : Chromosome size file.
-i : bwa index
-R : DNA restriction enzyme digestion fragments bed file.
-Q : Min MAPQ value for parsing as unique mapping. Default 1.
-G : Max inter align gap for pairtools parsing. Default 20. It will allow R1 5' end clipping.
-O : Max offset bases for filtering pairs based on R2 5' end positions to restriction sites. Default 3.
-M : Max size of ligation fragment for sequencing. It's used for filtering unligated DNA sequence.
-t : Max CPU threads for parallelized processing, at least 4. (Default 8)
-1 : R1 fastq.gz file, if there are multiple files, just separated with space or use wildcard,
such as '-1 lane1_R1.fq.gz lane2_R1.fq.gz', or '-1 lane*_R1.fq.gz'.
-2 : R2 fastq.gz file, if there are multiple files, just separated with space or use wildcard,
such as '-2 lane1_R2.fq.gz lane2_R2.fq.gz', or '-2 lane*_R2.fq.gz'.
-o : Output directoy
-h : Show usage help
---------------------------------------------------------------------------
EOF
exit 1
}
parameter_error() {
echo "!!!!!!!!!!!!!!!! Pipeline exit with parameter ERROR at $(date '+%Y-%m-%d %H:%M:%S %Z') !!!!!!!!!!!!!!!!"
usage
}
while getopts :r:N:g:c:i:R:Q:G:O:M:t:1:2:o:h opt; do
case $opt in
r) ref_name=${OPTARG};;
N) base_name=${OPTARG};;
g) ref_fa=${OPTARG};;
c) chromsize=${OPTARG};;
i) bwa_index=${OPTARG};;
R) rsites=${OPTARG};;
Q) mapq=${OPTARG};;
G) gap=${OPTARG};;
O) offset=${OPTARG};;
M) max_ligation_size=${OPTARG};;
t) threads=${OPTARG};;
1) R1=("${OPTARG}")
until [[ $(eval "echo \${$OPTIND}") =~ ^-.* ]] || [ -z $(eval "echo \${$OPTIND}") ]; do
R1+=($(eval "echo \${$OPTIND}"))
OPTIND=$((OPTIND + 1))
done;;
2) R2=("${OPTARG}")
until [[ $(eval "echo \${$OPTIND}") =~ ^-.* ]] || [ -z $(eval "echo \${$OPTIND}") ]; do
R2+=($(eval "echo \${$OPTIND}"))
OPTIND=$((OPTIND + 1))
done;;
o) output_dir=${OPTARG};;
h) usage;;
esac
done
echo "==================== iMARGI Pipeline BEGIN at $(date '+%Y-%m-%d %H:%M:%S %Z') ===================="
# check parameters
[ -z "$ref_name" ] && echo "Error!! Please provide reference genome name with -r" && parameter_error
[ -z "$base_name" ] && echo "Error!! Please provide base name for result files with -N" && parameter_error
[ ! -d "$output_dir" ] && echo "Error!! Output directory not exist: "$output_dir && parameter_error
[ -z "$mapq" ] && echo "Use default min mapq 1 for pairtools parsing." && mapq=1
if ! [[ "$mapq" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -Q" && parameter_error
fi
[ -z "$gap" ] && echo "Use default max inter align gap for pairtools parsing." && gap=20
if ! [[ "$gap" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -G" && parameter_error
fi
[ -z "$offset" ] && echo "Use default offset 3'." && offset=3
if ! [[ "$offset" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -O" && parameter_error
fi
[ -z "$max_ligation_size" ] && echo "Use default max ligation size 1000'." && max_ligation_size=1000
if ! [[ "$max_ligation_size" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -M" && parameter_error
fi
[ -z "$threads" ] && echo "Use default thread number 8'." && threads=8
if ! [[ "$threads" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -t" && parameter_error
fi
# check input parameter of fastq files
[ -z "$R1" ] && echo "Error!! Please provide fastq.gz files of R1 with -1" && parameter_error
[ -z "$R2" ] && echo "Error!! Please provide fastq.gz files of R2 with -2" && parameter_error
R1_str=" ${R1[*]} "
R2_str=" ${R2[*]} "
echo $R1_str
echo $R2_str
for item in ${R1[@]}; do
if [[ $R2_str =~ " $item " ]] ; then
echo "Error!! R1 and R2 fastq.gz files supplied by -1 and -2 are the same one:"
echo "R1: "$R1_str
echo "R2: "$R2_str
parameter_error
fi
done
for i in ${R1[@]}; do
[ ! -f "$i" ] && echo "Error!! Fastq R1 not exist: "$i && parameter_error
done
for i in ${R2[@]}; do
[ ! -f "$R2" ] && echo "Error!! Fastq R2 not exist: "$i && parameter_error
done
# check reference files
if [ ! -z "$ref_fa" ]; then
[ ! -f "$ref_fa" ] && echo "Error!! Reference genome fasta file not exist: "$ref_fa && parameter_error
else
[ ! -f "$chromsize" ] && echo "Error!! Chomosome size file not exist: "$chromsize && parameter_error
[ -z "$bwa_index" ] && echo "Error!! Please provide correct bwa index with -i" && parameter_error
[ ! -f "$rsites" ] && echo "Error!! Resitriction sites file not exist: "$rsites && parameter_error
fi
if [ ! -z "$ref_fa" ]; then
if [ -z "$chromsize" ]; then
echo ">>>>>>>>>>>>>>>> [$(date '+%m-%d %H:%M:%S')] Ref: no chromsize parameter. Start generating chromsize file from "$ref_fa" ..."
[ ! -f "$ref_fa" ] && echo "Error!! Reference genome fasta file not exist: "$ref_fa && parameter_error
ref_dir=$(dirname "$ref_fa")
samtools faidx $ref_fa
awk 'BEGIN{OFS="\t"}{print $1,$2}' "$ref_fa".fai > $ref_dir/chromsize.$ref_name.txt
chromsize=$ref_dir"/chromsize.$ref_name.txt"
else
[ ! -f "$chromsize" ] && echo "Error!! Chomosome size file not exist: "$chromsize && parameter_error
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Ref: chromsize parameter exist. Directly use the chromsize file "$chromsize
fi
if [ -z "$bwa_index" ]; then
date
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Ref: no bwa index parameter. Start generating bwa index from "$ref_fa" ..."
[ ! -f "$ref_fa" ] && echo "Error!! Reference genome fasta file not exist: "$ref_fa && parameter_error
ref_dir=$(dirname "$ref_fa")
mkdir $ref_dir/bwa_index
bwa index -p $ref_dir/bwa_index/bwa_index_$ref_name $ref_fa
bwa_index=$ref_dir"/bwa_index/bwa_index_"$ref_name
else
[ ! -f "${bwa_index}.amb" ] && echo "Error!! BWA index files not correct: "$bwa_index && parameter_error
[ ! -f "${bwa_index}.ann" ] && echo "Error!! BWA index files not correct: "$bwa_index && parameter_error
[ ! -f "${bwa_index}.bwt" ] && echo "Error!! BWA index files not correct: "$bwa_index && parameter_error
[ ! -f "${bwa_index}.pac" ] && echo "Error!! BWA index files not correct: "$bwa_index && parameter_error
[ ! -f "${bwa_index}.sa" ] && echo "Error!! BWA index files not correct: "$bwa_index && parameter_error
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Ref: bwa index parameter exist. Directly use the bwa index "$bwa_index
fi
if [ -z "$rsites" ]; then
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Ref: no restriction fragment parameter. Start generating from "$ref_fa" ..."
[ ! -f "$ref_fa" ] && echo "Error!! Reference genome fasta file not exist: "$ref_fa && parameter_error
ref_dir=$(dirname "$ref_fa")
imargi_rsfrags.sh -r $ref_fa -c $chromsize -e AluI -C 2 -o $ref_dir/AluI_frags.bed.gz
rsites=$ref_dir"/AluI_frags.bed.gz"
else
[ ! -f "$rsites" ] && echo "Error!! Ref: Restriction fragment file not exist: "$rsites && parameter_error
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Ref: Restriction fragment file parameter exist. Directly use "$rsites
fi
fi
[ ! -d "$output_dir/clean_fastq" ] && mkdir $output_dir/clean_fastq
[ ! -d "$output_dir/bwa_output" ] && mkdir $output_dir/bwa_output
[ ! -d "$output_dir/parse_temp" ] && mkdir $output_dir/parse_temp
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Start cleaning fastq ..."
# R1_str=''
# for i in ${R1[@]};do
# R1_str=$R1_str' '$i
# done
# R2_str=''
# for i in ${R2[@]};do
# R2_str=$R2_str' '$i
# done
imargi_clean.sh -1 $R1_str -2 $R2_str -N $base_name -o $output_dir/clean_fastq -t $threads
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Start BWA mem mapping ..."
bwa mem -t $threads -SP5M $bwa_index \
$output_dir/clean_fastq/clean_${base_name}_R1.fastq.gz \
$output_dir/clean_fastq/clean_${base_name}_R2.fastq.gz \
2>$output_dir/bwa_output/bwa_log_$base_name.txt | \
samtools view -@ $threads -Shb - > $output_dir/bwa_output/$base_name.bam
bwa_flag=$( tail -n 3 $output_dir/bwa_output/bwa_log_$base_name.txt |
awk 'BEGIN{flag="ERROR"}{if($0~ /^\[main\] Real time/){flag="OK";}}END{print flag;}' )
if [[ "$bwa_flag" == "OK" ]]; then
tail -n 3 $output_dir/bwa_output/bwa_log_$base_name.txt
echo "BWA mapping finished."
else
echo "BWA mapping ERROR."
tail -n 3 $output_dir/bwa_output/bwa_log_$base_name.txt
echo "Check more error log info in "$output_dir/bwa_output/bwa_log_$base_name.txt
echo "It's usually caused by out of memory or incorrect index files."
echo "!!!!!!!!!!!!!!!! Pipeline exit with BWA Error at $(date '+%Y-%m-%d %H:%M:%S %Z') !!!!!!!!!!!!!!!!" && usage
fi
echo ">>>>>>>>>>>>>>>>[$(date '+%m-%d %H:%M:%S')] Start parsing, deduplicating, and filtering RNA-DNA interactions ..."
imargi_parse.sh -r $ref_name -c $chromsize -R $rsites -b $output_dir/bwa_output/$base_name.bam \
-o $output_dir -Q $mapq -G $gap -O $offset -M $max_ligation_size -d false -D $output_dir/parse_temp -t $threads
echo "==================== iMARGI Pipeline END at at $(date '+%Y-%m-%d %H:%M:%S %Z') ===================="