-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_rnacmap.sh
executable file
·466 lines (412 loc) · 22.3 KB
/
run_rnacmap.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
#!/bin/bash
start=`date +%s`
input="$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
input_dir=$(dirname $input)
seq_id=$(basename $(basename $input) | cut -d. -f1)
program_dir=$(dirname $(readlink -f $0))
path_blastn_database=/home/jaswinder/Documents/project4/database/nt
path_infernal_database=/home/jaswinder/Documents/project4/database/nt
#path_blastn_database=$program_dir/nt_database/nt # set path to the formatted NCBI's database file without extension
#path_infernal_database=$program_dir/nt_database/nt # set path to the NCBI's database database file
mkdir -p $input_dir/${seq_id}_features #&& mkdir -p $input_dir/${seq_id}_outputs
echo ">"$seq_id > $input_dir/${seq_id}_features/$seq_id.fasta
awk -i inplace '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' $input
tail -n1 $input >> $input_dir/${seq_id}_features/$seq_id.fasta
echo "" >> $input_dir/${seq_id}_features/$seq_id.fasta
feature_dir=$input_dir/${seq_id}_features
output_dir=$input_dir/${seq_id}_outputs
#exit 1
if [ ! -f $path_blastn_database ]; then
echo ""
echo "========================================================================================"
echo " Looks like nt database doesn't exists in the path $path_blastn_database. "
echo " If you want to download the database now, please make sure you have enough "
echo " space in mounted directory and internet connection have enough bandwidth as "
echo " file is of size 270 GBs after unzip. It may take forever to download if "
echo " internet is slow! "
echo "========================================================================================"
echo ""
echo -n "Type 'y' for download or any other key to exit: "
read userinput
if [[ $(echo $userinput | tr '[A-Z]' '[a-z]') == 'y' ]]; then
echo ""
echo "=============================================================================================="
echo " Downloading NCBI's database form ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz link. "
echo " May take few hours to download. "
echo "=============================================================================================="
echo ""
wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O $program_dir/nt_database/nt.gz
if [[ $? -eq 0 ]]; then
echo ""
echo "======================================================================="
echo " nt database is completed successfully. "
echo "======================================================================="
echo ""
else
echo ""
echo "======================================================================="
echo " Error! Unable to download database sucessfully. "
echo " Check wget command or internet connection. "
echo "======================================================================="
echo ""
exit 1
fi
echo ""
echo "======================================================================"
echo " Unziping the downloaded nt database. "
echo " May take few hours as size of unzipped file is around 270 GBs. "
echo "======================================================================"
echo ""
############ unzip the nt data base file ############
gunzip $program_dir/nt_database/nt.gz
if [[ $? -eq 0 ]]; then
echo ""
echo "======================================================================="
echo " nt database unzip completed successfully. "
echo "======================================================================="
echo ""
else
echo ""
echo "======================================================================="
echo " Error! unable to unzip database sucessfully. "
echo " Please check if gunzip program exists! "
echo "======================================================================="
echo ""
exit 1
fi
else
echo ""
echo "==========================================================="
echo " Exiting the program because nt database is missing! "
echo "==========================================================="
echo ""
exit 1
fi
fi
###### check if aligned homologous sequences file already exists ############
if [ -f $feature_dir/$seq_id.a2m ]; then
echo ""
echo "======================================================================"
echo " MSA file $feature_dir/$seq_id.a2m from Infernal Pipeline already "
echo " exists for query sequence $feature_dir/$seq_id.fasta. "
echo " "
echo " Delete existing $feature_dir/$seq_id.a2m if want to generate new "
echo " alignment file "
echo "======================================================================"
echo ""
else
#### check if formatted nt database exists or not #####
if [[ ! -f "$path_blastn_database.nal" ]]; then
echo ""
echo "====================================================================="
echo " Nucleotide database file $path_database/nt need to formated "
echo " formated to use with 'makeblastdb' program in BLAST-N program. "
echo ""
echo " Formatting may take 2-3 hours as size of file is around 270 GBs. "
echo "====================================================================="
echo ""
makeblastdb -in $path_database/nt -dbtype nucl
if [[ $? -eq 0 ]]; then
echo ""
echo "======================================================="
echo " nt database formatted successfully. "
echo "======================================================="
echo ""
else
echo ""
echo "=================================================================="
echo " Error occured while formatting the nt database. "
echo ""
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "=================================================================="
echo ""
exit 1
fi
fi
#################### check if blastn alignment file ready exists ######################
if [ -f $feature_dir/$seq_id.bla ]; then
echo ""
echo "======================================================================="
echo " MSA-1 file $feature_dir/$seq_id.bla from Infernal Pipeline already "
echo " exists for query sequence $feature_dir/$seq_id.fasta. "
echo " "
echo " Delete existing $feature_dir/$seq_id.a2m if want to generate new "
echo " alignment file. "
echo "======================================================================="
echo ""
else
echo ""
echo "==========================================================================================================================="
echo " Running BLASTN for first round of homologous sequence search for query sequence $feature_dir/$seq_id.fasta. "
echo " May take 5 mins to few hours depending on sequence length and no. of homologous sequences in database. "
echo "==========================================================================================================================="
echo ""
blastn -db $path_blastn_database -query $feature_dir/$seq_id.fasta -out $feature_dir/$seq_id.bla -evalue 0.001 -num_descriptions 1 -num_threads 8 -line_length 1000 -num_alignments 50000
fi
if [ $? -eq 0 ]; then
echo ""
echo "==========================================================="
echo " First round of MSA-1 search completed successfully. "
echo "==========================================================="
echo ""
else
echo ""
echo "=================================================================="
echo " Error occured while formatting the nt database. "
echo ""
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "=================================================================="
echo ""
exit 1
fi
######## reformat the output ################
echo ""
echo "========================================================================================"
echo " Converting $feature_dir/$seq_id.bla from BLASTN to $feature_dir/$seq_id.sto. "
echo "========================================================================================"
echo ""
$program_dir/utils/parse_blastn_local.pl $feature_dir/$seq_id.bla $feature_dir/$seq_id.fasta $feature_dir/$seq_id.aln
$program_dir/utils/reformat.pl fas sto $feature_dir/$seq_id.aln $feature_dir/$seq_id.sto
if [ $? -eq 0 ]; then
echo ""
echo "=========================================="
echo " Converison completed successfully. "
echo "=========================================="
echo ""
else
echo ""
echo "============================================================================================="
echo " Error occured while Converting $feature_dir/$seq_id.bla to $feature_dir/$seq_id.sto "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues' "
echo "============================================================================================="
echo ""
exit 1
fi
######## predict secondary structure from RNAfold ################
echo ""
echo "==============================================================================================================================="
echo " Predicting Consensus Secondary Structure (CSS) of query sequence $feature_dir/$seq_id.fasta using RNAfold predictor. "
echo "==============================================================================================================================="
echo ""
RNAfold $feature_dir/$seq_id.fasta | awk '{print $1}' | tail -n +3 > $feature_dir/$seq_id.db
################ reformat ss with according to gaps in reference sequence of .sto file from blastn ################
for i in `awk '{print $2}' $feature_dir/$seq_id.sto | head -n5 | tail -n1 | grep -b -o - | sed 's/..$//'`; do sed -i "s/./&-/$i" $feature_dir/$seq_id.db; done
######### add reformated ss from last step to .sto file of blastn ##############
head -n -1 $feature_dir/$seq_id.sto > $feature_dir/temp.sto
echo "#=GC SS_cons "`cat $feature_dir/$seq_id.db` > $feature_dir/temp.txt
cat $feature_dir/temp.sto $feature_dir/temp.txt > $feature_dir/$seq_id.sto
echo "//" >> $feature_dir/$seq_id.sto
if [ $? -eq 0 ]; then
echo ""
echo "=================================================================="
echo " Consensus Secondary Structure (CSS) generated successfully. "
echo "=================================================================="
echo ""
else
echo ""
echo "=============================================================================="
echo " Error occured while generating structure from RNAfold. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "=============================================================================="
echo ""
exit 1
fi
######## run infernal ################
echo ""
echo "=============================================================================================================="
echo " Building Covariance Model from BLASTN alignment (with SS from SPOT-RNA) from $feature_dir/$seq_id.sto file. "
echo "=============================================================================================================="
echo ""
cmbuild --hand -F $feature_dir/$seq_id.cm $feature_dir/$seq_id.sto
if [ $? -eq 0 ]; then
echo ""
echo "============================================================================"
echo " Covariance Model (CM) built successfully from $feature_dir/$seq_id.sto. "
echo "============================================================================"
echo ""
else
echo ""
echo "==============================================================================================="
echo " Error occured while building Covariance Model (CM) from cmbuild. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "==============================================================================================="
echo ""
exit 1
fi
echo ""
echo "===================================================================="
echo " Calibrating the Covariance Model $feature_dir/$seq_id.cm. "
echo "===================================================================="
echo ""
cmcalibrate $feature_dir/$seq_id.cm
if [ $? -eq 0 ]; then
echo ""
echo "==========================================================="
echo " CM calibrated $feature_dir/$seq_id.cm successfully. "
echo "==========================================================="
echo ""
else
echo ""
echo "==============================================================="
echo " Error occured while calibrating $feature_dir/$seq_id.cm. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "==============================================================="
echo ""
exit 1
fi
echo ""
echo "======================================================================================================================"
echo " Second round of homologous sequences search using the calibrated covariance model $feature_dir/$seq_id.cm. "
echo " May take 15 mins to few hours for this step. "
echo "======================================================================================================================"
echo ""
cmsearch -o $feature_dir/$seq_id.out -A $feature_dir/$seq_id.msa --cpu 24 --incE 10.0 $feature_dir/$seq_id.cm $path_infernal_database
if [ $? -eq 0 ]; then
echo ""
echo "==========================================================="
echo " Second round of MSA-2 search completed successfully. "
echo "==========================================================="
echo ""
else
echo ""
echo "===================================================================================="
echo " Error occured during the second round search using CM $feature_dir/$seq_id.cm. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "===================================================================================="
echo ""
exit 1
fi
######### reformat the alignment without gaps and dashes ###############
echo ""
echo "======================================================================="
echo " Reformatting the output alignment $feature_dir/$seq_id.msa "
echo " for PSSM and DCA features by removing the gaps and dashes. "
echo "======================================================================="
echo ""
##### check if .msa is not empty #########
if [[ -s $feature_dir/$seq_id.msa ]]
then
esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m
else
cat $feature_dir/$seq_id.fasta > $feature_dir/temp.a2m
cat $feature_dir/$seq_id.fasta >> $feature_dir/temp.a2m
sed -i '$ s/.$/./' $feature_dir/temp.a2m
fi
if [ $? -eq 0 ]; then
echo ""
echo "==========================================================="
echo " Reformatted the $feature_dir/$seq_id.msa successfully. "
echo "==========================================================="
echo ""
else
echo ""
echo "========================================================================================"
echo " Error occured during the refomatting the alignment file $feature_dir/$seq_id.msa. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "========================================================================================"
echo ""
exit 1
fi
######### remove duplicates sequences from the alignment ###############
echo ""
echo "======================================================================="
echo " Removing duplicates from the alignment. "
echo "======================================================================="
echo ""
$program_dir/utils/seqkit rmdup -s $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m
if [ $? -eq 0 ]; then
echo ""
echo "==============================================="
echo " Duplicate sequences removed successfully. "
echo "==============================================="
echo ""
else
echo ""
echo "========================================================================================"
echo " Error occured during the removel of duplicates from MSA-2. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "========================================================================================"
echo ""
exit 1
fi
############# multiline fasta to single line fasta file #############
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < $feature_dir/$seq_id.a2m | sed '/^$/d' > $feature_dir/temp.a2m
############# add query sequence at the top of MSA file #############
cat $feature_dir/$seq_id.fasta $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m
fi
############# check if pssm file already exists otherwise generate from alignment file #############
if [ -f $feature_dir/$seq_id.pssm ]; then
echo ""
echo "=============================================================================================================================================="
echo " PSSM feature file $feature_dir/$seq_id.pssm already exists for query sequence $feature_dir/$seq_id.fasta. "
echo "=============================================================================================================================================="
echo ""
else
echo ""
echo "======================================================================================"
echo " Extracting PSSM features from the alignment $feature_dir/$seq_id.a2m. "
echo "======================================================================================"
echo ""
$program_dir/utils/getpssm.pl $feature_dir/$seq_id.fasta $feature_dir/$seq_id.a2m $feature_dir/$seq_id.pssm
if [ $? -eq 0 ]; then
echo ""
echo "==============================================================="
echo " PSSM extracted successfully from $feature_dir/$seq_id.a2m. "
echo "==============================================================="
echo ""
else
echo ""
echo "========================================================================="
echo " Error occured while extracting PSSM from $feature_dir/$seq_id.a2m. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "========================================================================="
echo ""
exit 1
fi
fi
############# check if dca file already exists otherwise generate from alignment file #############
if [ -f $feature_dir/$seq_id.dca ]; then
echo ""
echo "==============================================================="
echo " GRELMLIN feature file $feature_dir/$seq_id.dca already "
echo " exists for query sequence $feature_dir/$seq_id.fasta. "
echo " "
echo " Delete the existing file if want to generate new dca file. "
echo "==============================================================="
echo ""
else
echo ""
echo "============================================================================"
echo " Running PLMC for DCA features. "
echo "============================================================================"
echo ""
$program_dir/plmc/bin/plmc -c $feature_dir/$seq_id.dca -a -.ACGUNX -le 20 -lh 0.01 -m 50 $feature_dir/$seq_id.a2m &> $feature_dir/$seq_id.log_plmc
if [ $? -eq 0 ]; then
echo ""
echo "===================================================="
echo " DCA features successfully obtained from PLMC. "
echo "===================================================="
echo ""
else
echo ""
echo "============================================================================="
echo " Error occured while running PLMC. "
echo " "
echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA-2D/issues'"
echo "============================================================================="
echo ""
exit 1
fi
fi
cp $input_dir/${seq_id}_features/$seq_id.fasta $input_dir/${seq_id}_features/$seq_id
end=`date +%s`
runtime=$((end-start))
echo -e "\ncomputation time = "$runtime" seconds"