forked from baduelp/public
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSPLITREADER-beta2.5_part2.sh
405 lines (275 loc) · 24.3 KB
/
SPLITREADER-beta2.5_part2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#!/bin/bash
#############################################
# #
# SPLITREADER #
# beta2.5 - part 2 #
# Baduel et al. 2020 #
# #
#############################################
##Questions or comments to quadrana(ar)bio.ens.psl.eu
in=$1 # bam file name
cohort=$2 # name of cohort of genomes analyzed
workdir=$3 # working directory
GenomeFile=$4 # directory with genome annotation
TEannotfile=$5 # TE annotation gff
#############################################################
#IN THE FOLLOWING VARIABLE YOU CAN PROVIDE THE MINIMUM READ LEGTH. By default this is 100nt
LENGTH=100
#### If not specified, the program will calculate the longest read
#############################################################
#IN THE FOLLOWING VARIABLE YOU CAN PROVIDE THE MINIMUM NUMBER OF SPLIT-READS IN EACH EXTRIMITY OF THE TE. By default is 5 reads
READS=2
maxcov=100 # maximum coverage
LS=500 # library size
pe=TRUE
#### If not specified, the program estimate them. To this end, the program calculates the minimum number of reads as 3 standard deviation under the mean whole genome coverage
####This value should be at least 3, if not, it is forced to be 3
#############################################################
#############################################################
#IN THE FOLLOWING VARIABLE YOU CAN EXPLICITE THE NUMBER OF THREADS YOU WANT TO USE FOR MAPPING. By default is 2 threads
CORES=2
#############################################################
########################### edit the following paths !!! ####################
InputDir=/$workdir/$cohort/${in}/part1 #TE-sam from part1
OutputDir=/$workdir/$cohort/${in}/part2
if [ -e $OutputDir/log.txt ]
then
rm $OutputDir/log.txt
fi
if [ -e $OutputDir/$in-insertion-sites.bed ]
then
rm $OutputDir/$in-insertion-sites.bed
fi
#path to list of TEs to analyze (TE-information-all.txt)
#TE-> should be indicated in the first column of the TE-information.txt file located in listDir
#TSD -> should be indicated in the second column of the TE-information.txt file located in listDir
listDir=/$workdir/TE_sequence
TEannot=/$workdir/TE_sequence/$TEannotfile.gff
#Bowtie2 index for reference genome
GenomeIndexFile=/$workdir/$GenomeFile
#Bowtie2 executable path
Bowtie2Dir=/usr/local/bin
# #samtools executable path
samtoolsDir=/usr/local/bin
#bedtools executable path
bedtoolsdir=/usr/local/bin
#picard tools executable path
picardDir=/usr/share/java/picard.jar
# PID of this batch
IDPID=$$
# Temporary directory
TmpDir=/localtmp/PB-$IDPID
mkdir -p $TmpDir
##
if [ ! -d "$OutputDir" ]; then
mkdir -p $OutputDir
fi
############################################################################
function cleanup {
rm -f $TmpDir/*
rm -r -f $TmpDir
rm -f $TmpResultsDir/*
rm -r -f $TmpResultsDir
}
trap cleanup EXIT
echo "["$(date +"%y-%m-%d %T")"] Running SPLITREADER beta1.5 part 2" | tee -a $TmpDir/log.txt
if [ ! -e $InputDir/$in-TE.bam ]
then
if [ -e $InputDir/$in-TE.sam ]
then
echo "convert sam to bam" | tee -a $TmpDir/log.txt
$samtoolsDir/samtools view -Sb $InputDir/$in-TE.sam > $InputDir/$in-TE.bam
rm $InputDir/$in-TE.sam
fi
fi
#Extracting superfamily reads from remapped TE sam
echo "["$(date +"%y-%m-%d %T")"] Extracting superfamily reads from ${in}" | tee -a $TmpDir/log.txt
end=`wc -l $listDir/TE-information-all.txt | awk '{print $1}'`
###Starting the SPLITREADER pipeline for each TE in the TE-information.txt file
for ((l=1; $l<=$end; l=$l+1)); do
STARTTIME=$(date +%s)
IDPID2=$PPID
TmpResultsDir=$TmpDir/results-$IDPID2
mkdir -p $TmpResultsDir
TE=`sed -n "${l}p" $listDir/TE-information-all.txt | awk '{print $1}'`
TSD=`sed -n "${l}p" $listDir/TE-information-all.txt | awk '{print $2}'`
#TE=ATCOPIA93
#TSD=5
echo -e "\n"
echo "["$(date +"%y-%m-%d %T")"] ##### RUNNING SPLIT-READ ANALYSIS ON $TE ######" | tee -a $TmpDir/log.txt
echo ""
#############
# Selecting split-reads by mapping the unmapped reads over TE extremities
#############################################################
###filter soft-clipped reads with at least 20nt softclipped at 5' or 3' read's end
echo "["$(date +"%y-%m-%d %T")"] Selecting split-reads" | tee -a $TmpDir/log.txt
$samtoolsDir/samtools view -F 4 $InputDir/$in-TE.bam | grep -w $TE | awk '$6~/^[2-8][0-9]S/ || $6~/[2-8][0-9]S$/ || $6~/^1[0-9][0-9]S/ || $6~/1[0-9][0-9]S$/ {print $1"\t"$10"\t"$11}' | sort -k1,1 -k2,2 -u | awk '{print "@"$1"\n"$2"\n+\n"$3}' > $TmpResultsDir/$in-$TE-split.fastq 2>> $TmpDir/log.txt
#############################################################
###filtering pair ends of unmapped reads
echo "["$(date +"%y-%m-%d %T")"] Selecting discordant-reads" | tee -a $TmpDir/log.txt
$samtoolsDir/samtools view -F 8 $InputDir/$in-TE.bam | grep -w $TE | awk '{print $1}' >> $TmpResultsDir/reads.name.disc 2>> $TmpDir/log.txt
Nsplitreads=`wc -l $TmpResultsDir/$in-$TE-split.fastq | awk '{print $1/4}'`
Ndiscreads=`sort -k1,1 $TmpResultsDir/reads.name.disc | uniq | wc -l | awk '{print $1}'`
echo "$Nsplitreads splitreads and $Ndiscreads discordant reads identified on $TE" | tee -a $TmpDir/log.txt
if [ $(($Nsplitreads+$Ndiscreads)) -eq 0 ]
then
echo "No reads identified: end analysis on $TE" | tee -a $TmpDir/log.txt
else
if [ $Ndiscreads -gt 0 ]
then
# grab first in pair
$samtoolsDir/samtools view -f 64 -u $InputDir/$in-TE.bam > $TmpResultsDir/$in-TE-disc-first.bam 2>> $TmpDir/log.txt
# grab second in pair
$samtoolsDir/samtools view -f 128 -u $InputDir/$in-TE.bam > $TmpResultsDir/$in-TE-disc-second.bam 2>> $TmpDir/log.txt
java -Xmx10g -XX:+UseSerialGC -Djava.io.tmpdir=$TmpDir/javatemp -cp $picardDir net.sf.picard.sam.FilterSamReads INPUT=$TmpResultsDir/$in-TE-disc-first.bam FILTER=includeReadList READ_LIST_FILE=$TmpResultsDir/reads.name.disc OUTPUT=$TmpResultsDir/$in-$TE-selected-disc-first.sam TMP_DIR=$TmpDir/javatemp 2>> $TmpDir/log.txt
java -Xmx10g -XX:+UseSerialGC -Djava.io.tmpdir=$TmpDir/javatemp -cp $picardDir net.sf.picard.sam.FilterSamReads INPUT=$TmpResultsDir/$in-TE-disc-second.bam FILTER=includeReadList READ_LIST_FILE=$TmpResultsDir/reads.name.disc OUTPUT=$TmpResultsDir/$in-$TE-selected-disc-second.sam TMP_DIR=$TmpDir/javatemp 2>> $TmpDir/log.txt
cat $TmpResultsDir/$in-$TE-selected-disc-first.sam | awk '$1!~/^@/ {print $1"\t"$10"\t"$11}' | sort -u -k1,1 -k2,2 | awk '{print "@"$1"|1\n"$2"\n+\n"$3}' > $TmpResultsDir/$in-$TE-disc.fastq 2>> $TmpDir/log.txt
cat $TmpResultsDir/$in-$TE-selected-disc-second.sam | awk '$1!~/^@/ {print $1"\t"$10"\t"$11}' | sort -u -k1,1 -k2,2 | awk '{print "@"$1"|2\n"$2"\n+\n"$3}' >> $TmpResultsDir/$in-$TE-disc.fastq 2>> $TmpDir/log.txt
fi
rm -f $TmpResultsDir/reads.name
################################
###Estimating max read size (If necessary)
if [ -z "$LENGTH" ]
then
LENGTH=`head -5000 $TmpResultsDir/$in-$TE-split.fastq | awk 'NR%4 == 2 {print length($0)}' | sort | tail -1 `
length=$((LENGTH-20))
echo "["$(date +"%y-%m-%d %T")"] Maximum Read length: $LENGTH [Estimated] " | tee -a $TmpDir/log.txt
else
length=$((LENGTH-20))
echo "["$(date +"%y-%m-%d %T")"] Maximum Read length: $LENGTH [User defined] " | tee -a $TmpDir/log.txt
fi
###Recursive split-reads mapping
# step 1 for 3' read extremity: begining the loop.
echo "["$(date +"%y-%m-%d %T")"] Analyzing split-reads" | tee -a $TmpDir/log.txt
if [ $Nsplitreads -gt 0 ]
then
$Bowtie2Dir/bowtie2 -x $GenomeIndexFile -U $TmpResultsDir/$in-$TE-split.fastq -S $TmpResultsDir/$in-$TE-local.sam --local --very-sensitive --threads $CORES --quiet
$samtoolsDir/samtools view -H -S $TmpResultsDir/$in-$TE-local.sam > $TmpResultsDir/$in-$TE-split-local-up.sam
cat $TmpResultsDir/$in-$TE-split-local-up.sam > $TmpResultsDir/$in-$TE-split-local-down.sam
##extracting split reads upstream insertion
$samtoolsDir/samtools view -F 4 -q 5 -S $TmpResultsDir/$in-$TE-local.sam | awk '$6~/^[0-9][0-9]S/ || $6~/^1[0-9][0-9]S/ {print $0}' >> $TmpResultsDir/$in-$TE-split-local-down.sam
$samtoolsDir/samtools view -F 4 -q 5 -S $TmpResultsDir/$in-$TE-local.sam | awk '$6~/[0-9][0-9]S$/ || $6~/1[0-9][0-9]S$/ {print $0}' >> $TmpResultsDir/$in-$TE-split-local-up.sam
$samtoolsDir/samtools view -Sbu $TmpResultsDir/$in-$TE-split-local-down.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-splitjunction-down.bam
$samtoolsDir/samtools view -Sbu $TmpResultsDir/$in-$TE-split-local-up.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-splitjunction-up.bam
rm -f $TmpResultsDir/$in-$TE-split-local-up.sam
rm -f $TmpResultsDir/$in-$TE-split-local-down.sam
############################################
##Refining insertion sites
length=$(($((LENGTH/2))-1))
echo "["$(date +"%y-%m-%d %T")"] Refining insertion sites" | tee -a $TmpDir/log.txt
echo -e "\n"
echo -n "Progression: ["
$Bowtie2Dir/bowtie2 -x $GenomeIndexFile -U $TmpResultsDir/$in-$TE-split.fastq -S $TmpResultsDir/$in-$TE-splitjunction-5-$length.sam --un $TmpResultsDir/$in-$TE-split-5-$length -5 $length --very-sensitive --threads $CORES --local --quiet 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -H $TmpResultsDir/$in-$TE-splitjunction-5-$length.sam > $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam 2>> $TmpDir/log.txt
cat $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam > $TmpResultsDir/$in-$TE-splitjunction-5-$length-up.sam 2>> $TmpDir/log.txt
cat $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam > $TmpResultsDir/$in-$TE-splitjunction-3-$length-down.sam 2>> $TmpDir/log.txt
cat $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam > $TmpResultsDir/$in-$TE-splitjunction-3-$length-up.sam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -F 4 -q 5 -S $TmpResultsDir/$in-$TE-splitjunction-5-$length.sam | awk '$6~/^[0-9][0-9]S/ || $6~/^1[0-9][0-9]S/ {print $0}' >> $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -F 4 -q 5 -S $TmpResultsDir/$in-$TE-splitjunction-5-$length.sam | awk '$6~/[0-9][0-9]S$/ || $6~/^1[0-9][0-9]S/ {print $0}' >> $TmpResultsDir/$in-$TE-splitjunction-5-$length-up.sam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -Sbu $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.bam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -Sbu $TmpResultsDir/$in-$TE-splitjunction-5-$length-up.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-splitjunction-5-$length-up.bam 2>> $TmpDir/log.txt
rm -f $TmpResultsDir/$in-$TE-splitjunction-5-$length.sam
rm -f $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.sam
rm -f $TmpResultsDir/$in-$TE-splitjunction-5-$length-up.sam
$Bowtie2Dir/bowtie2 -x $GenomeIndexFile -U $TmpResultsDir/$in-$TE-split.fastq -S $TmpResultsDir/$in-$TE-splitjunction-3-$length.sam --un $TmpResultsDir/$in-$TE-split-3-$length -3 $length --very-sensitive --threads $CORES --local --quiet 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -F 4 -q 5 -S $TmpResultsDir/$in-$TE-splitjunction-3-$length.sam | awk '$6~/^[0-9][0-9]S/ || $6~/^1[0-9][0-9]S/ {print $0}' >> $TmpResultsDir/$in-$TE-splitjunction-3-$length-down.sam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -F 4 -q 5 -S $TmpResultsDir/$in-$TE-splitjunction-3-$length.sam | awk '$6~/[0-9][0-9]S$/ || $6~/^1[0-9][0-9]S/ {print $0}' >> $TmpResultsDir/$in-$TE-splitjunction-3-$length-up.sam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -Sbu $TmpResultsDir/$in-$TE-splitjunction-3-$length-down.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-splitjunction-3-$length-down.bam 2>> $TmpDir/log.txt
$samtoolsDir/samtools view -Sbu $TmpResultsDir/$in-$TE-splitjunction-3-$length-up.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-splitjunction-3-$length-up.bam 2>> $TmpDir/log.txt
rm -f $TmpResultsDir/$in-$TE-splitjunction-3-$length.sam
rm -f $TmpResultsDir/$in-$TE-splitjunction-3-$length-down.sam
rm -f $TmpResultsDir/$in-$TE-splitjunction-3-$length-up.sam
echo -n "]"
echo -e "\n"
fi
###########################
if [ -n "$pe" ]
then
if [ $Ndiscreads -gt 0 ]
then
echo "["$(date +"%y-%m-%d %T")"] Analyzing discordant reads" | tee -a $TmpDir/log.txt
# increase mapping penalities for mismatches and indels to prevent mismapped discordant reads to break-down the clusters
$Bowtie2Dir/bowtie2 -x $GenomeIndexFile -U $TmpResultsDir/$in-$TE-disc.fastq -S $TmpResultsDir/$in-$TE-endtoend.sam --very-sensitive --threads $CORES --quiet --mp 13 --rdg 8,5 --rfg 8,5
$samtoolsDir/samtools view -Sbu -f 16 -q 5 $TmpResultsDir/$in-$TE-endtoend.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-disc-down.bam
$samtoolsDir/samtools view -Sbu -F 16 -q 5 $TmpResultsDir/$in-$TE-endtoend.sam | $samtoolsDir/samtools sort - -o $TmpResultsDir/$in-$TE-disc-up.bam
rm -f $TmpResultsDir/$in-$TE-local.sam
rm -f $TmpResultsDir/$in-$TE-endtoend.sam
fi
fi
############################################
# Post-treatment:
echo "Merge the 5' and 3' clusters to create the downstream and upstream cluster" | tee -a $TmpDir/log.txt
$samtoolsDir/samtools merge -f -u $TmpResultsDir/$in-$TE-up.bam $TmpResultsDir/$in-$TE-splitjunction-5-$length-up.bam $TmpResultsDir/$in-$TE-splitjunction-3-$length-up.bam $TmpResultsDir/$in-$TE-splitjunction-up.bam 2>> $TmpDir/log.txt
$samtoolsDir/samtools merge -f -u $TmpResultsDir/$in-$TE-down.bam $TmpResultsDir/$in-$TE-splitjunction-5-$length-down.bam $TmpResultsDir/$in-$TE-splitjunction-3-$length-down.bam $TmpResultsDir/$in-$TE-splitjunction-down.bam 2>> $TmpDir/log.txt
$samtoolsDir/samtools sort $TmpResultsDir/$in-$TE-down.bam -o $TmpResultsDir/$in-$TE-down 2>> $TmpDir/log.txt
$samtoolsDir/samtools sort $TmpResultsDir/$in-$TE-up.bam -o $TmpResultsDir/$in-$TE-up 2>> $TmpDir/log.txt
echo "Calculate the coverage over mapped regions - filter regions according to minimum and maximum read-depth" | tee -a $TmpDir/log.txt
$samtoolsDir/samtools depth $TmpResultsDir/$in-$TE-up.bam | awk -v M=$maxcov '$3<(M) {print $1 "\t" $2 "\t"$2"\t"$3}' | sort -k 1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -c 4 -o max | awk -v l=$LENGTH '($3-$2)<=l {print $0}' > $TmpResultsDir/$in-$TE-up-merge.bed 2>> $TmpDir/log.txt
$samtoolsDir/samtools depth $TmpResultsDir/$in-$TE-down.bam | awk -v M=$maxcov ' $3<(M) {print $1 "\t" $2 "\t"$2"\t"$3}' | sort -k 1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -c 4 -o max | awk -v l=$LENGTH '($3-$2)<=l {print $0}' > $TmpResultsDir/$in-$TE-down-merge.bed 2>> $TmpDir/log.txt
#grep -w $TE $SequencesDir/TAIR10_TE.bed | awk '{print $1"\t"$2-5000"\t"$3+5000}' > $TmpResultsDir/disc-excluding.tmp
teid=`echo $TE | sed 's/\@/\t/' | awk '{print $1}'`
grep -w $teid $TEannot | awk '{print $1"\t"$4"\t"$5}' > $TmpResultsDir/disc-excluding.tmp
echo "merge cluster of discordant-reads" | tee -a $TmpDir/log.txt
if [ -n "$pe" ]
then
echo "["$(date +"%y-%m-%d %T")"] Searching for discordant-reads clusters..." | tee -a $TmpDir/log.txt
$samtoolsDir/samtools depth $TmpResultsDir/$in-$TE-disc-down.bam | awk -v M=$maxcov '$3<(M) {print $1 "\t" $2 "\t"$2 "\t"$3}' | sort -k1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -d $LS -c 4 -o max | sort -k1,1 -k2,2n > $TmpResultsDir/$in-$TE-disc-down.tmp 2>> $TmpDir/log.txt
$samtoolsDir/samtools depth $TmpResultsDir/$in-$TE-disc-up.bam | awk -v M=$maxcov '$3<(M) {print $1 "\t" $2 "\t"$2 "\t"$3}' | sort -k1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -d $LS -c 4 -o max | sort -k1,1 -k2,2n > $TmpResultsDir/$in-$TE-disc-up.tmp 2>> $TmpDir/log.txt
#filtering discordat-read clusters that overlap by a legth >than TSD+15bp
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-up.tmp -b $TmpResultsDir/$in-$TE-disc-down.tmp -wa -wb | awk -v l=$LENGTH '($3-$6)>l {print $1"\t"$2"\t"$7}' >> $TmpResultsDir/disc-excluding.tmp 2>> $TmpDir/log.txt
sort -o $TmpResultsDir/disc-excluding.tmp -k1,1 -k2,2n $TmpResultsDir/disc-excluding.tmp 2>> $TmpDir/log.txt
# $bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-up.tmp -b $TmpResultsDir/$in-$TE-disc-down.tmp -wa -wb | awk -v l=$LENGTH '($3-$6)>l {print $1"\t"$2"\t"$7}' >> $TmpResultsDir/disc-excluding.tmp 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-up.tmp -b $TmpResultsDir/disc-excluding.tmp -sorted -v | awk '{print $1 "\t" $2 "\t"$3}' > $TmpResultsDir/$in-$TE-disc-up.tmp2 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-down.tmp -b $TmpResultsDir/disc-excluding.tmp -sorted -v | awk '{print $1 "\t" $2 "\t"$3}' > $TmpResultsDir/$in-$TE-disc-down.tmp2 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -b $TmpResultsDir/$in-$TE-disc-up.bam -a $TmpResultsDir/$in-$TE-disc-up.tmp2 -bed -c | sort -k1,1 -k2,2n > $TmpResultsDir/$in-$TE-disc-up.bed 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -b $TmpResultsDir/$in-$TE-disc-down.bam -a $TmpResultsDir/$in-$TE-disc-down.tmp2 -bed -c | sort -k1,1 -k2,2n > $TmpResultsDir/$in-$TE-disc-down.bed 2>> $TmpDir/log.txt
rm -f $TmpResultsDir/$in-$TE-disc-down.tmp*
rm -f $TmpResultsDir/$in-$TE-disc-up.tmp*
fi
# echo "Defining TE insertions based on splitreads only: searching for overlapping clusters meeting the expected TSD size and number of supporting reads
echo "["$(date +"%y-%m-%d %T")"] Defining insertions..." | tee -a $TmpDir/log.txt
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-up-merge.bed -b $TmpResultsDir/$in-$TE-down-merge.bed -wo | awk -v tsd=$TSD -v te=$TE '($6-$2)>10 && ($7-$3)>10 && $9>=tsd && $9<(tsd+15) {print $1 "\t" $6 "\t" $3 "\t"te"\t" ($9-1)"\t"$4"\t"$8}' | $bedtoolsdir/intersectBed -a stdin -b $TmpResultsDir/disc-excluding.tmp -v >> $TmpResultsDir/$in-insertion-sites.1 2>> $TmpDir/log.txt
if [ -s $TmpResultsDir/$in-insertion-sites.1 ]
then
awk -v l=$LS '{print $1"\t"$2"\t"$3+l"\t"$4}' $TmpResultsDir/$in-$TE-disc-up.bed | $bedtoolsdir/intersectBed -a $TmpResultsDir/$in-insertion-sites.1 -b stdin -wa -wb | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$11}' > $TmpResultsDir/$in-insertion-sites.2 2>> $TmpDir/log.txt
awk -v l=$LS '{print $1"\t"$2"\t"$3+l"\t"$4}' $TmpResultsDir/$in-$TE-disc-up.bed | $bedtoolsdir/intersectBed -a $TmpResultsDir/$in-insertion-sites.1 -b stdin -v | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t""0"}' >> $TmpResultsDir/$in-insertion-sites.2 2>> $TmpDir/log.txt
awk -v l=$LS ' $2-l>0 {print $1"\t"$2-l"\t"$3"\t"$4}' $TmpResultsDir/$in-$TE-disc-down.bed | $bedtoolsdir/intersectBed -a $TmpResultsDir/$in-insertion-sites.2 -b stdin -wa -wb | awk '($6+$7)>=r {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$12}' >> $OutputDir/$in-insertion-sites.bed 2>> $TmpDir/log.txt
awk -v l=$LS '$2-l>0 {print $1"\t"$2-l"\t"$3"\t"$4}' $TmpResultsDir/$in-$TE-disc-down.bed | $bedtoolsdir/intersectBed -a $TmpResultsDir/$in-insertion-sites.2 -b stdin -v | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t""0"}' >> $OutputDir/$in-insertion-sites.bed 2>> $TmpDir/log.txt
else
echo "No split-reads clusters identified... skeeping this step" | tee -a $TmpDir/log.txt
fi
################################
#Defining TE insertions based in discordant reads and supported for at least one split-read:
if [ -n "$pe" ]
then
echo "selecting cluster of discordant reads that were not called by split-reads" | tee -a $TmpDir/log.txt
awk -v l=$LS '{print $1"\t"$2"\t"$3+l"\t"$4}' $TmpResultsDir/$in-$TE-disc-up.bed | $bedtoolsdir/intersectBed -b $TmpResultsDir/$in-insertion-sites.1 -a stdin -v > $TmpResultsDir/$in-$TE-disc-up-outer.bed 2>> $TmpDir/log.txt
awk -v l=$LS '$2>l {print $1"\t"$2-l"\t"$3"\t"$4}' $TmpResultsDir/$in-$TE-disc-down.bed | $bedtoolsdir/intersectBed -b $TmpResultsDir/$in-insertion-sites.1 -a stdin -v > $TmpResultsDir/$in-$TE-disc-down-outer.bed 2>> $TmpDir/log.txt
#selecting overlapping upstream and downstream clusters of discordant reads
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-up-outer.bed -b $TmpResultsDir/$in-$TE-disc-down-outer.bed -wa -wb | awk -v l=$LS '$3>l {print $1"\t"$2"\t"$7"\t"$4"\t"$8"\t"$3-l"\t"$6+l}' > $TmpResultsDir/$in-$TE-disc-cluster.0 2>> $TmpDir/log.txt
awk '$6<=$7 {print $1"\t"$6-4"\t"$7+4"\t"$4"\t"$5"\t"$2"\t"$3}' $TmpResultsDir/$in-$TE-disc-cluster.0 > $TmpResultsDir/$in-$TE-disc-cluster.1 2>> $TmpDir/log.txt
awk '$6>$7 {print $1"\t"$7-4"\t"$6+4"\t"$4"\t"$5"\t"$2"\t"$3}' $TmpResultsDir/$in-$TE-disc-cluster.0 >> $TmpResultsDir/$in-$TE-disc-cluster.1 2>> $TmpDir/log.txt
if [[ -s $TmpResultsDir/$in-$TE-disc-cluster.1 ]] ; then
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-cluster.1 -b $TmpResultsDir/$in-$TE-up-merge.bed -wa -wb | awk '{print $1"\t"$2"\t"$10"\t"$4"\t"$5"\t"$11"\t"$6"\t"$7}' | sort -k1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -c 4,5,6,7,8 -o max,max,max,max,max > $TmpResultsDir/$in-$TE-disc-cluster.3 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-cluster.1 -b $TmpResultsDir/$in-$TE-up-merge.bed -v | awk '{print $1"\t"$2"\t"$3-4"\t"$4"\t"$5"\t""0""\t"$6"\t"$7}' >> $TmpResultsDir/$in-$TE-disc-cluster.3 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-cluster.3 -b $TmpResultsDir/$in-$TE-down-merge.bed -wa -wb | awk -v te=$TE '{print $1"\t"$10"\t"$3"\t"te"\t""NA""\t"$6"\t"$12"\t"$4"\t"$5"\t"$7"\t"$8}' | sort -k1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -c 4,5,6,7,8,9,10,11 -o distinct,distinct,distinct,max,max,max,min,max >> $OutputDir/$in-insertion-sites.bed 2>> $TmpDir/log.txt
$bedtoolsdir/intersectBed -a $TmpResultsDir/$in-$TE-disc-cluster.3 -b $TmpResultsDir/$in-$TE-down-merge.bed -v | awk -v te=$TE '{print $1"\t"$2+4"\t"$3"\t"te"\t""NA""\t"$6"\t""0""\t"$4"\t"$5"\t"$7"\t"$8}' | sort -k1,1 -k2,2n | $bedtoolsdir/mergeBed -i stdin -c 4,5,6,7,8,9,10,11 -o distinct,distinct,distinct,max,max,max,min,max >> $OutputDir/$in-insertion-sites.bed 2>> $TmpDir/log.txt
fi
fi
INSERTIONS=`grep -w $TE $OutputDir/$in-insertion-sites.bed | wc -l | awk '{print $1}'`
echo "["$(date +"%y-%m-%d %T")"] Split-read analyis done: $INSERTIONS putative insertions identified..." | tee -a $TmpDir/log.txt
###merging bam files and moving them to the output folder
$samtoolsDir/samtools merge -f $TmpResultsDir/$in-$TE-split.bam $TmpResultsDir/$in-$TE-up.bam $TmpResultsDir/$in-$TE-down.bam $TmpResultsDir/$in-$TE-disc-down.bam $TmpResultsDir/$in-$TE-disc-up.bam
$samtoolsDir/samtools sort $TmpResultsDir/$in-$TE-split.bam -o $TmpResultsDir/$in-$TE-split-sorted.bam
$samtoolsDir/samtools view -H $TmpResultsDir/$in-$TE-split-sorted.bam | grep ^@SQ > $TmpResultsDir/$in-$TE-split-sorted.sam
$samtoolsDir/samtools view -H $TmpResultsDir/$in-$TE-split-sorted.bam | grep ^@PG | head -1 >> $TmpResultsDir/$in-$TE-split-sorted.sam
$samtoolsDir/samtools view $TmpResultsDir/$in-$TE-split-sorted.bam >> $TmpResultsDir/$in-$TE-split-sorted.sam
$samtoolsDir/samtools view -hb $TmpResultsDir/$in-$TE-split-sorted.sam > $OutputDir/$in-$TE-split.bam
$samtoolsDir/samtools index $OutputDir/$in-$TE-split.bam
ENDTIME=$(date +%s)
echo "["$(date +"%y-%m-%d %T")"] It takes $((ENDTIME-STARTTIME)) seconds to analyse $TE" | tee -a $TmpDir/log.txt
fi
rm -r -f $TmpResultsDir
done
echo "["$(date +"%y-%m-%d %T")"] SPLITREADER PART2 COMPLETED" | tee -a $TmpDir/log.txt
cp $TmpDir/log.txt $OutputDir/