-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnfrAnaAll
executable file
·408 lines (353 loc) · 19 KB
/
nfrAnaAll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/bin/bash
#PBS -l nodes=1:ppn=4
OUTDIR="nfr";
RANDOMDIR="nfr_random";
MINCLUSTERHEIGHT=20
MINBLOCKHEIGHT=20
DISTANCE=70
SCALE="0.6"
BLOCKHEIGHT="abs"
MINNFRLENGTH=20
MAXNFRLENGTH=1000
NFR_THRESHOLD="0.05"
GENOME="mm9"
SHUFFLECOUNT="100000"
EXTEND_REP1=0
EXTEND_REP2=0
#### usage ####
usage() {
echo Program: "nfrAnaAll (determine nucleosome free regions using histone marks (two replicates))"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: [email protected]
echo "Usage: nfrAnaAll -i <file> -j <file> -k <file> [OPTIONS]"
echo "Options:"
echo " -i <file> [mapped reads in BAM format (replicate 1)]"
echo " -j <file> [mapped reads in BAM format (replicate 2)]"
echo " -k <file> [optimal histone peaks region (regionPeak file)]"
echo " [NOTE: all the required input files should be provided as ABSOLUTE path]"
echo "[OPTIONS]"
echo " -o <dir> [output directory to store results (default: nfr)"
echo " -r <dir> [output directory to store results for randomly shuffled NFRs (default: nfr_random)"
echo " -f <file> [optimal TF (cebpa) peaks summit (summit file)]"
echo " [used to optimize -c and -e parameters]"
echo " -m <string> [genome (default: mm9)]"
echo " -p [run in parallel]"
echo " -d [shift 3' of reads to accomodate for fragment length]"
echo " -c <int> [minimum number of read in the block group (default: 20)]"
echo " -e <int> [minimum number of read in the block (default: 20)]"
echo " -x <int> [maximum distance between the blocks (default: 70)]"
echo " -s <float> [scale to define blocks (default: 0.6)]"
echo " -g <int> [block height (abs or rel) (default: abs)]"
echo " -n <int> [minimum length of nucleosome free region (default: 20)]"
echo " -v <int> [maximum length of nucleosome free region (default: 1000)]"
echo " -t <float> [FDR at which to consider a NFR as significant (default: 0.05)]"
echo " -u <int> [number of times NFR regions should be shuffled to compute p-values (default: 100000)]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:j:k:o:r:f:m:pdc:e:x:s:g:n:v:t:u:h ARG; do
case "$ARG" in
i) REP1=$OPTARG;;
j) REP2=$OPTARG;;
k) PEAKREGION=$OPTARG;;
o) OUTDIR=$OPTARG;;
r) RANDOMDIR=$OPTARG;;
f) TFSUMMIT=$OPTARG;;
m) GENOME=$OPTARG;;
p) PARALLEL=1;;
d) EXTEND=1;;
c) MINCLUSTERHEIGHT=$OPTARG;;
e) MINBLOCKHEIGHT=$OPTARG;;
x) DISTANCE=$OPTARG;;
s) SCALE=$OPTARG;;
g) BLOCKHEIGHT=$OPTARG;;
n) MINNFRLENGTH=$OPTARG;;
v) MAXNFRLENGTH=$OPTARG;;
t) NFR_THRESHOLD=$OPTARG;;
u) SHUFFLECOUNT=$OPTARG;;
h) HELP=1;;
esac
done
echo
if [ "$HELP" ]; then
usage
fi
echo
echo -n "Check, if all required parameters and files are provided (`date`).. "
## usage, if necessary file and directories are given/exist
if [ ! -f "$REP1" -o ! -f "$REP2" -o ! -f "$PEAKREGION" ]; then
echo
echo "Error: one or more required paramter values not provided"
echo
usage
fi
echo "done"
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
echo -n "Create directory structure (`date`).. "
if [ ! -d "$OUTDIR" ]; then
mkdir $OUTDIR
fi
if [ ! -d "$OUTDIR/rep1" ]; then
mkdir $OUTDIR/rep1/
mkdir $OUTDIR/rep2/
mkdir $OUTDIR/logs/
mkdir $OUTDIR/$RANDOMDIR
fi
echo "done"
echo -n "Populating files based on input genome, $GENOME (`date`).. "
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
GENOME_MACS2="mm"
REPEAT_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.simpleRepeat.gz"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
GENOME_MACS2="hs"
REPEAT_FILE="/home/pundhir/project/genome_annotations/human.hg19.simpleRepeat.gz"
else
echo "Presently the program only support analysis for mm9 or hg19"
echo
usage
fi
echo done
echo -n "Initialize input bam files (`date`).. "
if [ ! -s "$OUTDIR/logs/histone_Rep1.bam" ]; then
scp $REP1 $OUTDIR/logs/histone_Rep1.bam
fi
if [ ! -s "$OUTDIR/logs/histone_Rep2.bam" ]; then
scp $REP2 $OUTDIR/logs/histone_Rep2.bam
fi
REP1="$OUTDIR/logs/histone_Rep1.bam"
REP2="$OUTDIR/logs/histone_Rep2.bam"
echo "done"
echo -n "Determine number of bases by which to extend the 3' end of reads (`date`).. "
if [ ! -z "$EXTEND" -a ! -s "$OUTDIR/extendReads" ]; then
macs2 predictd -i $REP1 -g $GENOME_MACS2 --outdir $OUTDIR/logs/ 2>$OUTDIR/logs/predictd.rep1 &
macs2 predictd -i $REP2 -g $GENOME_MACS2 --outdir $OUTDIR/logs/ 2>$OUTDIR/logs/predictd.rep2 &
wait
FRAGMENTLENGTH_REP1=`cat $OUTDIR/logs/predictd.rep1 | grep "predicted fragment length" | perl -ane 'print $F[scalar(@F)-2];'`
EXTEND_REP1=`echo $FRAGMENTLENGTH_REP1 | perl -ane 'printf("%0.0f", $_/2);'`
#EXTEND_REP1=`cat $OUTDIR/logs/predictd.rep1 | grep "tag size is determined" | perl -ane '$tag=$F[scalar(@F)-2]; printf("%d", '$EXTEND_REP1'-$tag);'`
#EXTEND_REP1=`samtools view -s 0.00001 $REP1 | cut -f 6 | perl -ane 'BEGIN { $max=0; } $_=~s/[a-zA-Z]+//g; if($_>$max){$max=$_;} END { printf("%d", '$EXTEND_REP1'-$max); }'`
FRAGMENTLENGTH_REP2=`cat $OUTDIR/logs/predictd.rep2 | grep "predicted fragment length" | perl -ane 'print $F[scalar(@F)-2];'`
EXTEND_REP2=`echo $FRAGMENTLENGTH_REP2 | perl -ane 'printf("%0.0f", $_/2);'`
#EXTEND_REP2=`cat $OUTDIR/logs/predictd.rep2 | grep "tag size is determined" | perl -ane '$tag=$F[scalar(@F)-2]; printf("%d", '$EXTEND_REP2'-$tag);'`
#EXTEND_REP2=`samtools view -s 0.00001 $REP2 | cut -f 6 | perl -ane 'BEGIN { $max=0; } $_=~s/[a-zA-Z]+//g; if($_>$max){$max=$_;} END { printf("%d", '$EXTEND_REP2'-$max); }'`
echo -e "#extend (rep1)\textend (rep2)\tfragmentLength (rep1)\tfragmentLength (rep2)
$EXTEND_REP1\t$EXTEND_REP2\t$FRAGMENTLENGTH_REP1\t$FRAGMENTLENGTH_REP2" > $OUTDIR/extendReads
elif [ ! -z "$EXTEND" ]; then
echo -n "initialize extend parameters (nfr/extendReads exists).. "
EXTEND_REP1=`grep -v "^\#" $OUTDIR/extendReads | perl -ane 'print "$F[0]";'`;
EXTEND_REP2=`grep -v "^\#" $OUTDIR/extendReads | perl -ane 'print "$F[1]";'`;
else
echo -n "extension parameter (-d) not set. doing no extension of reads.. "
fi
echo "done"
## auto-compute the threshold for minimum number of reads in a block group
## initialize -c, -e parameters, if their optimal values are computed
if [ ! -s "$OUTDIR/optimizeThreshold/blockbuster_threshold.txt" ]; then
echo -n "Optimize the threshold for max length and min number of reads in a block group (`date`).. "
OPTION=0
if [ -f "$TFSUMMIT" ]; then
blockbuster_threshold_nfr -i $REP1 -j $REP2 -k $PEAKREGION -l $TFSUMMIT -o $OUTDIR/optimizeThreshold -n $OUTDIR -g $GENOME -p $OPTION -c $EXTEND_REP1 -d $EXTEND_REP2 &>$OUTDIR/blockbuster_threshold_nfr.log
else
blockbuster_threshold_nfr -i $REP1 -j $REP2 -k $PEAKREGION -o $OUTDIR/optimizeThreshold -n $OUTDIR -g $GENOME -p $OPTION -c $EXTEND_REP1 -d $EXTEND_REP2 &>$OUTDIR/blockbuster_threshold_nfr.log
fi
HEIGHT_THRESHOLD=`grep -v "^\#" $OUTDIR/optimizeThreshold/blockbuster_threshold.txt | perl -ane 'print "$F[3]";'`;
MINCLUSTERHEIGHT=$HEIGHT_THRESHOLD
MINBLOCKHEIGHT=$HEIGHT_THRESHOLD
echo "done"
elif [ -s "$OUTDIR/optimizeThreshold/blockbuster_threshold.txt" ]; then
echo -n "initialize -c, -e parameters ($OUTDIR/optimizeThreshold/blockbuster_threshold.txt exists) (`date`).. "
HEIGHT_THRESHOLD=`grep -v "^\#" $OUTDIR/optimizeThreshold/blockbuster_threshold.txt | perl -ane 'print "$F[3]";'`;
MINCLUSTERHEIGHT=$HEIGHT_THRESHOLD
MINBLOCKHEIGHT=$HEIGHT_THRESHOLD
echo "done"
fi
## print chosen parameters to file
DATE=`date`
echo "#timestamp: $DATE
#input BAM file (Rep1): $REP1
#input BAM file (Rep2): $REP2
#input histone peak region file: $PEAKREGION
#output directory: $OUTDIR
#output directory for randomly shuffled NFR: $OUTDIR/$RANDOMDIR
#minimum reads in block group: $MINCLUSTERHEIGHT
#minimum reads in block: $MINBLOCKHEIGHT
#minimum distance between the blocks: $DISTANCE
#scale to define blocks: $SCALE
#block height: $BLOCKHEIGHT
#minimum length of NFR: $MINNFRLENGTH
#maximum length of NFR: $MAXNFRLENGTH
#FDR at which to select significant NFR: $NFR_THRESHOLD
#number of times NFR regions should be shuffled: $SHUFFLECOUNT
#optimal TF peak summit file: $TFSUMMIT
#reference genome: $GENOME
#extend 3' end of reads (Rep1): $EXTEND_REP1
#extend 3' end of reads (Rep2): $EXTEND_REP2" > $OUTDIR/PARAMETERS
<<"COMMENT1"
COMMENT1
## index bam files and estimate size factors
echo -n "Create index of input BAM files (`date`).. "
if [ ! -e "$REP1.bai" ]; then
samtools index $REP1
fi
if [ ! -e "$REP2.bai" ]; then
samtools index $REP2
fi
echo "done"
echo -n "Compute size factor for each replicate (`date`).. "
if [ ! -e "$OUTDIR/sizeFactor" ]; then
estimateSizeFactor.pl -o b -b $REP1,$REP2 -x $PEAKREGION -r $OUTDIR/sizeFactorCount -e $EXTEND_REP1,$EXTEND_REP2 -g $GENOME
estimateSizeFactor.pl -o c -r $OUTDIR/sizeFactorCount > $OUTDIR/sizeFactor
fi
echo "done"
SIZEFACTOR_REP1=`head -n 1 $OUTDIR/sizeFactor | cut -f 2`;
SIZEFACTOR_REP2=`head -n 2 $OUTDIR/sizeFactor | tail -n 1 | cut -f 2`;
## convert input bam file into bed
echo -n "convert input bam file into bed (`date`).. "
ID_REP1=`echo $REP1 | perl -ane '$_=~s/^.*\///g; $_=~s/\.gz$//g; print $_;'`;
ID_REP2=`echo $REP2 | perl -ane '$_=~s/^.*\///g; $_=~s/\.gz$//g; print $_;'`;
if [ ! -s "$OUTDIR/$ID_REP1.bed" ]; then
bedtools bamtobed -i $REP1 | perl -ane 'if($F[0]!~/^chr[0-9a-zA-Z]+$/) { next; } $F[4]=sprintf("%0.2f", 1/'$SIZEFACTOR_REP1'); print "$F[0]\t$F[1]\t$F[2]\tTAG\t$F[4]\t$F[5]\n";' | sortBed -i stdin | uniq | bedtools slop -i stdin -g $GENOME_FILE -s -l 0 -r $EXTEND_REP1 | perl -ane 'print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t+\n";' | sortBed -i stdin > $OUTDIR/$ID_REP1.bed &
else
echo -n "($OUTDIR/$ID_REP1.bed already exists).. "
fi
REP1_BED="$OUTDIR/$ID_REP1.bed"
if [ ! -s "$OUTDIR/$ID_REP2.bed" ]; then
bedtools bamtobed -i $REP2 | perl -ane 'if($F[0]!~/^chr[0-9a-zA-Z]+$/) { next; } $F[4]=sprintf("%0.2f", 1/'$SIZEFACTOR_REP2'); print "$F[0]\t$F[1]\t$F[2]\tTAG\t$F[4]\t$F[5]\n";' | sortBed -i stdin | uniq | bedtools slop -i stdin -g $GENOME_FILE -s -l 0 -r $EXTEND_REP2 | perl -ane 'print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t+\n";' | sortBed -i stdin > $OUTDIR/$ID_REP2.bed &
else
echo -n "($OUTDIR/$ID_REP2.bed already exists).. "
fi
REP2_BED="$OUTDIR/$ID_REP2.bed"
wait_for_jobs_to_finish "convert input bam file into bed"
if [ ! -s "$OUTDIR/$ID_REP1.bed" -o ! -s "$OUTDIR/$ID_REP2.bed" ]; then
echo
echo "Error: bed files $OUTDIR/$ID_REP1.bed or $OUTDIR/$ID_REP2.bed not defined correctly"
exit
fi
echo "done"
## input parameters are ready. start NFR analysis
if [ -z "$PARALLEL" ]; then
echo -n "Predict nucleosome free regions (NFR) for replicate 1 (`date`).. "
NFRFILE_REP1=`findNFRAll.pl -s $REP1_BED -b $REP1 -o $OUTDIR/rep1/ -z $SIZEFACTOR_REP1 -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP1 -y $GENOME`;
echo "done"
echo -n "Predict nucleosome free regions (NFR) for replicate 2 (`date`).. "
NFRFILE_REP2=`findNFRAll.pl -s $REP2_BED -b $REP2 -o $OUTDIR/rep2/ -z $SIZEFACTOR_REP2 -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP2 -y $GENOME`;
echo "done"
echo -n "Determine common NFR between replicate 1 and 2 (`date`).. "
commonNFR.pl -i $NFRFILE_REP1 -j $NFRFILE_REP2 -k $REP1 -l $REP2 -m $SIZEFACTOR_REP1 -n $SIZEFACTOR_REP2 -o $OUTDIR -g $MINNFRLENGTH -c $EXTEND_REP1 -d $EXTEND_REP2 -y $GENOME
echo "done"
else
echo -n "Split summit file(s) into multiple smaller files (`date`).. "
if [ ! -d "$OUTDIR/parallel" ]; then
mkdir $OUTDIR/parallel
mkdir $OUTDIR/parallel/rep1
mkdir $OUTDIR/parallel/rep2
mkdir $OUTDIR/common/
fi
indexBed.sh -i $REP1_BED -o $OUTDIR/parallel/rep1 -x x
indexBed.sh -i $REP2_BED -o $OUTDIR/parallel/rep2 -x x
echo "done"
echo -n "Predict nucleosome free regions (NFR) for replicate 1 and 2 (`date`).. "
i=0;
for file in `ls $OUTDIR/parallel/rep1/x*`; do
FILE_SUFFIX+=( $(echo $file | sed 's/^.*\///g') );
findNFRAll.pl -s $file -b $REP1 -o $OUTDIR/rep1/ -z $SIZEFACTOR_REP1 -f ${FILE_SUFFIX[$i]} -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP1 -y $GENOME -p a && \
findNFRAll.pl -s $file -b $REP1 -o $OUTDIR/rep1/ -z $SIZEFACTOR_REP1 -f ${FILE_SUFFIX[$i]} -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP1 -y $GENOME -p b && \
findNFRAll.pl -s $file -b $REP1 -o $OUTDIR/rep1/ -z $SIZEFACTOR_REP1 -f ${FILE_SUFFIX[$i]} -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP1 -y $GENOME -p c &
i=$((i+1))
done
i=0;
for file in `ls $OUTDIR/parallel/rep2/x*`; do
FILE_SUFFIX+=( $(echo $file | sed 's/^.*\///g') );
findNFRAll.pl -s $file -b $REP2 -o $OUTDIR/rep2/ -z $SIZEFACTOR_REP2 -f ${FILE_SUFFIX[$i]} -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP2 -y $GENOME -p a && \
findNFRAll.pl -s $file -b $REP2 -o $OUTDIR/rep2/ -z $SIZEFACTOR_REP2 -f ${FILE_SUFFIX[$i]} -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP2 -y $GENOME -p b && \
findNFRAll.pl -s $file -b $REP2 -o $OUTDIR/rep2/ -z $SIZEFACTOR_REP2 -f ${FILE_SUFFIX[$i]} -c $MINCLUSTERHEIGHT -k $MINBLOCKHEIGHT -x $DISTANCE -l $SCALE -g $BLOCKHEIGHT -n $MINNFRLENGTH -v $MAXNFRLENGTH -e $EXTEND_REP2 -y $GENOME -p c &
i=$((i+1))
done
wait_for_jobs_to_finish "Predict nucleosome free regions (NFR) for replicate 1 and 2"
echo "done"
echo -n "Determine common NFR between replicate 1 and 2 (`date`).. "
ID_REP1=`echo $REP1 | perl -ane '$_=~s/^.*\///g; $_=~s/\.gz$//g; print $_;'`;
ID_REP2=`echo $REP2 | perl -ane '$_=~s/^.*\///g; $_=~s/\.gz$//g; print $_;'`;
for (( i=0; i<${#FILE_SUFFIX[@]}; i++ )); do
commonNFR.pl -i $OUTDIR/rep1/$ID_REP1.nfr.uniq${FILE_SUFFIX[$i]} -j $OUTDIR/rep2/$ID_REP2.nfr.uniq${FILE_SUFFIX[$i]} -k $REP1 -l $REP2 -m $SIZEFACTOR_REP1 -n $SIZEFACTOR_REP2 -o $OUTDIR/common/ -g $MINNFRLENGTH -c $EXTEND_REP1 -d $EXTEND_REP2 -y $GENOME -f ${FILE_SUFFIX[$i]} &
done
wait_for_jobs_to_finish "Determine common NFR between replicate 1 and 2"
echo "done"
echo -n "Concatenate all result files into one file (`date`).. "
ID=`echo $REP1 | perl -ane '$_=~s/^.*\///g; $_=~s/Rep.*$//g; $_=~s/\_$//g; print $_;'`;
zless $OUTDIR/common/$ID.All.nfrx* > $OUTDIR/$ID.All.nfr
nfr2ucsc.pl -i $OUTDIR/$ID.All.nfr > $OUTDIR/$ID.All.nfr.ucsc
echo "done"
fi
## nfr analysis for randomly distributed nfr regions (associate p values)
echo -n "check if size factor files already exist (`date`).. "
if [ ! -d "$OUTDIR/$RANDOMDIR" ]; then
mkdir $OUTDIR/$RANDOMDIR
fi
if [ -s "$OUTDIR/sizeFactor" ]; then
scp $OUTDIR/sizeFactor $OUTDIR/$RANDOMDIR/sizeFactor
scp $OUTDIR/sizeFactorCount $OUTDIR/$RANDOMDIR/sizeFactorCount
fi
echo "done"
if [ ! -s "$OUTDIR/$RANDOMDIR/INCLREGION.BED" ]; then
echo -n "create file containing genomic coordinates within which to randomly shuffle the NFRs (`date`).. "
ID_REP1=`echo $REP1 | perl -ane '$_=~s/^.*\///g; $_=~s/\.gz$//g; print $_;'`;
ID_REP2=`echo $REP2 | perl -ane '$_=~s/^.*\///g; $_=~s/\.gz$//g; print $_;'`;
#cat $OUTDIR/rep1/$ID_REP1.tmp* $OUTDIR/rep2/$ID_REP2.tmp* | cut -f 4 | perl -ane '@coor=split(/[\:\-]+/,$_); print "$coor[0]\t$coor[1]\t$coor[2]";' | sortBed -i stdin | mergeBed -i stdin > $OUTDIR/$RANDOMDIR/INCLREGION.BED
cat $OUTDIR/rep1/$ID_REP1.tmp* $OUTDIR/rep2/$ID_REP2.tmp* | cut -f 4 | perl -ane '@coor=split(/[\:\-]+/,$_); print "$coor[0]\t$coor[1]\t$coor[2]";' | sortBed -i stdin | mergeBed -i stdin -d $MAXNFRLENGTH > $OUTDIR/$RANDOMDIR/INCLREGION.BED
#cat $OUTDIR/rep1/$ID_REP1.nfrx* $OUTDIR/rep2/$ID_REP2.nfrx* | cut -f 4 | perl -ane '@coor=split(/[\:\-]+/,$_); print "$coor[0]\t$coor[1]\t$coor[2]";' | sortBed -i stdin | mergeBed -i stdin > $OUTDIR/$RANDOMDIR/INCLREGION.BED
echo "track name=\"Defined blocks ($ID_REP1.blocks)\" description=\"Defined blocks ($ID_REP1.blocks)\" itemRgb=\"On\"" > $OUTDIR/$ID_REP1.blocks.ucsc
cat $OUTDIR/rep1/$ID_REP1.tmp* >> $OUTDIR/$ID_REP1.blocks.ucsc
echo "track name=\"Defined blocks ($ID_REP2.blocks)\" description=\"Defined blocks ($ID_REP2.blocks)\" itemRgb=\"On\"" > $OUTDIR/$ID_REP2.blocks.ucsc
cat $OUTDIR/rep2/$ID_REP2.tmp* >> $OUTDIR/$ID_REP2.blocks.ucsc
echo "done"
else
echo "randomly shuffled file ($OUTDIR/$RANDOMDIR/INCLREGION.BED) already exist (`date`).. done"
fi
echo -n "nfr analysis for randomly distributed nfr regions (`date`).. "
ID=`echo $REP1 | perl -ane '$_=~s/^.*\///g; $_=~s/Rep.*$//g; $_=~s/\_$//g; print $_;'`;
if [ ! -s "$OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED" -o ! -s "$OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED.SCORE" ]; then
randomNfrAna -i $REP1 -j $REP2 -k $PEAKREGION -l $OUTDIR/$ID.All.nfr -m $GENOME -o $OUTDIR/$RANDOMDIR -p -f $OUTDIR/$RANDOMDIR/INCLREGION.BED -n $SHUFFLECOUNT -c $EXTEND_REP1 -d $EXTEND_REP2 &>$OUTDIR/nfrAna_random.log
else
echo "($OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED and $OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED.SCORE already exists).. "
fi
if [ ! -s "$OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED" -o ! -s "$OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED.SCORE" ]; then
echo -n "(failed using 100,000 regions, try using 10000 regions).. "
ID=`echo $REP1 | perl -ane '$_=~s/^.*\///g; $_=~s/Rep.*$//g; $_=~s/\_$//g; print $_;'`;
SHUFFLECOUNT=10000
randomNfrAna -i $REP1 -j $REP2 -k $PEAKREGION -l $OUTDIR/$ID.All.nfr -m $GENOME -o $OUTDIR/$RANDOMDIR -p -f $OUTDIR/$RANDOMDIR/INCLREGION.BED -n $SHUFFLECOUNT -c $EXTEND_REP1 -d $EXTEND_REP2 &>$OUTDIR/nfrAna_random.log
fi
Rscript /home/pundhir/software/myScripts/PredictNFR_v0.01/randomNfrAna.R $OUTDIR/$ID.All.nfr $OUTDIR/$RANDOMDIR/RANDOM_NFRREGION.BED.SCORE $OUTDIR/$ID.All.nfrP
perl -ane 'if($F[15]<'$NFR_THRESHOLD') { print $_; }' $OUTDIR/$ID.All.nfrP > $OUTDIR/$ID.All.nfr.sig
nfr2ucsc.pl -i $OUTDIR/$ID.All.nfr.sig > $OUTDIR/$ID.All.nfr.sig.ucsc
echo "done"
echo -n "convert input bam to bigWig format to visualize in UCSC browser (`date`).. "
ID_REP1=`echo $REP1 | perl -ane '$_=~s/\.bam.*//g; print $_;'`;
ID_REP2=`echo $REP2 | perl -ane '$_=~s/\.bam.*//g; print $_;'`;
if [ ! -s "$ID_REP1.bw" ]; then
bam2bwForChIP -i $REP1 -c $GENOME_FILE -e $EXTEND_REP1 &
else
echo -n "($ID_REP1.bw already exists..) "
fi
if [ ! -s "$ID_REP2.bw" ]; then
bam2bwForChIP -i $REP2 -c $GENOME_FILE -e $EXTEND_REP2 &
else
echo -n "($ID_REP2.bw already exists..) "
fi
wait
echo "All done. Bye"