-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy path5.phase_loops.pbs
executable file
·241 lines (181 loc) · 6.17 KB
/
5.phase_loops.pbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/bin/bash
# Set parameters for the cluster
#PBS -l nodes=1:ppn=20
#PBS -l walltime=20:00:00
#PBS -l mem=60GB
#PBS -l vmem=60GB
#PBS -j oe
# ChIA-PIPE
# Step 5: Phase the peaks and loops
# 2018
# The Jackson Laboratory for Genomic Medicine
# The help message:
function usage
{
echo -e "usage: qsub -F \"--conf ${conf} --out_dir ${out_dir}\"
5.phase_loops.pbs
"
}
# Parse arguments from the command line
while [ "$1" != "" ]; do
case $1 in
-c | --conf ) shift
conf=$1
;;
-o | --out_dir ) shift
out_dir=$1
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done
# Put output files in subdirectory named after the run
cd ${PBS_O_WORKDIR}
source ${conf}
cd ${out_dir}
# Add dependency dir to path
export PATH=${dep_dir}:${PATH}
## Create name of the log file
log_file=5.${run}.phase_loops.log
# Print arguments to ensure correct parsing
echo "
Arguments:
run=${run}
fasta=${fasta}
snp_file=${snp_file}
out_dir=${out_dir}
bin_dir=${bin_dir}
" >> ${log_file}
# Create file names
bam_file=${run}.for.BROWSER.bam
loop_file=${run}.e500.clusters.cis.BE3
pileup_file=${run}.total_snp_pileup.txt
if [ ${peak_caller} == "spp" ] || [ ${peak_caller} == "SPP" ]
then
peak_file=${run}.for.BROWSER.spp.z6.broadPeak
else
peak_file=${run}.no_input_all_peaks.narrowPeak
fi
# Separate the SNP file into one file for each chromosome
# (named after the chromosome)
echo -e "`date` Splitting SNP file by chromosome..\n" >> ${log_file}
awk '{ print >> "temp_"$1 }' ${snp_file}
# Count the SNP alleles for each chromosome
echo -e "`date` Counting SNP alleles on each chromosome..\n" >> ${log_file}
### Get chromosome names from the fasta file
chroms=$(grep ">" ${fasta} | sed -e "s/>//g")
echo -e "`date` Detected the following chroms in the FASTA file..\n" \
>> ${log_file}
for chrom in ${chroms}
do
echo -e "\t${chrom}" >> ${log_file}
done
### Pileup chromosomes
echo -e "`date` Performing mpileup on chromosome:\n" \
>> ${log_file}
job_ids=()
for chrom in ${chroms}
do
# Skip chrM
if [ ${chrom} == 'chrM' ] || [ ${chrom} == 'chrY' ]
then
echo -e "\t (Skipping ${chrom})" >> ${log_file}
continue
fi
# Create a separate PBS job for each chromosome
# to count SNP alleles with samtools
echo -e "\t ${chrom}" >> ${log_file}
job_name="temp_job_${chrom}"
# Change to output directory and load modules
pbs="cd ${out_dir};"
pbs+="module load samtools/0.1.19;"
# Samtools command
# By default, samtools/0.1.19 has BAQ filtering ON
# It is important to retain this if changing to a different version
pbs+="samtools mpileup -f ${fasta} "
pbs+="-l temp_${chrom} ${bam_file} "
pbs+="> temp_counts.${chrom}.mileup"
# Submit job
jid=$( echo -e ${pbs} 2>> ${log_file} | \
qsub -l nodes=1:ppn=${n_thread},mem=${mem},walltime=8:00:00 -N ${job_name})
sleep 0.5
job_ids+=" ${jid}"
done
# Wait for all chromosome jobs to finish
echo -e "`date` Waiting for job completion... \n" >> ${log_file}
for jid in ${job_ids}
do
echo -e "\t${jid}" >> ${log_file}
while [ -n "$( qstat | grep ${jid} )" ]
do
sleep 1
done
done
# Report completion of counting SNP alleles
echo -e "`date` Completed counting SNP alleles..\n" >> ${log_file}
### Merge SNP allele counts from all chromosomes
echo -e "`date` Merging SNP allele counts from all chroms..\n" >> ${log_file}
cat *mileup | awk 'BEGIN{FS="\t";OFS="\t"}{chrom=$1;pos=$2;ref=$3;\
num=$4+$7+$10;bases=$5""$8""$11;qual=$6""$9""$12;\
print chrom,pos,ref,num,bases,qual}' | awk 'NF==6{print}' \
> ${pileup_file}
echo -e "`date` Completed merging SNP allele counts..\n" >> ${log_file}
# Remove temporary files
echo -e "`date` Removing temp files..\n" >> ${log_file}
#rm temp_*
echo -e "`date` Completed removing temp files..\n" >> ${log_file}
### Count phased SNPs
# Do binomial test to identify significantly biased SNPs
# Output file name: phased_snp_qvalue.txt
echo -e "`date` Counting SNP allele bias and assessing significance.." \
>> ${log_file}
# Allele-biased SNPs (R)
Rscript ${bin_dir}/compute_phased_SNP.R \
${run} ${pileup_file} ${snp_file} TRUE 0.1
# Create BED file of All SNPs
cat ${run}.snp_qvalues.txt | grep -v biased | awk -v OFS='\t' \
'{ print $1, $2, $2+200, $7, "+", $4, $5, $6, "153,153,153"}' \
> ${run}.snp_qvalues.bed
# Create BED file of Phased SNPs
cat ${run}.snp_qvalues.txt | grep Yes | awk -v OFS='\t' \
'{
if ($4 >= $5)
print $1, $2, $2+200, $7, "+", $4, $5, $6, "26,152,80"
else
print $1, $2, $2+200, $7, "+", $4, $5, $6, "215,48,39"
}' \
> ${run}.snp_qvalues.phased.bed
# Make bedgraph of Paternal allele counts
cat phased_snp_qvalue.txt | grep -v biased | \
awk -v OFS='\t' '{ print $1, $2, $2+100, $4 }' \
> ${run}.snp_coverage.Paternal.100.bedgraph
# Make bedgraph of maternal allele counts
cat phased_snp_qvalue.txt | grep -v biased | \
awk -v OFS='\t' '{ print $1, $2, $2+10, $5 }' \
> ${run}.snp_coverage.Maternal.10.bedgraph
echo -e "`date` Completed counting SNP allele bias..\n" >> ${log_file}
#######
cat phased_snp_qvalue.Yes.txt | grep -v biased | \
awk -v OFS='\t' '{ print $1, $2, $2+1, $4 }' \
> ${run}.snp_coverage.Yes.Paternal.1.bedgraph
########
### Compute phased interactions
# phased_snp_qvalue.txx is generated by last step
# (non-overlapping)
echo "`date` Identifying biased loops (from raw loops and biased SNPs)..
" >> ${log_file}
perl ${bin_dir}/compute_phased_loops.pl \
phased_snp_qvalue.txt ${peak_file} ${loop_file}
# Create two subset BED files
# BED file of only phased peaks
cat ${peak_file}.SNP_Phased.browser.bed | grep -v Unphased \
> ${peak_file}.SNP_Phased.Paternal_or_Maternal.bed
# BED file of only unphased peaks
cat ${peak_file}.SNP_Phased.browser.bed | grep Unphased \
> ${peak_file}.SNP_Phased.Unphased.bed
echo -e "`date` Completed identifying biased loops..\n" >> ${log_file}
echo "`date` $0 done" >> ${log_file}