-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathimargi_parse.sh
executable file
·228 lines (197 loc) · 8.36 KB
/
imargi_parse.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env bash
set -e
PROGNAME=$0
usage() {
cat << EOF >&2
Usage: $PROGNAME [-r <ref_name>] [-c <chromSize_file>] [-R <restrict_sites>] [-b <bam_file>] [-o <output_dir>]
[-Q <min_mapq>] [-G <max_inter_align_gap>] [-O <offset_restriction_site>] [-M <max_ligation_size>]
[-d <drop>] [-D <intermediate_dir>] [-t <threads>]
Dependency: pairtools pbgzip
This script will use pairtools to parse the BAM alignments to interaction read pairs in .pairs format, and apply
de-duplication and filtering.
-r : Reference assembly name, such as "hg38"
-c : Chromosome size file.
-R : DNA restriction enzyme digestion sites bed file.
-b : BAM file generated by "bwa mem -SP5M" mapping of iMARGI data.
-o : Output directoy
-Q : Min MAPQ value, default 1.
-G : Max inter align gap for pairtools parsing. Default 20. It will allow R1 5' end clipping.
-O : Max mis-offset bases for filtering pairs based on R2 5' end positions to restriction sites. Default 3.
-M : Max size of ligation fragment for sequencing. It's used for filtering unligated DNA sequence. Default 1000.
-d : Flag of dropping. Default is false, i.e., output all the intermediate results.
-D : Directory for intermediate results. Works when -d false. Default is a sub-folder "intermediate_results"
in output directory.
-t : Max CPU threads for parallelized processing, at least 4. (Default 8)
-h : Show usage help
EOF
exit 1
}
while getopts :r:c:R:b:o:Q:G:O:M:d:D:t:h opt; do
case $opt in
r) ref_name=${OPTARG};;
c) chromsize=${OPTARG};;
R) rsites=${OPTARG};;
b) bamfile=${OPTARG};;
o) output_dir=${OPTARG};;
Q) mapq=${OPTARG};;
G) gap=${OPTARG};;
O) offset=${OPTARG};;
M) max_ligation_size=${OPTARG};;
d) dflag=${OPTARG};;
D) inter_dir=${OPTARG};;
t) threads=${OPTARG};;
h) usage;;
esac
done
# threshold of: (#paired_unique_mapping + #single_side_unique_mapping) / #total_read_pairs
pass_mapping=0.25
warn_mapping=0.5
# threshold of: #final_valid_pairs / #paired_unique_mapping
pass_valid=0.25
warn_valid=0.5
[ -z "$ref_name" ] && echo "Error!! Please provide reference genome name with -r" && usage
[ ! -f "$chromsize" ] && echo "Error!! Chomosome size file not exist: "$chromsize && usage
[ ! -f "$rsites" ] && echo "Error!! Resitriction sites file not exist: "$rsites && usage
[ ! -f "$bamfile" ] && echo "Error!! BAM file not exist: "$bamfile && usage
[ ! -d "$output_dir" ] && echo "Error!! Output directory not exist: "$output_dir && usage
[ -z "$mapq" ] && echo "Use default min mapq 1 for pairtools parsing." && mapq=1
if ! [[ "$mapq" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -Q" && usage
fi
[ -z "$gap" ] && echo "Use default max inter align gap for pairtools parsing." && gap=20
if ! [[ "$gap" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -g" && usage
fi
[ -z "$offset" ] && echo "Use default offset 3'." && offset=3
if ! [[ "$offset" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -O" && usage
fi
[ -z "$max_ligation_size" ] && echo "Use default max ligation size 1000'." && max_ligation_size=1000
if ! [[ "$max_ligation_size" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -M" && usage
fi
[ -z "$dflag" ] && echo "Use default setting '-d false'." && dflag="false"
if [[ "$dflag" != "false" ]] && [[ "$dflag" != "true" ]]; then
echo "Error!! Only true or false is acceptable for -d." && usage
fi
if [[ "$dflag" == "false" ]] ; then
[ -z "$inter_dir" ] && \
echo "Use default directory for intermediate result files: "$output_dir"/intermediate_results" && \
inter_dir="$output_dir/intermediate_results" && mkdir $inter_dir
[ ! -d "$inter_dir" ] && echo "Error!! Directory for intermediate result files not exist: "$inter_dir && usage
else
inter_dir=$output_dir"/inter_bam2pairs_"$RANDOM""$RANDOM
mkdir $inter_dir
fi
[ -z "$threads" ] && echo "Use default thread number 8'." && threads=8
if ! [[ "$threads" =~ ^[0-9]+$ ]]; then
echo "Error!! Only integer number is acceptable for -t" && usage
fi
filebase=$(basename ${bamfile%.*})
all_pairs=$inter_dir"/all_"$filebase".pairs.gz"
sorted_all_pairs=$inter_dir"/sorted_all_"$filebase".pairs.gz"
dedup_pairs=$inter_dir"/dedup_"$filebase".pairs.gz"
ummapped_pairs=$inter_dir"/unmapped_"$filebase".pairs.gz"
duplication_pairs=$inter_dir"/duplication_"$filebase".pairs.gz"
final_pairs=$output_dir"/final_"$filebase".pairs.gz"
drop_pairs=$inter_dir"/drop_"$filebase".pairs.gz"
stats=$inter_dir"/stats_final_"$filebase".txt"
date
echo "Start parsing ..."
pairtools parse -c $chromsize \
--assembly $ref_name \
--add-columns mapq,cigar \
--no-flip \
--min-mapq $mapq \
--max-inter-align-gap $gap \
--drop-sam \
--report-alignment-end 5 \
--walks-policy 5any \
--nproc-in $(($threads/3+1)) \
--nproc-out $(($threads-$threads/3-1)) \
--output $all_pairs \
$bamfile
date
echo "Start sort ..."
pairtools sort \
--nproc $(($threads/2 + 1)) \
--tmpdir $inter_dir \
--nproc-in $(($threads/3+1)) \
--nproc-out $(($threads-$threads/3-1)) \
--output $sorted_all_pairs \
$all_pairs
date
echo "Start de-duplication ..."
pairtools dedup \
--mark-dups \
--nproc-in $(($threads/3+1)) \
--output-stats $inter_dir"/stats_dedup_"$filebase".txt" \
--output-dups $duplication_pairs \
--output-unmapped $ummapped_pairs \
$sorted_all_pairs |\
imargi_restrict.py \
--frags $rsites \
--output $dedup_pairs \
--nproc-out $(($threads-$threads/3-1))
date
echo "Start filtering ... "
select_str="regex_match(pair_type, \"[UuR][UuR]\") and \
dist2_rsite != \"!\" and \
(abs(int(dist2_rsite)) <= "$offset") and \
(not (chrom1 == chrom2 and \
abs(int(dist1_rsite)) <= "$offset" and \
strand1 != strand2 and \
((strand1 == \"+\" and strand2 == \"-\" and int(frag1_start) <= int(frag2_start) and \
abs(int(frag2_end) - int(frag1_start)) <= "$max_ligation_size") or \
(strand1 == \"-\" and strand2 == \"+\" and int(frag1_start) >= int(frag2_start) and \
abs(int(frag1_end) - int(frag2_start)) <= "$max_ligation_size"))))"
# echo $select_str
pairtools select "$select_str" \
--chrom-subset $chromsize \
--output-rest $drop_pairs \
--nproc-in $(($threads/3+1)) \
--nproc-out $(($threads-$threads/3-1)) \
--output $final_pairs \
$dedup_pairs
# pairix $final_pairs
date
echo "Generating final stats ... "
pairtools stats \
--nproc-in $(($threads/3+1)) \
--nproc-out $(($threads-$threads/3-1)) \
--output $stats \
$final_pairs
rm $all_pairs
awk -v pass_mapping=$pass_mapping -v warn_mapping=$warn_mapping \
-v pass_valid=$pass_valid -v warn_valid=$warn_valid \
'BEGIN{
FS="\t"; OFS="\t"
}FNR==NR{
if(FNR<7){count_raw[$1]=$2};
}FNR!=NR{
if(FNR<9){count[$1]=$2}else{exit};
}END{
qc_mapping=(count_raw["total_single_sided_mapped"] + count_raw["total_mapped"])/count_raw["total"];
qc_valid=count["total"]/count_raw["total_nodups"];
if(qc_mapping >= pass_mapping && qc_valid >= pass_valid){
warn_message="";
if(qc_mapping < warn_mapping || qc_valid < warn_valid){
warn_message=" (The sequence mapping rates are lower than average. Experimental repetition or improvements are recommended.)"
};
print "Sequence mapping QC\tpassed"warn_message;
}else{print "Sequence mapping QC\tfailed"};
print "(#unique_mapped_pairs + #single_side_unique_mapped)/#total_read_pairs", qc_mapping;
print "#total_valid_interactions/#nondup_unique_mapped_pairs", qc_valid;
print "total_read_pairs", count_raw["total"];
print "single_side_unique_mapped", count_raw["total_single_sided_mapped"];
print "unique_mapped_pairs", count_raw["total_mapped"];
print "nondup_unique_mapped_pairs", count_raw["total_nodups"];
print "total_valid_interactions", count["total"];
print "inter_chr", count["trans"];
print "intra_chr", count["cis"];
}' $inter_dir/stats_dedup_$filebase.txt $stats > $output_dir/pipelineStats_$filebase.log
date
echo "Parsing and filtering finished."
if [[ "$dflag" == "true" ]]; then
rm -rf $inter_dir
fi