-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmgbl_miRNA_cov.pl
executable file
·149 lines (141 loc) · 4.99 KB
/
mgbl_miRNA_cov.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;
my $usage = q/Usage:
mgbl_miRNA_cov.pl [-P] [-v <max_mms>] [-g <maxgaps>] [-f <valid_overlaps>] \
[-o <output_cov_data>] <mgblast_output.tab>
Filters the overlaps resulted from a mgblast search against a database of
known miRNAs and estimates expression level (for mature miRNAs).
Options:
-P precursor miRNAs were used for the mgblast search
-g discard overlaps with more than <maxgaps> gaps (default 2)
-v discard overlaps with more than <max_mms> mismatched bases
at the 3' end of the read (default 4 at the 3' end of the read);
-U unique mappings of reads to miRNAs: assuming the hits are sorted
by score, only use the first hit for a read and discard the rest
-R append column with all mapped reads for each miRNA
-B consider aligments on both strands (normally only Plus strand
alignments are kept)
Other implied restrictions:
* only 1 base mismatch is allowed at the 5' end of the read;
* in case of mature miRNAs, an overlap should involve at least 90% of the
read length and at least 70% of the mature miRNA length
* in case of precursor miRNAs, an overlap should involve at least 90% of
the read length
/;
umask 0002;
getopts('BRUPf:g:v:o:') || die($usage."\n");
my $outfile=$Getopt::Std::opt_o;
my $uniqmap=$Getopt::Std::opt_U;
my $rshow=$Getopt::Std::opt_R;
my $bothstrands=$Getopt::Std::opt_B;
my $fflt=$Getopt::Std::opt_f;
my $maxgaps=$Getopt::Std::opt_g || 2;
my $maxovh=$Getopt::Std::opt_v || 4;
my $premiRNA=$Getopt::Std::opt_P;
my $minmcov=70;
my $minrcov=90;
#my $totalreads=$Getopt::Std::opt_n;
#die($usage."Error: total number of reads in the sample is required! (-n parameter)\n")
# unless $totalreads>1;
if ($outfile) {
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
select(OUTF);
}
if ($fflt) {
open(FLT, '>'.$fflt) || die("Error creating output file $fflt\n");
}
my %mdata; # mirna => [$cov, $read1, $read2,...]
my %rd; # reads assigned uniquely to miRNAs - first come first served,
# assuming first hit is the best one!
while (<>) {
my $mgline=$_;
chomp;
my ($read, $rlen, $rstart, $rend, $mirna, $mlen, $mstart, $mend,
$pid, $bitscore, $pvalue, $strand, $rgaps, $mgaps)=split(/\t/);
next unless $read && $mirna;
next if $uniqmap && exists($rd{$read}); #read already assigned to a mature mRNA
next if ($strand eq '-' && !$bothstrands);
my ($read_mult)=($read=~m/_x(\d+)$/); # read multiplicity
print STDERR ">checking $mgline";
print STDERR "read_multiplier=$read_mult, pid=$pid\n";
next if ($read_mult>=3 && $pid<100);
($rstart, $rend)=($rend, $rstart) if $rend<$rstart;
next if ($rend-$rstart+1)*100<$minrcov*$rlen;
unless ($premiRNA) {
next if ($mend-$mstart+1)*100<$minmcov*$mlen;
}
my ($rovh3, $rovh5)=($strand eq '-') ? ($rstart-1, $rlen-$rend) : ($rlen-$rend, $rstart-1);
my ($movh3, $movh5)=($mlen-$mend, $mstart-1);
my $mm3=($rovh3<$movh3) ? $rovh3 : $movh3;
my $mm5=($rovh5<$movh5) ? $rovh5 : $movh5;
my $maxmm=$maxovh;
$maxmm-- if $pid<100;
next if $mm3>$maxmm || $mm5>1;
my $numgaps=0;
if ($rgaps) {
my @rg=split(/\,/, $rgaps);
foreach my $g (@rg) {
$numgaps+= ($g=~m/\+(\d+)$/) ? $1 : 1;
}
next if $numgaps>2;
}
if ($mgaps) {
my @rg=split(/\,/, $mgaps);
foreach my $g (@rg) {
$numgaps+= ($g=~m/\+(\d+)$/) ? $1 : 1;
}
next if $numgaps>2;
}
next if ($read_mult>=3 && $numgaps>0);
# -- here we are, valid overlap, keep it:
print FLT $mgline if $fflt;
$rd{$read}=$mirna;
my ($rmul)=($read=~m/_x(\d+)$/); #read multiplicity
my $md=$mdata{$mirna};
unless ($md) {
my @mc=(0) x $mlen;
@mc[($mstart-1)..($mend-1)]=($rmul) x ($mend-$mstart+1);
# ------------- 0 1 2 3 4 5
$mdata{$mirna}=[$rmul, $mlen, $mstart, $mend, [@mc], $read];
next;
}
# add to existing miRNA
$md->[0]+=$rmul;
$$md[2]=$mstart if ($$md[2]>$mstart);
$$md[3]=$mend if ($$md[3]<$mend);
my $mc=$$md[4];
for (my $i=$mstart-1;$i<$mend;$i++) {
$$mc[$i]+=$rmul;
}
push(@$md, $read);
} #while input lines
# report coverage per known miRNA
my $k=keys(%mdata);
my ($maxmirna, $maxcov);
while (my ($mirna, $md)=each(%mdata)) {
my @r=@$md;
my $mcov=shift(@r);
my $mlen=shift(@r);
my $mcovstart=shift(@r);
my $mcovend=shift(@r);
my $covdata=shift(@r); #coverage per base
# find maximum coverage for this miRNA
my $maxc=$$covdata[0];
foreach my $c (@$covdata) { $maxc=$c if $c>$maxc; }
my $mlencov = ($mcovstart==1 && $mcovend==$mlen)? 'full' : sprintf('%.2f',($mcovend-$mcovstart)/$mlen);
($maxcov, $maxmirna)=($mcov, $mirna) if $maxcov<$mcov;
#print join("\t",$mirna, sprintf('%.4f',$mcov/$totalreads), $mcov.'x',
my $rlist=$rshow ? "\t".join(',',@r) : '';
print join("\t",$mirna, $mcov, $maxc, $mlencov)."$rlist\n";
}
print STDERR "Done. $k miRNAs covered.\n".
"Maximum coverage was found for $maxmirna at ${maxcov}x.\n";
#- script ending here:
close(FLT) if $fflt;
if ($outfile) {
select(STDOUT);
close(OUTF);
}
#======== Subroutines ============#