-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfsize
executable file
·66 lines (66 loc) · 1.99 KB
/
fsize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/perl
#
#pipe a multi-fasta into this and get the sequence size for each fasta record
#
use strict;
use Getopt::Std;
use Digest::MD5 qw(md5_hex);
my $usage=q/
Reports the FASTA sequence length for each record in a multi-FASTA file
The output line format is (tab delimited):
<seq_name> <seq_len> [<MD5>|<seq_description>]
..where <seq_description> is the rest of the text following
the sequence name in the defline, if any
Options:
-S print a summary of the sequence data found
-M print a MD5 checksum instead of the sequence description
(MD5 is computed on the uppercase sequence, soft-masking it ignored)
-C print soft-masked bases count, N count and MD5 checksum instead
of the sequence description
/;
die $usage."\n" if ($ARGV[0]=~m/^\-+h/);
getopts('CSM') || die($usage."\n");
my $summary=$Getopt::Std::opt_S;
my $checksum=$Getopt::Std::opt_M;
my $seqck=$Getopt::Std::opt_C;
local $/="\n>";
my ($minlen, $min, $max, $maxlen, $total, $totlc, $totn, $avg, $std);
$minlen=2000000000;
my @v; #
while (<>) {
s/^>//;
chomp;
my ($seqname, $ann, $seq)=(m/^(\S+)[ \t\x01]*([^\n]*)\n?(.*)/s);
my @nr=split(/\x01/, $ann, 2);
$ann=$nr[0] if (@nr>1);
$seq =~ tr/\t \n\r//d;
my $len=length($seq);
my $lc=($seq=~tr/a-z//);
my $nb=($seq=~tr/Nn//);
my @ol=($seqname, $len);
if ($checksum) {
my $md5=md5_hex(uc($seq));
push(@ol, $md5);
} elsif ($seqck) {
my $md5=md5_hex(uc($seq));
push(@ol, $lc, $nb, $md5);
}
else {
push(@ol,$ann) if $ann;
}
push(@v, $len);
$total+=$len;
$totlc+=$lc;
$totn+=$nb;
($minlen, $min)=($len, $seqname) if ($len<$minlen);
($maxlen, $max)=($len, $seqname) if ($len>$maxlen);
print join("\t", @ol)."\n";
}
my $numseqs=@v;
if ($summary) {
#TODO: calculate variance
print STDERR "Total $total bases in $numseqs sequences ($totlc lower-case, $totn Ns).\n";
print STDERR "Max length: $maxlen (sequence $max)\n";
print STDERR "Min length: $minlen (sequence $min)\n";
print STDERR "Average : ".int($total/$numseqs)."\n";
}