-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathseq-len
executable file
·71 lines (64 loc) · 1.85 KB
/
seq-len
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env perl
# Created: 17 Jan 2021
# Author: Thomas Hackl, [email protected]
use warnings;
use strict;
use Getopt::Long qw(:config no_ignore_case bundling);
use Data::Dumper;
GetOptions (
"out|o=s" => sub { '-' ne $_[1] and open(STDOUT, '>', $_[1]) || die $! },
"help|h!" => \(my $help),
"debug|D!" => \(my $debug),
) or die("Error in command line arguments\n");
if ($help){
print "Usage: seq-len < in > out\n";
printf " %-19s %s\n", "-o/--out", "write to this file [STDOUT]";
printf " %-19s %s\n", "-h/--help", "show this help";
printf " %-19s %s\n", "-D/--debug", "show debug messages";
print "Reads FASTA, GFF3 and Genbank.\n";
print "Generates three tab-separated columns: seq_id, seq_desc and length\n";
exit 0;
}
# determine format based on first line
my $first = <>;
my @r;
my $seq_reg = 0;
# fasta
if ($first =~ /^>(\S+)(?:\s([^\n\r]*))?/){ # .* captures \r in some cases
@r = ($1, $2 // "", 0);
while (<>) {
tr/\r\n//d; # chomp() fails on \r\n;
if (/^>(\S+)(?:\s(.*))?/) {
print join("\t", @r),"\n";
@r = ($1, $2 // "", 0);
} else {
$r[2] += length($_)
}
}
print join("\t", @r),"\n";
}
# gbk
elsif ($first=~ /^LOCUS\s+(\S+)\s+(\d+)/) {
print $1,"\t\t",$2,"\n";
while (<>) {
if (/^LOCUS\s+(\S+)\s+(\d+)/){
print $1,"\t\t",$2,"\n";
}
}
}
# gff
elsif ($first =~ /^##gff/) {
while (<>){
$first = undef;
if (/^##sequence-region\s(\S+)\s(\d+)\s(\d+)/){
$seq_reg++;
print $1,"\t\t",$3-$2+1,"\n";
}
};
unless ($seq_reg) {
die "Cannot parse sequence lengths: ##sequence-region directives are missing.\n";
}
}
else {
die "Unknown format, can read fasta, gbk and gff with proper ##gff-version and ##sequence-region directives\n";
}