-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathncbitax2bcp.pl
executable file
·103 lines (96 loc) · 2.7 KB
/
ncbitax2bcp.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/perl
use strict;
#use Getopt::Std;
use FindBin;use lib $FindBin::Bin;
my $usage = q{Usage:
Download ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
And unpack it in the current directory
Then run $0 which will create
the bcp files for loading into the database.
};
umask 0002;
my %merged; #insert "merged" entries too, even though their data is duplicate
#getopts('o:') || die($usage."\n");
#my $outfile=$Getopt::Std::opt_o;
die("$usage\n") if $ARGV[0]=~m/^\-[\-help]*$/;
die("$usage Error: cannot find nodes.dmp!\n") unless -f 'nodes.dmp';
my $cmd='cut -f5 nodes.dmp | sort -u |';
open(TAXC, '>taxon_cat.bcp') || die "Error creating taxon_cat.bcp file!\n";
my %ranks; #rank_name => rank_id
open(RANK, $cmd) || die "Error opening $cmd pipe!\n";
my ($rank_id, $rank_name);
while (<RANK>) {
chomp;
s/^\s+//;s/\s+$//;
next unless $_;
$rank_id++;
$rank_name=$_;
$ranks{$rank_name}=$rank_id;
print TAXC join("\t", 'R', $rank_id, '', $rank_name, '')."\n";
}
close(RANK);
open(MRG, 'merged.dmp') || die("Error opening merged.dmp!\n");
while (<MRG>) {
chomp;
next unless $_;
s/\t\|$//;
my @t=split(/\t\|\t/);
push(@{$merged{$t[1]}},$t[0]);
}
close(MRG);
open(DIV, 'division.dmp') || die("Error opening division.dmp!\n");
while (<DIV>) {
chomp;
next unless $_;
s/\t\|$//;
my @t=split(/\t\|\t/);
print TAXC join("\t", 'D', @t[0..3])."\n";
}
close(DIV);
close(TAXC);
open(NAMES, 'names.dmp') || die("Error opening names.dmp!\n");
open(TAXN, '>taxon_names.bcp') || die("Error creating taxon_names.bcp!\n");
my %names; # tax_id => [sci_name, com_name]
while(<NAMES>) {
chomp;
next unless $_;
s/\t\|$//;
my @t=split(/\t\|\t/);
print TAXN join("\t", @t)."\n";
if ($t[3]=~/\bcommon name/) {
my $d=$names{$t[0]};
$names{$t[0]}= $d ? [$$d[0], $t[1]] : ['',$t[1]];
}
elsif ($t[3]=~/\bscientific name/) {
my $d=$names{$t[0]};
$names{$t[0]}= $d ? [$t[1], $$d[1]] : [$t[1]];
}
}
close(NAMES);
close(TAXN);
open(TAX, '>taxon.bcp') || die("Error creating taxon.bcp!\n");
open(NODES, 'nodes.dmp') || die("Error opening names.dmp!\n");
while(<NODES>) {
chomp;
next unless $_;
s/\t\|$//;
my @t=split(/\t\|\t/);
my $rank_id=$ranks{$t[2]};
die("Error: invalid rank $t[2] from nodes.dmp line:\n$_\n")
unless defined($rank_id);
my $d=$names{$t[0]};
die("Error: names not found for taxon $t[0] at nodes.dmp line:\n$_\n")
unless $d && $$d[0];
my $sci_name=$$d[0];
my $com_name=$$d[1] || '';
print TAX join("\t",$t[0],$t[1],$rank_id, $t[4], $t[5],
$sci_name, $com_name)."\n";
if (my $md=$merged{$t[0]}) {
foreach my $tid (@$md) {
print TAX join("\t",$tid,$t[1],$rank_id, $t[4], $t[5],
$sci_name, $com_name)."\n";
}
}
}
close(NODES);
close(TAX);