-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #36 from broadinstitute/fn_update_docker
Updating Docker with Lydia's Perl Scripts
- Loading branch information
Showing
20 changed files
with
2,711 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
158 changes: 158 additions & 0 deletions
158
taxon_id_scripts/add_column_with_superkingdom_of_taxon_id.pl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Reads in column containing taxon id and adds column containing the superkingdom of | ||
# that taxon id. | ||
|
||
# Usage: | ||
# perl add_column_with_superkingdom_of_taxon_id.pl [table] | ||
# [title of column containing taxon ids] [nodes.dmp file from NCBI] | ||
|
||
# Prints to console. To print to file, use | ||
# perl add_column_with_superkingdom_of_taxon_id.pl [table] | ||
# [title of column containing taxon ids] [nodes.dmp file from NCBI] > [output table path] | ||
|
||
|
||
use strict; | ||
use warnings; | ||
|
||
|
||
my $table = $ARGV[0]; | ||
my $taxon_id_column_title = $ARGV[1]; | ||
my $nodes_file = $ARGV[2]; # nodes.dmp file from NCBI: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz | ||
|
||
|
||
my $NO_DATA = "NA"; | ||
my $NEWLINE = "\n"; | ||
my $DELIMITER = "\t"; | ||
my $TAXONDUMP_DELIMITER = "\t[|]\t"; # nodes.dmp and names.dmp | ||
|
||
my $SUPERKINGDOM_COLUMN_TITLE = "superkingdom"; # column to add | ||
|
||
# superkingdoms | ||
my %TAXON_ID_TO_SUPERKINGDOM = ( | ||
2157 => "Archaea", | ||
2 => "Bacteria", | ||
2759 => "Eukaryota", | ||
10239 => "Viruses"); | ||
my $ROOT_TAXON_ID = 1; | ||
|
||
# nodes.dmp and names.dmp | ||
my $TAXONID_COLUMN = 0; # both | ||
my $PARENTID_COLUMN = 1; # nodes.dmp | ||
my $RANK_COLUMN = 2; # nodes.dmp | ||
my $NAMES_COLUMN = 1; # names.dmp | ||
my $NAME_TYPE_COLUMN = 3; # names.dmp | ||
|
||
|
||
# verifies that all input files exist and are non-empty | ||
if(!$nodes_file or !-e $nodes_file or -z $nodes_file) | ||
{ | ||
print STDERR "Error: nodes.dmp file not provided, does not exist, or empty:\n\t" | ||
.$nodes_file."\nExiting.\n"; | ||
die; | ||
} | ||
if(!$table or !-e $table or -z $table) | ||
{ | ||
print STDERR "Error: input table not provided, does not exist, or is empty:\n\t" | ||
.$table."\nExiting.\n"; | ||
die; | ||
} | ||
|
||
|
||
# reads in nodes file | ||
my %taxonid_to_parent = (); # key: taxon id -> value: taxon id of parent taxon | ||
my %taxonid_to_rank = (); # key: taxon id -> value: rank of taxon | ||
open NODES_FILE, "<$nodes_file" || die "Could not open $nodes_file to read\n"; | ||
while(<NODES_FILE>) | ||
{ | ||
chomp; | ||
if($_ =~ /\S/) | ||
{ | ||
my @items = split($TAXONDUMP_DELIMITER, $_); | ||
my $taxonid = $items[$TAXONID_COLUMN]; | ||
my $parent_taxonid = $items[$PARENTID_COLUMN]; | ||
my $rank = $items[$RANK_COLUMN]; | ||
|
||
$taxonid_to_parent{$taxonid} = $parent_taxonid; | ||
$taxonid_to_rank{$taxonid} = $rank; | ||
} | ||
} | ||
close NODES_FILE; | ||
|
||
|
||
# reads in taxon id column of table and adds superkingdom column | ||
my $first_line = 1; | ||
my $taxon_id_column = -1; | ||
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n"; | ||
while(<TABLE>) # for each row in the file | ||
{ | ||
chomp; | ||
my $line = $_; | ||
if($line =~ /\S/) # if row not empty | ||
{ | ||
my @items_in_line = split($DELIMITER, $line, -1); | ||
if($first_line) # column titles | ||
{ | ||
# identifies column to merge by and columns to save | ||
my $column = 0; | ||
foreach my $column_title(@items_in_line) | ||
{ | ||
if(defined $column_title and $column_title eq $taxon_id_column_title) | ||
{ | ||
if($taxon_id_column != -1) | ||
{ | ||
print STDERR "Warning: column title ".$taxon_id_column_title | ||
." appears more than once in input table:\n\t".$table."\n"; | ||
} | ||
$taxon_id_column = $column; | ||
} | ||
$column++; | ||
} | ||
|
||
# verifies that we have found column to merge by | ||
if($taxon_id_column == -1) | ||
{ | ||
print STDERR "Warning: column title ".$taxon_id_column_title | ||
." not found in input table:\n\t".$table."\nExiting.\n"; | ||
die; | ||
} | ||
$first_line = 0; # next line is not column titles | ||
|
||
# prints line as is | ||
print $line; | ||
|
||
# prints titles of new superkingdom column | ||
print $DELIMITER.$SUPERKINGDOM_COLUMN_TITLE.$NEWLINE; | ||
} | ||
else # column values (not column titles) | ||
{ | ||
# retrieves taxon id | ||
my $taxon_id = $items_in_line[$taxon_id_column]; | ||
|
||
# retrieves superkingdom | ||
my $superkingdom = $NO_DATA; | ||
my $ancestor_taxon_id = $taxon_id; | ||
while($superkingdom eq $NO_DATA | ||
and $taxonid_to_parent{$ancestor_taxon_id} | ||
and $ancestor_taxon_id ne $taxonid_to_parent{$ancestor_taxon_id} | ||
and $ancestor_taxon_id ne $ROOT_TAXON_ID) | ||
{ | ||
if($TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id}) | ||
{ | ||
$superkingdom = $TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id}; | ||
} | ||
$ancestor_taxon_id = $taxonid_to_parent{$ancestor_taxon_id} | ||
} | ||
|
||
# prints line as is | ||
print $line; | ||
|
||
# prints superkingdom column value | ||
print $DELIMITER.$superkingdom.$NEWLINE; | ||
} | ||
} | ||
} | ||
close TABLE; | ||
|
||
|
||
# April 4, 2023 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Adds column with specified title and specified value for all values. | ||
|
||
# Usage: | ||
# perl add_one_value_column.pl [table to add column to] "[title of column to add]" | ||
# "[value of column to add]" | ||
|
||
# Prints to console. To print to file, use | ||
# perl add_one_value_column.pl [table to add column to] "[title of column to add]" | ||
# "[value of column to add]" > [output table path] | ||
|
||
|
||
use strict; | ||
use warnings; | ||
|
||
|
||
my $table = $ARGV[0]; | ||
my $title_of_column_to_add = $ARGV[1]; | ||
my $value_of_column_to_add = $ARGV[2]; | ||
|
||
my $NEWLINE = "\n"; | ||
my $DELIMITER = "\t"; | ||
|
||
|
||
# verifies that input table exists and is not empty | ||
if(!$table or !-e $table or -z $table) | ||
{ | ||
print STDERR "Error: table to add column to not provided, does not exist, or empty:\n\t" | ||
.$table."\nExiting.\n"; | ||
die; | ||
} | ||
|
||
|
||
# reads in and adds column to table to add columns to | ||
my $first_line = 1; | ||
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n"; | ||
while(<TABLE>) # for each row in the file | ||
{ | ||
chomp; | ||
my $line = $_; | ||
if($line =~ /\S/) # if row not empty | ||
{ | ||
if($first_line) # column titles | ||
{ | ||
# prints line as is | ||
print $line; | ||
|
||
# prints title of new column | ||
print $DELIMITER; | ||
print $title_of_column_to_add; | ||
print $NEWLINE; | ||
|
||
$first_line = 0; | ||
} | ||
else # column values (not column titles) | ||
{ | ||
# prints line as is | ||
print $line; | ||
|
||
# prints value of new column | ||
print $DELIMITER; | ||
print $value_of_column_to_add; | ||
print $NEWLINE; | ||
} | ||
} | ||
} | ||
close TABLE; | ||
|
||
|
||
# September 26, 2021 | ||
# November 8, 2021 |
91 changes: 91 additions & 0 deletions
91
taxon_id_scripts/download_fasta_sequences_from_accession_numbers.pl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Given a list of accession numbers, one per line, downloads and prints fasta sequences | ||
# corresponding to each accession number. | ||
|
||
# Based on option 2 in https://edwards.flinders.edu.au/ncbi-sequence-or-fasta-batch-download-using-entrez/ | ||
|
||
|
||
# Usage: | ||
# perl download_fasta_sequences_from_accession_numbers.pl | ||
# [path of file with list of accession numbers, one per line] | ||
# [database (nucleotide by default)] | ||
|
||
# Prints to console. To print to file, use | ||
# perl download_fasta_sequences_from_accession_numbers.pl | ||
# [path of file with list of accession numbers, one per line] | ||
# [database (nucleotide by default)] > [output fasta file path] | ||
|
||
use strict; | ||
use warnings; | ||
|
||
|
||
my $accession_numbers_file = $ARGV[0]; # list of accession numbers, one per line | ||
my $database = $ARGV[1]; # nucleotide by default | ||
if(!$database) | ||
{ | ||
$database = "nucleotide"; | ||
} | ||
|
||
|
||
my $NEWLINE = "\n"; | ||
my $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL = 400; | ||
my $TEMP_FILE_EXTENSION = "_temp.txt"; | ||
|
||
|
||
# reads in accession numbers and retrieves corresponding fasta sequences | ||
my @accession_numbers_lists = (); | ||
my $current_accession_numbers_command_list = ""; | ||
my $current_number_accession_numbers = 0; | ||
open ACCESSION_NUMBERS, "<$accession_numbers_file" || die "Could not open $accession_numbers_file to read\n"; | ||
while(<ACCESSION_NUMBERS>) | ||
{ | ||
chomp; | ||
if($_ =~ /\S/) | ||
{ | ||
my $accession_number = $_; | ||
if($current_accession_numbers_command_list) | ||
{ | ||
$current_accession_numbers_command_list .= ","; | ||
} | ||
$current_accession_numbers_command_list .= $accession_number; | ||
$current_number_accession_numbers++; | ||
|
||
if($current_number_accession_numbers >= $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL) | ||
{ | ||
push(@accession_numbers_lists, $current_accession_numbers_command_list); | ||
$current_number_accession_numbers = 0; | ||
$current_accession_numbers_command_list = ""; | ||
} | ||
} | ||
} | ||
close ACCESSION_NUMBERS; | ||
push(@accession_numbers_lists, $current_accession_numbers_command_list); | ||
|
||
|
||
# builds and runs command to download fasta file | ||
# example URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=D90600.1 | ||
my $temp_file = $accession_numbers_file.$TEMP_FILE_EXTENSION; | ||
foreach my $accession_numbers_list(@accession_numbers_lists) | ||
{ | ||
# print STDERR $accession_numbers_list."\n"; | ||
my $command = "curl https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=" | ||
.$database."\\&rettype=fasta\\&retmode=text\\&id=".$accession_numbers_list; # ." > ".$output_fasta; | ||
`$command > $temp_file`; | ||
|
||
open FASTA_SEQUENCES, "<$temp_file" || die "Could not open $temp_file to read\n"; | ||
while(<FASTA_SEQUENCES>) | ||
{ | ||
chomp; | ||
if($_ =~ /\S/) | ||
{ | ||
print $_.$NEWLINE; | ||
} | ||
} | ||
close FASTA_SEQUENCES; | ||
} | ||
`rm $temp_file`; | ||
|
||
|
||
# December 27, 2022 | ||
# December 29, 2022 |
Oops, something went wrong.