Skip to content

Commit

Permalink
Merge pull request #36 from broadinstitute/fn_update_docker
Browse files Browse the repository at this point in the history
Updating Docker with Lydia's Perl Scripts
  • Loading branch information
golu099 authored Nov 16, 2023
2 parents 948a783 + 0a8164f commit 64f63a2
Show file tree
Hide file tree
Showing 20 changed files with 2,711 additions and 2 deletions.
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ ENV VIRAL_CLASSIFY_PATH=$INSTALL_PATH/viral-classify \
PATH="$PATH:$MINICONDA_PATH/envs/env2/bin"

COPY requirements-conda.txt requirements-conda-env2.txt $VIRAL_CLASSIFY_PATH/

# install most dependencies to the main environment
RUN $VIRAL_NGS_PATH/docker/install-conda-dependencies.sh $VIRAL_CLASSIFY_PATH/requirements-conda.txt

Expand All @@ -22,7 +21,7 @@ RUN CONDA_PREFIX="$MINICONDA_PATH/envs/env2"; \
COPY . $VIRAL_CLASSIFY_PATH

# Link key bits of python code into the path
RUN ln -s $VIRAL_CLASSIFY_PATH/metagenomics.py $VIRAL_CLASSIFY_PATH/taxon_filter.py $VIRAL_CLASSIFY_PATH/kmer_utils.py $VIRAL_CLASSIFY_PATH/classify $VIRAL_NGS_PATH
RUN ln -s $VIRAL_CLASSIFY_PATH/taxon_id_scripts/* $VIRAL_CLASSIFY_PATH/metagenomics.py $VIRAL_CLASSIFY_PATH/taxon_filter.py $VIRAL_CLASSIFY_PATH/kmer_utils.py $VIRAL_CLASSIFY_PATH/classify $VIRAL_NGS_PATH

# This not only prints the current version string, but it
# also saves it to the VERSION file for later use and also
Expand Down
158 changes: 158 additions & 0 deletions taxon_id_scripts/add_column_with_superkingdom_of_taxon_id.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#!/usr/bin/env perl

# Reads in column containing taxon id and adds column containing the superkingdom of
# that taxon id.

# Usage:
# perl add_column_with_superkingdom_of_taxon_id.pl [table]
# [title of column containing taxon ids] [nodes.dmp file from NCBI]

# Prints to console. To print to file, use
# perl add_column_with_superkingdom_of_taxon_id.pl [table]
# [title of column containing taxon ids] [nodes.dmp file from NCBI] > [output table path]


use strict;
use warnings;


my $table = $ARGV[0];
my $taxon_id_column_title = $ARGV[1];
my $nodes_file = $ARGV[2]; # nodes.dmp file from NCBI: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz


my $NO_DATA = "NA";
my $NEWLINE = "\n";
my $DELIMITER = "\t";
my $TAXONDUMP_DELIMITER = "\t[|]\t"; # nodes.dmp and names.dmp

my $SUPERKINGDOM_COLUMN_TITLE = "superkingdom"; # column to add

# superkingdoms
my %TAXON_ID_TO_SUPERKINGDOM = (
2157 => "Archaea",
2 => "Bacteria",
2759 => "Eukaryota",
10239 => "Viruses");
my $ROOT_TAXON_ID = 1;

# nodes.dmp and names.dmp
my $TAXONID_COLUMN = 0; # both
my $PARENTID_COLUMN = 1; # nodes.dmp
my $RANK_COLUMN = 2; # nodes.dmp
my $NAMES_COLUMN = 1; # names.dmp
my $NAME_TYPE_COLUMN = 3; # names.dmp


# verifies that all input files exist and are non-empty
if(!$nodes_file or !-e $nodes_file or -z $nodes_file)
{
print STDERR "Error: nodes.dmp file not provided, does not exist, or empty:\n\t"
.$nodes_file."\nExiting.\n";
die;
}
if(!$table or !-e $table or -z $table)
{
print STDERR "Error: input table not provided, does not exist, or is empty:\n\t"
.$table."\nExiting.\n";
die;
}


# reads in nodes file
my %taxonid_to_parent = (); # key: taxon id -> value: taxon id of parent taxon
my %taxonid_to_rank = (); # key: taxon id -> value: rank of taxon
open NODES_FILE, "<$nodes_file" || die "Could not open $nodes_file to read\n";
while(<NODES_FILE>)
{
chomp;
if($_ =~ /\S/)
{
my @items = split($TAXONDUMP_DELIMITER, $_);
my $taxonid = $items[$TAXONID_COLUMN];
my $parent_taxonid = $items[$PARENTID_COLUMN];
my $rank = $items[$RANK_COLUMN];

$taxonid_to_parent{$taxonid} = $parent_taxonid;
$taxonid_to_rank{$taxonid} = $rank;
}
}
close NODES_FILE;


# reads in taxon id column of table and adds superkingdom column
my $first_line = 1;
my $taxon_id_column = -1;
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
my $line = $_;
if($line =~ /\S/) # if row not empty
{
my @items_in_line = split($DELIMITER, $line, -1);
if($first_line) # column titles
{
# identifies column to merge by and columns to save
my $column = 0;
foreach my $column_title(@items_in_line)
{
if(defined $column_title and $column_title eq $taxon_id_column_title)
{
if($taxon_id_column != -1)
{
print STDERR "Warning: column title ".$taxon_id_column_title
." appears more than once in input table:\n\t".$table."\n";
}
$taxon_id_column = $column;
}
$column++;
}

# verifies that we have found column to merge by
if($taxon_id_column == -1)
{
print STDERR "Warning: column title ".$taxon_id_column_title
." not found in input table:\n\t".$table."\nExiting.\n";
die;
}
$first_line = 0; # next line is not column titles

# prints line as is
print $line;

# prints titles of new superkingdom column
print $DELIMITER.$SUPERKINGDOM_COLUMN_TITLE.$NEWLINE;
}
else # column values (not column titles)
{
# retrieves taxon id
my $taxon_id = $items_in_line[$taxon_id_column];

# retrieves superkingdom
my $superkingdom = $NO_DATA;
my $ancestor_taxon_id = $taxon_id;
while($superkingdom eq $NO_DATA
and $taxonid_to_parent{$ancestor_taxon_id}
and $ancestor_taxon_id ne $taxonid_to_parent{$ancestor_taxon_id}
and $ancestor_taxon_id ne $ROOT_TAXON_ID)
{
if($TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id})
{
$superkingdom = $TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id};
}
$ancestor_taxon_id = $taxonid_to_parent{$ancestor_taxon_id}
}

# prints line as is
print $line;

# prints superkingdom column value
print $DELIMITER.$superkingdom.$NEWLINE;
}
}
}
close TABLE;


# April 4, 2023
72 changes: 72 additions & 0 deletions taxon_id_scripts/add_one_value_column.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env perl

# Adds column with specified title and specified value for all values.

# Usage:
# perl add_one_value_column.pl [table to add column to] "[title of column to add]"
# "[value of column to add]"

# Prints to console. To print to file, use
# perl add_one_value_column.pl [table to add column to] "[title of column to add]"
# "[value of column to add]" > [output table path]


use strict;
use warnings;


my $table = $ARGV[0];
my $title_of_column_to_add = $ARGV[1];
my $value_of_column_to_add = $ARGV[2];

my $NEWLINE = "\n";
my $DELIMITER = "\t";


# verifies that input table exists and is not empty
if(!$table or !-e $table or -z $table)
{
print STDERR "Error: table to add column to not provided, does not exist, or empty:\n\t"
.$table."\nExiting.\n";
die;
}


# reads in and adds column to table to add columns to
my $first_line = 1;
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
my $line = $_;
if($line =~ /\S/) # if row not empty
{
if($first_line) # column titles
{
# prints line as is
print $line;

# prints title of new column
print $DELIMITER;
print $title_of_column_to_add;
print $NEWLINE;

$first_line = 0;
}
else # column values (not column titles)
{
# prints line as is
print $line;

# prints value of new column
print $DELIMITER;
print $value_of_column_to_add;
print $NEWLINE;
}
}
}
close TABLE;


# September 26, 2021
# November 8, 2021
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env perl

# Given a list of accession numbers, one per line, downloads and prints fasta sequences
# corresponding to each accession number.

# Based on option 2 in https://edwards.flinders.edu.au/ncbi-sequence-or-fasta-batch-download-using-entrez/


# Usage:
# perl download_fasta_sequences_from_accession_numbers.pl
# [path of file with list of accession numbers, one per line]
# [database (nucleotide by default)]

# Prints to console. To print to file, use
# perl download_fasta_sequences_from_accession_numbers.pl
# [path of file with list of accession numbers, one per line]
# [database (nucleotide by default)] > [output fasta file path]

use strict;
use warnings;


my $accession_numbers_file = $ARGV[0]; # list of accession numbers, one per line
my $database = $ARGV[1]; # nucleotide by default
if(!$database)
{
$database = "nucleotide";
}


my $NEWLINE = "\n";
my $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL = 400;
my $TEMP_FILE_EXTENSION = "_temp.txt";


# reads in accession numbers and retrieves corresponding fasta sequences
my @accession_numbers_lists = ();
my $current_accession_numbers_command_list = "";
my $current_number_accession_numbers = 0;
open ACCESSION_NUMBERS, "<$accession_numbers_file" || die "Could not open $accession_numbers_file to read\n";
while(<ACCESSION_NUMBERS>)
{
chomp;
if($_ =~ /\S/)
{
my $accession_number = $_;
if($current_accession_numbers_command_list)
{
$current_accession_numbers_command_list .= ",";
}
$current_accession_numbers_command_list .= $accession_number;
$current_number_accession_numbers++;

if($current_number_accession_numbers >= $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL)
{
push(@accession_numbers_lists, $current_accession_numbers_command_list);
$current_number_accession_numbers = 0;
$current_accession_numbers_command_list = "";
}
}
}
close ACCESSION_NUMBERS;
push(@accession_numbers_lists, $current_accession_numbers_command_list);


# builds and runs command to download fasta file
# example URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=D90600.1
my $temp_file = $accession_numbers_file.$TEMP_FILE_EXTENSION;
foreach my $accession_numbers_list(@accession_numbers_lists)
{
# print STDERR $accession_numbers_list."\n";
my $command = "curl https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db="
.$database."\\&rettype=fasta\\&retmode=text\\&id=".$accession_numbers_list; # ." > ".$output_fasta;
`$command > $temp_file`;

open FASTA_SEQUENCES, "<$temp_file" || die "Could not open $temp_file to read\n";
while(<FASTA_SEQUENCES>)
{
chomp;
if($_ =~ /\S/)
{
print $_.$NEWLINE;
}
}
close FASTA_SEQUENCES;
}
`rm $temp_file`;


# December 27, 2022
# December 29, 2022
Loading

0 comments on commit 64f63a2

Please sign in to comment.