Merge pull request #36 from broadinstitute/fn_update_docker

Updating Docker with Lydia's Perl Scripts
broadinstitute · Nov 16, 2023 · 64f63a2 · 64f63a2
2 parents 948a783 + 0a8164f
commit 64f63a2
Show file tree

Hide file tree

Showing 20 changed files with 2,711 additions and 2 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,6 @@ ENV VIRAL_CLASSIFY_PATH=$INSTALL_PATH/viral-classify \
 	PATH="$PATH:$MINICONDA_PATH/envs/env2/bin"
 
 COPY requirements-conda.txt requirements-conda-env2.txt $VIRAL_CLASSIFY_PATH/
-
 # install most dependencies to the main environment
 RUN $VIRAL_NGS_PATH/docker/install-conda-dependencies.sh $VIRAL_CLASSIFY_PATH/requirements-conda.txt 
 
@@ -22,7 +21,7 @@ RUN CONDA_PREFIX="$MINICONDA_PATH/envs/env2"; \
 COPY . $VIRAL_CLASSIFY_PATH
 
 # Link key bits of python code into the path
-RUN ln -s $VIRAL_CLASSIFY_PATH/metagenomics.py $VIRAL_CLASSIFY_PATH/taxon_filter.py $VIRAL_CLASSIFY_PATH/kmer_utils.py $VIRAL_CLASSIFY_PATH/classify $VIRAL_NGS_PATH
+RUN ln -s $VIRAL_CLASSIFY_PATH/taxon_id_scripts/* $VIRAL_CLASSIFY_PATH/metagenomics.py $VIRAL_CLASSIFY_PATH/taxon_filter.py $VIRAL_CLASSIFY_PATH/kmer_utils.py $VIRAL_CLASSIFY_PATH/classify $VIRAL_NGS_PATH
 
 # This not only prints the current version string, but it
 # also saves it to the VERSION file for later use and also

diff --git a/taxon_id_scripts/add_column_with_superkingdom_of_taxon_id.pl b/taxon_id_scripts/add_column_with_superkingdom_of_taxon_id.pl
@@ -0,0 +1,158 @@
+#!/usr/bin/env perl
+
+# Reads in column containing taxon id and adds column containing the superkingdom of
+# that taxon id.
+
+# Usage:
+# perl add_column_with_superkingdom_of_taxon_id.pl [table]
+# [title of column containing taxon ids] [nodes.dmp file from NCBI]
+
+# Prints to console. To print to file, use
+# perl add_column_with_superkingdom_of_taxon_id.pl [table]
+# [title of column containing taxon ids] [nodes.dmp file from NCBI] > [output table path]
+
+
+use strict;
+use warnings;
+
+
+my $table = $ARGV[0];
+my $taxon_id_column_title = $ARGV[1];
+my $nodes_file = $ARGV[2]; # nodes.dmp file from NCBI: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
+
+
+my $NO_DATA = "NA";
+my $NEWLINE = "\n";
+my $DELIMITER = "\t";
+my $TAXONDUMP_DELIMITER = "\t[|]\t"; # nodes.dmp and names.dmp
+
+my $SUPERKINGDOM_COLUMN_TITLE = "superkingdom"; # column to add
+
+# superkingdoms
+my %TAXON_ID_TO_SUPERKINGDOM = (
+	2157 => "Archaea",
+	2 => "Bacteria",
+	2759 => "Eukaryota",
+	10239 => "Viruses");
+my $ROOT_TAXON_ID = 1;
+
+# nodes.dmp and names.dmp
+my $TAXONID_COLUMN = 0;	# both
+my $PARENTID_COLUMN = 1;	# nodes.dmp
+my $RANK_COLUMN = 2;		# nodes.dmp
+my $NAMES_COLUMN = 1;		# names.dmp
+my $NAME_TYPE_COLUMN = 3;	# names.dmp
+
+
+# verifies that all input files exist and are non-empty
+if(!$nodes_file or !-e $nodes_file or -z $nodes_file)
+{
+	print STDERR "Error: nodes.dmp file not provided, does not exist, or empty:\n\t"
+		.$nodes_file."\nExiting.\n";
+	die;
+}
+if(!$table or !-e $table or -z $table)
+{
+	print STDERR "Error: input table not provided, does not exist, or is empty:\n\t"
+		.$table."\nExiting.\n";
+	die;
+}
+
+
+# reads in nodes file
+my %taxonid_to_parent = (); # key: taxon id -> value: taxon id of parent taxon
+my %taxonid_to_rank = (); # key: taxon id -> value: rank of taxon
+open NODES_FILE, "<$nodes_file" || die "Could not open $nodes_file to read\n";
+while(<NODES_FILE>)
+{
+	chomp;
+	if($_ =~ /\S/)
+	{
+		my @items = split($TAXONDUMP_DELIMITER, $_);
+		my $taxonid = $items[$TAXONID_COLUMN];
+		my $parent_taxonid = $items[$PARENTID_COLUMN];
+		my $rank = $items[$RANK_COLUMN];
+
+		$taxonid_to_parent{$taxonid} = $parent_taxonid;
+		$taxonid_to_rank{$taxonid} = $rank;
+	}
+}
+close NODES_FILE;
+
+
+# reads in taxon id column of table and adds superkingdom column
+my $first_line = 1;
+my $taxon_id_column = -1;
+open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
+while(<TABLE>) # for each row in the file
+{
+	chomp;
+	my $line = $_;
+	if($line =~ /\S/) # if row not empty
+	{
+		my @items_in_line = split($DELIMITER, $line, -1);
+		if($first_line) # column titles
+		{
+			# identifies column to merge by and columns to save
+			my $column = 0;
+			foreach my $column_title(@items_in_line)
+			{
+				if(defined $column_title and $column_title eq $taxon_id_column_title)
+				{
+					if($taxon_id_column != -1)
+					{
+						print STDERR "Warning: column title ".$taxon_id_column_title
+							." appears more than once in input table:\n\t".$table."\n";
+					}
+					$taxon_id_column = $column;
+				}
+				$column++;
+			}
+
+			# verifies that we have found column to merge by
+			if($taxon_id_column == -1)
+			{
+				print STDERR "Warning: column title ".$taxon_id_column_title
+					." not found in input table:\n\t".$table."\nExiting.\n";
+				die;
+			}
+			$first_line = 0; # next line is not column titles
+
+			# prints line as is
+			print $line;
+
+			# prints titles of new superkingdom column
+			print $DELIMITER.$SUPERKINGDOM_COLUMN_TITLE.$NEWLINE;
+		}
+		else # column values (not column titles)
+		{
+			# retrieves taxon id
+			my $taxon_id = $items_in_line[$taxon_id_column];
+
+			# retrieves superkingdom
+			my $superkingdom = $NO_DATA;
+			my $ancestor_taxon_id = $taxon_id;
+			while($superkingdom eq $NO_DATA
+				and $taxonid_to_parent{$ancestor_taxon_id}
+				and $ancestor_taxon_id ne $taxonid_to_parent{$ancestor_taxon_id}
+				and $ancestor_taxon_id ne $ROOT_TAXON_ID)
+			{
+				if($TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id})
+				{
+					$superkingdom = $TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id};
+				}
+				$ancestor_taxon_id = $taxonid_to_parent{$ancestor_taxon_id}
+			}
+
+			# prints line as is
+			print $line;
+
+			# prints superkingdom column value
+			print $DELIMITER.$superkingdom.$NEWLINE;
+		}
+	}
+}
+close TABLE;
+
+
+# April 4, 2023
diff --git a/taxon_id_scripts/add_one_value_column.pl b/taxon_id_scripts/add_one_value_column.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/env perl
+
+# Adds column with specified title and specified value for all values.
+
+# Usage:
+# perl add_one_value_column.pl [table to add column to] "[title of column to add]"
+# "[value of column to add]"
+
+# Prints to console. To print to file, use
+# perl add_one_value_column.pl [table to add column to] "[title of column to add]"
+# "[value of column to add]" > [output table path]
+
+
+use strict;
+use warnings;
+
+
+my $table = $ARGV[0];
+my $title_of_column_to_add = $ARGV[1];
+my $value_of_column_to_add = $ARGV[2];
+
+my $NEWLINE = "\n";
+my $DELIMITER = "\t";
+
+
+# verifies that input table exists and is not empty
+if(!$table or !-e $table or -z $table)
+{
+	print STDERR "Error: table to add column to not provided, does not exist, or empty:\n\t"
+		.$table."\nExiting.\n";
+	die;
+}
+
+
+# reads in and adds column to table to add columns to
+my $first_line = 1;
+open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
+while(<TABLE>) # for each row in the file
+{
+	chomp;
+	my $line = $_;
+	if($line =~ /\S/) # if row not empty
+	{
+		if($first_line) # column titles
+		{
+			# prints line as is
+			print $line;
+
+			# prints title of new column
+			print $DELIMITER;
+			print $title_of_column_to_add;
+			print $NEWLINE;
+
+			$first_line = 0;
+		}
+		else # column values (not column titles)
+		{
+			# prints line as is
+			print $line;
+
+			# prints value of new column
+			print $DELIMITER;
+			print $value_of_column_to_add;
+			print $NEWLINE;
+		}
+	}
+}
+close TABLE;
+
+
+# September 26, 2021
+# November 8, 2021
diff --git a/taxon_id_scripts/download_fasta_sequences_from_accession_numbers.pl b/taxon_id_scripts/download_fasta_sequences_from_accession_numbers.pl
@@ -0,0 +1,91 @@
+#!/usr/bin/env perl
+
+# Given a list of accession numbers, one per line, downloads and prints fasta sequences
+# corresponding to each accession number.
+
+# Based on option 2 in https://edwards.flinders.edu.au/ncbi-sequence-or-fasta-batch-download-using-entrez/
+
+
+# Usage:
+# perl download_fasta_sequences_from_accession_numbers.pl
+# [path of file with list of accession numbers, one per line]
+# [database (nucleotide by default)]
+
+# Prints to console. To print to file, use
+# perl download_fasta_sequences_from_accession_numbers.pl
+# [path of file with list of accession numbers, one per line]
+# [database (nucleotide by default)] > [output fasta file path]
+
+use strict;
+use warnings;
+
+
+my $accession_numbers_file = $ARGV[0]; # list of accession numbers, one per line
+my $database = $ARGV[1]; # nucleotide by default
+if(!$database)
+{
+	$database = "nucleotide";
+}
+
+
+my $NEWLINE = "\n";
+my $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL = 400;
+my $TEMP_FILE_EXTENSION = "_temp.txt";
+
+
+# reads in accession numbers and retrieves corresponding fasta sequences
+my @accession_numbers_lists = ();
+my $current_accession_numbers_command_list = "";
+my $current_number_accession_numbers = 0;
+open ACCESSION_NUMBERS, "<$accession_numbers_file" || die "Could not open $accession_numbers_file to read\n";
+while(<ACCESSION_NUMBERS>)
+{
+	chomp;
+	if($_ =~ /\S/)
+	{
+		my $accession_number = $_;
+		if($current_accession_numbers_command_list)
+		{
+			$current_accession_numbers_command_list .= ",";
+		}
+		$current_accession_numbers_command_list .= $accession_number;
+		$current_number_accession_numbers++;
+
+		if($current_number_accession_numbers >= $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL)
+		{
+			push(@accession_numbers_lists, $current_accession_numbers_command_list);
+			$current_number_accession_numbers = 0;
+			$current_accession_numbers_command_list = "";
+		}
+	}
+}
+close ACCESSION_NUMBERS;
+push(@accession_numbers_lists, $current_accession_numbers_command_list);
+
+
+# builds and runs command to download fasta file
+# example URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=D90600.1
+my $temp_file = $accession_numbers_file.$TEMP_FILE_EXTENSION;
+foreach my $accession_numbers_list(@accession_numbers_lists)
+{
+# 	print STDERR $accession_numbers_list."\n";
+	my $command = "curl https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db="
+		.$database."\\&rettype=fasta\\&retmode=text\\&id=".$accession_numbers_list; # ." > ".$output_fasta;
+	`$command > $temp_file`;
+
+	open FASTA_SEQUENCES, "<$temp_file" || die "Could not open $temp_file to read\n";
+	while(<FASTA_SEQUENCES>)
+	{
+		chomp;
+		if($_ =~ /\S/)
+		{
+			print $_.$NEWLINE;
+		}
+	}
+	close FASTA_SEQUENCES;
+}
+`rm $temp_file`;
+
+
+# December 27, 2022
+# December 29, 2022