Merge pull request #45 from broadinstitute/fn_add_perl_script

Adding kraken output perl scripts
broadinstitute · Mar 18, 2024 · 0dff10a · 0dff10a
2 parents 486dd5b + 3cbaed6
commit 0dff10a
Show file tree

Hide file tree

Showing 23 changed files with 964 additions and 1,660 deletions.
diff --git a/taxon_id_scripts/LCA_table_to_kraken_output_format.pl b/taxon_id_scripts/LCA_table_to_kraken_output_format.pl
@@ -0,0 +1,250 @@
+#!/usr/bin/env perl
+
+# Converts LCA output table to kraken output format. Treats paired reads as separate reads.
+
+# Input table is output of retrieve_top_blast_hits_LCA_for_each_sequence.pl, with
+# column titles (tab-separated):
+# - sequence_name
+# - LCA_taxon_id
+# - LCA_taxon_rank
+# - LCA_taxon_species
+# - LCA_taxon_genus
+# - LCA_taxon_family
+# - evalue_of_top_hits
+# - lowest_pident_of_top_hits
+# - mean_pident_of_top_hits
+# - highest_pident_of_top_hits
+# - lowest_qcovs_of_top_hits
+# - mean_qcovs_of_top_hits
+# - highest_qcovs_of_top_hits
+# - number_top_hits
+
+# Kraken output format:
+# https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#standard-kraken-output-format
+
+
+# Usage:
+# perl LCA_table_to_kraken_output_format.pl
+# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
+# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)] 
+
+
+# Prints to console. To print to file, use
+# perl LCA_table_to_kraken_output_format.pl
+# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
+# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)]
+# > [output kraken format table]
+
+
+use strict;
+use warnings;
+
+
+my $LCA_matches = $ARGV[0]; # output of retrieve_top_blast_hits_LCA_for_each_sequence.pl
+my $fasta_file = $ARGV[1]; # fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)
+
+my $NO_DATA = "NA";
+my $NEWLINE = "\n";
+my $DELIMITER = "\t";
+
+my $INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH = 1;
+
+# blast LCA table columns:
+my $sequence_name_column = 0;
+my $LCA_taxon_id_column = 1;
+my $LCA_taxon_rank_column = 2;
+my $LCA_taxon_species_column = 3;
+my $LCA_taxon_genus_column = 4;
+my $LCA_taxon_family_column = 5;
+my $LCA_taxon_superkingdom_column = 6;
+my $evalue_of_top_hits_column = 7;
+my $lowest_pident_of_top_hits_column = 8;
+my $mean_pident_of_top_hits_column = 9;
+my $highest_pident_of_top_hits_column = 10;
+my $lowest_qcovs_of_top_hits_column = 11;
+my $mean_qcovs_of_top_hits_column = 12;
+my $highest_qcovs_of_top_hits_column = 13;
+my $number_top_hits_column = 14;
+
+
+# reads in sequence names and lengths from fasta file
+my %sequence_name_to_length = (); # key: sequence name -> value: length of sequence
+my %sequence_name_appears_in_LCA_table = (); # key: sequence name from fasta file -> value: 1 if sequence appears in LCA table, 0 if not
+open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
+my $sequence_name = "";
+my $sequence = "";
+while(<FASTA_FILE>) # for each line in the file
+{
+	chomp;
+	if($_ =~ /^>(.*)$/) # header line
+	{
+		# processes previous sequence
+		$sequence_name_to_length{$sequence_name} = sequence_length($sequence);
+		$sequence_name_appears_in_LCA_table{$sequence_name} = 0;
+
+		# starts processing this sequence
+		$sequence_name = $1;
+		$sequence = "";
+	}
+	elsif($_ =~ /\S/)
+	{
+		$sequence .= $_;
+	}
+}
+close FASTA_FILE;
+if($sequence and $sequence_name)
+{
+	# processes last sequence
+	$sequence_name_to_length{$sequence_name} = sequence_length($sequence);
+	$sequence_name_appears_in_LCA_table{$sequence_name} = 0;
+}
+
+
+# reads LCA table and prints rows for reads with hits
+my $first_row = 1;
+open LCA_MATCHES, "<$LCA_matches" || die "Could not open $LCA_matches to read\n";
+while(<LCA_MATCHES>)
+{
+	chomp;
+	my $line = $_;
+	if($line =~ /\S/)
+	{
+		if($first_row)
+		{
+			# ignore header line
+			$first_row = 0;
+		}
+		else
+		{
+			# reads in relevant lines in row
+			my @items = split($DELIMITER, $line);
+			my $sequence_name = $items[$sequence_name_column];
+			my $assigned_taxon_id = $items[$LCA_taxon_id_column];
+
+			# records that we have seen this sequence
+			$sequence_name_appears_in_LCA_table{$sequence_name} = 1;
+
+			# prints kraken format row for unclassified sequence
+			# "C"/"U": a one letter code indicating that the sequence was either classified
+			# or unclassified.
+			if($assigned_taxon_id == 0)
+			{
+				print "U".$DELIMITER;
+			}
+			else
+			{
+				print "C".$DELIMITER;
+			}
+
+			# The sequence ID, obtained from the FASTA/FASTQ header.
+			print $sequence_name.$DELIMITER;
+
+			# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence
+			# is unclassified.
+			print $assigned_taxon_id.$DELIMITER;
+
+			# The length of the sequence in bp. In the case of paired read data, this will be
+			# a string containing the lengths of the two sequences in bp, separated by a pipe
+			# character, e.g. "98|94".
+			print $sequence_name_to_length{$sequence_name}.$DELIMITER;
+
+			# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s)
+			print $NO_DATA.$NEWLINE;
+		}
+	}
+}
+close LCA_MATCHES;
+
+
+# prints rows for reads without hits
+foreach my $sequence_name(sort keys %sequence_name_appears_in_LCA_table)
+{
+	if($sequence_name and !$sequence_name_appears_in_LCA_table{$sequence_name})
+	{
+		# prints kraken format row for unclassified sequence
+		# "C"/"U": a one letter code indicating that the sequence was either classified
+		# or unclassified.
+		print "U".$DELIMITER;
+
+		# The sequence ID, obtained from the FASTA/FASTQ header.
+		print $sequence_name.$DELIMITER;
+
+		# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence
+		# is unclassified.
+		print "0".$DELIMITER;
+
+		# The length of the sequence in bp. In the case of paired read data, this will be
+		# a string containing the lengths of the two sequences in bp, separated by a pipe
+		# character, e.g. "98|94".
+		print $sequence_name_to_length{$sequence_name}.$DELIMITER;
+
+		# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s)
+		print $NO_DATA.$NEWLINE;
+	}
+}
+
+
+# returns sequence length
+sub sequence_length
+{
+	my $sequence = $_[0];
+
+	# capitalize sequence
+	$sequence = uc($sequence);
+
+	# counts number bases or unambiguous bases in this sequence
+	my $sequence_length = 0;
+	foreach my $base(split //, $sequence)
+	{
+		if(!$INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_unambiguous_base($base))
+		{
+			$sequence_length++;
+		}
+		elsif($INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_base($base))
+		{
+			$sequence_length++;
+		}
+	}
+	return $sequence_length;
+}
+
+# returns 1 if base is A, T, C, G; returns 0 if not
+# input base must be capitalized
+sub is_unambiguous_base
+{
+	my $base = $_[0]; # must be capitalized
+	if($base eq "A" or $base eq "T" or $base eq "C" or $base eq "G")
+	{
+		return 1;
+	}
+	return 0;
+}
+
+# returns 1 if base is not gap, 0 if base is a gap
+sub is_base
+{
+	my $base = $_[0];
+
+	# empty value
+	if(!$base)
+	{
+		return 0;
+	}
+
+	# only whitespace
+	if($base !~ /\S/)
+	{
+		return 0;
+	}
+
+	# gap
+	if($base eq "-")
+	{
+		return 0;
+	}
+
+	# base
+	return 1;
+}
+
+# September 29, 2023