Skip to content

Commit

Permalink
Merge pull request #45 from broadinstitute/fn_add_perl_script
Browse files Browse the repository at this point in the history
Adding kraken output perl scripts
  • Loading branch information
golu099 authored Mar 18, 2024
2 parents 486dd5b + 3cbaed6 commit 0dff10a
Show file tree
Hide file tree
Showing 23 changed files with 964 additions and 1,660 deletions.
250 changes: 250 additions & 0 deletions taxon_id_scripts/LCA_table_to_kraken_output_format.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
#!/usr/bin/env perl

# Converts LCA output table to kraken output format. Treats paired reads as separate reads.

# Input table is output of retrieve_top_blast_hits_LCA_for_each_sequence.pl, with
# column titles (tab-separated):
# - sequence_name
# - LCA_taxon_id
# - LCA_taxon_rank
# - LCA_taxon_species
# - LCA_taxon_genus
# - LCA_taxon_family
# - evalue_of_top_hits
# - lowest_pident_of_top_hits
# - mean_pident_of_top_hits
# - highest_pident_of_top_hits
# - lowest_qcovs_of_top_hits
# - mean_qcovs_of_top_hits
# - highest_qcovs_of_top_hits
# - number_top_hits

# Kraken output format:
# https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#standard-kraken-output-format


# Usage:
# perl LCA_table_to_kraken_output_format.pl
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)]


# Prints to console. To print to file, use
# perl LCA_table_to_kraken_output_format.pl
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)]
# > [output kraken format table]


use strict;
use warnings;


my $LCA_matches = $ARGV[0]; # output of retrieve_top_blast_hits_LCA_for_each_sequence.pl
my $fasta_file = $ARGV[1]; # fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)

my $NO_DATA = "NA";
my $NEWLINE = "\n";
my $DELIMITER = "\t";

my $INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH = 1;

# blast LCA table columns:
my $sequence_name_column = 0;
my $LCA_taxon_id_column = 1;
my $LCA_taxon_rank_column = 2;
my $LCA_taxon_species_column = 3;
my $LCA_taxon_genus_column = 4;
my $LCA_taxon_family_column = 5;
my $LCA_taxon_superkingdom_column = 6;
my $evalue_of_top_hits_column = 7;
my $lowest_pident_of_top_hits_column = 8;
my $mean_pident_of_top_hits_column = 9;
my $highest_pident_of_top_hits_column = 10;
my $lowest_qcovs_of_top_hits_column = 11;
my $mean_qcovs_of_top_hits_column = 12;
my $highest_qcovs_of_top_hits_column = 13;
my $number_top_hits_column = 14;


# reads in sequence names and lengths from fasta file
my %sequence_name_to_length = (); # key: sequence name -> value: length of sequence
my %sequence_name_appears_in_LCA_table = (); # key: sequence name from fasta file -> value: 1 if sequence appears in LCA table, 0 if not
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
my $sequence_name = "";
my $sequence = "";
while(<FASTA_FILE>) # for each line in the file
{
chomp;
if($_ =~ /^>(.*)$/) # header line
{
# processes previous sequence
$sequence_name_to_length{$sequence_name} = sequence_length($sequence);
$sequence_name_appears_in_LCA_table{$sequence_name} = 0;

# starts processing this sequence
$sequence_name = $1;
$sequence = "";
}
elsif($_ =~ /\S/)
{
$sequence .= $_;
}
}
close FASTA_FILE;
if($sequence and $sequence_name)
{
# processes last sequence
$sequence_name_to_length{$sequence_name} = sequence_length($sequence);
$sequence_name_appears_in_LCA_table{$sequence_name} = 0;
}


# reads LCA table and prints rows for reads with hits
my $first_row = 1;
open LCA_MATCHES, "<$LCA_matches" || die "Could not open $LCA_matches to read\n";
while(<LCA_MATCHES>)
{
chomp;
my $line = $_;
if($line =~ /\S/)
{
if($first_row)
{
# ignore header line
$first_row = 0;
}
else
{
# reads in relevant lines in row
my @items = split($DELIMITER, $line);
my $sequence_name = $items[$sequence_name_column];
my $assigned_taxon_id = $items[$LCA_taxon_id_column];

# records that we have seen this sequence
$sequence_name_appears_in_LCA_table{$sequence_name} = 1;

# prints kraken format row for unclassified sequence
# "C"/"U": a one letter code indicating that the sequence was either classified
# or unclassified.
if($assigned_taxon_id == 0)
{
print "U".$DELIMITER;
}
else
{
print "C".$DELIMITER;
}

# The sequence ID, obtained from the FASTA/FASTQ header.
print $sequence_name.$DELIMITER;

# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence
# is unclassified.
print $assigned_taxon_id.$DELIMITER;

# The length of the sequence in bp. In the case of paired read data, this will be
# a string containing the lengths of the two sequences in bp, separated by a pipe
# character, e.g. "98|94".
print $sequence_name_to_length{$sequence_name}.$DELIMITER;

# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s)
print $NO_DATA.$NEWLINE;
}
}
}
close LCA_MATCHES;


# prints rows for reads without hits
foreach my $sequence_name(sort keys %sequence_name_appears_in_LCA_table)
{
if($sequence_name and !$sequence_name_appears_in_LCA_table{$sequence_name})
{
# prints kraken format row for unclassified sequence
# "C"/"U": a one letter code indicating that the sequence was either classified
# or unclassified.
print "U".$DELIMITER;

# The sequence ID, obtained from the FASTA/FASTQ header.
print $sequence_name.$DELIMITER;

# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence
# is unclassified.
print "0".$DELIMITER;

# The length of the sequence in bp. In the case of paired read data, this will be
# a string containing the lengths of the two sequences in bp, separated by a pipe
# character, e.g. "98|94".
print $sequence_name_to_length{$sequence_name}.$DELIMITER;

# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s)
print $NO_DATA.$NEWLINE;
}
}


# returns sequence length
sub sequence_length
{
my $sequence = $_[0];

# capitalize sequence
$sequence = uc($sequence);

# counts number bases or unambiguous bases in this sequence
my $sequence_length = 0;
foreach my $base(split //, $sequence)
{
if(!$INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_unambiguous_base($base))
{
$sequence_length++;
}
elsif($INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_base($base))
{
$sequence_length++;
}
}
return $sequence_length;
}

# returns 1 if base is A, T, C, G; returns 0 if not
# input base must be capitalized
sub is_unambiguous_base
{
my $base = $_[0]; # must be capitalized
if($base eq "A" or $base eq "T" or $base eq "C" or $base eq "G")
{
return 1;
}
return 0;
}

# returns 1 if base is not gap, 0 if base is a gap
sub is_base
{
my $base = $_[0];

# empty value
if(!$base)
{
return 0;
}

# only whitespace
if($base !~ /\S/)
{
return 0;
}

# gap
if($base eq "-")
{
return 0;
}

# base
return 1;
}

# September 29, 2023
Loading

0 comments on commit 0dff10a

Please sign in to comment.