-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #45 from broadinstitute/fn_add_perl_script
Adding kraken output perl scripts
- Loading branch information
Showing
23 changed files
with
964 additions
and
1,660 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Converts LCA output table to kraken output format. Treats paired reads as separate reads. | ||
|
||
# Input table is output of retrieve_top_blast_hits_LCA_for_each_sequence.pl, with | ||
# column titles (tab-separated): | ||
# - sequence_name | ||
# - LCA_taxon_id | ||
# - LCA_taxon_rank | ||
# - LCA_taxon_species | ||
# - LCA_taxon_genus | ||
# - LCA_taxon_family | ||
# - evalue_of_top_hits | ||
# - lowest_pident_of_top_hits | ||
# - mean_pident_of_top_hits | ||
# - highest_pident_of_top_hits | ||
# - lowest_qcovs_of_top_hits | ||
# - mean_qcovs_of_top_hits | ||
# - highest_qcovs_of_top_hits | ||
# - number_top_hits | ||
|
||
# Kraken output format: | ||
# https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#standard-kraken-output-format | ||
|
||
|
||
# Usage: | ||
# perl LCA_table_to_kraken_output_format.pl | ||
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search] | ||
# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)] | ||
|
||
|
||
# Prints to console. To print to file, use | ||
# perl LCA_table_to_kraken_output_format.pl | ||
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search] | ||
# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)] | ||
# > [output kraken format table] | ||
|
||
|
||
use strict; | ||
use warnings; | ||
|
||
|
||
my $LCA_matches = $ARGV[0]; # output of retrieve_top_blast_hits_LCA_for_each_sequence.pl | ||
my $fasta_file = $ARGV[1]; # fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences) | ||
|
||
my $NO_DATA = "NA"; | ||
my $NEWLINE = "\n"; | ||
my $DELIMITER = "\t"; | ||
|
||
my $INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH = 1; | ||
|
||
# blast LCA table columns: | ||
my $sequence_name_column = 0; | ||
my $LCA_taxon_id_column = 1; | ||
my $LCA_taxon_rank_column = 2; | ||
my $LCA_taxon_species_column = 3; | ||
my $LCA_taxon_genus_column = 4; | ||
my $LCA_taxon_family_column = 5; | ||
my $LCA_taxon_superkingdom_column = 6; | ||
my $evalue_of_top_hits_column = 7; | ||
my $lowest_pident_of_top_hits_column = 8; | ||
my $mean_pident_of_top_hits_column = 9; | ||
my $highest_pident_of_top_hits_column = 10; | ||
my $lowest_qcovs_of_top_hits_column = 11; | ||
my $mean_qcovs_of_top_hits_column = 12; | ||
my $highest_qcovs_of_top_hits_column = 13; | ||
my $number_top_hits_column = 14; | ||
|
||
|
||
# reads in sequence names and lengths from fasta file | ||
my %sequence_name_to_length = (); # key: sequence name -> value: length of sequence | ||
my %sequence_name_appears_in_LCA_table = (); # key: sequence name from fasta file -> value: 1 if sequence appears in LCA table, 0 if not | ||
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n"; | ||
my $sequence_name = ""; | ||
my $sequence = ""; | ||
while(<FASTA_FILE>) # for each line in the file | ||
{ | ||
chomp; | ||
if($_ =~ /^>(.*)$/) # header line | ||
{ | ||
# processes previous sequence | ||
$sequence_name_to_length{$sequence_name} = sequence_length($sequence); | ||
$sequence_name_appears_in_LCA_table{$sequence_name} = 0; | ||
|
||
# starts processing this sequence | ||
$sequence_name = $1; | ||
$sequence = ""; | ||
} | ||
elsif($_ =~ /\S/) | ||
{ | ||
$sequence .= $_; | ||
} | ||
} | ||
close FASTA_FILE; | ||
if($sequence and $sequence_name) | ||
{ | ||
# processes last sequence | ||
$sequence_name_to_length{$sequence_name} = sequence_length($sequence); | ||
$sequence_name_appears_in_LCA_table{$sequence_name} = 0; | ||
} | ||
|
||
|
||
# reads LCA table and prints rows for reads with hits | ||
my $first_row = 1; | ||
open LCA_MATCHES, "<$LCA_matches" || die "Could not open $LCA_matches to read\n"; | ||
while(<LCA_MATCHES>) | ||
{ | ||
chomp; | ||
my $line = $_; | ||
if($line =~ /\S/) | ||
{ | ||
if($first_row) | ||
{ | ||
# ignore header line | ||
$first_row = 0; | ||
} | ||
else | ||
{ | ||
# reads in relevant lines in row | ||
my @items = split($DELIMITER, $line); | ||
my $sequence_name = $items[$sequence_name_column]; | ||
my $assigned_taxon_id = $items[$LCA_taxon_id_column]; | ||
|
||
# records that we have seen this sequence | ||
$sequence_name_appears_in_LCA_table{$sequence_name} = 1; | ||
|
||
# prints kraken format row for unclassified sequence | ||
# "C"/"U": a one letter code indicating that the sequence was either classified | ||
# or unclassified. | ||
if($assigned_taxon_id == 0) | ||
{ | ||
print "U".$DELIMITER; | ||
} | ||
else | ||
{ | ||
print "C".$DELIMITER; | ||
} | ||
|
||
# The sequence ID, obtained from the FASTA/FASTQ header. | ||
print $sequence_name.$DELIMITER; | ||
|
||
# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence | ||
# is unclassified. | ||
print $assigned_taxon_id.$DELIMITER; | ||
|
||
# The length of the sequence in bp. In the case of paired read data, this will be | ||
# a string containing the lengths of the two sequences in bp, separated by a pipe | ||
# character, e.g. "98|94". | ||
print $sequence_name_to_length{$sequence_name}.$DELIMITER; | ||
|
||
# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s) | ||
print $NO_DATA.$NEWLINE; | ||
} | ||
} | ||
} | ||
close LCA_MATCHES; | ||
|
||
|
||
# prints rows for reads without hits | ||
foreach my $sequence_name(sort keys %sequence_name_appears_in_LCA_table) | ||
{ | ||
if($sequence_name and !$sequence_name_appears_in_LCA_table{$sequence_name}) | ||
{ | ||
# prints kraken format row for unclassified sequence | ||
# "C"/"U": a one letter code indicating that the sequence was either classified | ||
# or unclassified. | ||
print "U".$DELIMITER; | ||
|
||
# The sequence ID, obtained from the FASTA/FASTQ header. | ||
print $sequence_name.$DELIMITER; | ||
|
||
# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence | ||
# is unclassified. | ||
print "0".$DELIMITER; | ||
|
||
# The length of the sequence in bp. In the case of paired read data, this will be | ||
# a string containing the lengths of the two sequences in bp, separated by a pipe | ||
# character, e.g. "98|94". | ||
print $sequence_name_to_length{$sequence_name}.$DELIMITER; | ||
|
||
# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s) | ||
print $NO_DATA.$NEWLINE; | ||
} | ||
} | ||
|
||
|
||
# returns sequence length | ||
sub sequence_length | ||
{ | ||
my $sequence = $_[0]; | ||
|
||
# capitalize sequence | ||
$sequence = uc($sequence); | ||
|
||
# counts number bases or unambiguous bases in this sequence | ||
my $sequence_length = 0; | ||
foreach my $base(split //, $sequence) | ||
{ | ||
if(!$INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_unambiguous_base($base)) | ||
{ | ||
$sequence_length++; | ||
} | ||
elsif($INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_base($base)) | ||
{ | ||
$sequence_length++; | ||
} | ||
} | ||
return $sequence_length; | ||
} | ||
|
||
# returns 1 if base is A, T, C, G; returns 0 if not | ||
# input base must be capitalized | ||
sub is_unambiguous_base | ||
{ | ||
my $base = $_[0]; # must be capitalized | ||
if($base eq "A" or $base eq "T" or $base eq "C" or $base eq "G") | ||
{ | ||
return 1; | ||
} | ||
return 0; | ||
} | ||
|
||
# returns 1 if base is not gap, 0 if base is a gap | ||
sub is_base | ||
{ | ||
my $base = $_[0]; | ||
|
||
# empty value | ||
if(!$base) | ||
{ | ||
return 0; | ||
} | ||
|
||
# only whitespace | ||
if($base !~ /\S/) | ||
{ | ||
return 0; | ||
} | ||
|
||
# gap | ||
if($base eq "-") | ||
{ | ||
return 0; | ||
} | ||
|
||
# base | ||
return 1; | ||
} | ||
|
||
# September 29, 2023 |
Oops, something went wrong.