diff --git a/CHANGELOG.md b/CHANGELOG.md index 15cea2a..c759533 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### 1.7.1 + +#### Bugfix + +Fixing some issues with blast report generation when using newer BLAST databases. Source of the problemis still unclear but appears to be linked to new realeses of the BLAST db and concerns only specific taxids or sequences. + ### 1.7.0 #### Breaking changes diff --git a/VERSION b/VERSION index 9dbb0c0..081af9a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.7.0 \ No newline at end of file +1.7.1 \ No newline at end of file diff --git a/workflow/envs/blast.yaml b/workflow/envs/blast.yaml index ade741e..093c60c 100644 --- a/workflow/envs/blast.yaml +++ b/workflow/envs/blast.yaml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - blast=2.12.0 + - blast=2.16.0 diff --git a/workflow/rules/blast.smk b/workflow/rules/blast.smk index f956bbe..abddfa7 100644 --- a/workflow/rules/blast.smk +++ b/workflow/rules/blast.smk @@ -155,6 +155,7 @@ rule blast_otus: -qcov_hsp_perc {params.qcov} $masking \ -outfmt '6 qseqid sseqid evalue pident bitscore sacc staxid length mismatch gaps stitle' \ -num_threads {threads} \ + -mt_mode 1 \ 2> {log} sed -i '1 i\query\tsubject\tevalue\tidentity\tbitscore\tsubject_acc\tsubject_taxid\talignment_length\tmismatch\tgaps\tsubject_name' {output.report} @@ -235,49 +236,52 @@ rule blast_stats: "{sample}/reports/{sample}_blast_stats.tsv", params: bit_diff=config["bit_score_diff"], + sample=lambda w: w.sample, message: "[{wildcards.sample}][assignement] collecting BLAST stats" conda: "../envs/pandas.yaml" log: "logs/{sample}/blast_stats.log", - shell: - """ - exec 2> {log} - - if [ -s {input.blast} ] - then - # Get list of all OTUs - OTUs=$(grep "^>" {input.otus} | cut -d";" -f1 | tr -d '>' | sort -u) - - for otu in $OTUs - do - size=$(grep -E "^>${{otu}}\>" {input.otus} | cut -d"=" -f2) - bhits=$(grep -c -E "^${{otu}};" {input.blast} || true) - if [ $bhits -eq 0 ] - then - # When there is no blast hit - echo "{wildcards.sample}\t$otu\t$size\t0\t0\t0\t0\t0\t-\t-\t-\t- (1.0)\t../{input.blast}\t../{input.filtered}" >> {output} - else - # Otherwise collect and print stats to file - bit_best=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -rn | head -n1) - bit_low=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -n | head -n1) - bit_thr=$(($bit_best - {params.bit_diff})) - shits=$(grep -c -E "^${{otu}}\>" {input.filtered}) - cons=$(grep -E "^${{otu}}\>" {input.lca} | cut -d'\t' -f2-5) - - echo "{wildcards.sample}\t$otu\t$size\t$bhits\t$bit_best\t$bit_low\t$bit_thr\t$shits\t$cons\t../{input.blast}\t../{input.filtered}" >> {output} - fi - done - # Sort by size and add header (just to get hits on top) - sort -k3,3nr -o {output} {output} - sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output} - - else - echo "{wildcards.sample}\t-\t-\t0\t0\t0\t0\t0\t-\t-\t-\t-\t../{input.blast}\t../{input.filtered}" > {output} - sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output} - fi - """ + script: + "../scripts/blast_stats.py" + # shell: + # """ + # exec 2> {log} + + # if [ -s {input.blast} ] + # then + # # Get list of all OTUs + # OTUs=$(grep "^>" {input.otus} | cut -d";" -f1 | tr -d '>' | sort -u) + + # for otu in $OTUs + # do + # size=$(grep -E "^>${{otu}}\>" {input.otus} | cut -d"=" -f2) + # bhits=$(grep -c -E "^${{otu}};" {input.blast} || true) + # if [ $bhits -eq 0 ] + # then + # # When there is no blast hit + # echo "{wildcards.sample}\t$otu\t$size\t0\t0\t0\t0\t0\t-\t-\t-\t- (1.0)\t../{input.blast}\t../{input.filtered}" >> {output} + # else + # # Otherwise collect and print stats to file + # bit_best=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -rn | head -n1) + # bit_low=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -n | head -n1) + # bit_thr=$(($bit_best - {params.bit_diff})) + # shits=$(grep -c -E "^${{otu}}\>" {input.filtered}) + # cons=$(grep -E "^${{otu}}\>" {input.lca} | cut -d'\t' -f2-5) + + # echo "{wildcards.sample}\t$otu\t$size\t$bhits\t$bit_best\t$bit_low\t$bit_thr\t$shits\t$cons\t../{input.blast}\t../{input.filtered}" >> {output} + # fi + # done + # # Sort by size and add header (just to get hits on top) + # sort -k3,3nr -o {output} {output} + # sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output} + + # else + # echo "{wildcards.sample}\t-\t-\t0\t0\t0\t0\t0\t-\t-\t-\t-\t../{input.blast}\t../{input.filtered}" > {output} + # sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output} + # fi + # """ rule collect_blast_stats: diff --git a/workflow/scripts/blast_stats.py b/workflow/scripts/blast_stats.py new file mode 100644 index 0000000..bd7f3f2 --- /dev/null +++ b/workflow/scripts/blast_stats.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +import sys + + +sys.stderr = open(snakemake.log[0], "w") + + +import os +import pandas as pd + + +HEADER = "\t".join([ + "Sample", + "Query", + "Count", + "Blast hits", + "Best bit-score", + "Lowest bit-score", + "Bit-score threshold", + "Saved Blast hits", + "Consensus", + "Rank", + "Taxid", + "Disambiguation", + "link_report", + "link_filtered" + ]) + + +def parse_fasta(fasta): + ids, sizes = [], [] + with open(fasta, "r") as fi: + for l in fi: + if l[0] ==">": + ids.append(l.split(";")[0][1:].strip()) + sizes.append(l.split("=")[1].strip()) + return ids, sizes + + +def count_blasthits(id, report): + i = 0 + with open(report, "r") as fi: + for l in fi: + if l.split(";")[0] == id: + i += 1 + return i + + +def main(otus_in, blast_in, filtered_in, lca_in, report_out, bit_diff, sample): + with open(report_out, "w") as fo: + fo.write(f"{HEADER}\n") + if not os.path.isfile(blast_in) or os.stat(blast_in).st_size == 0: + with open(report_out, "a") as fo: + fo.write("\t".join([ + sample, + "-", + "-", + "0", + "0", + "0", + "0", + "0", + "-", + "-", + "-", + "-", + f"../{blast_in}", + f"../{filtered_in}" + ])+"\n") + + else: + otus, sizes = parse_fasta(otus_in) + blast_df = pd.read_csv(blast_in, sep="\t") + filtered_df = pd.read_csv(filtered_in, sep="\t") + lca_df = pd.read_csv(lca_in, sep="\t") + + for otu, size in zip(otus, sizes): + bhits = count_blasthits(otu, blast_in) + if bhits == 0: + with open(report_out, "a") as fo: + fo.write("\t".join([ + sample, + otu, + str(size), + "0", + "0", + "0", + "0", + "0", + "-", + "-", + "-", + "- (1.0)", + f"../{blast_in}", + f"../{filtered_in}" + ])+"\n") + else: + bit_best = blast_df["bitscore"].max() + bit_low = blast_df["bitscore"].min() + bit_thr = bit_best - bit_diff + shits = filtered_df["query"].str.count(otu).sum() + cons = lca_df[lca_df["queryID"] == otu] + + with open(report_out, "a") as fo: + fo.write("\t".join([ + sample, + otu, + str(size), + str(bhits), + str(bit_best), + str(bit_low), + str(bit_thr), + str(shits), + cons["Consensus"].values[0], + cons["Rank"].values[0], + str(cons["Taxid"].values[0]), + cons["Disambiguation"].values[0], + f"../{blast_in}", + f"../{filtered_in}" + ])+"\n") + + +if __name__ == '__main__': + main( + snakemake.input['otus'], + snakemake.input['blast'], + snakemake.input['filtered'], + snakemake.input['lca'], + snakemake.output[0], + snakemake.params['bit_diff'], + snakemake.params['sample'], + )