From ee349d5b7f5599c2975e1ef40e532cb7824a612d Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 5 Jun 2024 14:09:31 -0400 Subject: [PATCH] Esearch input (#47) * Add genomes (#45) (#46) * Corynebacterium diphtheriae * added Bifidobacterium adolenscentis * replaced S. enterica IIIa; Added hops (Humulus lupulus) * added a Citrobacter species * m * replaced repressed genome accession for B. faecium * remove random single quotes * bump version * helpful log messages * v5.6.3 * make symlink to avoid naming mistakes * check whether taxonkit is loaded * use efetch -input * fix tr bug --- bin/buildKraken1.sh | 8 ++++++++ bin/downloadKalamari.pl | 26 ++++++++++++-------------- bin/filterTaxonomy.sh | 3 +++ src/plasmids.tsv | 4 ++-- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/bin/buildKraken1.sh b/bin/buildKraken1.sh index d51f638..851ff75 100755 --- a/bin/buildKraken1.sh +++ b/bin/buildKraken1.sh @@ -22,11 +22,15 @@ cp -rv $TAXDIR $DB/taxonomy # Make --add-to-library more efficient with # concatenated fasta files +export nl=$'\n' find $SRC -name '*.fasta.gz' | \ xargs -n 100 -P 1 bash -c ' for i in "$@"; do gzip -cd $i done > $tmpfile + echo -ne "ADDING to library:\n " + zgrep "^>" $tmpfile | sed "s/^>//" | tr "$nl" " " + echo "^^ contents of $tmpfile ^^" kraken-build --db $DB --add-to-library $tmpfile ' @@ -35,3 +39,7 @@ kraken-build --db $DB --build --threads 1 # Reduce the size of the database kraken-build --db $DB --clean + +if [ ! -e "$sharedir/kalamari-kraken1" ]; then + ln -sv kalamari-kraken "$sharedir/kalamari-kraken1" +fi diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index 314a3de..34526a5 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -11,7 +11,7 @@ use IO::Compress::Gzip; use version 0.77; -our $VERSION = version->parse("5.6.0"); +our $VERSION = version->parse("5.7.0"); use threads; @@ -167,27 +167,25 @@ sub downloadEntries{ my $numEntries = scalar(@$entries); my @acc = map{$$_{nuccoreAcc}} @$entries; logmsg "Downloading ".scalar(@acc)." accessions"; - my $queryArg = join("[accession] OR ", sort(@acc))."[accession]"; my $dir = tempdir("download.XXXXXX", DIR=>$$settings{tempdir}); + # Make the input file for efetch + my $inputAcc = "$dir/input.acc"; + open(my $fh, ">", $inputAcc) or die "ERROR: could not write to $inputAcc: $!"; + print $fh join("\n", @acc)."\n"; + close $fh; + # Accessions that had errors my @err; - # Get the esearch xml in place for at least one downstream query - my $esearchXml = "$dir/esearch.xml"; - my $esearchCmd = "esearch -db nuccore -query '$queryArg' > $esearchXml"; - command($esearchCmd); + # Get started on the comprehensive assembly file + my $outfile = "$dir/all.fasta"; + logmsg "Downloading all accessions to $outfile using input accessions in $inputAcc"; + command("efetch -db nuccore -input $inputAcc -format fasta > $dir/all.fasta"); if($?){ - die "ERROR running: $esearchCmd: $!"; + die "ERROR: could not download all accessions"; } - # Get started on the assembly file - my $outfile = "$dir/all.fasta"; - - # Main query: efetch - my $efetchCmd = "cat $esearchXml | efetch -format fasta > $outfile"; - system($efetchCmd); - my $seqsWithVersion = readSeqs($outfile); my $seqs = {}; while(my($acc, $seq) = each(%$seqsWithVersion)){ diff --git a/bin/filterTaxonomy.sh b/bin/filterTaxonomy.sh index 48fbb3f..1af4912 100755 --- a/bin/filterTaxonomy.sh +++ b/bin/filterTaxonomy.sh @@ -2,6 +2,9 @@ set -eu +# Check for dependencies +which taxonkit + thisdir=$(dirname $0) thisfile=$(basename $0) KALAMARI_VER=$(downloadKalamari.pl --version) diff --git a/src/plasmids.tsv b/src/plasmids.tsv index e9656a5..c2dafa6 100644 --- a/src/plasmids.tsv +++ b/src/plasmids.tsv @@ -2964,7 +2964,7 @@ Rickettsia CP015014 780 33988 Rickettsia CP010970 780 33988 Onion yellows phytoplasma AB480166 100379 85620 Onion yellows phytoplasma AB479509 100379 85620 -'Brassica napus' phytoplasma HQ637382 469009 85620 +Brassica napus phytoplasma HQ637382 469009 85620 Candidatus Phytoplasma FJ905104 33926 2146 Candidatus Phytoplasma KF801472 33926 2146 Onion yellows phytoplasma AB479513 100379 2146 @@ -2986,7 +2986,7 @@ Paulownia witches'-broom phytoplasma EF426472 39647 85620 Paulownia witches'-broom phytoplasma EF426473 39647 85620 Periwinkle little leaf phytoplasma JN835187 137854 85635 Rice orange leaf phytoplasma KY086101 146897 85635 -'Catharanthus roseus' aster yellows phytoplasma CP035950 1193712 85620 +Catharanthus roseus aster yellows phytoplasma CP035950 1193712 85620 Bacillus thuringiensis CP016196 1428 85620 Bacillus sp. BS98 CP043831 2608254 185979 Bacillus CP009595 1386 185979