Skip to content

Commit

Permalink
Add genomes on server functionality (#164)
Browse files Browse the repository at this point in the history
* Add new new option for nucl database - Genome on server
* Adjust Tests
* test corrected and further changes by @wm75
* Update tools/ncbi_blast_plus/ncbi_makeblastdb.xml
* Update Version-Suffix and README
* Add missing README doc meesage for VERSION-SUFFIX 1

Co-authored-by: Wolfgang Maier <[email protected]>
  • Loading branch information
elischberg and wm75 authored Feb 22, 2024
1 parent 2dd12dc commit 028e3e8
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 27 deletions.
3 changes: 3 additions & 0 deletions test-data/all_fasta.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#<value> <dbkey> <display_name> <file_path>
#
three_human_mRNA thmRNA Three-Human-mRANs ${__HERE__}/three_human_mRNA.fasta
Binary file modified test-data/three_human_mRNA.fasta.gz
Binary file not shown.
4 changes: 4 additions & 0 deletions test-data/tool_data_table_conf.xml.test
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@
<columns>value, name, path</columns>
<file path="${__HERE__}/blastdb_d.loc" />
</table>
<table name="all_fasta" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="${__HERE__}/all_fasta.loc" />
</table>
</tables>
18 changes: 18 additions & 0 deletions tool-data/all_fasta.loc.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#This file lists the locations and dbkeys of all the fasta files
#under the "genome" directory (a directory that contains a directory
#for each build). The script extract_fasta.py will generate the file
#all_fasta.loc. This file has the format (white space characters are
#TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, all_fasta.loc could look something like this:
#
#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
#
#Your all_fasta.loc file should contain an entry for each individual
#fasta file. So there will be multiple fasta files for each build,
#such as with hg19 above.
#
4 changes: 4 additions & 0 deletions tool-data/tool_data_table_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,8 @@
<columns>value, name, path</columns>
<file path="tool-data/blastdb_d.loc" />
</table>
<table name="all_fasta" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="tool-data/all_fasta.loc.sample" />
</table>
</tables>
4 changes: 4 additions & 0 deletions tools/ncbi_blast_plus/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ a galaxy specific suffix which gets reset to zero with each new BLAST version:
============== ===============================================================
Version Changes
-------------- ---------------------------------------------------------------
2.14.1+galaxy2 - Add usage of genome FASTA files on the Galaxy server with
``makeblastdb`` (contribution from Wolfgang Maier and
Elischa Berger)
2.14.1+galaxy1 - Fix for get_species_taxids
2.14.1+galaxy0 - Updated for NCBI BLAST+ 2.14.1 release.
2.10.1+galaxy3 - Silenced ``deltablast`` warning about using ``-num_threads``
with ``--subject`` (i.e. FASTA file from your history).
Expand Down
2 changes: 1 addition & 1 deletion tools/ncbi_blast_plus/ncbi_macros.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<macros>
<token name="@TOOL_VERSION@">2.14.1</token>
<token name="@VERSION_SUFFIX@">1</token>
<token name="@VERSION_SUFFIX@">2</token>
<token name="@PROFILE@">16.10</token>
<xml name="parallelism">
<!-- If job splitting is enabled, break up the query file into parts -->
Expand Down
142 changes: 116 additions & 26 deletions tools/ncbi_blast_plus/ncbi_makeblastdb.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,36 @@
<requirement type="package" version="3.9">python</requirement>
</expand>
<command detect_errors="aggressive" strict="true"><![CDATA[
#set $inputs = []
#set $input_compression = []
#for r in $input.selection:
#if $input.type == "protein":
#silent $inputs.append($r.input_file)
#silent $input_compression.append($r.input_file.is_of_type('fasta.gz'))
#elif $r.nuc_choice.source == "history":
#silent $inputs.append($r.nuc_choice.input_file)
#silent $input_compression.append($r.nuc_choice.input_file.is_of_type('fasta.gz'))
#else:
#silent $inputs.append($r.nuc_choice.input_file.fields.path)
#silent $input_compression.append(False)
#end if
#end for
python $__tool_directory__/check_no_duplicates.py
##First check for duplicates (since BLAST+ 2.2.28 fails to do so)
##and abort (via the ampersand ampersand trick) if any are found.
#for i in $input_file#'${i}' #end for#
#for i in $inputs#'$i' #end for#
&&
##makeblastdb does not like input redirects of the sort
##makeblastdb -in <(gunzip -c gzipped_fasta_file)
##therefore we're cramming everything
##into a single cat command below
cat
#for i in $input_file:
#if $i.is_of_type('fasta.gz') and $i.ext != "fasta":
<(gunzip -c ${i})
#for i, is_gzipped in zip($inputs, $input_compression):
#if $is_gzipped:
<(gunzip -c '$i')
#else:
${i}
'$i'
#end if
#end for
| makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}'
Expand All @@ -36,7 +51,12 @@ $hash_index
##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
-title 'BLAST Database'
#end if
-dbtype $dbtype
-dbtype
#if $input.type == "protein":
prot
#else:
nucl
#end if
## --------------------------------------------------------------------
## Masking
## --------------------------------------------------------------------
Expand All @@ -60,15 +80,39 @@ $hash_index
> '$outfile'
]]></command>
<inputs>
<param argument="-dbtype" type="select" display="radio" label="Molecule type of input">
<option value="prot">protein</option>
<option value="nucl">nucleotide</option>
</param>
<!-- TODO Allow merging of existing BLAST databases (conditional on the database type)?
NOTE Double check the new database would be self contained first
-->
<!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
<param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" />
<conditional name="input">
<param argument="-dbtype" name="type" type="select" label="Molecule type of input">
<option value="protein">protein</option>
<option value="nucleotide">nucleotide</option>
</param>
<!-- TODO Allow merging of existing BLAST databases (conditional on the database type)?
NOTE Double check the new database would be self contained first
-->
<when value="protein">
<repeat name="selection" title="Select input" min="1" default="1">
<!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
<param name="input_file" argument="-in" type="data" format="fasta,fasta.gz" label="FASTA input" help="FASTA file with one or more sequences to add to the database" />
</repeat>
</when>
<when value="nucleotide">
<repeat name="selection" title="Select input" min="1" default="1">
<conditional name="nuc_choice">
<param name="source" type="select" label="Input is a">
<option value="history">Dataset in history</option>
<option value="cached">Genome on server</option>
</param>
<when value="history">
<param name="input_file" argument="-in" type="data" format="fasta,fasta.gz" label="FASTA input" help="FASTA file with one or more sequences to add to the database" />
</when>
<when value="cached">
<param name="input_file" type="select" label="Installed genome">
<options from_data_table="all_fasta"/>
</param>
</when>
</conditional>
</repeat>
</when>
</conditional>
<param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
<param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
<param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
Expand All @@ -95,15 +139,16 @@ $hash_index
<when value="map">
<param name="taxmap" argument="-taxid_map" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" />
</when>
-->
</conditional>
</inputs>
<outputs>
<!-- If we only accepted one FASTA file, we could use its human name here... -->
<data name="outfile" format="data" label="${dbtype.value_label} BLAST database from ${on_string}">
<data name="outfile" format="data" label="${input.type} BLAST database from ${on_string}">
<change_format>
<when input="dbtype" value="nucl" format="blastdbn" />
<when input="dbtype" value="prot" format="blastdbp" />
<when input="input.type" value="nucleotide" format="blastdbn" />
<when input="input.type" value="protein" format="blastdbp" />
</change_format>
</data>
</outputs>
Expand All @@ -115,8 +160,12 @@ $hash_index
With and without the taxid the only real difference is in the *.phr file.
-->
<test>
<param name="dbtype" value="prot" />
<param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
<conditional name="input">
<param name="type" value="protein"/>
<repeat name="selection">
<param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
</repeat>
</conditional>
<param name="title" value="Just 4 human proteins" />
<param name="parse_seqids" value="" />
<param name="hash_index" value="true" />
Expand All @@ -132,8 +181,12 @@ $hash_index
</output>
</test>
<test>
<param name="dbtype" value="prot" />
<param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
<conditional name="input">
<param name="type" value="protein"/>
<repeat name="selection">
<param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
</repeat>
</conditional>
<param name="title" value="Just 4 human proteins" />
<param name="parse_seqids" value="" />
<param name="hash_index" value="true" />
Expand All @@ -151,8 +204,12 @@ $hash_index
</output>
</test>
<test>
<param name="dbtype" value="prot" />
<param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
<conditional name="input">
<param name="type" value="protein"/>
<repeat name="selection">
<param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
</repeat>
</conditional>
<param name="title" value="Just 4 human proteins" />
<param name="parse_seqids" value="" />
<param name="hash_index" value="true" />
Expand All @@ -169,8 +226,41 @@ $hash_index
</output>
</test>
<test>
<param name="dbtype" value="nucl" />
<param name="input_file" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" />
<conditional name="input">
<param name="type" value="nucleotide"/>
<repeat name="selection">
<conditional name="nuc_choice">
<param name="source" value="history"/>
<param name="input_file" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" />
</conditional>
</repeat>
</conditional>
<param name="title" value="Just 3 human mRNA sequences" />
<param name="parse_seqids" value="" />
<param name="hash_index" value="true" />
<param name="taxselect" value="id" />
<param name="taxid" value="9606" />
<output name="outfile" compare="contains" file="three_human_mRNA.fasta.log.txt" ftype="blastdbn">
<extra_files type="file" value="three_human_mRNA.fasta.nhr" name="blastdb.nhr" />
<extra_files type="file" value="three_human_mRNA.fasta.nin" name="blastdb.nin" compare="sim_size" delta="8" />
<extra_files type="file" value="three_human_mRNA.fasta.nsq" name="blastdb.nsq" />
<extra_files type="file" value="three_human_mRNA.fasta.nog" name="blastdb.nog" />
<extra_files type="file" value="three_human_mRNA.fasta.nhd" name="blastdb.nhd" />
<extra_files type="file" value="three_human_mRNA.fasta.nhi" name="blastdb.nhi" />
<extra_files type="file" value="three_human_mRNA.fasta.nsd" name="blastdb.nsd" />
<extra_files type="file" value="three_human_mRNA.fasta.nsi" name="blastdb.nsi" />
</output>
</test>
<test>
<conditional name="input">
<param name="type" value="nucleotide"/>
<repeat name="selection">
<conditional name="nuc_choice">
<param name="source" value="cached"/>
<param name="input_file" value="three_human_mRNA" />
</conditional>
</repeat>
</conditional>
<param name="title" value="Just 3 human mRNA sequences" />
<param name="parse_seqids" value="" />
<param name="hash_index" value="true" />
Expand Down

0 comments on commit 028e3e8

Please sign in to comment.