Skip to content

Commit

Permalink
Add bbmap_bbsplit (#138)
Browse files Browse the repository at this point in the history
* initial commit dedup

* Revert "initial commit dedup"

This reverts commit 38f586b.

* initial commit, complete config file, add test data

* complete config file, adjusted script and tests, not functional

* update changelog, hep.txt, functional test, large test data

* smaller test data

* remove test resource from config

* modify paths in test script

* Arguments closer to original tool's

* Extra arg to allow use of bbmap args
  • Loading branch information
emmarousseau authored Oct 18, 2024
1 parent 86333c1 commit 7fb67a9
Show file tree
Hide file tree
Showing 5 changed files with 484 additions and 0 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@
* `bedtools`:
- `bedtools_getfasta`: extract sequences from a FASTA file for each of the
intervals defined in a BED/GFF/VCF file (PR #59).

* `bbmap`:
- `bbmap_bbsplit`: Split sequencing reads by mapping them to multiple references simultaneously (PR #138).



Expand Down
162 changes: 162 additions & 0 deletions src/bbmap_bbsplit/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
namespace: "bbmap"
name: "bbmap_bbsplit"
description: Split sequencing reads by mapping them to multiple references simultaneously.
links:
homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/
documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/
repository: https://github.com/BioInfoTools/BBMap/blob/master/sh/bbsplit.sh

license: BBTools Copyright (c) 2014

argument_groups:
- name: "Input"
arguments:
- name: "--id"
type: string
description: Sample ID
- name: "--paired"
type: boolean_true
description: Paired fastq files or not?
- name: "--input"
type: file
multiple: true
description: Input fastq files, either one or two (paired), separated by ";".
example: reads.fastq
- name: "--ref"
type: file
multiple: true
description: Reference FASTA files, separated by ";". The primary reference should be specified first.
- name: "--only_build_index"
type: boolean_true
description: If set, only builds the index. Otherwise, mapping is performed.
- name: "--build"
type: string
description: |
Designate index to use. Corresponds to the number specified when building the index.
If building the index, this will be the build's id. If multiple references are indexed
in the same directory, each needs a unique build ID. Default: 1.
example: "1"
- name: "--qin"
type: string
description: |
Set to 33 or 64 to specify input quality value ASCII offset. Automatically detected if
not specified.
- name: "--interleaved"
type: boolean_true
description: |
True forces paired/interleaved input; false forces single-ended mapping.
If not specified, interleaved status will be autodetected from read names.
- name: "--maxindel"
type: integer
description: |
Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq.
example: 20
- name: "--minratio"
type: double
description: |
Fraction of max alignment score required to keep a site. Higher is faster.
example: 0.56
- name: "--minhits"
type: integer
description: |
Minimum number of seed hits required for candidate sites. Higher is faster.
example: 1
- name: "--ambiguous"
type: string
description: |
Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
* best Use the first best site (Default)
* toss Consider unmapped
* random Select one top-scoring site randomly
* all Retain all top-scoring sites. Does not work yet with SAM output
choices: [best, toss, random, all]
example: best
- name: "--ambiguous2"
type: string
description: |
Set behavior only for reads that map ambiguously to multiple different references.
Normal 'ambiguous=' controls behavior on all ambiguous reads;
Ambiguous2 excludes reads that map ambiguously within a single reference.
* best Use the first best site (Default)
* toss Consider unmapped
* all Write a copy to the output for each reference to which it maps
* split Write a copy to the AMBIGUOUS_ output for each reference to which it maps
choices: [best, toss, all, split]
example: best
- name: "--qtrim"
type: string
description: |
Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both).
choices: [l, r, lr]
- name: "--untrim"
type: boolean_true
description: Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings.


- name: "Output"
arguments:
- name: "--fastq_1"
type: file
description: |
Output file for read 1.
direction: output
example: read_out1.fastq
- name: "--fastq_2"
type: file
description: |
Output file for read 2.
direction: output
example: read_out2.fastq
- name: "--sam2bam"
alternatives: ["--bs"]
type: file
description: |
Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
direction: output
example: script.sh
- name: "--scafstats"
type: file
description: |
Write statistics on how many reads mapped to which scaffold to this file.
direction: output
example: scaffold_stats.txt
- name: "--refstats"
type: file
description: |
Write statistics on how many reads were assigned to which reference to this file.
Unmapped reads whose mate mapped to a reference are considered assigned and will be counted.
direction: output
example: reference_stats.txt
- name: "--nzo"
type: boolean_true
description: Only print lines with nonzero coverage.
- name: "--bbmap_args"
type: string
description: |
Additional arguments from BBMap to pass to BBSplit.
resources:
- type: bash_script
path: script.sh

test_resources:
- type: bash_script
path: test.sh

engines:
- type: docker
image: ubuntu:22.04
setup:
- type: docker
run: |
apt-get update && \
apt-get install -y build-essential openjdk-17-jdk wget tar && \
wget --no-check-certificate https://sourceforge.net/projects/bbmap/files/BBMap_39.01.tar.gz && \
tar xzf BBMap_39.01.tar.gz && \
cp -r bbmap/* /usr/local/bin
- type: docker
run: |
bbsplit.sh --version 2>&1 | awk '/BBMap version/{print "BBMAP:", $NF}' > /var/software_versions.txt
runners:
- type: executable
- type: nextflow
83 changes: 83 additions & 0 deletions src/bbmap_bbsplit/help.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
```
bbsplit.sh
```

BBSplit
Written by Brian Bushnell, from Dec. 2010 - present
Last modified June 11, 2018

Description: Maps reads to multiple references simultaneously.
Outputs reads to a file for the reference they best match, with multiple options for dealing with ambiguous mappings.

To index: bbsplit.sh build=<1> ref_x=<reference fasta> ref_y=<another reference fasta>
To map: bbsplit.sh build=<1> in=<reads> out_x=<output file> out_y=<another output file>

To be concise, and do everything in one command:
bbsplit.sh ref=x.fa,y.fa in=reads.fq basename=o%.fq

that is equivalent to
bbsplit.sh build=1 in=reads.fq ref_x=x.fa ref_y=y.fa out_x=ox.fq out_y=oy.fq

By default paired reads will yield interleaved output, but you can use the # symbol to produce twin output files.
For example, basename=o%_#.fq will produce ox_1.fq, ox_2.fq, oy_1.fq, and oy_2.fq.


Indexing Parameters (required when building the index):
ref=<file,file> A list of references, or directories containing fasta files.
ref_<name>=<ref.fa> Alternate, longer way to specify references. e.g., ref_ecoli=ecoli.fa
These can also be comma-delimited lists of files; e.g., ref_a=a1.fa,a2.fa,a3.fa
build=<1> If multiple references are indexed in the same directory, each needs a unique build ID.
path=<.> Specify the location to write the index, if you don't want it in the current working directory.

Input Parameters:
build=<1> Designate index to use. Corresponds to the number specified when building the index.
in=<reads.fq> Primary reads input; required parameter.
in2=<reads2.fq> For paired reads in two files.
qin=<auto> Set to 33 or 64 to specify input quality value ASCII offset.
interleaved=<auto> True forces paired/interleaved input; false forces single-ended mapping.
If not specified, interleaved status will be autodetected from read names.

Mapping Parameters:
maxindel=<20> Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq.
minratio=<0.56> Fraction of max alignment score required to keep a site. Higher is faster.
minhits=<1> Minimum number of seed hits required for candidate sites. Higher is faster.
ambiguous=<best> Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
best (use the first best site)
toss (consider unmapped)
random (select one top-scoring site randomly)
all (retain all top-scoring sites. Does not work yet with SAM output)
ambiguous2=<best> Set behavior only for reads that map ambiguously to multiple different references.
Normal 'ambiguous=' controls behavior on all ambiguous reads;
Ambiguous2 excludes reads that map ambiguously within a single reference.
best (use the first best site)
toss (consider unmapped)
all (write a copy to the output for each reference to which it maps)
split (write a copy to the AMBIGUOUS_ output for each reference to which it maps)
qtrim=<true> Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both).
untrim=<true> Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings.

Output Parameters:
out_<name>=<file> Output reads that map to the reference <name> to <file>.
basename=prefix%suffix Equivalent to multiple out_%=prefix%suffix expressions, in which each % is replaced by the name of a reference file.
bs=<file> Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
scafstats=<file> Write statistics on how many reads mapped to which scaffold to this file.
refstats=<file> Write statistics on how many reads were assigned to which reference to this file.
Unmapped reads whose mate mapped to a reference are considered assigned and will be counted.
nzo=t Only print lines with nonzero coverage.

***** Notes *****
Almost all BBMap parameters can be used; run bbmap.sh for more details.
Exceptions include the 'nodisk' flag, which BBSplit does not support.
BBSplit is recommended for fastq and fasta output, not for sam/bam output.
When the reference sequences are shorter than read length, use Seal instead of BBSplit.

Java Parameters:
-Xmx This will set Java's memory usage, overriding autodetection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an
out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.

This list is not complete. For more information, please consult /readme.txt
Please contact Brian Bushnell at [email protected] if you encounter any problems.
91 changes: 91 additions & 0 deletions src/bbmap_bbsplit/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash

## VIASH START
## VIASH END

set -eo pipefail

function clean_up {
rm -rf "$tmpdir"
}
trap clean_up EXIT

unset_if_false=( par_paired par_only_build_index par_interleaved par_untrim par_nzo)

for var in "${unset_if_false[@]}"; do
if [ -z "${!var}" ]; then
unset $var
fi
done

if [ ! -d "$par_build" ]; then
IFS=";" read -ra ref_files <<< "$par_ref"
primary_ref="${ref_files[0]}"
refs=()
for file in "${ref_files[@]:1}"
do
name=$(basename "$file" | sed 's/\.[^.]*$//')
refs+=("ref_$name=$file")
done
fi

if $par_only_build_index; then
if [ ${#refs[@]} -gt 1 ]; then
bbsplit.sh \
--ref_primary="$primary_ref" \
"${refs[@]}" \
path=$par_build
else
echo "ERROR: Please specify at least two reference fasta files."
fi
else
IFS=";" read -ra input <<< "$par_input"
tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")
index_files=''
if [ -d "$par_build" ]; then
index_files="path=$par_build"
elif [ ${#refs[@]} -gt 0 ]; then
index_files="--ref_primary=$primary_ref ${refs[*]}"
else
echo "ERROR: Please either specify a BBSplit index as input or at least two reference fasta files."
fi

extra_args=""
if [ -n "$par_refstats" ]; then extra_args+=" --refstats $par_refstats"; fi
if [ -n "$par_ambiguous" ]; then extra_args+=" --ambiguous $par_ambiguous"; fi
if [ -n "$par_ambiguous2" ]; then extra_args+=" --ambiguous2 $par_ambiguous2"; fi
if [ -n "$par_minratio" ]; then extra_args+=" --minratio $par_minratio"; fi
if [ -n "$par_minhits" ]; then extra_args+=" --minhits $par_minhits"; fi
if [ -n "$par_maxindel" ]; then extra_args+=" --maxindel $par_maxindel"; fi
if [ -n "$par_qin" ]; then extra_args+=" --qin $par_qin"; fi
if [ -n "$par_qtrim" ]; then extra_args+=" --qtrim $par_qtrim"; fi
if [ "$par_interleaved" = true ]; then extra_args+=" --interleaved"; fi
if [ "$par_untrim" = true ]; then extra_args+=" --untrim"; fi
if [ "$par_nzo" = true ]; then extra_args+=" --nzo"; fi

if [ -n "$par_bbmap_args" ]; then extra_args+=" $par_bbmap_args"; fi


if $par_paired; then
bbsplit.sh \
$index_files \
in=${input[0]} \
in2=${input[1]} \
basename=${tmpdir}/%_#.fastq \
$extra_args
read1=$(find $tmpdir/ -iname primary_1*)
read2=$(find $tmpdir/ -iname primary_2*)
cp $read1 $par_fastq_1
cp $read2 $par_fastq_2
else
bbsplit.sh \
$index_files \
in=${input[0]} \
basename=${tmpdir}/%.fastq \
$extra_args
read1=$(find $tmpdir/ -iname primary*)
cp $read1 $par_fastq_1
fi
fi

exit 0
Loading

0 comments on commit 7fb67a9

Please sign in to comment.