Add bbmap_bbsplit (#138)

* initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586b. * initial commit, complete config file, add test data * complete config file, adjusted script and tests, not functional * update changelog, hep.txt, functional test, large test data * smaller test data * remove test resource from config * modify paths in test script * Arguments closer to original tool's * Extra arg to allow use of bbmap args
viash-hub · Oct 18, 2024 · 7fb67a9 · 7fb67a9
1 parent 86333c1
commit 7fb67a9
Show file tree

Hide file tree

Showing 5 changed files with 484 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -184,6 +184,9 @@
 * `bedtools`:
     - `bedtools_getfasta`: extract sequences from a FASTA file for each of the
                            intervals defined in a BED/GFF/VCF file (PR #59).
+
+* `bbmap`:
+    - `bbmap_bbsplit`: Split sequencing reads by mapping them to multiple references simultaneously (PR #138).
 
 
 

diff --git a/src/bbmap_bbsplit/config.vsh.yaml b/src/bbmap_bbsplit/config.vsh.yaml
@@ -0,0 +1,162 @@
+namespace: "bbmap"
+name: "bbmap_bbsplit"
+description: Split sequencing reads by mapping them to multiple references simultaneously.
+links:
+  homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/
+  documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/
+  repository: https://github.com/BioInfoTools/BBMap/blob/master/sh/bbsplit.sh
+
+license: BBTools Copyright (c) 2014
+
+argument_groups:
+- name: "Input"
+  arguments:
+  - name: "--id"
+    type: string
+    description: Sample ID
+  - name: "--paired"
+    type: boolean_true
+    description: Paired fastq files or not?
+  - name: "--input"
+    type: file
+    multiple: true
+    description: Input fastq files, either one or two (paired), separated by ";".
+    example: reads.fastq
+  - name: "--ref"
+    type: file
+    multiple: true
+    description: Reference FASTA files, separated by ";". The primary reference should be specified first.
+  - name: "--only_build_index"
+    type: boolean_true
+    description: If set, only builds the index. Otherwise, mapping is performed.
+  - name: "--build"
+    type: string
+    description: |
+      Designate index to use. Corresponds to the number specified when building the index.
+      If building the index, this will be the build's id. If multiple references are indexed
+      in the same directory, each needs a unique build ID. Default: 1.
+    example: "1"
+  - name: "--qin"
+    type: string
+    description: |
+      Set to 33 or 64 to specify input quality value ASCII offset. Automatically detected if
+      not specified.
+  - name: "--interleaved"
+    type: boolean_true
+    description: |
+      True forces paired/interleaved input; false forces single-ended mapping.
+      If not specified, interleaved status will be autodetected from read names.
+  - name: "--maxindel"
+    type: integer
+    description: |
+      Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq.
+    example: 20
+  - name: "--minratio"
+    type: double
+    description: |
+      Fraction of max alignment score required to keep a site. Higher is faster.
+    example: 0.56
+  - name: "--minhits"
+    type: integer
+    description: |
+      Minimum number of seed hits required for candidate sites. Higher is faster.
+    example: 1
+  - name: "--ambiguous"
+    type: string
+    description: |
+      Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
+        * best    Use the first best site (Default)
+        * toss    Consider unmapped
+        * random  Select one top-scoring site randomly
+        * all     Retain all top-scoring sites.  Does not work yet with SAM output
+    choices: [best, toss, random, all]
+    example: best
+  - name: "--ambiguous2"
+    type: string
+    description: |
+      Set behavior only for reads that map ambiguously to multiple different references.
+      Normal 'ambiguous=' controls behavior on all ambiguous reads;
+      Ambiguous2 excludes reads that map ambiguously within a single reference.
+        * best    Use the first best site (Default)
+        * toss    Consider unmapped
+        * all     Write a copy to the output for each reference to which it maps
+        * split   Write a copy to the AMBIGUOUS_ output for each reference to which it maps
+    choices: [best, toss, all, split]
+    example: best
+  - name: "--qtrim"
+    type: string
+    description: |
+      Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both).
+    choices: [l, r, lr]
+  - name: "--untrim"
+    type: boolean_true
+    description: Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings.
+
+
+- name: "Output"
+  arguments:
+  - name: "--fastq_1"
+    type: file
+    description: |
+      Output file for read 1.
+    direction: output
+    example: read_out1.fastq
+  - name: "--fastq_2"
+    type: file
+    description: |
+      Output file for read 2.
+    direction: output
+    example: read_out2.fastq
+  - name: "--sam2bam"
+    alternatives: ["--bs"]
+    type: file
+    description: |
+      Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
+    direction: output
+    example: script.sh
+  - name: "--scafstats"
+    type: file
+    description: |
+      Write statistics on how many reads mapped to which scaffold to this file.
+    direction: output
+    example: scaffold_stats.txt
+  - name: "--refstats"
+    type: file
+    description: |
+      Write statistics on how many reads were assigned to which reference to this file.
+      Unmapped reads whose mate mapped to a reference are considered assigned and will be counted.
+    direction: output
+    example: reference_stats.txt
+  - name: "--nzo"
+    type: boolean_true
+    description: Only print lines with nonzero coverage.
+  - name: "--bbmap_args"
+    type: string
+    description: |
+      Additional arguments from BBMap to pass to BBSplit.
+    
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+- type: docker
+  image: ubuntu:22.04
+  setup:
+    - type: docker
+      run: | 
+        apt-get update && \
+        apt-get install -y build-essential openjdk-17-jdk wget tar && \
+        wget --no-check-certificate https://sourceforge.net/projects/bbmap/files/BBMap_39.01.tar.gz && \
+        tar xzf BBMap_39.01.tar.gz && \
+        cp -r bbmap/* /usr/local/bin
+    - type: docker
+      run: |
+        bbsplit.sh --version 2>&1 | awk '/BBMap version/{print "BBMAP:", $NF}' > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/bbmap_bbsplit/help.txt b/src/bbmap_bbsplit/help.txt
@@ -0,0 +1,83 @@
+```
+bbsplit.sh
+```
+
+BBSplit
+Written by Brian Bushnell, from Dec. 2010 - present
+Last modified June 11, 2018
+
+Description:  Maps reads to multiple references simultaneously.
+Outputs reads to a file for the reference they best match, with multiple options for dealing with ambiguous mappings.
+
+To index:     bbsplit.sh build=<1> ref_x=<reference fasta> ref_y=<another reference fasta>
+To map:       bbsplit.sh build=<1> in=<reads> out_x=<output file> out_y=<another output file>
+
+To be concise, and do everything in one command:
+bbsplit.sh ref=x.fa,y.fa in=reads.fq basename=o%.fq
+
+that is equivalent to
+bbsplit.sh build=1 in=reads.fq ref_x=x.fa ref_y=y.fa out_x=ox.fq out_y=oy.fq
+
+By default paired reads will yield interleaved output, but you can use the # symbol to produce twin output files.
+For example, basename=o%_#.fq will produce ox_1.fq, ox_2.fq, oy_1.fq, and oy_2.fq.
+
+
+Indexing Parameters (required when building the index):
+ref=<file,file>     A list of references, or directories containing fasta files.
+ref_<name>=<ref.fa> Alternate, longer way to specify references. e.g., ref_ecoli=ecoli.fa
+                    These can also be comma-delimited lists of files; e.g., ref_a=a1.fa,a2.fa,a3.fa
+build=<1>           If multiple references are indexed in the same directory, each needs a unique build ID.
+path=<.>            Specify the location to write the index, if you don't want it in the current working directory.
+
+Input Parameters:
+build=<1>           Designate index to use.  Corresponds to the number specified when building the index.
+in=<reads.fq>       Primary reads input; required parameter.
+in2=<reads2.fq>     For paired reads in two files.
+qin=<auto>          Set to 33 or 64 to specify input quality value ASCII offset.
+interleaved=<auto>  True forces paired/interleaved input; false forces single-ended mapping.
+                    If not specified, interleaved status will be autodetected from read names.
+
+Mapping Parameters:
+maxindel=<20>       Don't look for indels longer than this.  Lower is faster.  Set to >=100k for RNA-seq.
+minratio=<0.56>     Fraction of max alignment score required to keep a site.  Higher is faster.
+minhits=<1>         Minimum number of seed hits required for candidate sites.  Higher is faster.
+ambiguous=<best>    Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
+                       best   (use the first best site)
+                       toss   (consider unmapped)
+                       random   (select one top-scoring site randomly)
+                       all   (retain all top-scoring sites.  Does not work yet with SAM output)
+ambiguous2=<best>   Set behavior only for reads that map ambiguously to multiple different references.
+                    Normal 'ambiguous=' controls behavior on all ambiguous reads;
+                    Ambiguous2 excludes reads that map ambiguously within a single reference.
+                       best   (use the first best site)
+                       toss   (consider unmapped)
+                       all   (write a copy to the output for each reference to which it maps)
+                       split   (write a copy to the AMBIGUOUS_ output for each reference to which it maps)
+qtrim=<true>        Quality-trim ends to Q5 before mapping.  Options are 'l' (left), 'r' (right), and 'lr' (both).
+untrim=<true>       Undo trimming after mapping.  Untrimmed bases will be soft-clipped in cigar strings.
+
+Output Parameters:
+out_<name>=<file>   Output reads that map to the reference <name> to <file>.
+basename=prefix%suffix     Equivalent to multiple out_%=prefix%suffix expressions, in which each % is replaced by the name of a reference file.
+bs=<file>           Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
+scafstats=<file>    Write statistics on how many reads mapped to which scaffold to this file.
+refstats=<file>     Write statistics on how many reads were assigned to which reference to this file.
+                    Unmapped reads whose mate mapped to a reference are considered assigned and will be counted.
+nzo=t               Only print lines with nonzero coverage.
+
+***** Notes *****
+Almost all BBMap parameters can be used; run bbmap.sh for more details.
+Exceptions include the 'nodisk' flag, which BBSplit does not support.
+BBSplit is recommended for fastq and fasta output, not for sam/bam output.
+When the reference sequences are shorter than read length, use Seal instead of BBSplit.
+
+Java Parameters:
+-Xmx                This will set Java's memory usage, overriding autodetection.
+                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+                    The max is typically 85% of physical memory.
+-eoom               This flag will cause the process to exit if an
+                    out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.
+
+This list is not complete.  For more information, please consult /readme.txt
+Please contact Brian Bushnell at [email protected] if you encounter any problems.
diff --git a/src/bbmap_bbsplit/script.sh b/src/bbmap_bbsplit/script.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -eo pipefail
+
+function clean_up {
+    rm -rf "$tmpdir"
+}
+trap clean_up EXIT 
+
+unset_if_false=( par_paired par_only_build_index par_interleaved par_untrim par_nzo)
+
+for var in "${unset_if_false[@]}"; do
+    if [ -z "${!var}" ]; then
+        unset $var
+    fi
+done
+
+if [ ! -d "$par_build" ]; then
+    IFS=";" read -ra ref_files <<< "$par_ref"
+    primary_ref="${ref_files[0]}"
+    refs=()
+    for file in "${ref_files[@]:1}"
+    do
+        name=$(basename "$file" | sed 's/\.[^.]*$//')
+        refs+=("ref_$name=$file")
+    done
+fi
+
+if $par_only_build_index; then
+    if [ ${#refs[@]} -gt 1 ]; then
+        bbsplit.sh \
+            --ref_primary="$primary_ref" \
+            "${refs[@]}" \
+            path=$par_build
+    else
+        echo "ERROR: Please specify at least two reference fasta files."
+    fi
+else
+    IFS=";" read -ra input <<< "$par_input"
+    tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")
+    index_files=''
+    if [ -d "$par_build" ]; then
+        index_files="path=$par_build"
+    elif [ ${#refs[@]} -gt 0 ]; then
+        index_files="--ref_primary=$primary_ref ${refs[*]}"
+    else
+        echo "ERROR: Please either specify a BBSplit index as input or at least two reference fasta files."
+    fi
+
+    extra_args=""
+    if [ -n "$par_refstats" ]; then extra_args+=" --refstats $par_refstats"; fi
+    if [ -n "$par_ambiguous" ]; then extra_args+=" --ambiguous $par_ambiguous"; fi
+    if [ -n "$par_ambiguous2" ]; then extra_args+=" --ambiguous2 $par_ambiguous2"; fi
+    if [ -n "$par_minratio" ]; then extra_args+=" --minratio $par_minratio"; fi
+    if [ -n "$par_minhits" ]; then extra_args+=" --minhits $par_minhits"; fi
+    if [ -n "$par_maxindel" ]; then extra_args+=" --maxindel $par_maxindel"; fi
+    if [ -n "$par_qin" ]; then extra_args+=" --qin $par_qin"; fi
+    if [ -n "$par_qtrim" ]; then extra_args+=" --qtrim $par_qtrim"; fi
+    if [ "$par_interleaved" = true ]; then extra_args+=" --interleaved"; fi
+    if [ "$par_untrim" = true ]; then extra_args+=" --untrim"; fi
+    if [ "$par_nzo" = true ]; then extra_args+=" --nzo"; fi
+
+    if [ -n "$par_bbmap_args" ]; then extra_args+=" $par_bbmap_args"; fi
+
+
+    if $par_paired; then
+        bbsplit.sh \
+            $index_files \
+            in=${input[0]} \
+            in2=${input[1]} \
+            basename=${tmpdir}/%_#.fastq \
+            $extra_args
+        read1=$(find $tmpdir/ -iname primary_1*)
+        read2=$(find $tmpdir/ -iname primary_2*)
+        cp $read1 $par_fastq_1
+        cp $read2 $par_fastq_2
+    else
+        bbsplit.sh \
+            $index_files \
+            in=${input[0]} \
+            basename=${tmpdir}/%.fastq \
+            $extra_args
+        read1=$(find $tmpdir/ -iname primary*)
+        cp $read1 $par_fastq_1
+    fi
+fi
+
+exit 0