Merge pull request #4 from CenterForMedicalGeneticsGhent/dev

Release PR v1.0.0
nf-cmgg · Sep 7, 2023 · 1394aa1 · 1394aa1
2 parents 6c40deb + daccb87
commit 1394aa1
Show file tree

Hide file tree

Showing 46 changed files with 355 additions and 1,011 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v1.0.0 - Moonwalking Mamba - [7 September 2023]
+
+### `Added`
+
+1. Removed the alignment in favor of supplying BAM or CRAM files as input. This makes sure the annotation is closely related to the actual data it needs to be used on.
+
+### `Fixed`
+
+1. Improved handling of duplicate filenames
+
 ## v0.1.0 - Dancing Panda - [4 July 2023]
 
 Initial release of CenterForMedicalGeneticsGhent/nf-cmgg-qdnaseq, created with the [nf-core](https://nf-co.re/) template.

diff --git a/README.md b/README.md
@@ -8,11 +8,9 @@
 
 **CenterForMedicalGeneticsGhent/nf-cmgg-qdnaseq** is a bioinformatics pipeline for creating qDNAseq annotations
 
-1. Trim FASTQ files to read lengths of 50 with Trimmomatic
-2. Align the reads with BWA (aln and samse/sampe)
-3. Create a mappability WIG file with GenMap
-4. Convert the WIG to BigWig with UCSC WigToBigWig
-5. Create the annotations using a custom R script
+1. Create a mappability WIG file with GenMap
+2. Convert the WIG to BigWig with UCSC WigToBigWig
+3. Create the annotations using a custom R script
 
 ## Usage
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,5 +1,5 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-qdnaseq/0.1.0" target="_blank">CenterForMedicalGeneticsGhent/nf-cmgg-qdnaseq</a>
+  This report has been generated by the <a href="https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-qdnaseq/1.0.0" target="_blank">CenterForMedicalGeneticsGhent/nf-cmgg-qdnaseq</a>
   analysis pipeline.
 report_section_order:
   "CenterForMedicalGeneticsGhent-nf-cmgg-qdnaseq-methods-description":

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,fastq_1,fastq_2
-test1,https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R1.fastq.gz,https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R2.fastq.gz
-test2,https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R1.fastq.gz,
+cram,crai
+https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/raw/main/data/genomics/homo_sapiens/illumina/cram/test.cram,https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/raw/main/data/genomics/homo_sapiens/illumina/cram/test.cram.crai
+https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/raw/main/data/genomics/homo_sapiens/illumina/cram/test2.cram,
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -7,23 +7,19 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample": {
-                "type": "string",
-                "meta": ["id"]
-            },
-            "fastq_1": {
+            "cram": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
-                "pattern": "^.*\\.fastq(\\.gz)?$"
+                "pattern": "^.*\\.(b|cr)am$"
             },
-            "fastq_2": {
+            "crai": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
-                "pattern": "^.*\\.fastq(\\.gz)?$"
+                "pattern": "^.*\\.(b|cr)ai$"
             }
         },
-        "required": ["fastq_1", "sample"]
+        "required": ["cram"]
     }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -18,14 +18,6 @@ if(!params.annotation_genome) {
 
 process {
 
-    withName: TRIMGALORE {
-        ext.args   = "--hardtrim5 50"
-    }
-
-    withName: BWA_ALN {
-        ext.args   = "-n 2 -q 40"
-    }
-
     withName: GAWK {
         ext.suffix = "sizes"
         ext.args2  = '\'{print $1"\t"$2}\''
@@ -37,6 +29,7 @@ process {
     }
 
     withName: CREATE_ANNOTATIONS {
+        stageInMode = "copy" // Because qdnaseq tries to fetch the indices from the link source
         publishDir  = [
             overwrite: true,
             enabled: true,

diff --git a/conf/nf_test.config b/conf/nf_test.config
@@ -31,4 +31,5 @@ params {
     // Genome references
     genome    = 'hg38'
     bin_sizes = "10,5"
+    species   = "Hsapiens"
 }
diff --git a/conf/test.config b/conf/test.config
@@ -28,4 +28,5 @@ params {
     // Genome references
     genome    = 'hg38'
     bin_sizes = "10,5"
+    species   = "Hsapiens"
 }
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -22,9 +22,9 @@ Reference genome related files and options required for the workflow.
 | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
 | `genome`            | Name of the genome.                                                                                                                                                                                                                                                               | `string`  |         | True     |        |
 | `annotation_genome` | The name of the genome used to create the annotations. This will default to the value supplied with --genome.                                                                                                                                                                     | `string`  | None    |          |        |
+| `species`           | Name of the species. Needs to be in this format: Hsapiens (First name as a capital letter and last name as all lowercase letters)                                                                                                                                                 | `string`  |         | True     |        |
 | `fasta`             | Path to FASTA genome file. <details><summary>Help</summary><small>This parameter is _mandatory_ if `--genome` is not specified.</small></details>                                                                                                                                 | `string`  |         |          |        |
 | `fai`               | Path to FASTA genome index file.                                                                                                                                                                                                                                                  | `string`  |         |          |        |
-| `bwa`               | The BWA index.                                                                                                                                                                                                                                                                    | `string`  |         |          |        |
 | `blacklist`         | The blacklist BED file.                                                                                                                                                                                                                                                           | `string`  |         |          |        |
 | `igenomes_base`     | Directory / URL base for iGenomes references.                                                                                                                                                                                                                                     | `string`  |         |          | True   |
 | `igenomes_ignore`   | Do not load the iGenomes reference config. <details><summary>Help</summary><small>Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.</small></details> | `boolean` |         |          | True   |

diff --git a/docs/usage.md b/docs/usage.md
@@ -12,39 +12,18 @@ You will need to create a samplesheet with information about the samples you wou
 --input '[path to samplesheet file]'
 ```
 
-### Multiple runs of the same sample
-
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
-
-```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
-```
-
 ### Full samplesheet
 
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
-
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
-
 ```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+cram,crai
+test.cram,test.cram.crai
+test2.cram,
 ```
 
-| Column    | Description                                                                                                                                                                            |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| Column | Description                                          |
+| ------ | ---------------------------------------------------- |
+| `cram` | A input BAM or CRAM file to use for bins calculation |
+| `crai` | The index for the BAM or CRAM file.                  |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

diff --git a/main.nf b/main.nf
@@ -17,7 +17,6 @@ nextflow.enable.dsl = 2
 
 params.fasta  = WorkflowMain.getGenomeAttribute(params, 'fasta')
 params.fai    = WorkflowMain.getGenomeAttribute(params, 'fai')
-params.bwa    = WorkflowMain.getGenomeAttribute(params, 'bwa')
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/modules.json b/modules.json
@@ -5,32 +5,9 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
-                    "bwa/aln": {
-                        "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"],
-                        "patch": "modules/nf-core/bwa/aln/bwa-aln.diff"
-                    },
-                    "bwa/index": {
-                        "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
-                    },
-                    "bwa/sampe": {
-                        "branch": "master",
-                        "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
-                        "installed_by": ["modules"],
-                        "patch": "modules/nf-core/bwa/sampe/bwa-sampe.diff"
-                    },
-                    "bwa/samse": {
-                        "branch": "master",
-                        "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
-                        "installed_by": ["modules"],
-                        "patch": "modules/nf-core/bwa/samse/bwa-samse.diff"
-                    },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4",
                         "installed_by": ["modules"]
                     },
                     "gawk": {
@@ -50,7 +27,7 @@
                     },
                     "multiqc": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80",
                         "installed_by": ["modules"]
                     },
                     "samtools/convert": {
@@ -69,24 +46,20 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/samtools/index/samtools-index.diff"
                     },
-                    "tabix/bgzip": {
+                    "samtools/merge": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
-                    "trimgalore": {
+                    "tabix/bgzip": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
                     "ucsc/wigtobigwig": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
-                    },
-                    "untar": {
-                        "branch": "master",
-                        "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e",
+                        "git_sha": "66290981ab6038ea86177ade40b9449bc790b0ce",
                         "installed_by": ["modules"]
                     }
                 }

diff --git a/modules/local/create_annotations/Dockerfile b/modules/local/create_annotations/Dockerfile
@@ -1,14 +1,19 @@
 FROM mambaorg/micromamba:1.4-focal
 
-LABEL version="0.0.1" maintainer="Nicolas Vannieuwkerke <[email protected]>"
+LABEL version="0.0.3" maintainer="Nicolas Vannieuwkerke <[email protected]>"
 
 RUN micromamba install -y --name base -c conda-forge -c bioconda -c defaults \
     bioconductor-qdnaseq==1.34.0 \
     bioconductor-biobase==2.58.0 \
+    bioconductor-bsgenome==1.66.3 \
     ucsc-bigwigaverageoverbed==377 \
     r-biocmanager==1.30.21 \
     r-xml==3.99_0.14 \
     r-restfulr==0.0.15 \
     bioconductor-rtracklayer==1.58.0 \
     r-r.cache==0.16.0 \
+    r-lsr==0.5.2 \
     && micromamba clean --all --yes
+
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+ENV PATH "$MAMBA_ROOT_PREFIX/bin:$PATH"
diff --git a/modules/local/create_annotations/main.nf b/modules/local/create_annotations/main.nf
@@ -1,14 +1,15 @@
 process CREATE_ANNOTATIONS {
     tag "$bin_size"
-    label 'process_single'
+    label 'process_medium'
 
-    container "quay.io/cmgg/qdnaseq:0.0.1"
+    container "cmgg/qdnaseq:0.0.4"
 
     input:
     val(bin_size)
     tuple val(meta), path(bams, stageAs:"bams/*"), path(bais, stageAs:"bams/*")
     tuple val(meta2), path(bigwig)
     tuple val(meta3), path(blacklist)
+    path(genomes)
 
     output:
     tuple val(meta), path("*.rda"), emit: annotation
@@ -24,7 +25,7 @@ process CREATE_ANNOTATIONS {
     def prefix = task.ext.prefix ?: "${params.annotation_genome}.${bin_size}kbp"
 
     """
-    touch ${prefix}.rda
+    touch ${params.annotation_genome}.${bin_size}kbp.rda
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/create_annotations/templates/create_annotations.R b/modules/local/create_annotations/templates/create_annotations.R
@@ -6,12 +6,11 @@ library(BiocManager)
 library(QDNAseq)
 library(future)
 
-BiocManager::install("BSgenome.Hsapiens.UCSC.${params.annotation_genome}")
-library(BSgenome.Hsapiens.UCSC.${params.annotation_genome})
+library($genomes, lib.loc="$genomes")
 
 binsize <- ${bin_size}
 
-bins <- createBins(bsgenome=BSgenome.Hsapiens.UCSC.${params.annotation_genome}, binSize=binsize)
+bins <- createBins(bsgenome=$genomes, binSize=binsize)
 bins\$mappability <- calculateMappability(
     bins,
     bigWigFile="${bigwig}",
@@ -24,7 +23,7 @@ bins\$residual <- NA
 bins\$use <- bins\$bases > 0
 
 #
-tg <- binReadCounts(bins, path="bams")
+tg <- binReadCounts(bins, path="bams", chunkSize=1E7)
 
 bins\$residual <- iterateResiduals(tg)
 

diff --git a/modules/local/get_bsgenome/main.nf b/modules/local/get_bsgenome/main.nf
@@ -0,0 +1,31 @@
+process GET_BSGENOME {
+    tag "$genome"
+    label 'process_single'
+
+    container "cmgg/qdnaseq:0.0.4"
+
+    input:
+    val(genome)
+    val(species)
+    env R_LIBS_USER
+
+    output:
+    path("BSgenome.${species}.UCSC.${genome}")  , emit: genome
+    path "versions.yml"                         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template "get_bsgenome.R"
+
+    stub:
+    """
+    mkdir BSgenome.${species}.UCSC.${genome}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        r-biocmanager: 3.17
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/get_bsgenome/templates/get_bsgenome.R b/modules/local/get_bsgenome/templates/get_bsgenome.R
@@ -0,0 +1,12 @@
+#!/usr/bin/env Rscript
+
+# load required packages
+library(BiocManager)
+
+dir.create("./BSgenome.${species}.UCSC.${genome}")
+
+install("BSgenome.${species}.UCSC.${genome}", lib="BSgenome.${species}.UCSC.${genome}")
+
+sink("versions.yml")
+cat("\\"task.process\\":\n")
+cat("    r-biocmanager: 1.30.21\n")