naobservatory · simonleandergrimm · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025
diff --git a/configs/containers.config b/configs/containers.config
@@ -115,6 +115,12 @@ process {
     withLabel: python {
         container = "community.wave.seqera.io/library/python:3.13.1--d00663700fcc8bcf"
     }
+    withLabel: minimap2 {
+        container = "staphb/minimap2:2.28"
+    }
+    withLabel: samtools {
+        container = "staphb/samtools:1.21"
+    }
     withLabel: coreutils_file {
         container = "community.wave.seqera.io/library/coreutils_file:ccfe471e6d036f54"
         // Build with Seqera Containers

diff --git a/configs/run.config b/configs/run.config
@@ -8,6 +8,9 @@ params {
     // Sequencing platform
     ont = <TRUE OR FALSE BASED ON SEQUENCING PLATFORM> // Whether the sequencing is ONT (true) or Illumina (false)
 
+    // Human filtering
+    human_read_filtering = false // Whether to filter human reads. Only applicable to ONT.
+
     // Directories
     base_dir = <PATH TO YOUR DIRECTORY> // Parent for working and output directories (can be S3)
     ref_dir = <PATH TO YOUR ORGS REFERENCE DIRECTORY> // Reference/index directory (generated by index workflow)

diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
@@ -8,6 +8,9 @@ params {
     // Sequencing platform
     ont = false // Whether the sequencing is ONT (true) or Illumina (false)
 
+    // Human filtering
+    human_read_filtering = false // Whether to filter human reads
+
     // Directories
     base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)

diff --git a/modules/local/minimap2/main.nf b/modules/local/minimap2/main.nf
@@ -0,0 +1,22 @@
+// Detection and removal of contaminant reads, using indices created for ONT cDNA data
+process MINIMAP2_ONT {
+    label "large"
+    label "minimap2"
+    input:
+        tuple val(sample), path(reads)
+        path(contaminant_ref)
+        val(suffix)
+    output:
+        tuple val(sample), path("${sample}_${suffix}_minimap2.sam"), emit: sam
+        tuple val(sample), path("${sample}_in.fastq.gz"), emit: input
+    shell:
+        '''
+        # Define input
+        o=!{sample}_!{suffix}_minimap2.sam
+        ref=!{contaminant_ref}
+        # Run minimap2
+        zcat !{reads} | minimap2 -a ${ref} /dev/fd/0 > ${o}
+        # Link input to output for testing
+        ln -s !{reads} !{sample}_in.fastq.gz
+        '''
+}
diff --git a/modules/local/samtools/main.nf b/modules/local/samtools/main.nf
@@ -0,0 +1,46 @@
+// Return reads that did not align to reference as FASTQ (streamed version)
+process SAMTOOLS_FILTER {
+    label "samtools"
+    label "small"
+    input:
+        tuple val(sample), path(sam)
+        val(suffix)
+    output:
+        tuple val(sample), path("${sample}_${suffix}.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_in.sam"), emit: input
+    shell:
+        '''
+        # Define output
+        out=!{sample}_!{suffix}.fastq.gz
+        var="fastq -n -f 4"
+        # Execute samtools
+        cat !{sam} | samtools ${var} - | gzip > ${out}
+        # Link input to output for testing
+        ln -s !{sam} !{sample}_in.sam
+        '''
+}
+
+// Return aligned and unaligned reads separately as FASTQs
+process SAMTOOLS_SEPARATE {
+    label "samtools"
+    input:
+        tuple val(sample), path(sam)
+        val(suffix)
+    output:
+        tuple val(sample), path("${sample}_${suffix}_samtools_match.fastq.gz"), emit: match
+        tuple val(sample), path("${sample}_${suffix}_samtools_nomatch.fastq.gz"), emit: nomatch
+        tuple val(sample), path("${sample}_in.sam"), emit: input
+    shell:
+        '''
+        # Define output
+        om=!{sample}_!{suffix}_samtools_match.fastq.gz
+        on=!{sample}_!{suffix}_samtools_nomatch.fastq.gz
+        om_var="fastq -n -F 4"
+        on_var="fastq -n -f 4"
+        # Execute samtools
+        cat !{sam} | samtools ${om_var} - | gzip > ${om}
+        cat !{sam} | samtools ${on_var} - | gzip > ${on}
+        # Link input to output for testing
+        ln -s !{sam} !{sample}_in.sam
+        '''
+}
diff --git a/nf-test.config b/nf-test.config
@@ -9,6 +9,7 @@ config {
         load "[email protected]"
         load "[email protected]"
         load "[email protected]"
+        load "[email protected]"
     }
 
 }
diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
@@ -16,6 +16,11 @@ include { ADD_FIXED_COLUMN as ADD_BRACKEN_NORIBO } from "../../../modules/local/
 include { CONCATENATE_TSVS as CONCATENATE_KRAKEN } from "../../../modules/local/concatenateTsvs"
 include { CONCATENATE_TSVS as CONCATENATE_BRACKEN } from "../../../modules/local/concatenateTsvs"
 
+if (params.ont) {
+    include { MINIMAP2_ONT as MINIMAP2_RIBO } from "../../../modules/local/minimap2"
+    include { SAMTOOLS_SEPARATE } from "../../../modules/local/samtools"
+}
+
 /****************
 | MAIN WORKFLOW |
 ****************/
@@ -27,13 +32,19 @@ workflow PROFILE {
         ref_dir
         min_kmer_fraction
         k
-        bbduk_suffix
+        ribo_suffix
         bracken_threshold
         single_end
     main:
         // Separate ribosomal reads
-        ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz"
-        ribo_ch = BBDUK(reads_ch, ribo_path, min_kmer_fraction, k, bbduk_suffix, !single_end)
+        if (params.ont) {
+            ribo_path = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-ribo-index/ribo-ref-concat-unique.mmi"
+            mapped_ch = MINIMAP2_RIBO(reads_ch, ribo_path, ribo_suffix)
+            ribo_ch = SAMTOOLS_SEPARATE(mapped_ch, ribo_suffix)
+        } else {
+            ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz"
+            ribo_ch = BBDUK(reads_ch, ribo_path, min_kmer_fraction, k, ribo_suffix, !single_end)
+        }
         // Run taxonomic profiling separately on ribo and non-ribo reads
         tax_ribo_ch = TAXONOMY_RIBO(ribo_ch.match, kraken_db_ch, "D", bracken_threshold, single_end)
         tax_noribo_ch = TAXONOMY_NORIBO(ribo_ch.nomatch, kraken_db_ch, "D", bracken_threshold, single_end)

diff --git a/subworkflows/local/subsetTrim/main.nf b/subworkflows/local/subsetTrim/main.nf
@@ -6,8 +6,11 @@ include { SUBSET_READS_SINGLE_TARGET as SUBSET_SINGLE } from "../../../modules/l
 include { SUBSET_READS_PAIRED_TARGET as SUBSET_PAIRED } from "../../../modules/local/subsetReads"
 include { FASTP } from "../../../modules/local/fastp"
 include { FILTLONG } from "../../../modules/local/filtlong"
+include { MINIMAP2_ONT as MINIMAP2_HUMAN } from "../../../modules/local/minimap2"
+include { SAMTOOLS_FILTER } from "../../../modules/local/samtools"
 include { INTERLEAVE_FASTQ } from "../../../modules/local/interleaveFastq"
 
+
 /***********
 | WORKFLOW |
 ***********/
@@ -19,6 +22,7 @@ workflow SUBSET_TRIM {
       adapter_path
       single_end
       ont
+      human_read_filtering
       random_seed
     main:
         if (single_end) {
@@ -30,6 +34,11 @@ workflow SUBSET_TRIM {
         }
         if (ont) {
             cleaned_ch = FILTLONG(inter_ch)
+            if (human_read_filtering) {
+                minimap2_human_index = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi"
+                minimap2_ch = MINIMAP2_HUMAN(cleaned_ch, minimap2_human_index, "human")
+                cleaned_ch = SAMTOOLS_FILTER(minimap2_ch.sam, "no-human")
+            }
         } else {
             cleaned_ch = FASTP(inter_ch, adapter_path, !single_end)
         }

diff --git a/tests/modules/local/minimap2/main.nf.test b/tests/modules/local/minimap2/main.nf.test
@@ -0,0 +1,69 @@
+nextflow_process {
+
+    name "Test process MINIMAP2_ONT"
+    script "modules/local/minimap2/main.nf"
+    process "MINIMAP2_ONT"
+    config "tests/run_dev_se.config"
+    tag "module"
+    tag "minimap2"
+
+    setup {
+        run("LOAD_SAMPLESHEET") {
+            script "subworkflows/local/loadSampleSheet/main.nf"
+            process {
+                """
+                input[0] = "${projectDir}/test-data/ont-samplesheet.csv"
+                input[1] = true
+                """
+            }
+        }
+    }
+
+    test("When run against human index, should run without failures and return properly formatted SAM") {
+        tag "expect_success"
+        tag "single_end"
+        when {
+            params {}
+            process {
+                '''
+                input[0] = LOAD_SAMPLESHEET.out.samplesheet
+                input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi"
+                input[2] = "human"
+                '''
+            }
+        }
+        then {
+            // Should run without failures
+            assert process.success
+            // Both @SQ headers and alignments should be present
+            def nHeaders = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -c '^@SQ'"].execute().text.trim() as Integer
+            def nAlignments = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -v '^@' | wc -l"].execute().text.trim() as Integer
+            assert nHeaders > 0
+            assert nAlignments > 0
+        }
+    }
+
+    test("When run against ribo index, should run without failures and return properly formatted SAM") {
+        tag "expect_success"
+        tag "single_end"
+        when {
+            params {}
+            process {
+                '''
+                input[0] = LOAD_SAMPLESHEET.out.samplesheet
+                input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-ribo-index/ribo-ref-concat-unique.mmi"
+                input[2] = "ribo"
+                '''
+            }
+        }
+        then {
+            // Should run without failures
+            assert process.success
+            // Both @SQ headers and alignments should be present
+            def nHeaders = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -c '^@SQ'"].execute().text.trim() as Integer
+            def nAlignments = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -v '^@' | wc -l"].execute().text.trim() as Integer
+            assert nHeaders > 0
+            assert nAlignments > 0
+        }
+    }
+}
diff --git a/tests/modules/local/samtools/filter.nf.test b/tests/modules/local/samtools/filter.nf.test
@@ -0,0 +1,61 @@
+nextflow_process {
+
+    name "Test process SAMTOOLS_FILTER"
+    script "modules/local/samtools/main.nf"
+    process "SAMTOOLS_FILTER"
+    config "tests/run_dev_se.config"
+    tag "module"
+    tag "samtools"
+
+    setup {
+        run("LOAD_SAMPLESHEET") {
+            script "subworkflows/local/loadSampleSheet/main.nf"
+            process {
+                """
+                input[0] = "${projectDir}/test-data/ont-samplesheet.csv"
+                input[1] = true
+                """
+            }
+        }
+        run("MINIMAP2_ONT") {
+            script "modules/local/minimap2/main.nf"
+            process {
+                """
+                input[0] = LOAD_SAMPLESHEET.out.samplesheet
+                input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi"
+                input[2] = "human"
+                """
+            }
+        }
+    }
+
+    test("When run on SAM file, should only return unaligned reads") {
+        tag "expect_success"
+        tag "single_end"
+        when {
+            params {}
+            process {
+                '''
+                input[0] = MINIMAP2_ONT.out.sam
+                input[1] = "no-human"
+                '''
+            }
+        }
+        then {
+            // Should run without failures
+            assert process.success
+
+            // Output FASTQ ids should be identical to unmapped read ids in input SAM
+            def fastq_out = path(process.out.reads[0][1]).fastq
+            def read_ids_out = fastq_out.readNames.toSet()
+
+            def samlines = sam(process.out.input[0][1]).getSamLines()
+            def unmapped_read_ids = samlines
+                .findAll { line -> line.split('\t')[1] == '4' }  // Only keep lines where flag = 4
+                .collect { line -> line.split('\t')[0] }         // Get read IDs
+                .toSet()                                  // Convert to Set to remove duplicates
+
+            assert unmapped_read_ids == read_ids_out
+        }
+    }
+}
diff --git a/tests/modules/local/samtools/separate.nf.test b/tests/modules/local/samtools/separate.nf.test
@@ -0,0 +1,71 @@
+nextflow_process {
+
+    name "Test process SAMTOOLS_SEPARATE"
+    script "modules/local/samtools/main.nf"
+    process "SAMTOOLS_SEPARATE"
+    config "tests/run_dev_se.config"
+    tag "module"
+    tag "samtools"
+
+    setup {
+        run("LOAD_SAMPLESHEET") {
+            script "subworkflows/local/loadSampleSheet/main.nf"
+            process {
+                """
+                input[0] = "${projectDir}/test-data/ont-samplesheet.csv"
+                input[1] = true
+                """
+            }
+        }
+        run("MINIMAP2_ONT") {
+            script "modules/local/minimap2/main.nf"
+            process {
+                """
+                input[0] = LOAD_SAMPLESHEET.out.samplesheet
+                input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi"
+                input[2] = "human"
+                """
+            }
+        }
+    }
+
+    test("When run on a SAM file, should return aligned and unaligned reads separately") {
+        tag "expect_success"
+        tag "single_end"
+        when {
+            params {}
+            process {
+                '''
+                input[0] = MINIMAP2_ONT.out.sam
+                input[1] = "no-human"
+                '''
+            }
+        }
+        then {
+            // Should run without failures
+            assert process.success
+
+            // The two FASTQ output files should contain read ids of mapped and unmapped reads, respectively
+            def fastq_match = path(process.out.match[0][1]).fastq
+            def fastq_nomatch = path(process.out.nomatch[0][1]).fastq
+            def read_ids_match = fastq_match.readNames.toSet()
+            def read_ids_nomatch = fastq_nomatch.readNames.toSet()
+
+            def samlines = sam(process.out.input[0][1]).getSamLines()
+            def unmapped_read_ids = samlines
+                .findAll { line -> line.split('\t')[1] == '4' }  // Only keep lines where flag = 4
+                .collect { line -> line.split('\t')[0] }         // Get read IDs
+                .toSet()                                  // Convert to Set to remove duplicates
+            def mapped_read_ids = samlines
+                .findAll { line -> line.split('\t')[1] != '4' }  // Only keep lines where flag != 4
+                .collect { line -> line.split('\t')[0] }         // Get read IDs
+                .toSet()
+
+            assert read_ids_nomatch == unmapped_read_ids
+            assert read_ids_match == mapped_read_ids
+
+            // The two FASTQ files should not contain any overlapping read ids
+            assert read_ids_match.intersect(read_ids_nomatch).size() == 0
+        }
+    }
+}
diff --git a/tests/run.config b/tests/run.config
@@ -10,6 +10,9 @@ params {
     // Sequencing platform
     ont = false // Whether the sequencing is ONT (true) or Illumina (false)
 
+    // Human filtering
+    human_read_filtering = false // Whether to filter human reads. Only applicable to ONT.
+
     // Directories
     base_dir = "./" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)