-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Making PROFILE take in ONT data #199
base: dev
Are you sure you want to change the base?
Changes from 23 commits
bb278f3
0aab9dd
79cff17
1ae7470
9ff3db1
2989cde
d8c6790
acdafe9
49af743
ca671f5
edde7be
5950650
9707939
7652019
a730568
3924195
259caec
5b44d38
3808e9c
347cb04
cb3afbb
5bd7854
1e9f933
2730c11
683bd95
6714709
666b0bb
af8eff0
6eb0ba2
64d649f
085d8bd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// Detection and removal of contaminant reads, using indices created for ONT cDNA data | ||
process MINIMAP2_ONT { | ||
label "large" | ||
label "minimap2" | ||
input: | ||
tuple val(sample), path(reads) | ||
path(contaminant_ref) | ||
val(suffix) | ||
output: | ||
tuple val(sample), path("${sample}_${suffix}_minimap2.sam"), emit: sam | ||
tuple val(sample), path("${sample}_in.fastq.gz"), emit: input | ||
shell: | ||
''' | ||
# Define input | ||
o=!{sample}_!{suffix}_minimap2.sam | ||
ref=!{contaminant_ref} | ||
# Run minimap2 | ||
zcat !{reads} | minimap2 -a ${ref} /dev/fd/0 > ${o} | ||
# Link input to output for testing | ||
ln -s !{reads} !{sample}_in.fastq.gz | ||
''' | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
// Return reads that did not align to reference as FASTQ (streamed version) | ||
process SAMTOOLS_FILTER { | ||
label "samtools" | ||
label "small" | ||
input: | ||
tuple val(sample), path(sam) | ||
val(suffix) | ||
output: | ||
tuple val(sample), path("${sample}_${suffix}.fastq.gz"), emit: reads | ||
tuple val(sample), path("${sample}_in.sam"), emit: input | ||
shell: | ||
''' | ||
# Define output | ||
out=!{sample}_!{suffix}.fastq.gz | ||
var="fastq -n -f 4" | ||
# Execute samtools | ||
cat !{sam} | samtools ${var} - | gzip > ${out} | ||
# Link input to output for testing | ||
ln -s !{sam} !{sample}_in.sam | ||
''' | ||
} | ||
|
||
// Return aligned and unaligned reads separately as FASTQs | ||
process SAMTOOLS_SEPARATE { | ||
label "samtools" | ||
input: | ||
tuple val(sample), path(sam) | ||
val(suffix) | ||
output: | ||
tuple val(sample), path("${sample}_${suffix}_samtools_match.fastq.gz"), emit: match | ||
tuple val(sample), path("${sample}_${suffix}_samtools_nomatch.fastq.gz"), emit: nomatch | ||
tuple val(sample), path("${sample}_in.sam"), emit: input | ||
shell: | ||
''' | ||
# Define output | ||
om=!{sample}_!{suffix}_samtools_match.fastq.gz | ||
on=!{sample}_!{suffix}_samtools_nomatch.fastq.gz | ||
om_var="fastq -n -F 4" | ||
on_var="fastq -n -f 4" | ||
# Execute samtools | ||
cat !{sam} | samtools ${om_var} - | gzip > ${om} | ||
cat !{sam} | samtools ${on_var} - | gzip > ${on} | ||
# Link input to output for testing | ||
ln -s !{sam} !{sample}_in.sam | ||
''' | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ config { | |
load "[email protected]" | ||
load "[email protected]" | ||
load "[email protected]" | ||
load "[email protected]" | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,11 @@ include { ADD_FIXED_COLUMN as ADD_BRACKEN_NORIBO } from "../../../modules/local/ | |
include { CONCATENATE_TSVS as CONCATENATE_KRAKEN } from "../../../modules/local/concatenateTsvs" | ||
include { CONCATENATE_TSVS as CONCATENATE_BRACKEN } from "../../../modules/local/concatenateTsvs" | ||
|
||
if (params.ont) { | ||
include { MINIMAP2_ONT as MINIMAP2_RIBO } from "../../../modules/local/minimap2" | ||
include { SAMTOOLS_SEPARATE } from "../../../modules/local/samtools" | ||
} | ||
|
||
/**************** | ||
| MAIN WORKFLOW | | ||
****************/ | ||
|
@@ -27,13 +32,19 @@ workflow PROFILE { | |
ref_dir | ||
min_kmer_fraction | ||
k | ||
bbduk_suffix | ||
ribo_suffix | ||
bracken_threshold | ||
single_end | ||
main: | ||
// Separate ribosomal reads | ||
ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz" | ||
ribo_ch = BBDUK(reads_ch, ribo_path, min_kmer_fraction, k, bbduk_suffix, !single_end) | ||
if (params.ont) { | ||
ribo_path = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-ribo-index/ribo-ref-concat-unique.mmi" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another reference that needs to be created in the index workflow and referenced in the index |
||
mapped_ch = MINIMAP2_RIBO(reads_ch, ribo_path, ribo_suffix) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Speaking from ignorance here, but why should BBDuk not work on ONT data? It's calculating a fraction of matching k-mers so read length per se shouldn't be fatal. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @simonleandergrimm I'd still like this to be addressed before we move forward here, I remain uneasy about switching to an alignment-based approach without having a good understanding of how this might affect the results. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @willbradshaw Can you say more about what your worry is? Mostly that minimap2 is more resource-intensive? I think the files are small enough that this doesn't matter that much. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On priors I'd assume minimap2 to be better than bbduk (I'd assume the latter needs a bunch of playing around to find the fraction of matching k-mers that gives equivalent performance to minimap2). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Finally, I'd be pretty keen to get the ONT version of the pipeline up and running fast and do optimization later (especially because this part of the pipeline shouldn't affect the Illumina runs). |
||
ribo_ch = SAMTOOLS_SEPARATE(mapped_ch, ribo_suffix) | ||
} else { | ||
ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz" | ||
ribo_ch = BBDUK(reads_ch, ribo_path, min_kmer_fraction, k, ribo_suffix, !single_end) | ||
} | ||
// Run taxonomic profiling separately on ribo and non-ribo reads | ||
tax_ribo_ch = TAXONOMY_RIBO(ribo_ch.match, kraken_db_ch, "D", bracken_threshold, single_end) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI I expect Kraken to do a bad job on generating a taxonomic profile here But I still think this PR is in the right direction: if we want to do better taxonomic profiling for ONT data that should be a separately-prioritized followup |
||
tax_noribo_ch = TAXONOMY_NORIBO(ribo_ch.nomatch, kraken_db_ch, "D", bracken_threshold, single_end) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
nextflow_process { | ||
|
||
name "Test process MINIMAP2_ONT" | ||
script "modules/local/minimap2/main.nf" | ||
process "MINIMAP2_ONT" | ||
config "tests/run_dev_se.config" | ||
tag "module" | ||
tag "minimap2" | ||
|
||
setup { | ||
run("LOAD_SAMPLESHEET") { | ||
script "subworkflows/local/loadSampleSheet/main.nf" | ||
process { | ||
""" | ||
input[0] = "${projectDir}/test-data/ont-samplesheet.csv" | ||
input[1] = true | ||
""" | ||
} | ||
} | ||
} | ||
|
||
test("When run against human index, should run without failures and return properly formatted SAM") { | ||
tag "expect_success" | ||
tag "single_end" | ||
when { | ||
params {} | ||
process { | ||
''' | ||
input[0] = LOAD_SAMPLESHEET.out.samplesheet | ||
input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi" | ||
input[2] = "human" | ||
''' | ||
} | ||
} | ||
then { | ||
// Should run without failures | ||
assert process.success | ||
// Both @SQ headers and alignments should be present | ||
def nHeaders = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -c '^@SQ'"].execute().text.trim() as Integer | ||
def nAlignments = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -v '^@' | wc -l"].execute().text.trim() as Integer | ||
assert nHeaders > 0 | ||
assert nAlignments > 0 | ||
} | ||
} | ||
|
||
test("When run against ribo index, should run without failures and return properly formatted SAM") { | ||
tag "expect_success" | ||
tag "single_end" | ||
when { | ||
params {} | ||
process { | ||
''' | ||
input[0] = LOAD_SAMPLESHEET.out.samplesheet | ||
input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-ribo-index/ribo-ref-concat-unique.mmi" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
input[2] = "ribo" | ||
''' | ||
} | ||
} | ||
then { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As commented in another PR, I'd prefer a test of the actual content, not just that there is content. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now fixed. |
||
// Should run without failures | ||
assert process.success | ||
// Both @SQ headers and alignments should be present | ||
def nHeaders = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -c '^@SQ'"].execute().text.trim() as Integer | ||
def nAlignments = ["bash", "-c", "cat " + process.out.sam[0][1] + " | grep -v '^@' | wc -l"].execute().text.trim() as Integer | ||
assert nHeaders > 0 | ||
assert nAlignments > 0 | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
nextflow_process { | ||
|
||
name "Test process SAMTOOLS_FILTER" | ||
script "modules/local/samtools/main.nf" | ||
process "SAMTOOLS_FILTER" | ||
config "tests/run_dev_se.config" | ||
tag "module" | ||
tag "samtools" | ||
|
||
setup { | ||
run("LOAD_SAMPLESHEET") { | ||
script "subworkflows/local/loadSampleSheet/main.nf" | ||
process { | ||
""" | ||
input[0] = "${projectDir}/test-data/ont-samplesheet.csv" | ||
input[1] = true | ||
""" | ||
} | ||
} | ||
run("MINIMAP2_ONT") { | ||
script "modules/local/minimap2/main.nf" | ||
process { | ||
""" | ||
input[0] = LOAD_SAMPLESHEET.out.samplesheet | ||
input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi" | ||
input[2] = "human" | ||
""" | ||
} | ||
} | ||
} | ||
|
||
test("When run on SAM file, should only return unaligned reads") { | ||
tag "expect_success" | ||
tag "single_end" | ||
when { | ||
params {} | ||
process { | ||
''' | ||
input[0] = MINIMAP2_ONT.out.sam | ||
input[1] = "no-human" | ||
''' | ||
} | ||
} | ||
then { | ||
// Should run without failures | ||
assert process.success | ||
|
||
// Output FASTQ ids should be identical to unmapped read ids in input SAM | ||
def fastq_out = path(process.out.reads[0][1]).fastq | ||
def read_ids_out = fastq_out.readNames.toSet() | ||
|
||
def samlines = sam(process.out.input[0][1]).getSamLines() | ||
def unmapped_read_ids = samlines | ||
.findAll { line -> line.split('\t')[1] == '4' } // Only keep lines where flag = 4 | ||
.collect { line -> line.split('\t')[0] } // Get read IDs | ||
.toSet() // Convert to Set to remove duplicates | ||
|
||
assert unmapped_read_ids == read_ids_out | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
nextflow_process { | ||
|
||
name "Test process SAMTOOLS_SEPARATE" | ||
script "modules/local/samtools/main.nf" | ||
process "SAMTOOLS_SEPARATE" | ||
config "tests/run_dev_se.config" | ||
tag "module" | ||
tag "samtools" | ||
|
||
setup { | ||
run("LOAD_SAMPLESHEET") { | ||
script "subworkflows/local/loadSampleSheet/main.nf" | ||
process { | ||
""" | ||
input[0] = "${projectDir}/test-data/ont-samplesheet.csv" | ||
input[1] = true | ||
""" | ||
} | ||
} | ||
run("MINIMAP2_ONT") { | ||
script "modules/local/minimap2/main.nf" | ||
process { | ||
""" | ||
input[0] = LOAD_SAMPLESHEET.out.samplesheet | ||
input[1] = "s3://nao-mgs-simon/ont-indices/2024-12-14/minimap2-human-index/chm13v2.0.mmi" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
input[2] = "human" | ||
""" | ||
} | ||
} | ||
} | ||
|
||
test("When run on a SAM file, should return aligned and unaligned reads separately") { | ||
tag "expect_success" | ||
tag "single_end" | ||
when { | ||
params {} | ||
process { | ||
''' | ||
input[0] = MINIMAP2_ONT.out.sam | ||
input[1] = "no-human" | ||
''' | ||
} | ||
} | ||
then { | ||
// Should run without failures | ||
assert process.success | ||
|
||
// The two FASTQ output files should contain read ids of mapped and unmapped reads, respectively | ||
def fastq_match = path(process.out.match[0][1]).fastq | ||
def fastq_nomatch = path(process.out.nomatch[0][1]).fastq | ||
def read_ids_match = fastq_match.readNames.toSet() | ||
def read_ids_nomatch = fastq_nomatch.readNames.toSet() | ||
|
||
def samlines = sam(process.out.input[0][1]).getSamLines() | ||
def unmapped_read_ids = samlines | ||
.findAll { line -> line.split('\t')[1] == '4' } // Only keep lines where flag = 4 | ||
.collect { line -> line.split('\t')[0] } // Get read IDs | ||
.toSet() // Convert to Set to remove duplicates | ||
def mapped_read_ids = samlines | ||
.findAll { line -> line.split('\t')[1] != '4' } // Only keep lines where flag != 4 | ||
.collect { line -> line.split('\t')[0] } // Get read IDs | ||
.toSet() | ||
|
||
assert read_ids_nomatch == unmapped_read_ids | ||
assert read_ids_match == mapped_read_ids | ||
|
||
// The two FASTQ files should not contain any overlapping read ids | ||
assert read_ids_match.intersect(read_ids_nomatch).size() == 0 | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This process is a superset of SAMTOOLS_FILTER. You don't need both.
(I would implement this as two calls to a general SAMTOOLS_FASTQ process with different input parameters, followed by a channel merge.)