diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv index ff9641b..1ac6410 100644 --- a/assets/samplesheet_s3.csv +++ b/assets/samplesheet_s3.csv @@ -3,5 +3,5 @@ mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_d mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz, mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram, mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz, -mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam, +mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,uli mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz, diff --git a/conf/base.config b/conf/base.config index ca753f3..5ee5555 100644 --- a/conf/base.config +++ b/conf/base.config @@ -46,6 +46,11 @@ process { memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } + withName: 'PACBIO_PBMARKDUP' { + time = { check_max( 4.hour * task.attempt, 'time' ) } + memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } + } + withName: 'SAMTOOLS_SORMADUP' { cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) } diff --git a/conf/modules.config b/conf/modules.config index 44d6cf0..7e5fe6c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -41,6 +41,10 @@ process { ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } } + withName: PACBIO_PBMARKDUP { + ext.args = { "--rmdup" } + } + withName: BLAST_BLASTN { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' } @@ -50,11 +54,11 @@ process { ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" } - // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. - // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes - // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp - // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. - // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes + // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. + // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes + // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp + // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. + // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } diff --git a/modules/local/pbmarkdup.nf b/modules/local/pbmarkdup.nf new file mode 100644 index 0000000..e5eed00 --- /dev/null +++ b/modules/local/pbmarkdup.nf @@ -0,0 +1,37 @@ +process PACBIO_PBMARKDUP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::pbmarkdup==1.0.3--h9ee0642_0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pbmarkdup:1.0.3--h9ee0642_0' : + 'biocontainers/pbmarkdup:1.0.3--h9ee0642_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.{bam,f*a,/.*f.*\\.gz/}") , emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = input.getExtension() + + """ + pbmarkdup \\ + $input \\ + ${prefix}.${suffix} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pbmarkdup: \$(echo \$(pbmarkdup --version 2>&1) | awk 'BEFORE{FS=" "}{print \$2}') + END_VERSIONS + """ +} diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index f472a6c..a012065 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -2,10 +2,9 @@ // Align PacBio read files against the genome // -include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' -include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' - +include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' workflow ALIGN_PACBIO { take: diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index acb21fa..58727fa 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -7,6 +7,7 @@ include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtoo include { SAMTOOLS_COLLATETOFASTA } from '../../modules/local/samtools_collatetofasta' include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' +include { PACBIO_PBMARKDUP } from '../../modules/local/pbmarkdup' include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' include { SEQKIT_FQ2FA } from '../../modules/nf-core/seqkit/fq2fa' include { SEQTK_SUBSEQ } from '../../modules/nf-core/seqtk/subseq' @@ -22,8 +23,26 @@ workflow FILTER_PACBIO { ch_versions = Channel.empty() - // Check file types and branch + // Branch for handling ultra low-input libraries reads + | branch { + meta, reads -> + uli : meta.library == "uli" + other : true + } + | set { ch_reads_branched } + + // Mark/remove duplicates + PACBIO_PBMARKDUP ( ch_reads_branched.uli ) + ch_versions = ch_versions.mix ( PACBIO_PBMARKDUP.out.versions.first() ) + + PACBIO_PBMARKDUP.out.output + | mix ( ch_reads_branched.other ) + | set { ch_reads_all } + + + // Check file types and branch + ch_reads_branched.other | branch { meta, reads -> fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 3465448..f123b4d 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -39,6 +39,7 @@ def create_data_channel ( LinkedHashMap row, datafile, stats ) { def meta = [:] meta.id = row.sample meta.datatype = row.datatype + meta.library = row.library if ( meta.datatype == "hic" || meta.datatype == "illumina" ) { platform = "ILLUMINA"