Skip to content

Commit

Permalink
Merge pull request #17 from sanger-tol/update_hic_mapping
Browse files Browse the repository at this point in the history
Update hic mapping
  • Loading branch information
priyanka-surana authored Oct 3, 2023
2 parents 3bf48e2 + 839c2a5 commit e9d81e2
Show file tree
Hide file tree
Showing 32 changed files with 521 additions and 556 deletions.
39 changes: 39 additions & 0 deletions bin/generate_cram_csv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

#
# Based on https://github.com/sanger-tol/treeval/blob/80554a803903183613d49690d5770eeadb3c42c9/bin/generate_cram_csv.sh
# from Sanger TOL treeval pipeline
#

#cram_path=$1
chunkn=0
#for cram in ${cram_path}/*.cram; do
for cram in "$@"; do

rgline=$(samtools view -H $cram|grep "RG"|sed 's/\t/\\t/g'|sed "s/'//g")

crampath=$(readlink -f ${cram})

ncontainers=$(zcat ${crampath}.crai|wc -l)
base=$(basename $cram .cram)

from=0
to=10000


while [ $to -lt $ncontainers ]
do
echo $crampath,${crampath}.crai,${from},${to},${base},${chunkn},${rgline}
from=$((to+1))
((to+=10000))
((chunkn++))
done

if [ $from -le $ncontainers ]
then
echo $crampath,${crampath}.crai,${from},${ncontainers},${base},${chunkn},${rgline}
((chunkn++))
fi
done

exit 0
72 changes: 22 additions & 50 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -491,104 +491,76 @@ process {
}
// End of Set up of the polishing pipeline

// Set up of the Hi-C read mapping pipeline
withName: BWAMEM2_MEM {
// Set read group in the output file header as specified in meta;
// If the data is HiC then set for split alignment, take the alignment
// with the smallest coordinate aas primary, skip mate rescue,
// skip pairing, perform smart pairing,
// and append FASTQ comment to SAM output
ext.args = { ( "${meta.datatype}" == "hic" ) ? "-5SPCp -R ${meta.read_group}" : "-R ${meta.read_group}" }
}

withName: SAMTOOLS_FASTQ {
// Filter out reaads which fail platform/vendor quality checks;
// dont append /1 and /2 to the read name;
// copy RG, BC and QT tags to the FASTQ header line
ext.args = '-F0xB00 -nt'
}

withName: SAMTOOLS_SORT {
ext.prefix = { "${meta.id}.sorted" }
}

withName: SAMTOOLS_COLLATE {
ext.prefix = { "${meta.id}.collated" }
}

withName: SAMTOOLS_FIXMATE {
// Add mate score tag
ext.args = '-mp'
ext.prefix = { "${meta.id}.fixmate" }
}
// Set up of the scaffolding pipeline

withName: SAMTOOLS_MARKDUP {
ext.prefix = { "${meta.id}.markdup" }
withName: '.*HIC_MAPPING:SAMTOOLS_MARKDUP_HIC_MAPPING' {
ext.prefix = { "${meta.id}_mkdup" }
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: SAMTOOLS_VIEW {
ext.args = "--output-fmt cram"
withName: '.*HIC_MAPPING:SAMTOOLS_MERGE_HIC_MAPPING' {
ext.prefix = { "${meta.id}_merged" }
}

withName: '.*ALIGN_SHORT:MARKDUP_STATS:SAMTOOLS_VIEW_MARKDUP' {
// Return compressed BAM output
// Filter out reads classified as not primary alignment (0x100)
// read fails platform/vendor quality checks (0x200)
// read is PCR or optical duplicate (0x400)
// supplementary alignment (0x800)
ext.args = "-u -F0xf00 -e 'mapq>=10' --output-fmt bam"
withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
ext.args = ''
ext.args1 = '-F0xB00 -nt'
ext.args2 = { "-5SPCp -H'${rglines}'" }
ext.args3 = '-mpu'
ext.args4 = { "-F0xf00 -e 'mapq>=10'" }
ext.args5 = { '--write-index -l1' }
}

withName: '.*ALIGN_SHORT:MARKDUP_STATS:BED_SORT' {
withName: '.*HIC_MAPPING:BAMTOBED_SORT' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*ALIGN_SHORT:MARKDUP_STATS:CONVERT_STATS:SAMTOOLS_INDEX' {

withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_VIEW' {
ext.args = "--output-fmt cram"
}

withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_INDEX' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*ALIGN_SHORT:MARKDUP_STATS:CONVERT_STATS:SAMTOOLS_STATS' {
withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_STATS' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*ALIGN_SHORT:MARKDUP_STATS:CONVERT_STATS:SAMTOOLS_FLAGSTAT' {
withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_FLAGSTAT' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*ALIGN_SHORT:MARKDUP_STATS:CONVERT_STATS:SAMTOOLS_IDXSTATS' {
withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_IDXSTATS' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: GNU_SORT {
ext.args = '-k4,4'
}
// End of Set up of the Hi-C read mapping pipeline

// Set up of the scffolding pipeline
withName: 'YAHS' {
ext.prefix = 'out'
Expand Down
25 changes: 25 additions & 0 deletions conf/test_full_gsMetZobe1.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running full-size tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a full size pipeline test.
Use as follows:
nextflow run sanger-tol/genomeassembly -profile test_full,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Full test profile'
config_profile_description = 'Full test dataset to check pipeline function'

max_cpus = 28
max_memory = '100.GB'
max_time = '24.h'

// Input data for full size test
input = 'assets/test_gsMetZobe1.yaml'
polishing_on = true
hifiasm_hic_on = true
}
18 changes: 4 additions & 14 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,7 @@
},
"bwamem2/index": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"bwamem2/mem": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"git_sha": "bfed129da5134b4439b1821c917972570d44d39c",
"installed_by": ["modules"]
},
"cat/cat": {
Expand Down Expand Up @@ -178,12 +173,7 @@
},
"samtools/faidx": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"samtools/fastq": {
"branch": "master",
"git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
"git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
"installed_by": ["modules"]
},
"samtools/fixmate": {
Expand All @@ -208,12 +198,12 @@
},
"samtools/markdup": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8",
"installed_by": ["modules"]
},
"samtools/merge": {
"branch": "master",
"git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
"git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013",
"installed_by": ["modules"]
},
"samtools/sort": {
Expand Down
47 changes: 47 additions & 0 deletions modules/local/bamtobed_sort.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
//
// Copied from https://github.com/sanger-tol/treeval/blob/28309b7a1faf3aee5627f497c7cfa62d12ac65b8/modules/local/bamtobed_sort.nf
// from Sanger TOL treeval pipeline
//


process BAMTOBED_SORT {
tag "$meta.id"
label "process_high"

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' :
'biocontainers/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' }"

input:
tuple val(meta), path(bam)

output:
tuple val(meta), path("*.bed"), emit: sorted_bed
path "versions.yml" , emit: versions

script:
def prefix = args.ext.prefix ?: "${meta.id}"
def st_cores = task.cpus > 4 ? 4 : "${task.cpus}"
def buffer_mem = task.memory.toGiga() / 2
"""
samtools view -@${st_cores} -u -F0x400 ${bam} | bamToBed | sort -k4 --parallel=${task.cpus} -S ${buffer_mem}G > ${prefix}_merged_sorted.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
END_VERSIONS
"""

stub:
def prefix = args.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}_merged_sorted.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
END_VERSIONS
"""
}
62 changes: 62 additions & 0 deletions modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//
// Copied from https://github.com/sanger-tol/treeval/blob/28309b7a1faf3aee5627f497c7cfa62d12ac65b8/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
// from Sanger TOL treeval pipeline
//

process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
tag "$meta.id"
label "process_high"

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' :
'biocontainers/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' }"

input:
tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix)

output:
tuple val(meta), path("*.bam"), emit: mappedbam
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args1 = task.ext.args1 ?: ''
def args2 = task.ext.args2 ?: ''
def args3 = task.ext.args3 ?: ''
def args4 = task.ext.args4 ?: ''
def args5 = task.ext.args5 ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
// Please be aware one of the tools here required mem = 28 * reference size!!!
"""
cram_filter -n ${from}-${to} ${cramfile} - | \\
samtools fastq ${args1} | \\
bwa-mem2 mem -p ${bwaprefix} -t${task.cpus} -5SPCp -H'${rglines}' - | \\
samtools fixmate ${args3} - - | \\
samtools view -bh ${args4} - | \\
samtools sort ${args5} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam -
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
bwa-mem2: \$(bwa-mem2 version | sed 's/bwa-mem2 //g')
END_VERSIONS
"""
// temp removal staden_io_lib: \$(echo \$(staden_io_lib --version 2>&1) | sed 's/^.*staden_io_lib //; s/Using.*\$//') CAUSES ERROR

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
def base = "45022_3#2"
def chunkid = "1"
"""
touch ${prefix}_${base}_${chunkid}_mem.bam
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //')
END_VERSIONS
"""
}
43 changes: 43 additions & 0 deletions modules/local/generate_cram_csv.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//
// Based on https://github.com/sanger-tol/treeval/blob/28309b7a1faf3aee5627f497c7cfa62d12ac65b8/modules/local/generate_cram_csv.nf
// from Sanger TOL treeval pipeline
//

process GENERATE_CRAM_CSV {
tag "${meta.id}"
label 'process_low'

conda "bioconda::samtools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
'biocontainers/samtools:1.17--h00cdaf9_0' }"

input:
tuple val(meta), path(crampaths, stageAs: "?/*")


output:
tuple val(meta), path('*.csv'), emit: csv
path "versions.yml", emit: versions

script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
generate_cram_csv.sh $crampaths >> ${prefix}_cram.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
END_VERSIONS
"""

stub:
"""
touch ${meta.id}.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
END_VERSIONS
"""
}
Loading

0 comments on commit e9d81e2

Please sign in to comment.