diff --git a/modules/ktest/mapping/.gitignore b/modules/ktest/mapping/.gitignore new file mode 100644 index 0000000..731e034 --- /dev/null +++ b/modules/ktest/mapping/.gitignore @@ -0,0 +1 @@ +bin/__pycache__ \ No newline at end of file diff --git a/modules/ktest/mapping/README.md b/modules/ktest/mapping/README.md new file mode 100644 index 0000000..f143b90 --- /dev/null +++ b/modules/ktest/mapping/README.md @@ -0,0 +1,102 @@ +# Mapping module +## 1. Input channels + + + +++++ + + + + + + + + + + + + + + + + + + + + +
ChannelValueExample
Data channel- rg_id: Group ID / Sample ID
- sample_name: Name of sample
- library_id: Unique ID of library of sample
- lane: Lane on sequencer
- platform: Platform of sequencer (Illumina/MCI)
- machine: Name of the sequencer, e.g. Hiseq X
- orient: The orientation, forward or reverse (1 or 2)
- object: Which object's sequence? e.g. human, shrimp, ...
- path: Path of fastq files
[rg_id, sample_name, library_id, lane, platform, machine, orient, object, path]

Data channel
Reference channel- bwa: Folder contains index set of the reference genome (Indexing by BWA)[bwa]

Reference channel
+ + +## 2. Output channels + + +++++ + + + + + + + + + + + + + + +
ChannelValueExample
from_mapping- object: Which object's sequence? e.g. human, shrimp, ...
- rg_id: Group ID / Sample ID
- library_id: Unique ID of library of sample
- dedup_bam: Bam file after mark duplicate
- dedup_bai: Bai file, index of bam file
[val(object), val(rg_id), val(library_id), path(dedup_bam), path(dedup_bai)]

+
+ +## 3. Processes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ProcessInput ChannelOutput ChannelDescription
CAT_FILE[val(object), val(rg_id), val(library_id), val(platform), val(machine), val(orient), [val(path1), val(path2)]][val(object), val(rg_id), val(library_id), val(platform), val(machine), val(orient), path(cat_orient_fastq_gz)Concate fastq files come from identical sample
FASTQC[val(rg_id), val(library_id), path(fastq_path)]Comming...Run fastqc
MAP_BAM[val(object), val(rg_id), val(library_id), val(platform), val(machine), path(fastq_path), path(bwa_ref)][val(object), val(rg_id), val(library_id), path(pe_sorted_bam)]Mapping and sorting
BAM_INDEX[val(object), val(rg_id), val(library_id), path(dedup_bam)][val(object), val(rg_id), val(library_id), path(dedup_bai)]Indexing bam file
diff --git a/modules/ktest/mapping/conf/base.config b/modules/ktest/mapping/conf/base.config new file mode 100755 index 0000000..b7b2a79 --- /dev/null +++ b/modules/ktest/mapping/conf/base.config @@ -0,0 +1,3 @@ +params{ + cache_sing_folder = "/home/ktest/pipeline_env/software/truongphi" +} \ No newline at end of file diff --git a/modules/ktest/mapping/conf/input.config b/modules/ktest/mapping/conf/input.config new file mode 100755 index 0000000..9e788c7 --- /dev/null +++ b/modules/ktest/mapping/conf/input.config @@ -0,0 +1,3 @@ +params{ + samplesheet = "" +} \ No newline at end of file diff --git a/modules/ktest/mapping/conf/mapping.config b/modules/ktest/mapping/conf/mapping.config new file mode 100755 index 0000000..67b7a18 --- /dev/null +++ b/modules/ktest/mapping/conf/mapping.config @@ -0,0 +1,4 @@ +params { + ref_folder = '/home/ktest/project/trucle/LCWGS-4/LCWGS-28/reference' + samplesheet = '/home/ktest/project/trucle/LCWGS-8/LCWGS-18/data_test/sample_sheet1.tsv' +} \ No newline at end of file diff --git a/modules/ktest/mapping/conf/test/test1.config b/modules/ktest/mapping/conf/test/test1.config new file mode 100644 index 0000000..ba16095 --- /dev/null +++ b/modules/ktest/mapping/conf/test/test1.config @@ -0,0 +1,3 @@ +params{ + samplesheet = "$projectDir/tests/test_sample_sheet.tsv" +} \ No newline at end of file diff --git a/modules/ktest/mapping/main.nf b/modules/ktest/mapping/main.nf new file mode 100644 index 0000000..82da729 --- /dev/null +++ b/modules/ktest/mapping/main.nf @@ -0,0 +1,23 @@ +#!/usr/bin/env nextflow +include { MAPPING } from "./mapping.nf" + +workflow{ + //INPUT CHANEL + input_channel = Channel.fromPath("$params.samplesheet") + .splitCsv(skip: 1, sep: '\t') + // [rg_id, sample_name, library_id, lane, platform, machine, orient, object, path] + + params.ref_pattern = "$params.ref_folder/*.{fa,fa.amb,fa.ann,fa.bwt,fa.fai,fa.pac,fa.sa}" + + reference_channel = Channel.fromPath(params.ref_pattern).collect().map{[it]} // [bwa_ref] + + //MAPPING + MAPPING( + input_channel, + reference_channel + ) + + //EMIT + //MAPPING.out.from_mapping // [val(object), val(rg_id), val(library_id), path(dedup_bam), path(dedup_bai)] + +} diff --git a/modules/ktest/mapping/mapping.nf b/modules/ktest/mapping/mapping.nf new file mode 100644 index 0000000..e090e18 --- /dev/null +++ b/modules/ktest/mapping/mapping.nf @@ -0,0 +1,64 @@ +include { FASTQC } from "./modules/fastqc.nf" +include { CAT_FILE } from "./modules/cat_file.nf" +include { MAP_BAM } from "./modules/map_bam.nf" +include { MARKDUPLICATES } from "./modules/markduplicates.nf" +include { BAM_INDEX } from "./modules/bam_index.nf" + +workflow MAPPING{ + take: + input_channel // [rg_id, sample_name, library_id, lane, platform, machine, orient, object, path] + reference_channel // [bwa_ref] + + main: + + // Cat fastq file + input_channel.map{ [it[7],it[0],it[2],it[4],it[5],it[6],it[-1]] } // [object, rg_id, library_id, platform, machine, orient, path] + .groupTuple(by: [0,1,2,3,4,5], sort: true) + .branch{ + cat: it[-1].size() > 1 // [object, rg_id, library_id, platform, machine, orient, [path1, path2]] + non_cat: true + return it.flatten() // [object, rg_id, library_id, platform, machine, orient, path] + } + .set{cat_filter} + + + CAT_FILE( + cat_filter.cat + ) + + // Combine raw input + raw_input = cat_filter.non_cat + .concat(CAT_FILE.out) // [object, rg_id, library_id, platform, machine, orient, path] + + // Fastqc + FASTQC( + raw_input.map{ + [it[1], it[2], it[-1]] + } // [val(rg_id), val(library_id), path(fastq_path)] + ) + + + MAP_BAM( + raw_input.groupTuple(by: [0,1,2,3,4], sort:true) // [object, rg_id, library_id, platform, machine, orient, path] + .map{ + it[0..4] + [it[-1]] // [object, rg_id, library_id, platform, machine, [path1, path2]] + }.combine(reference_channel) // [object, rg_id, library_id, platform, machine, [path1, path2], bwa_ref] + + + ) + + MARKDUPLICATES{ + MAP_BAM.out.sorted // [val(object), val(rg_id), val(library_id), path("${rg_id}_${library_id}.pe.sorted.bam")] + } + + BAM_INDEX{ + MARKDUPLICATES.out.dedup_bam // [val(object), val(rg_id), val(library_id), path("${rg_id}_${library_id}.dedup.bam")] + } + + emit: + from_mapping = MARKDUPLICATES.out.dedup_bam + .combine( + BAM_INDEX.out.dedup_bai, by: [0,1,2] + ) // [val(object), val(rg_id), val(library_id), path(dedup_bam), path(dedup_bai)] + +} \ No newline at end of file diff --git a/modules/ktest/mapping/modules/bam_index.nf b/modules/ktest/mapping/modules/bam_index.nf new file mode 100644 index 0000000..53bfa8e --- /dev/null +++ b/modules/ktest/mapping/modules/bam_index.nf @@ -0,0 +1,17 @@ +process BAM_INDEX{ + tag "$rg_id" + + container "quay.io/biocontainers/picard:3.1.1--hdfd78af_0" + memory { 30.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(object), val(rg_id), val(library_id), path(dedup_bam) + + output: + tuple val(object), val(rg_id), val(library_id), path("${rg_id}_${library_id}.dedup.bai"), emit: dedup_bai + + """ + picard BuildBamIndex INPUT=$dedup_bam\ + """ +} \ No newline at end of file diff --git a/modules/ktest/mapping/modules/cat_file.nf b/modules/ktest/mapping/modules/cat_file.nf new file mode 100644 index 0000000..4a9a3fd --- /dev/null +++ b/modules/ktest/mapping/modules/cat_file.nf @@ -0,0 +1,33 @@ +process CAT_FILE{ + tag "$rg_id" + + container "ubuntu:rolling" + memory { 20.GB * task.attempt } + cpus { 4 * task.attempt } + + input: + tuple val(object), + val(rg_id), + val(library_id), + val(platform), + val(machine), + val(orient), + path(fastq_files) + + output: + tuple val(object), + val(rg_id), + val(library_id), + val(platform), + val(machine), + path("${library_id}_${rg_id}_cat_${orient}.fastq.gz") + + """ + cat ${fastq_files} > ${library_id}_${rg_id}_cat_${orient}.fastq.gz + + for filename in `ls *.gz`; do + read_num=\$(zcat \$filename | echo \$((`wc -l`/4))) + echo "\$filename has: \$read_num reads" + done + """ +} \ No newline at end of file diff --git a/modules/ktest/mapping/modules/fastqc.nf b/modules/ktest/mapping/modules/fastqc.nf new file mode 100644 index 0000000..7c0b201 --- /dev/null +++ b/modules/ktest/mapping/modules/fastqc.nf @@ -0,0 +1,16 @@ +process FASTQC{ + tag "$rg_id" + + container "phinguyen2000/fastqc_v0.12.1:v0.1.0" + memory { 30.GB * task.attempt } + cpus { 16 * task.attempt } + + + input: + tuple val(rg_id), val(library_id), path(fastq_path) + + + """ + fastqc --threads 20 $fastq_path + """ +} \ No newline at end of file diff --git a/modules/ktest/mapping/modules/map_bam.nf b/modules/ktest/mapping/modules/map_bam.nf new file mode 100644 index 0000000..07b9511 --- /dev/null +++ b/modules/ktest/mapping/modules/map_bam.nf @@ -0,0 +1,21 @@ +process MAP_BAM{ + tag "$object:$rg_id" + + container "phinguyen2000/mapping:v0.1.0" + memory { 30.GB * task.attempt } + cpus { 8 * task.attempt } + + input: + tuple val(object), val(rg_id), val(library_id), val(platform), val(machine), path(fastq_path), path(bwa_ref) + + output: + tuple val(object), val(rg_id), val(library_id), path("${rg_id}_${library_id}.pe.sorted.bam"), emit: sorted + """ + threads=20 + bwa mem -t \$threads\ + -R "@RG\\tID:${rg_id}\\tLB:${library_id}\\tPL:${platform}\\tPM:${machine}\\tSM:${rg_id}"\ + -M ${bwa_ref[0]} \ + $fastq_path | samtools sort -@\$threads -o ${rg_id}_${library_id}.pe.sorted.bam + + """ +} \ No newline at end of file diff --git a/modules/ktest/mapping/modules/markduplicates.nf b/modules/ktest/mapping/modules/markduplicates.nf new file mode 100644 index 0000000..7c2918a --- /dev/null +++ b/modules/ktest/mapping/modules/markduplicates.nf @@ -0,0 +1,20 @@ +process MARKDUPLICATES{ + tag "$object:$rg_id" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 30.GB * task.attempt } + cpus { 10 * task.attempt } + + input: + tuple val(object), val(rg_id), val(library_id), path(pe_sorted_bam) + + output: + tuple val(object), val(rg_id), val(library_id), path("${rg_id}_${library_id}.dedup.bam"), emit: dedup_bam + + """ + gatk MarkDuplicates \ + -I $pe_sorted_bam \ + -O ${rg_id}_${library_id}.dedup.bam \ + -M ${rg_id}_${library_id}.dedup.metrics.txt + """ +} \ No newline at end of file diff --git a/modules/ktest/mapping/nextflow.config b/modules/ktest/mapping/nextflow.config new file mode 100755 index 0000000..ae16b24 --- /dev/null +++ b/modules/ktest/mapping/nextflow.config @@ -0,0 +1,43 @@ +// Load params for inputs +includeConfig 'conf/input.config' + +// Load params common for all modules +includeConfig 'conf/base.config' + +// Load config for modules +includeConfig 'conf/mapping.config' + +nextflow.enable.dsl = 2 + +tower { + enabled = true + accessToken = "$TOWER_ACCESS_TOKEN" + workspaceId = '222915005021784' +} + + +profiles{ + cluster { + executor{ + name = 'slurm' + queueSize = 30 + } + } + + test1 { includeConfig 'conf/test/test1.config' } +} + +singularity{ + enabled = true + cacheDir = "$params.cache_sing_folder" + runOptions = "--bind /home" +} + +process{ + + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + queue = 'dev' + maxForks = 30 +} \ No newline at end of file diff --git a/modules/ktest/mapping/tests/test_sample_sheet.tsv b/modules/ktest/mapping/tests/test_sample_sheet.tsv new file mode 100644 index 0000000..ae31c73 --- /dev/null +++ b/modules/ktest/mapping/tests/test_sample_sheet.tsv @@ -0,0 +1,3 @@ +rg_id sample_name library_id lane platform machine orient object path +SRR14775139 A01 RANDOM S1 ILLUMINA Novaseq 1 pig /home/ktest/project/trucle/LCWGS-8/LCWGS-18/data_test/SRR14775139_1.fastq.gz +SRR14775139 A01 RANDOM S1 ILLUMINA Novaseq 2 pig /home/ktest/project/trucle/LCWGS-8/LCWGS-18/data_test/SRR14775139_2.fastq.gz