From 51a3579097132d77adeccde252860c0604d3fb9c Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Sat, 6 Jul 2024 16:37:02 +0000 Subject: [PATCH] Truongphikt add module calling from KTest-VN/calling@e461b84 --- modules/ktest/calling/.gitignore | 1 + modules/ktest/calling/README.md | 167 ++++++++++++++++++ modules/ktest/calling/calling.nf | 74 ++++++++ modules/ktest/calling/conf/base.config | 3 + modules/ktest/calling/conf/calling.config | 5 + modules/ktest/calling/conf/input.config | 3 + .../ktest/calling/conf/ktest_cluster.config | 19 ++ modules/ktest/calling/conf/test.config | 4 + .../ktest/calling/conf/test/test_human.config | 3 + .../ktest/calling/conf/test/test_pig.config | 6 + modules/ktest/calling/main.nf | 21 +++ modules/ktest/calling/modules/apply_bqsr.nf | 28 +++ .../calling/modules/base_recalibrator.nf | 32 ++++ .../ktest/calling/modules/call_variants.nf | 24 +++ .../ktest/calling/modules/draft_calling.nf | 25 +++ modules/ktest/calling/modules/draft_join.nf | 36 ++++ modules/ktest/calling/modules/joining.nf | 34 ++++ modules/ktest/calling/modules/split_chr.nf | 21 +++ modules/ktest/calling/nextflow.config | 26 +++ .../ktest/calling/tests/test_sample_human.tsv | 4 + .../ktest/calling/tests/test_sample_pig.tsv | 2 + 21 files changed, 538 insertions(+) create mode 100644 modules/ktest/calling/.gitignore create mode 100644 modules/ktest/calling/README.md create mode 100644 modules/ktest/calling/calling.nf create mode 100644 modules/ktest/calling/conf/base.config create mode 100644 modules/ktest/calling/conf/calling.config create mode 100644 modules/ktest/calling/conf/input.config create mode 100644 modules/ktest/calling/conf/ktest_cluster.config create mode 100644 modules/ktest/calling/conf/test.config create mode 100644 modules/ktest/calling/conf/test/test_human.config create mode 100644 modules/ktest/calling/conf/test/test_pig.config create mode 100644 modules/ktest/calling/main.nf create mode 100644 modules/ktest/calling/modules/apply_bqsr.nf create mode 100644 modules/ktest/calling/modules/base_recalibrator.nf create mode 100644 modules/ktest/calling/modules/call_variants.nf create mode 100644 modules/ktest/calling/modules/draft_calling.nf create mode 100644 modules/ktest/calling/modules/draft_join.nf create mode 100644 modules/ktest/calling/modules/joining.nf create mode 100644 modules/ktest/calling/modules/split_chr.nf create mode 100755 modules/ktest/calling/nextflow.config create mode 100644 modules/ktest/calling/tests/test_sample_human.tsv create mode 100755 modules/ktest/calling/tests/test_sample_pig.tsv diff --git a/modules/ktest/calling/.gitignore b/modules/ktest/calling/.gitignore new file mode 100644 index 0000000..731e034 --- /dev/null +++ b/modules/ktest/calling/.gitignore @@ -0,0 +1 @@ +bin/__pycache__ \ No newline at end of file diff --git a/modules/ktest/calling/README.md b/modules/ktest/calling/README.md new file mode 100644 index 0000000..7288076 --- /dev/null +++ b/modules/ktest/calling/README.md @@ -0,0 +1,167 @@ +# Calling module + +Calling variants from mark duplicated bam file ([PRS-62](https://ktest-dattn.atlassian.net/browse/PRS-62)). + +------------------ +# 1. Usages +Input params (view `conf/input.config`): +- `from_mapping_csv` path of sample sheet + +Calling params (view `conf/calling.config`): + +- `folder_ref`: the path of the folder contains reference genome +- `genome_name`: file name of fasta genome reference laid in folder_ref +- `human_knownsite_vcf`: Known-sites reference for BASE_RECALIBRATOR process in module Calling. + + +# 2. Channels +## 2.1 Input channels + + +++++ + + + + + + + + + + + + + +
ChannelValueExample
from_mapping + - key: key from higher hierarchical structure (e.g. null, 'arrayA_batch4', ...)
+ - rg_id: Group ID / Sample ID
+ - object: Which object's sequence? e.g. human, shrimp, ...
+ - dedup_bam: Path of bam file, after duplicate
+ - dedup_bai: Path of *.bai file, index file of bam file.
+
([val(key), val(rg_id), val(object), path(dedup_bam), path(dedup_bai)])
+
+ +## 2.2 Reference channels + + +++++ + + + + + + + + + + + + + + + +
ChannelValueExample
calling_reference + - folder_ref: Folder contains reference genome and indexing *.{fa, fa.fai, dict}.
+ - genome_ref: file name of fasta genome reference laid in folder_ref. (e.g. Homo_sapiens.GRCh38.dna.primary_assembly.fa)
+ - human_knownsite_vcf: Known-sites reference for BASE_RECALIBRATOR process in module Calling. +
+ ([folder_ref, genome_ref, human_knownsite_vcf]) +
+ +## 2.3. Output channels + + +++++ + + + + + + + + + + + + + + +
ChannelValueExample
split_vcf + - object: Which object's sequence? e.g. human, shrimp, ...
+ - chr: Order number of chromosome (1→22) + - split_vcf: Cohort vcf file after split by chromosome and its index
+
[val(object), val(chr), [path(split_vcf), path(split_vcf_tbi)]]
+ +## 2.4. Processes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ProcessInput ChannelOutput ChannelDescription
DRAFT_CALLING[val(key), al(object), val(rg_id), path(dedup_bam), path(dedup_bai), path(folder_ref), val(genome_name)][val(key), val(object), val(rg_id), path(raw_variants_vcf)Draft variant calling
DRAFT_JOIN[val(key), val(object), path(raw_variants_vcf), path(folder_ref), val(genome_name)][val(key), val(object), path(joint_genotyped_draft{vcf.gz,vcf.gz.idx})]Join draft variant VCF files into a draft cohort VCF file
BASE_RECALIBRATOR[val(object), val(rg_id), path(dedup_bam), path(dedup_bai), val(joint_genotyped_draft_vcf), val(genome_name), path(human_knownsite_vcf)][val(object), val(rg_id), path(recal_data_table)]Statistic impute and Overwrite original reported quality score in each mismatch with reference
APPLY_BQSR[val(key), val(object), val(sample_id), path(dedup_bam), path(dedup_bai), path(recal_data_table), path(folder_ref), val(genome_name)][val(key), val(object), val(sample_id), path(recal_bam)]Apply base quality score recalibration
CALL_VARIANTS[val(key), val(object), val(rg_id), path(recal_bam)][val(key), val(object), val(rg_id), path(recal_vcf)]Call germline SNPs and indels via local re-assembly of haplotypes
JOINING[val(key), val(object), val(rg_id), path(variants_vcf), path(folder_ref), val(genome_name)[val(key), val(object), path(joint_genotyped{vcf.gz,vcf.gz.tbi})]Join variant VCF files into a draft cohort VCF file
SPLIT_CHR[val(key), val(object), path(joint_genotyped), val(chr)[val(key), val(object), val(chr), path(split_vcf{vcf.gz,vcf.gz.tbi})]Split the cohort vcf file by chromosome
diff --git a/modules/ktest/calling/calling.nf b/modules/ktest/calling/calling.nf new file mode 100644 index 0000000..fe068a0 --- /dev/null +++ b/modules/ktest/calling/calling.nf @@ -0,0 +1,74 @@ +include { DRAFT_CALLING } from "./modules/draft_calling.nf" +include { DRAFT_JOIN } from "./modules/draft_join.nf" +include { BASE_RECALIBRATOR } from "./modules/base_recalibrator.nf" +include { APPLY_BQSR } from "./modules/apply_bqsr.nf" +include { CALL_VARIANTS } from "./modules/call_variants.nf" +include { JOINING } from "./modules/joining.nf" +include { SPLIT_CHR } from "./modules/split_chr.nf" + +workflow CALLING{ + take: + from_mapping // ([val(key), val(object), val(rg_id), path(dedup_bam), path(dedup_bai)]) + calling_reference // ([path(folder_ref), val(genome_ref_name), [human_knownsite_vcf, human_knownsite_vcf_tbi]]) + + main: + // Split branch human object and others + from_mapping.branch{ + human: it[1] == 'human' + return it + [["/dev/null"]] // ([val(key), val(object), val(rg_id), path(dedup_bam), path(dedup_bai), [path(null_file)]]) + others: true // ([val(key), val(object), val(rg_id), path(dedup_bam), path(dedup_bai)]) + }.set{ human_selector } + + //===============IF NOT BEING HUMAN OBJECT================== + DRAFT_CALLING{ + human_selector.others // ([val(key), val(object), val(rg_id), path(dedup_bam), path(dedup_bai)]) + .combine(calling_reference.map{it[0,1]}) // ([val(key), val(object), val(rg_id), path(dedup_bam), path(dedup_bai), path(folder_ref), val(genome_name)]) + } + + DRAFT_JOIN{ + DRAFT_CALLING.out.raw_variants_vcf // ([val(key), val(object), val(rg_id), path(raw_variants_vcf_gz)]) + .groupTuple(by: [0,1]) // ([val(key), val(object), [val(rg_id), ...], [path(raw_variants_vcf_gz), ...]]) + .combine(calling_reference.map{it[0,1]}) // ([val(key), val(object), [val(rg_id), ...], [path(raw_variants_vcf_gz),...], path(folder_ref), val(genome_name)]) + } + + + non_human_pkg = from_mapping.combine( + DRAFT_JOIN.out.joint_genotyped_draft, + by: [0,1] // ([val(key), val(object), val(rg_id), path(dedup.bam), path(dedup.bai), [path(cohort_draft_vcf), path(cohort_draft_vcf_idx)]]) + ) + //========================================================== + BASE_RECALIBRATOR{ + human_selector.human + .concat(non_human_pkg) // ([val(key), val(object), val(rg_id), path(dedup.bam), path(dedup.bai), [path(cohort_draft_vcf), path(cohort_draft_vcf_idx)]]) + .combine(calling_reference.map{it[0..2]}) // ([val(key), val(object), val(rg_id), path(dedup.bam), path(dedup.bai), [path(cohort_draft_vcf), path(cohort_draft_vcf_idx)], path(folder_ref), val(genome_name), path(human_knownsite_vcf)]) + + } + + APPLY_BQSR( + from_mapping.combine( + BASE_RECALIBRATOR.out.recal_data_table, by:[0,1,2] // ([val(key), val(object), val(rg_id), path(dedup.bam), path(dedup.bai), path(recal_data_table)]) + ).combine(calling_reference.map{it[0,1]}) // ([val(key), val(object), val(rg_id), path(dedup.bam), path(dedup.bai), path(recal_data_table), path(folder_ref), val(genome_name)]) + ) + + CALL_VARIANTS( + APPLY_BQSR.out.recal_bam // ([val(key), val(object), val(rg_id), path(recal_bam)]) + .combine(calling_reference.map{it[0,1]}) // ([val(key), val(object), val(rg_id), path(recal_bam), path(folder_ref), val(genome_name)]) + ) + + JOINING( + CALL_VARIANTS.out.variants_recal_vcf // ([val(key), val(object), val(rg_id), [path(variants_recal_vcf_gz), path(variants_recal_vcf_gz_tbi)]]) + .map{it.flatten()} // ([val(key), val(object), val(rg_id), path(variants_recal_vcf_gz), path(variants_recal_vcf_gz_tbi)]) + .groupTuple(by: [0,1]) // ([val(key), val(object), [rg_id1, rg_id2, ...], [path(variants_recal_vcf_gz),...)], [path(variants_recal_vcf_gz_tbi), ...]]) + .combine(calling_reference.map{it[0,1]}) // ([val(key), val(object), [rg_id1, rg_id2, ...], [path(variants_recal_vcf_gz),...)], [path(variants_recal_vcf_gz_tbi), ...], path(folder_ref), val(genome_name)]) + + ) + + // Split by chromosomes + SPLIT_CHR( + JOINING.out.cohort_vcf // ([val(key), val(object), [path(joint_genotyped_vcf_gz), path(joint_genotyped_vcf_gz_tbi)]]) + .combine(Channel.of(1..22).map{it.toString()}) // ([val(key), val(object), [path(joint_genotyped_vcf_gz), path(joint_genotyped_vcf_gz_tbi)], val(chr)]) + ) + + emit: + split_vcf = SPLIT_CHR.out.split_vcf // ([val(key), val(object), val(chr), [path(split_vcf), path(split_vcf_tbi)]]) +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/base.config b/modules/ktest/calling/conf/base.config new file mode 100644 index 0000000..0c1a014 --- /dev/null +++ b/modules/ktest/calling/conf/base.config @@ -0,0 +1,3 @@ +params { + cache_sing_folder = "/home/ktest/pipeline_env/software/truongphi" +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/calling.config b/modules/ktest/calling/conf/calling.config new file mode 100644 index 0000000..d297b36 --- /dev/null +++ b/modules/ktest/calling/conf/calling.config @@ -0,0 +1,5 @@ +params { + folder_ref = "/home/ktest/pipeline_env/database/Variant_Calling/hg38" + genome_name = "Homo_sapiens.GRCh38.dna.primary_assembly.fa" + human_knownsite_vcf = "/home/ktest2/project/PRS/PRS-54/PRS-96/*.{vcf.gz,vcf.gz.tbi}" +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/input.config b/modules/ktest/calling/conf/input.config new file mode 100644 index 0000000..96f4cf4 --- /dev/null +++ b/modules/ktest/calling/conf/input.config @@ -0,0 +1,3 @@ +params{ + from_mapping_csv = "" +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/ktest_cluster.config b/modules/ktest/calling/conf/ktest_cluster.config new file mode 100644 index 0000000..82694ac --- /dev/null +++ b/modules/ktest/calling/conf/ktest_cluster.config @@ -0,0 +1,19 @@ +executor{ + name = 'slurm' + queueSize = 30 +} + +process{ + + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 5 + + queue = 'prod' + maxForks = 30 +} + +singularity{ + enabled = true + cacheDir = "$params.cache_sing_folder" + runOptions = "--bind /home/" +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/test.config b/modules/ktest/calling/conf/test.config new file mode 100644 index 0000000..ec45229 --- /dev/null +++ b/modules/ktest/calling/conf/test.config @@ -0,0 +1,4 @@ +params { + from_mapping = "/home/ktest/project/truongphi/PRS/PRS-21/PRS-66/from_mapping.tsv" + bam_folder = "/home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling" +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/test/test_human.config b/modules/ktest/calling/conf/test/test_human.config new file mode 100644 index 0000000..d6ebb9c --- /dev/null +++ b/modules/ktest/calling/conf/test/test_human.config @@ -0,0 +1,3 @@ +params { + from_mapping_tsv = "$projectDir/tests/test_sample_human.tsv" +} \ No newline at end of file diff --git a/modules/ktest/calling/conf/test/test_pig.config b/modules/ktest/calling/conf/test/test_pig.config new file mode 100644 index 0000000..07517b1 --- /dev/null +++ b/modules/ktest/calling/conf/test/test_pig.config @@ -0,0 +1,6 @@ +params { + from_mapping_tsv = "$projectDir/tests/test_sample_pig.tsv" + + folder_ref = "/home/ktest2/project/PRS/PRS-62/PRS-199/test_data/pig_reference" + genome_name = "GCA_000003025.6_Sscrofa11.1_genomic.fa" +} \ No newline at end of file diff --git a/modules/ktest/calling/main.nf b/modules/ktest/calling/main.nf new file mode 100644 index 0000000..c4ee652 --- /dev/null +++ b/modules/ktest/calling/main.nf @@ -0,0 +1,21 @@ +#!/usr/bin/env nextflow +include { CALLING } from "./calling.nf" + +workflow{ + //INPUT CHANEL + from_mapping = channel.fromPath("$params.from_mapping_tsv") + .splitCsv(skip: 1, sep: '\t') // ([val(key), val(rg_id), val(object), path(dedup_bam), path(dedup_bai)]) + + calling_reference = channel.fromPath("${params.folder_ref}") + .combine(channel.of("${params.genome_name}")) + .combine(channel.fromPath("${params.human_knownsite_vcf}").collect().map{[it]}) // ([path(folder_ref), val(genome_ref_name), [human_knownsite_vcf, human_knownsite_vcf_tbi]) + //CALLING + CALLING( + from_mapping, + calling_reference + ) + + //EMIT + //CALLING.split_vcf // ([val(key), val(object), val(chr), [path(split_vcf), path(split_vcf_tbi)]]) + +} diff --git a/modules/ktest/calling/modules/apply_bqsr.nf b/modules/ktest/calling/modules/apply_bqsr.nf new file mode 100644 index 0000000..a2191b5 --- /dev/null +++ b/modules/ktest/calling/modules/apply_bqsr.nf @@ -0,0 +1,28 @@ +process APPLY_BQSR{ + + tag "$key:$object:$sample_id" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(key), val(object), val(sample_id), path(dedup_bam), path(dedup_bai), path(recal_data_table), path(folder_ref), val(genome_name) + + + output: + tuple val(key), val(object), val(sample_id), path("${key_string}_${object}_${sample_id}.recal.bam"), emit: recal_bam + + script: + key_string = key ? key.join("-") : key + + """ + gatk ApplyBQSR \ + -R "$folder_ref/$genome_name" \ + -I $dedup_bam \ + -bqsr $recal_data_table\ + -O "${key_string}_${object}_${sample_id}.recal.bam" + + """ + +} \ No newline at end of file diff --git a/modules/ktest/calling/modules/base_recalibrator.nf b/modules/ktest/calling/modules/base_recalibrator.nf new file mode 100644 index 0000000..4eca852 --- /dev/null +++ b/modules/ktest/calling/modules/base_recalibrator.nf @@ -0,0 +1,32 @@ +process BASE_RECALIBRATOR{ + tag "$key:$object:$rg_id" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(key), val(object), + val(rg_id), path(dedup_bam), + path(dedup_bai), path(joint_genotyped_draft_vcf), + path(folder_ref), val(genome_name), + path(human_knownsite_vcf) + output: + tuple val(key), val(object), val(rg_id), path("${key_string}_${rg_id}.recal_data.table"), emit: recal_data_table + + script: + if (joint_genotyped_draft_vcf.getName() == 'null'){ + known_site = "${human_knownsite_vcf[0]}" + }else{ + known_site = joint_genotyped_draft_vcf[0] + } + key_string = key ? key.join("-") : key + + """ + gatk BaseRecalibrator \ + -I $dedup_bam \ + -R "$folder_ref/$genome_name" \ + --known-sites $known_site \ + -O ${key_string}_${rg_id}.recal_data.table + """ +} \ No newline at end of file diff --git a/modules/ktest/calling/modules/call_variants.nf b/modules/ktest/calling/modules/call_variants.nf new file mode 100644 index 0000000..35f2932 --- /dev/null +++ b/modules/ktest/calling/modules/call_variants.nf @@ -0,0 +1,24 @@ +process CALL_VARIANTS{ + tag "$key:$object:$rg_id" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + input: + tuple val(key), val(object), val(rg_id), path(recal_bam), path(folder_ref), val(genome_name) + + + output: + tuple val(key), val(object), val(rg_id), path("${key_string}_${object}_${rg_id}.variants.recal.vcf.{gz,gz.tbi}"), emit: variants_recal_vcf + + script: + key_string = key ? key.join("-") : key + + """ + gatk --java-options "-Xmx4g" HaplotypeCaller \ + -R "$folder_ref/$genome_name" \ + -I "$recal_bam" \ + -O "${key_string}_${object}_${rg_id}.variants.recal.vcf.gz"\ + -ERC GVCF + """ +} \ No newline at end of file diff --git a/modules/ktest/calling/modules/draft_calling.nf b/modules/ktest/calling/modules/draft_calling.nf new file mode 100644 index 0000000..cc8a09f --- /dev/null +++ b/modules/ktest/calling/modules/draft_calling.nf @@ -0,0 +1,25 @@ +process DRAFT_CALLING{ + tag "$key:$object:$rg_id" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(key), val(object), val(rg_id), path(dedup_bam), path(dedup_bai), path(folder_ref), val(genome_name) + + + output: + tuple val(key), val(object), val(rg_id), path("${key_string}_${rg_id}.raw_variants.vcf.gz"), emit: raw_variants_vcf + + script: + key_string = key ? key.join("-") : key + + """ + gatk --java-options "-Xmx4g" HaplotypeCaller \ + -R "$folder_ref/$genome_name" \ + -I "$dedup_bam"\ + -O "${key_string}_${rg_id}.raw_variants.vcf.gz"\ + -ERC GVCF + """ +} \ No newline at end of file diff --git a/modules/ktest/calling/modules/draft_join.nf b/modules/ktest/calling/modules/draft_join.nf new file mode 100644 index 0000000..e7960b8 --- /dev/null +++ b/modules/ktest/calling/modules/draft_join.nf @@ -0,0 +1,36 @@ +process DRAFT_JOIN{ + tag "$key:$object: #sample::${num_vcf_file}" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(key), val(object), val(rg_ids), path(raw_variants_vcf), path(folder_ref), val(genome_name) + + output: + tuple val(key), val(object), path("${key_string}_joint_genotyped.draft.vcf.{gz,gz.tbi}"), emit: joint_genotyped_draft + + script: + variant_option = "" + for (file in raw_variants_vcf){ + variant_option += "--variant " + file.getName() + " " + } + num_vcf_file = rg_ids.size() + key_string = key ? key.join("-") : key + + """ + gatk IndexFeatureFile \ + -I "$raw_variants_vcf" + + gatk CombineGVCFs \ + -R "$folder_ref/$genome_name" \ + $variant_option \ + -O ${key_string}_cohort.draft.vcf.gz + + gatk GenotypeGVCFs \ + -R "$folder_ref/$genome_name" \ + -V ${key_string}_cohort.draft.vcf.gz \ + -O ${key_string}_joint_genotyped.draft.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/ktest/calling/modules/joining.nf b/modules/ktest/calling/modules/joining.nf new file mode 100644 index 0000000..76560f2 --- /dev/null +++ b/modules/ktest/calling/modules/joining.nf @@ -0,0 +1,34 @@ +process JOINING{ + tag "$key:$object:#sample::${num_vcf_file}" + + container "phinguyen2000/gatk_tabix:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(key), val(object), val(rg_ids), path(variants_vcfs), path(variants_vcf_tbis), path(folder_ref), val(genome_name) + + + output: + tuple val(key), val(object), path("${key_string}_${object}_joint_genotyped.vcf.{gz,gz.tbi}"), emit: cohort_vcf + + script: + variant_option = "" + for (file in variants_vcfs){ + variant_option += "--variant " + file.getName() + " " + } + + num_vcf_file = rg_ids.size() + key_string = key ? key.join("-") : key + + """ + gatk CombineGVCFs \ + -R "$folder_ref/$genome_name" \ + $variant_option \ + -O ${key_string}_${object}_cohort.vcf.gz + gatk GenotypeGVCFs \ + -R "$folder_ref/$genome_name" \ + -V ${key_string}_${object}_cohort.vcf.gz \ + -O ${key_string}_${object}_joint_genotyped.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/ktest/calling/modules/split_chr.nf b/modules/ktest/calling/modules/split_chr.nf new file mode 100644 index 0000000..a1c3b00 --- /dev/null +++ b/modules/ktest/calling/modules/split_chr.nf @@ -0,0 +1,21 @@ +process SPLIT_CHR{ + tag "$key:$object:chr$chr" + + container "phinguyen2000/bcftools:v0.1.0" + memory { 20.GB * task.attempt } + cpus { 5 * task.attempt } + + input: + tuple val(key), val(object), path(joint_genotyped), val(chr) + + output: + tuple val(key), val(object), val(chr), path("${key_string}_${object}_chr${chr}_split.vcf.{gz,gz.tbi}"), emit: split_vcf + + script: + key_string = key ? key.join("-") : key + + """ + bcftools view --regions "$chr" -O z -o "${key_string}_${object}_chr${chr}_split.vcf.gz" ${joint_genotyped[0]} + bcftools index -t ${key_string}_${object}_chr${chr}_split.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/ktest/calling/nextflow.config b/modules/ktest/calling/nextflow.config new file mode 100755 index 0000000..a144cd5 --- /dev/null +++ b/modules/ktest/calling/nextflow.config @@ -0,0 +1,26 @@ +// Load params for inputs +includeConfig 'conf/input.config' + +// Load params common for all modules +includeConfig 'conf/base.config' + +// Load config for modules +includeConfig 'conf/calling.config' + + +nextflow.enable.dsl = 2 + +tower { + enabled = true + accessToken = "$TOWER_ACCESS_TOKEN" + workspaceId = '222915005021784' +} + + +profiles{ + ktest_cluster { includeConfig 'conf/ktest_cluster.config' } + + // test profiles + test_human { includeConfig 'conf/test/test_human.config' } + test_pig { includeConfig 'conf/test/test_pig.config' } +} \ No newline at end of file diff --git a/modules/ktest/calling/tests/test_sample_human.tsv b/modules/ktest/calling/tests/test_sample_human.tsv new file mode 100644 index 0000000..8e79859 --- /dev/null +++ b/modules/ktest/calling/tests/test_sample_human.tsv @@ -0,0 +1,4 @@ +key object rg_id dedup.bam dedup.bai +null human HG00096 /home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling/HG00096_Capture.dedup.bam /home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling/HG00096_Capture.dedup.bai +null human HG00102 /home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling/HG00102_Capture.dedup.bam /home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling/HG00102_Capture.dedup.bai +null human NA12878 /home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling/NA12878_Capture.dedup.bam /home/ktest/share/Working_folder/TRUONGPHI/test_data/test_calling/NA12878_Capture.dedup.bai diff --git a/modules/ktest/calling/tests/test_sample_pig.tsv b/modules/ktest/calling/tests/test_sample_pig.tsv new file mode 100755 index 0000000..0d0ffc8 --- /dev/null +++ b/modules/ktest/calling/tests/test_sample_pig.tsv @@ -0,0 +1,2 @@ +key object rg_id dedup.bam dedup.bai +null pig SRR5336868 /home/ktest2/project/PRS/PRS-62/PRS-199/test_data/subsampled_SRR5336868.bam /home/ktest2/project/PRS/PRS-62/PRS-199/test_data/subsampled_SRR5336868.bam.bai \ No newline at end of file