Skip to content

Commit

Permalink
Merge pull request nf-core#75 from LouisLeNezet/phasing2
Browse files Browse the repository at this point in the history
Fix phasing and add --freq
  • Loading branch information
LouisLeNezet authored Jun 7, 2024
2 parents f7fbeb8 + 78b7e18 commit cf7a7fc
Show file tree
Hide file tree
Showing 22 changed files with 166 additions and 70 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co
- [#15](https://github.com/nf-core/phaseimpute/pull/15) - Changed test csv files to point to nf-core repository
- [#16](https://github.com/nf-core/phaseimpute/pull/16) - Removed outdir from test config files
- [#65](https://github.com/nf-core/phaseimpute/pull/65) - Separate stitch output by individuals
- [#75](https://github.com/nf-core/phaseimpute/pull/75) - Set frequency computation with VCFFIXUP process as optional with --compute_freq. Use Glimpse_chunk on panel vcf to compute the chunk and not makewindows on fasta.

### `Dependencies`

Expand Down
2 changes: 1 addition & 1 deletion conf/steps/imputation_glimpse1.config
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ process {
// Impute the variants
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_GLIMPSE1:GLIMPSE_PHASE' {
ext.args = ["--impute-reference-only-variants"].join(' ')
ext.prefix = { "${meta.id}_${meta.region.replace(':','_')}_phase" }
ext.prefix = { "${meta.id}_${meta.chunk.replace(':','_')}_phase" }
ext.suffix = "bcf"
publishDir = [ enabled: false ]
}
Expand Down
2 changes: 1 addition & 1 deletion conf/steps/imputation_glimpse2.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ process {
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_GLIMPSE2:GLIMPSE2_PHASE' {
ext.prefix = { "${meta.id}_${meta.region.replace(':','_')}_glimpse2" }
ext.prefix = { "${meta.id}_${meta.chunk.replace(':','_')}_glimpse2" }
ext.args = "--keep-monomorphic-ref-sites"
ext.suffix = "vcf.gz"
publishDir = [ enabled: false ]
Expand Down
4 changes: 2 additions & 2 deletions conf/steps/imputation_quilt.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ process {

// Impute quilt
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:QUILT_QUILT' {
ext.prefix = { "${meta.id}_C${meta.chr}.impute" }
ext.prefix = { "${meta.id}_${meta.chr}_${meta.chunk}.impute" }
publishDir = [enabled: false]
}

Expand All @@ -33,7 +33,7 @@ process {
// Annotate quilt imputed VCFs
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:BCFTOOLS_ANNOTATE' {
ext.args = "--set-id '%CHROM:%POS:%REF:%ALT' -Oz"
ext.prefix = { "${meta.id}_C${meta.chr}.impute.annotate" }
ext.prefix = { "${meta.id}_${meta.chr}_${meta.chunk}.impute.annotate" }
publishDir = [ enabled: false ]
}

Expand Down
3 changes: 1 addition & 2 deletions conf/steps/panel_prep.config
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,8 @@ process {
]
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' {
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:GLIMPSE2_CHUNK' {
ext.prefix = { "${meta.id}_chunks" }
ext.args = ['-w 30000', '-s 20000'].join(' ')
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:SHAPEIT5_PHASECOMMON' {
Expand Down
23 changes: 19 additions & 4 deletions conf/test_all.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,29 @@ params {
depth = 1

// Genome references
fasta = params.pipelines_testdata_base_path + "reference_genome/21_22/hs38DH.chr21_22.fa"
panel = "${projectDir}/tests/csv/panel.csv"
phased = false
map = "${projectDir}/tests/csv/map.csv"
fasta = params.pipelines_testdata_base_path + "reference_genome/21_22/hs38DH.chr21_22.fa"
panel = "${projectDir}/tests/csv/panel.csv"
phased = false
compute_freq = false
//map = "${projectDir}/tests/csv/map.csv"

// Pipeline steps
steps = "all"

// Impute tools
tools = "glimpse1,glimpse2,stitch,quilt"
}

process {
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE_CHUNK' {
ext.args = ["--window-size 10000", "--window-count 400", "--buffer-size 5000", "--buffer-count 30"].join(' ')
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE2_CHUNK' {
ext.args = ["--window-mb 0.01", "--window-cm 0.01", "--window-count 200", "--buffer-mb 0.005", "--buffer-cm 0.005", "--buffer-count 30"].join(' ')
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:GLIMPSE2_CHUNK' {
ext.args = ["--window-mb 0.01", "--window-cm 0.01", "--window-count 200", "--buffer-mb 0.005", "--buffer-cm 0.005", "--buffer-count 30"].join(' ')
}
}
11 changes: 9 additions & 2 deletions conf/test_glimpse2.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ params {
tools = "glimpse2"
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE_CHUNK' {
ext.args = ["--window-size 10000", "--window-count 400", "--buffer-size 5000", "--buffer-count 30"].join(' ')
process {
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE_CHUNK' {
ext.args = ["--window-size 10000", "--window-count 400", "--buffer-size 5000", "--buffer-count 30"].join(' ')
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE2_CHUNK' {
ext.args = ["--window-mb 0.01", "--window-cm 0.01", "--window-count 200", "--buffer-mb 0.005", "--buffer-cm 0.005", "--buffer-count 30"].join(' ')
}
}

6 changes: 6 additions & 0 deletions conf/test_panelprep.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,9 @@ params {
// Pipeline steps
steps = "panelprep"
}

process {
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:GLIMPSE2_CHUNK' {
ext.args = ["--window-mb 0.01", "--window-cm 0.01", "--window-count 200", "--buffer-mb 0.005", "--buffer-cm 0.005", "--buffer-count 30"].join(' ')
}
}
29 changes: 29 additions & 0 deletions conf/test_panelprep_fullchr.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/phaseimpute -profile test_panelprep,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check panel preparation steps on full chromosomes'

// Limit resources so that this can run on GitHub Actions
max_cpus = 24
max_memory = '50.GB'
max_time = '4.h'

// Genome references
fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa"
panel = "${projectDir}/tests/csv/panel_fullchr.csv"
phased = false

// Pipeline steps
steps = "panelprep"
}
4 changes: 1 addition & 3 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,6 @@ Note that the pipeline will create the following files in your working directory
work # Directory containing the nextflow working files
<OUTDIR> # Finished results in specified location (defined with --outdir)
.nextflow_log # Log file from Nextflow
work # Directory containing the nextflow working files
<OUTDIR> # Finished results in specified location (defined with --outdir)
.nextflow_log # Log file from Nextflow
# Other nextflow hidden files, eg. history of pipeline runs and old logs.
```

Expand Down Expand Up @@ -161,6 +158,7 @@ The required flags for this mode are:
- `--steps panelprep`: The steps to run.
- `--panel reference.csv`: The samplesheet containing the reference panel files in `vcf.gz` format.
- `--phased`: (optional) Whether the reference panel is phased (true|false).
- `--compute_freq`: (optional) Whether the frequency (AC/AN field) for each variants needs to be computed or not (true/false). This can be the case if the frequency is absent from the reference panel or if individuals have been removed.
- `--remove_samples`: (optional) A comma-separated list of samples to remove from the reference.

You can find an overview of the results produced by this steps in the [Output](output.md).
Expand Down
8 changes: 2 additions & 6 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@
"shapeit5/phasecommon": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["vcf_phase_shapeit5"]
"installed_by": ["vcf_phase_shapeit5"],
"patch": "modules/nf-core/shapeit5/phasecommon/shapeit5-phasecommon.diff"
},
"stitch": {
"branch": "master",
Expand Down Expand Up @@ -213,11 +214,6 @@
"branch": "master",
"git_sha": "7e56daae390ff896b292ddc70823447683a79936",
"installed_by": ["subworkflows"]
},
"vcf_phase_shapeit5": {
"branch": "master",
"git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30",
"installed_by": ["subworkflows"]
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions modules/nf-core/shapeit5/phasecommon/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions modules/nf-core/shapeit5/phasecommon/shapeit5-phasecommon.diff

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ params {
// Panel preparation
panel = null
phased = null
compute_freq = true
rename_chr = false
remove_samples = null

Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@
"type": "boolean",
"pattern": "true|false"
},
"compute_freq": {
"description": "Should the allele frequency for each variant (AC/AN fields necessary for Glimpse1 and the validation step) be computed using VCFFIXUP tool. This can be necessary if the fields are absent from the panel or if samples have been removed.",
"type": "boolean",
"pattern": "true|false"
},
"binaryref": {
"type": "string",
"description": "Whether to generate a binary reference file to be used with GLIMPSE2"
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/bam_impute_quilt/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ workflow BAM_IMPUTE_QUILT {
.map {
metaIC, bam, bai, metaPC, hap, legend, chr, start, end, ngen, buffer, gmap ->
[
metaIC.subMap("id") + ["panel": metaPC.id, "chr": metaPC.chr],
metaIC.subMap("id") + ["panel": metaPC.id, "chr": metaPC.chr, "chunk": start + "-" + end],
bam, bai, hap, legend, chr, start, end, ngen, buffer, gmap
]
}
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/vcf_impute_glimpse1/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ workflow VCF_IMPUTE_GLIMPSE1 {
.combine(ch_chunks_panel, by: 0)
.combine(gmap_file)
.map{ metaPC, metaIPC, bam, bai, samples, regionin, regionout, panel, panel_index, gmap ->
[metaIPC + ["region": regionin],
[metaIPC + ["chunk": regionout],
bam, bai, samples, regionin, regionout, panel, panel_index, gmap]
}

Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/vcf_impute_glimpse2/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ workflow VCF_IMPUTE_GLIMPSE2 {
.combine(ch_chunks_panel)
.combine(gmap_file)
.map{ metaI, bam, bai, samples, metaPC, regionin, regionout, panel, panel_index, gmap ->
[metaI + metaPC + ["region": regionin],
[metaI + metaPC + ["chunk": regionout],
bam, bai, samples, regionin, regionout, panel, panel_index, gmap]
}

Expand Down
28 changes: 17 additions & 11 deletions subworkflows/local/vcf_normalize_bcftools/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -49,29 +49,35 @@ workflow VCF_NORMALIZE_BCFTOOLS {
BCFTOOLS_INDEX_3(BCFTOOLS_DEL_SPL.out.vcf)
ch_versions = ch_versions.mix(BCFTOOLS_INDEX_3.out.versions)

ch_biallelic_vcf_tbi = BCFTOOLS_DEL_SPL.out.vcf.join(BCFTOOLS_INDEX_3.out.tbi)
ch_biallelic_vcf_tbi_spl = BCFTOOLS_DEL_SPL.out.vcf.join(BCFTOOLS_INDEX_3.out.tbi)
} else {
ch_biallelic_vcf_tbi_spl = ch_biallelic_vcf_tbi
}

// Fix panel (AC/AN INFO fields in VCF are inconsistent with GT field)
VCFLIB_VCFFIXUP(ch_biallelic_vcf_tbi)
ch_versions = ch_versions.mix(VCFLIB_VCFFIXUP.out.versions)
if (params.compute_freq == true) {
// Fix panel (AC/AN INFO fields in VCF are inconsistent with GT field)
VCFLIB_VCFFIXUP(ch_biallelic_vcf_tbi_spl)
ch_versions = ch_versions.mix(VCFLIB_VCFFIXUP.out.versions)

// Index fixed panel
BCFTOOLS_INDEX_4(VCFLIB_VCFFIXUP.out.vcf)
ch_versions = ch_versions.mix(BCFTOOLS_INDEX_4.out.versions)
// Index fixed panel
BCFTOOLS_INDEX_4(VCFLIB_VCFFIXUP.out.vcf)
ch_versions = ch_versions.mix(BCFTOOLS_INDEX_4.out.versions)

// Join fixed vcf and tbi
ch_biallelic_vcf_tbi = VCFLIB_VCFFIXUP.out.vcf.join(BCFTOOLS_INDEX_4.out.tbi)
// Join fixed vcf and tbi
ch_biallelic_vcf_tbi_freq = VCFLIB_VCFFIXUP.out.vcf.join(BCFTOOLS_INDEX_4.out.tbi)
} else {
ch_biallelic_vcf_tbi_freq = ch_biallelic_vcf_tbi_spl
}

// Convert VCF to Hap and Legend files
BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, [])
BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi_freq, ch_fasta, [])
ch_versions = ch_versions.mix(BCFTOOLS_CONVERT.out.versions)

// Output hap and legend files
ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend)

emit:
vcf_tbi = ch_biallelic_vcf_tbi // channel: [ [id, chr], vcf, tbi ]
vcf_tbi = ch_biallelic_vcf_tbi_freq // channel: [ [id, chr], vcf, tbi ]
hap_legend = ch_hap_legend // channel: [ [id, chr], '.hap', '.legend' ]
versions = ch_versions // channel: [ versions.yml ]
}
68 changes: 41 additions & 27 deletions subworkflows/local/vcf_phase_shapeit5/main.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
include { BEDTOOLS_MAKEWINDOWS } from '../../../modules/nf-core/bedtools/makewindows/main.nf'
include { SHAPEIT5_PHASECOMMON } from '../../../modules/nf-core/shapeit5/phasecommon/main'
include { SHAPEIT5_LIGATE } from '../../../modules/nf-core/shapeit5/ligate/main'
include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bcftools/index/main.nf'
include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf'
include { GLIMPSE2_CHUNK } from '../../../modules/nf-core/glimpse2/chunk'
include { SHAPEIT5_PHASECOMMON } from '../../../modules/nf-core/shapeit5/phasecommon'
include { SHAPEIT5_LIGATE } from '../../../modules/nf-core/shapeit5/ligate'
include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bcftools/index'
include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index'

workflow VCF_PHASE_SHAPEIT5 {

Expand All @@ -11,40 +11,54 @@ workflow VCF_PHASE_SHAPEIT5 {
ch_region // channel (optional) : [ [chr, region], region ]
ch_ref // channel (optional) : [ [id, chr], ref, csi ]
ch_scaffold // channel (optional) : [ [id, chr], scaffold, csi ]
ch_map // channel (optional) : [ [id, chr], map]
ch_map // channel (optional) : [ [chr], map]

main:

ch_versions = Channel.empty()

// It is needed to generate a file containing the region to phase in a Chr \tab Start \tab End format
// Make chunks with Glimpse2 (does not work with "sequential" mode)
chunk_model = "recursive"

// Create the File in bed format and use the meta id for the file name
ch_region_file = ch_region
.collectFile(newLine: true) { metaCR, region -> ["${metaCR.chr}.bed", region.replace(":","\t").replace("-","\t")]}
.map { file -> [[id: file.getBaseName(), chr:file.getBaseName()], file] }

BEDTOOLS_MAKEWINDOWS(ch_region_file)
ch_versions = ch_versions.mix(BEDTOOLS_MAKEWINDOWS.out.versions.first())

ch_chunk_output = BEDTOOLS_MAKEWINDOWS.out.bed
.splitCsv(header: ['Chr', 'Start', 'End'], sep: "\t", skip: 0)
.map { meta, it -> [meta.subMap("chr"), it["Chr"]+":"+it["Start"]+"-"+it["End"]]}

ch_chunks_number = BEDTOOLS_MAKEWINDOWS.out.bed
.map { meta, bed -> [meta.subMap("chr"), bed.countLines().intValue()]}
// Chunk with Glimpse2
ch_input_glimpse2 = ch_vcf
.map{
metaIC, vcf, csi, pedigree -> [metaIC.subMap("chr"), metaIC, vcf, csi]
}
.combine(ch_region.map{ metaCR, region -> [metaCR.subMap("chr"), region]}, by:0)
.join(ch_map)
.map{
metaC, metaIC, vcf, csi, region, gmap -> [metaIC, vcf, csi, region, gmap]
}
GLIMPSE2_CHUNK ( ch_input_glimpse2, chunk_model )
ch_versions = ch_versions.mix( GLIMPSE2_CHUNK.out.versions.first() )

// Rearrange channels
ch_chunks_glimpse2 = GLIMPSE2_CHUNK.out.chunk_chr
.splitCsv(
header: [
'ID', 'Chr', 'RegionBuf', 'RegionCnk', 'WindowCm',
'WindowMb', 'NbTotVariants', 'NbComVariants'
], sep: "\t", skip: 0
)
.map { metaIC, it -> [metaIC, it["RegionBuf"], it["RegionCnk"]]}

ch_chunks_number = GLIMPSE2_CHUNK.out.chunk_chr
.map { meta, chunk -> [meta.subMap("chr"), chunk.countLines().intValue()]}

ch_phase_input = ch_vcf
.map { metaIC, vcf, index, pedigree ->
[metaIC.subMap("chr"), metaIC, vcf, index, pedigree] }
.combine(ch_chunk_output, by:0)
.map { metaC, meta, vcf, index, pedigree, chunk ->
[meta + [chunk: chunk], vcf, index, pedigree, chunk]
.combine(ch_chunks_glimpse2, by:0)
.map{
metaIC, vcf, csi, pedigree, regionbuf, regioncnk -> [metaIC.subMap("chr"), metaIC, vcf, csi, pedigree, regionbuf, regioncnk]
}
.combine(ch_map, by:0)
.map { metaC, metaIC, vcf, index, pedigree, regionbuf, regioncnk, gmap ->
[metaIC + [chunk: regioncnk], vcf, index, pedigree, regionbuf, gmap]
}

SHAPEIT5_PHASECOMMON (
ch_phase_input, ch_ref,
ch_scaffold, ch_map
ch_scaffold
)
ch_versions = ch_versions.mix(SHAPEIT5_PHASECOMMON.out.versions.first())

Expand Down
Loading

0 comments on commit cf7a7fc

Please sign in to comment.