Skip to content

Commit

Permalink
Handle new reference naming (#173)
Browse files Browse the repository at this point in the history
* More default memory for plink, needed for a bigger reference (gnomAD).

* More generalized file pattern for relatedness files in a reference ("GRCh3#_*.king.cutoff.out.id") with a single meta ID.

ToDo: rename the deg2 relatedness files in setup_resource.nf / bootstrap_ancestry.nf

* Update report.qmd to handle other colours

* give king cutoff files consistent names

* fix test

* add version check to extract_database

* add version check to extract_database

---------

Co-authored-by: Benjamin Wingfield <[email protected]>
  • Loading branch information
smlmbrt and nebfield authored Oct 2, 2023
1 parent 16cf42e commit eb97ffd
Show file tree
Hide file tree
Showing 15 changed files with 99 additions and 65 deletions.
27 changes: 19 additions & 8 deletions assets/report/report.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,23 @@ if(params$run_ancestry == TRUE){
```{r colour_palette, echo = FALSE, eval=params$run_ancestry}
# source: https://github.com/PGScatalog/PGS_Catalog/blob/master/catalog/static/catalog/pgs.scss#L2493-L2520
# $ancestry_colours
thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A",
"#377EB8", "#00CED1", "#984EA3", "#A6CEE3",
"#FF7F00", "#BBB", "#999")
names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS", "EUR", "GME",
"SAS", "MAE", "MAO", "NR", "OTH")
thousand_genomes_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours)
if({params$reference_panel_name} == '1000G'){
thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A",
"#377EB8", "#00CED1", "#984EA3", "#A6CEE3",
"#FF7F00", "#BBB", "#999")
names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS",
"EUR", "GME", "SAS", "MAE",
"MAO", "NR", "OTH")
current_population_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours)
} else if({params$reference_panel_name} == 'HGDP+1kGP'){
gnomAD_pop_colours <- c("#97519d", "#e42523", "#f67e1e", "#48b24b",
"#3280bb", "#a65528", "#9a9c9b")
names(gnomAD_pop_colours) <- c("AFR", "AMR", "CSA", "EAS",
"EUR", "MID", "OCE")
current_population_palette <- scale_colour_manual(name = "Populations", values = gnomAD_pop_colours)
} else{
current_population_palette <- scale_colour_brewer(palette="Set3")
}
```

```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
Expand All @@ -321,7 +332,7 @@ for(pc in seq.int(1,5,2)){
if (pcX %in% colnames(popsim)){
p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop, shape=slabel), alpha=0.25)
p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop, shape=slabel))
p_pca <- p_pca + theme_bw() + thousand_genomes_palette + scale_shape_manual(values=map_shapes, name='sampleset')
p_pca <- p_pca + theme_bw() + current_population_palette + scale_shape_manual(values=map_shapes, name='sampleset')
print(p_pca)
}
}
Expand Down Expand Up @@ -492,4 +503,4 @@ For scores from the PGS Catalog, please remember to cite the original publicatio

> PGS Catalog Calculator (in development). PGS Catalog Team. `https://github.com/PGScatalog/pgsc_calc`
> Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5.
> Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5.
3 changes: 3 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,7 @@ process {
withName:DUMPSOFTWAREVERSIONS {
cache = false
}
withLabel:plink2{
memory = { check_max( 16.GB * task.attempt, 'memory' ) }
}
}
8 changes: 8 additions & 0 deletions modules/local/ancestry/bootstrap/make_database.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ process MAKE_DATABASE {

input:
path '*'
tuple val(grch37_king_meta), path(grch37_king)
tuple val(grch38_king_meta), path(grch38_king)
path checksums

output:
Expand All @@ -26,6 +28,12 @@ process MAKE_DATABASE {
echo $workflow.manifest.version > meta.txt
# can't use meta variables in stageAs
# don't want to use renameTo because it's destructive for the input
cp -L $grch37_king ${grch37_king_meta.build}_${grch37_king_meta.id}.king.cutoff.out.id
cp -L $grch38_king ${grch38_king_meta.build}_${grch38_king_meta.id}.king.cutoff.out.id
rm $grch37_king $grch38_king
tar --dereference -acf pgsc_calc.tar.zst *
cat <<-END_VERSIONS > versions.yml
Expand Down
17 changes: 13 additions & 4 deletions modules/local/ancestry/extract_database.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,27 @@ process EXTRACT_DATABASE {

output:
tuple val(meta38), path("GRCh38_*_ALL.pgen"), path("GRCh38_*_ALL.psam"), path("GRCh38_*_ALL.pvar.zst"), emit: grch38, optional: true
tuple val(meta38), path("deg2_hg38.king.cutoff.out.id"), emit: grch38_king, optional: true
tuple val(meta38), path("GRCh38_*.king.cutoff.out.id"), emit: grch38_king, optional: true
tuple val(meta37), path("GRCh37_*_ALL.pgen"), path("GRCh37_*_ALL.psam"), path("GRCh37_*_ALL.pvar.zst"), emit: grch37, optional: true
tuple val(meta37), path("deg2_phase3.king.cutoff.out.id"), emit: grch37_king, optional: true
tuple val(meta37), path("GRCh37_*.king.cutoff.out.id"), emit: grch37_king, optional: true
path "versions.yml", emit: versions

script:
meta38 = ['build': 'GRCh38']
meta37 = ['build': 'GRCh37']
def king = params.target_build == "GRCh37" ? "deg2_phase3.king.cutoff.out.id" : "deg2_hg38.king.cutoff.out.id"

"""
tar -xvf $reference --wildcards "${params.target_build}*" $king
tar -xf $reference --wildcards "${params.target_build}*" meta.txt 2> /dev/null
DB_VERSION=\$(cat meta.txt)
if [ "\$DB_VERSION" != "2.0.0-alpha.3" ]; then
echo "Old reference database version detected, please redownload the latest version and try again"
echo "See https://pgsc-calc.readthedocs.io/en/latest/how-to/ancestry.html"
exit 1
else
echo "Database version good"
fi
cat <<-END_VERSIONS > versions.yml
${task.process.tokenize(':').last()}:
Expand Down
18 changes: 9 additions & 9 deletions modules/local/plink2_relabelbim.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ process PLINK2_RELABELBIM {
label "plink2" // controls conda, docker, + singularity options

tag "$meta.id chromosome $meta.chrom"
storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${params.target_build}/${meta.chrom}" :
"$workDir/genomes/${meta.id}/${params.target_build}/${meta.chrom}/")
storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${meta.build}/${meta.chrom}" :
"$workDir/genomes/${meta.id}/${meta.build}/${meta.chrom}/")

conda "${task.ext.conda}"

Expand All @@ -20,9 +20,9 @@ process PLINK2_RELABELBIM {
tuple val(meta), path(geno), path(variants), path(pheno)

output:
tuple val(meta), path("*.bed"), emit: geno
tuple val(meta), path("*.zst"), emit: variants
tuple val(meta), path("*.fam"), emit: pheno
tuple val(meta), path("${meta.build}_*.bed"), emit: geno
tuple val(meta), path("${meta.build}_*.zst"), emit: variants
tuple val(meta), path("${meta.build}_*.fam"), emit: pheno
tuple val(meta), path("*.vmiss.gz"), emit: vmiss
path "versions.yml" , emit: versions

Expand All @@ -33,7 +33,7 @@ process PLINK2_RELABELBIM {
script:
def args = task.ext.args ?: ''
def compressed = variants.getName().endsWith("zst") ? 'vzs' : ''
def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}_" : "${meta.id}_"
def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}"
def mem_mb = task.memory.toMega() // plink is greedy
// if dropping multiallelic variants, set a generic ID that won't match
def set_ma_missing = params.keep_multiallelic ? '' : '--var-id-multi @:#'
Expand All @@ -48,11 +48,11 @@ process PLINK2_RELABELBIM {
$set_ma_missing \\
--bfile ${geno.baseName} $compressed \\
--make-just-bim zs \\
--out ${params.target_build}_${prefix}${meta.chrom}
--out ${meta.build}_${prefix}_${meta.chrom}
# cross platform (mac, linux) method of preserving symlinks
cp -a $geno ${params.target_build}_${prefix}${meta.chrom}.bed
cp -a $pheno ${params.target_build}_${prefix}${meta.chrom}.fam
cp -a $geno ${meta.build}_${prefix}_${meta.chrom}.bed
cp -a $pheno ${meta.build}_${prefix}_${meta.chrom}.fam
gzip *.vmiss
cat <<-END_VERSIONS > versions.yml
Expand Down
19 changes: 10 additions & 9 deletions modules/local/plink2_relabelpvar.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ process PLINK2_RELABELPVAR {

tag "$meta.id chromosome $meta.chrom"

storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${params.target_build}/${meta.chrom}" :
"$workDir/genomes/${meta.id}/${params.target_build}/${meta.chrom}/")
storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${meta.build}/${meta.chrom}" :
"$workDir/genomes/${meta.id}/${meta.build}/${meta.chrom}/")

conda "${task.ext.conda}"

Expand All @@ -21,9 +21,9 @@ process PLINK2_RELABELPVAR {
tuple val(meta), path(geno), path(pheno), path(variants)

output:
tuple val(meta), path("*.pgen"), emit: geno
tuple val(meta), path("*.zst") , emit: variants
tuple val(meta), path("*.psam"), emit: pheno
tuple val(meta), path("${meta.build}_*.pgen"), emit: geno
tuple val(meta), path("${meta.build}_*.pvar.zst") , emit: variants
tuple val(meta), path("${meta.build}_*.psam"), emit: pheno
tuple val(meta), path("*.vmiss.gz"), emit: vmiss
path "versions.yml" , emit: versions

Expand All @@ -34,7 +34,7 @@ process PLINK2_RELABELPVAR {
script:
def args = task.ext.args ?: ''
def compressed = variants.getName().endsWith("zst") ? 'vzs' : ''
def prefix = task.ext.suffix ? "${meta.id}_${task.ext.suffix}_" : "${meta.id}_"
def prefix = task.ext.suffix ? "${meta.id}_${task.ext.suffix}" : "${meta.id}"
def mem_mb = task.memory.toMega() // plink is greedy
// if dropping multiallelic variants, set a generic ID that won't match
def set_ma_missing = params.keep_multiallelic ? '' : '--var-id-multi @:#'
Expand All @@ -49,11 +49,12 @@ process PLINK2_RELABELPVAR {
$set_ma_missing \\
--pfile ${geno.baseName} $compressed \\
--make-just-pvar zs \\
--out ${params.target_build}_${prefix}${meta.chrom}
--out ${meta.build}_${prefix}_${meta.chrom}
# cross platform (mac, linux) method of preserving symlinks
cp -a $geno ${params.target_build}_${prefix}${meta.chrom}.pgen
cp -a $pheno ${params.target_build}_${prefix}${meta.chrom}.psam
cp -a $geno ${meta.build}_${prefix}_${meta.chrom}.pgen
cp -a $pheno ${meta.build}_${prefix}_${meta.chrom}.psam
gzip *.vmiss
cat <<-END_VERSIONS > versions.yml
Expand Down
16 changes: 8 additions & 8 deletions modules/local/plink2_vcf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ process PLINK2_VCF {

tag "$meta.id chromosome $meta.chrom"

storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${params.target_build}/${meta.chrom}" :
"$workDir/genomes/${meta.id}/${params.target_build}/${meta.chrom}/")
storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${meta.build}/${meta.chrom}" :
"$workDir/genomes/${meta.id}/${meta.build}/${meta.chrom}/")

conda "${task.ext.conda}"

Expand All @@ -20,15 +20,15 @@ process PLINK2_VCF {
tuple val(meta), path(vcf)

output:
tuple val(newmeta), path("*.pgen"), emit: pgen
tuple val(newmeta), path("*.psam"), emit: psam
tuple val(newmeta), path("*.zst") , emit: pvar
tuple val(newmeta), path("*.vmiss.gz"), emit: vmiss
tuple val(newmeta), path("${meta.build}_*.pgen"), emit: pgen
tuple val(newmeta), path("${meta.build}_*.psam"), emit: psam
tuple val(newmeta), path("${meta.build}_*.zst") , emit: pvar
tuple val(newmeta), path("${meta.build}_*.vmiss.gz"), emit: vmiss
path "versions.yml" , emit: versions

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}_"
def prefix = task.ext.suffix ? "${meta.id}_${task.ext.suffix}" : "${meta.id}"
def mem_mb = task.memory.toMega()
def dosage_options = meta.vcf_import_dosage ? 'dosage=DS' : ''
// rewriting genotypes, so use --max-alleles instead of using generic ID
Expand All @@ -48,7 +48,7 @@ process PLINK2_VCF {
--vcf $vcf $dosage_options \\
--allow-extra-chr $chrom_filter \\
--make-pgen vzs \\
--out ${params.target_build}_${prefix}${meta.chrom}
--out ${meta.build}_${prefix}_${meta.chrom}_vcf
gzip *.vmiss
Expand Down
24 changes: 13 additions & 11 deletions subworkflows/local/ancestry/bootstrap_ancestry.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Create a database containing reference data required for ancestry inference
//
include { SETUP_RESOURCE } from '../../../modules/local/ancestry/bootstrap/setup_resource'
include { PLINK2_RELABELPVAR } from '../../../modules/local/plink2_relabelpvar'
include { PLINK2_RELABELPVAR as BOOTSTRAP_RELABEL } from '../../../modules/local/plink2_relabelpvar'
include { MAKE_DATABASE } from '../../../modules/local/ancestry/bootstrap/make_database'

workflow BOOTSTRAP_ANCESTRY {
Expand Down Expand Up @@ -33,11 +33,11 @@ workflow BOOTSTRAP_ANCESTRY {

SETUP_RESOURCE.out.plink.dump( tag: 'ref_setup' )

PLINK2_RELABELPVAR( SETUP_RESOURCE.out.plink )
ch_versions = ch_versions.mix(PLINK2_RELABELPVAR.out.versions.first())
BOOTSTRAP_RELABEL( SETUP_RESOURCE.out.plink )
ch_versions = ch_versions.mix(BOOTSTRAP_RELABEL.out.versions.first())

PLINK2_RELABELPVAR.out.geno
.concat(PLINK2_RELABELPVAR.out.pheno, PLINK2_RELABELPVAR.out.variants)
BOOTSTRAP_RELABEL.out.geno
.concat(BOOTSTRAP_RELABEL.out.pheno, BOOTSTRAP_RELABEL.out.variants)
.dump(tag: 'ancestry_relabelled')
.set { relabelled }

Expand All @@ -47,12 +47,14 @@ workflow BOOTSTRAP_ANCESTRY {
.groupTuple(size: 3)
.dump(tag: 'ancestry_relabelled_grouped')
.map { drop_meta_keys(it).flatten() }
.set{ relabelled_flat }
.set{ relabelled_flat }

ref.king
.map { drop_meta_keys(it) }
// dropping meta keys simplifies the join
.join( relabelled_flat )
ref.king.branch {
GRCh37: it[0].build == "GRCh37"
GRCh38: it[0].build == "GRCh38"
}.set { ch_king }

relabelled_flat
.flatten()
.filter(Path)
.collect()
Expand All @@ -62,7 +64,7 @@ workflow BOOTSTRAP_ANCESTRY {
Channel.fromPath(params.ancestry_checksums, checkIfExists: true)
.set { ch_checksums }

MAKE_DATABASE( ch_raw_ref, ch_checksums )
MAKE_DATABASE( ch_raw_ref, ch_king.GRCh37, ch_king.GRCh38, ch_checksums )
ch_versions = ch_versions.mix(MAKE_DATABASE.out.versions)

emit:
Expand Down
2 changes: 1 addition & 1 deletion tests/modules/plink2/relabelbim/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ workflow testrelabelbim {
bim = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim')
bed = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bed')
fam = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.fam')
def meta = [id: 'test', is_bfile: true]
def meta = [id: 'test', build: 'GRCh37', is_bfile: true, chrom: 22]

PLINK2_RELABELBIM( Channel.of([meta, bed, bim, fam]) )
}
6 changes: 3 additions & 3 deletions tests/modules/plink2/relabelbim/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
- fast
- module
files:
- path: output/plink2/GRCh37_test_null.bed
- path: output/plink2/GRCh37_test_22.bed
md5sum: a8be76ae3301d395563784fcbd571ae2
- path: output/plink2/GRCh37_test_null.bim.zst
- path: output/plink2/GRCh37_test_null.fam
- path: output/plink2/GRCh37_test_22.bim.zst
- path: output/plink2/GRCh37_test_22.fam
md5sum: 8915d48959a21e827d1db1b192422ba1
- path: output/plink2/versions.yml
contains:
Expand Down
2 changes: 1 addition & 1 deletion tests/modules/plink2/relabelpvar/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ include { PLINK2_RELABELPVAR } from '../../../../modules/local/plink2_relabelpva

workflow testrelabelpvar {
vcf = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.vcf.gz')
def meta = [id: 'test', chrom: 22]
def meta = [id: 'test', 'build': 'GRCh37', chrom: 22]

PLINK2_VCF(Channel.of([meta, vcf]))

Expand Down
6 changes: 3 additions & 3 deletions tests/modules/plink2/relabelpvar/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
- fast
- module
files:
- path: output/plink2/GRCh37_test_22.psam
- path: output/plink2/GRCh37_test_22_vcf.psam
md5sum: 90f1430b71153d59bc14e9499b0366f4
- path: output/plink2/GRCh37_test_22.pgen
- path: output/plink2/GRCh37_test_22_vcf.pgen
md5sum: be32a51a5509111327a5deb6a3610b2d
- path: output/plink2/GRCh37_test_22.pvar.zst
- path: output/plink2/GRCh37_test_22_vcf.pvar.zst
- path: output/plink2/versions.yml
contains:
- "plink2: 2.00a3.3"
2 changes: 1 addition & 1 deletion tests/modules/plink2/vcf/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include { PLINK2_VCF } from '../../../../modules/local/plink2_vcf'

workflow testvcf {
vcf = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.vcf.gz')
def meta = [id: 'test', is_vcf: true, chrom: '22']
def meta = [id: 'test', is_vcf: true, build: 'GRCh37', chrom: '22']

PLINK2_VCF(Channel.of([meta, vcf]))

Expand Down
6 changes: 3 additions & 3 deletions tests/modules/plink2/vcf/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
- plink2
- fast
files:
- path: output/plink2/GRCh37_vcf_22.pgen
- path: output/plink2/GRCh37_vcf_22.psam
- path: output/plink2/GRCh37_vcf_22.pvar.zst
- path: output/plink2/GRCh37_test_22_vcf.pgen
- path: output/plink2/GRCh37_test_22_vcf.psam
- path: output/plink2/GRCh37_test_22_vcf.pvar.zst
- path: output/plink2/versions.yml
contains:
- "plink2: 2.00a3.3"
8 changes: 4 additions & 4 deletions tests/subworkflows/test_make_compatible.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
files:
- path: output/samplesheet/out.json
- path: output/combine/scorefiles.txt.gz
- path: output/plink2/GRCh37_vcf_22.pgen
- path: output/plink2/GRCh37_vcf_22.pvar.zst
- path: output/plink2/GRCh37_vcf_22.psam
- path: output/plink2/GRCh37_vcf_22.vmiss.gz
- path: output/plink2/GRCh37_cineca_22_vcf.pgen
- path: output/plink2/GRCh37_cineca_22_vcf.pvar.zst
- path: output/plink2/GRCh37_cineca_22_vcf.psam
- path: output/plink2/GRCh37_cineca_22_vcf.vmiss.gz

- name: test make compatible subworkflow with pfile
command: nextflow run main.nf --only_compatible -c ./tests/config/nextflow.config
Expand Down

0 comments on commit eb97ffd

Please sign in to comment.