diff --git a/.github/workflows/ancestry-conda.yml b/.github/workflows/ancestry-conda.yml new file mode 100644 index 00000000..532f6ae1 --- /dev/null +++ b/.github/workflows/ancestry-conda.yml @@ -0,0 +1,74 @@ +name: Run ancestry test with mamba profile + +on: + workflow_call: + inputs: + ancestry-cache-key: + type: string + required: true + +jobs: + test_mamba_ancestry: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -el {0} + + steps: + - name: Check out pipeline code + uses: actions/checkout@v3 + + - name: Set environment variables + run: | + echo "ANCESTRY_REF_DIR=$RUNNER_TEMP" >> $GITHUB_ENV + echo "ANCESTRY_TARGET_DIR=$RUNNER_TEMP" >> $GITHUB_ENV + + - name: Restore reference data + uses: actions/cache/restore@v3 + with: + path: | + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.pgen + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.psam + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.pvar.zst + ${{ env.ANCESTRY_REF_DIR }}/GRCh38_HAPNEST_reference.tar.zst + key: ${{ inputs.ancestry-cache-key }} + fail-on-cache-miss: true + + - uses: conda-incubator/setup-miniconda@v2 + with: + channels: conda-forge,bioconda,defaults + miniforge-variant: Mambaforge + miniforge-version: latest + python-version: "3.10" + + - uses: actions/setup-java@v3 + with: + distribution: 'corretto' + java-version: '17' + + - name: install nxf + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Set up test requirements + uses: actions/setup-python@v3 + with: + python-version: '3.10' + cache: 'pip' + + - run: pip install -r ${{ github.workspace }}/tests/requirements.txt + + - name: Run ancestry test + run: TMPDIR=~ PROFILE=mamba pytest --kwdof --symlink --git-aware --wt 2 --tag "ancestry" --ignore tests/bin + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v3 + with: + name: logs-conda-ancestry + path: | + /home/runner/pytest_workflow_*/*/.nextflow.log + /home/runner/pytest_workflow_*/*/log.out + /home/runner/pytest_workflow_*/*/log.err + /home/runner/pytest_workflow_*/*/output/* diff --git a/.github/workflows/ancestry-vcf.yml b/.github/workflows/ancestry-vcf.yml new file mode 100644 index 00000000..5454433d --- /dev/null +++ b/.github/workflows/ancestry-vcf.yml @@ -0,0 +1,160 @@ +name: Run ancestry test with singularity or docker profiles with VCF input + +on: + workflow_call: + inputs: + container-cache-key: + type: string + required: true + ancestry-cache-key: + type: string + required: true + docker: + type: boolean + singularity: + type: boolean + +env: + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/singularity + SINGULARITY_VERSION: 3.8.3 + +jobs: + docker: + if: ${{ inputs.docker }} + runs-on: ubuntu-latest + + steps: + - name: Set environment variables + run: | + echo "ANCESTRY_REF_DIR=$RUNNER_TEMP" >> $GITHUB_ENV + echo "ANCESTRY_TARGET_DIR=$RUNNER_TEMP" >> $GITHUB_ENV + + - name: Check out pipeline code + uses: actions/checkout@v3 + + - uses: nf-core/setup-nextflow@v1 + + - name: Restore docker images + id: restore-docker + uses: actions/cache/restore@v3 + with: + path: ${{ runner.temp }}/docker + key: ${{ inputs.container-cache-key }} + fail-on-cache-miss: true + + - name: Load docker images from cache + run: | + find $HOME -name '*.tar' + find ${{ runner.temp }}/docker/ -name '*.tar' -exec sh -c 'docker load < {}' \; + + - name: Restore reference data + uses: actions/cache/restore@v3 + with: + path: | + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.pgen + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.psam + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.pvar.zst + ${{ env.ANCESTRY_REF_DIR }}/GRCh38_HAPNEST_reference.tar.zst + key: ${{ inputs.ancestry-cache-key }} + fail-on-cache-miss: true + + - name: Install plink2 to recode + run: sudo apt-get install -y plink2 + + - name: Recode VCF + run: plink2 --pfile ${ANCESTRY_TARGET_DIR}/GRCh38_HAPNEST_TARGET_ALL vzs --export vcf bgz --out ${ANCESTRY_TARGET_DIR}/GRCh38_HAPNEST_TARGET_ALL + + - name: Set up test requirements + uses: actions/setup-python@v3 + with: + python-version: '3.10' + cache: 'pip' + + - run: pip install -r ${{ github.workspace }}/tests/requirements.txt + + - name: Run ancestry test + run: TMPDIR=~ PROFILE=docker pytest --kwdof --symlink --git-aware --wt 2 --tag "ancestry vcf" --ignore tests/bin + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v3 + with: + name: logs-singularity-ancestry + path: | + /home/runner/pytest_workflow_*/*/.nextflow.log + /home/runner/pytest_workflow_*/*/log.out + /home/runner/pytest_workflow_*/*/log.err + /home/runner/pytest_workflow_*/*/output/* + + singularity: + if: ${{ inputs.singularity }} + runs-on: ubuntu-latest + + steps: + - name: Set environment variables + run: | + echo "ANCESTRY_REF_DIR=$RUNNER_TEMP" >> $GITHUB_ENV + echo "ANCESTRY_TARGET_DIR=$RUNNER_TEMP" >> $GITHUB_ENV + + - name: Check out pipeline code + uses: actions/checkout@v3 + + - uses: nf-core/setup-nextflow@v1 + + - name: Restore singularity setup + id: restore-singularity-setup + uses: actions/cache@v3 + with: + path: /opt/hostedtoolcache/singularity/${{ env.SINGULARITY_VERSION }}/x64 + key: ${{ runner.os }}-singularity-${{ env.SINGULARITY_VERSION }} + fail-on-cache-miss: true + + - name: Add singularity to path + run: | + echo "/opt/hostedtoolcache/singularity/${{ env.SINGULARITY_VERSION }}/x64/bin" >> $GITHUB_PATH + + - name: Restore singularity container images + id: restore-singularity + uses: actions/cache@v3 + with: + path: ${{ env.NXF_SINGULARITY_CACHEDIR }} + key: ${{ inputs.container-cache-key }} + + - name: Restore reference data + uses: actions/cache/restore@v3 + with: + path: | + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.pgen + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.psam + ${{ env.ANCESTRY_TARGET_DIR }}/GRCh38_HAPNEST_TARGET_ALL.pvar.zst + ${{ env.ANCESTRY_REF_DIR }}/GRCh38_HAPNEST_reference.tar.zst + key: ${{ inputs.ancestry-cache-key }} + fail-on-cache-miss: true + + - name: Install plink2 to recode + run: sudo apt-get install -y plink2 + + - name: Recode VCF + run: plink2 --pfile ${ANCESTRY_TARGET_DIR}/GRCh38_HAPNEST_TARGET_ALL vzs --export vcf bgz --out ${ANCESTRY_TARGET_DIR}/GRCh38_HAPNEST_TARGET_ALL + + - name: Set up test requirements + uses: actions/setup-python@v3 + with: + python-version: '3.10' + cache: 'pip' + + - run: pip install -r ${{ github.workspace }}/tests/requirements.txt + + - name: Run ancestry test + run: TMPDIR=~ PROFILE=singularity pytest --kwdof --symlink --git-aware --wt 2 --tag "ancestry vcf" --ignore tests/bin + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v3 + with: + name: logs-singularity-ancestry + path: | + /home/runner/pytest_workflow_*/*/.nextflow.log + /home/runner/pytest_workflow_*/*/log.out + /home/runner/pytest_workflow_*/*/log.err + /home/runner/pytest_workflow_*/*/output/* diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee98360f..b5acc420 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,7 @@ on: branches: - dev - main + - fix_vcf release: types: [published] @@ -123,3 +124,19 @@ jobs: container-cache-key: ${{ needs.preload_singularity.outputs.cache-key }} ancestry-cache-key: ${{ needs.preload_ancestry.outputs.cache-key }} singularity: true + + ancestry_vcf_docker: + needs: [preload_ancestry, preload_docker] + uses: ./.github/workflows/ancestry-vcf.yml + with: + container-cache-key: ${{ needs.preload_docker.outputs.cache-key }} + ancestry-cache-key: ${{ needs.preload_ancestry.outputs.cache-key }} + docker: true + + ancestry_vcf_singularity: + needs: [preload_ancestry, preload_singularity] + uses: ./.github/workflows/ancestry.yml + with: + container-cache-key: ${{ needs.preload_singularity.outputs.cache-key }} + ancestry-cache-key: ${{ needs.preload_ancestry.outputs.cache-key }} + singularity: true diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 87e77667..8b627f5a 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -1,4 +1,4 @@ -name: test conda on publish +name: test conda profiles on demand and on publish on: release: @@ -6,33 +6,50 @@ on: workflow_dispatch: jobs: + preload_ancestry: + uses: ./.github/workflows/preload-reference.yml + + test_mamba_ancestry: + uses: ./.github/workflows/ancestry-conda.yml + needs: [preload_ancestry] + with: + ancestry-cache-key: ${{ needs.preload_ancestry.outputs.cache-key }} + test_mamba_standard: runs-on: ubuntu-latest + defaults: + run: + shell: bash -el {0} strategy: fail-fast: false matrix: test_profile: ["test"] profile: ["mamba"] - nxf_ver: ["22.10.0", "latest"] + nxf_ver: ["22.10.0", ""] + + env: + NXF_VER: ${{ matrix.nxf_ver }} steps: - name: Check out pipeline code uses: actions/checkout@v3 + - uses: conda-incubator/setup-miniconda@v2 + with: + channels: conda-forge,bioconda,defaults + miniforge-variant: Mambaforge + miniforge-version: latest + python-version: "3.10" + - uses: actions/setup-java@v3 with: distribution: 'corretto' java-version: '17' - - uses: nf-core/setup-nextflow@v1 - with: - version: ${{ matrix.nxf_ver }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - channels: conda-forge,bioconda,defaults + - name: install nxf + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ - name: Run pipeline with test data run: | diff --git a/.github/workflows/standard-test.yml b/.github/workflows/standard-test.yml index f8f61872..881f68ec 100644 --- a/.github/workflows/standard-test.yml +++ b/.github/workflows/standard-test.yml @@ -14,6 +14,7 @@ on: env: NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/singularity SINGULARITY_VERSION: 3.8.3 + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} jobs: docker: diff --git a/RELEASE-CHECKLIST.md b/RELEASE-CHECKLIST.md index 2abc6f5b..b12a8d27 100644 --- a/RELEASE-CHECKLIST.md +++ b/RELEASE-CHECKLIST.md @@ -31,6 +31,10 @@ - [ ] Has the changelog been updated? - [ ] Update the nextflow schema +# Reference panels +- [ ] Did anything change to the modules for creating the reference panel? Bump ref_format_version in nextflow.config + - [ ] Publish new reference panels to FTP, update any documentation. + # Tests - [ ] Make sure unit tests pass on singularity, docker, and conda (CI) diff --git a/assets/examples/samplesheet.csv b/assets/examples/samplesheet.csv index e4221e60..65fd9dc1 100644 --- a/assets/examples/samplesheet.csv +++ b/assets/examples/samplesheet.csv @@ -1,2 +1,2 @@ sampleset,path_prefix,chrom,format -cineca,assets/examples/target_genomes/cineca_synthetic_subset,22,pfile \ No newline at end of file +cineca,target_genomes/cineca_synthetic_subset,22,pfile \ No newline at end of file diff --git a/assets/examples/samplesheet_bfile.csv b/assets/examples/samplesheet_bfile.csv index 0ac2241d..19f83aba 100644 --- a/assets/examples/samplesheet_bfile.csv +++ b/assets/examples/samplesheet_bfile.csv @@ -1,2 +1,2 @@ sampleset,path_prefix,chrom,format -cineca,assets/examples/target_genomes/cineca_synthetic_subset,22,bfile \ No newline at end of file +cineca,target_genomes/cineca_synthetic_subset,22,bfile \ No newline at end of file diff --git a/assets/examples/samplesheet_vcf.csv b/assets/examples/samplesheet_vcf.csv index 553e907a..80642835 100644 --- a/assets/examples/samplesheet_vcf.csv +++ b/assets/examples/samplesheet_vcf.csv @@ -1,2 +1,2 @@ sampleset,path_prefix,chrom,format -cineca,assets/examples/target_genomes/cineca_synthetic_subset,22,vcf \ No newline at end of file +cineca,target_genomes/cineca_synthetic_subset,22,vcf \ No newline at end of file diff --git a/assets/report/report.qmd b/assets/report/report.qmd index 3001f6a0..dda84749 100644 --- a/assets/report/report.qmd +++ b/assets/report/report.qmd @@ -293,12 +293,23 @@ if(params$run_ancestry == TRUE){ ```{r colour_palette, echo = FALSE, eval=params$run_ancestry} # source: https://github.com/PGScatalog/PGS_Catalog/blob/master/catalog/static/catalog/pgs.scss#L2493-L2520 # $ancestry_colours -thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A", - "#377EB8", "#00CED1", "#984EA3", "#A6CEE3", - "#FF7F00", "#BBB", "#999") -names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS", "EUR", "GME", - "SAS", "MAE", "MAO", "NR", "OTH") -thousand_genomes_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours) +if({params$reference_panel_name} == '1000G'){ + thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A", + "#377EB8", "#00CED1", "#984EA3", "#A6CEE3", + "#FF7F00", "#BBB", "#999") + names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS", + "EUR", "GME", "SAS", "MAE", + "MAO", "NR", "OTH") + current_population_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours) +} else if({params$reference_panel_name} == 'HGDP+1kGP'){ + gnomAD_pop_colours <- c("#97519d", "#e42523", "#f67e1e", "#48b24b", + "#3280bb", "#a65528", "#9a9c9b") + names(gnomAD_pop_colours) <- c("AFR", "AMR", "CSA", "EAS", + "EUR", "MID", "OCE") + current_population_palette <- scale_colour_manual(name = "Populations", values = gnomAD_pop_colours) +} else{ + current_population_palette <- scale_colour_brewer(palette="Set3") +} ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} @@ -321,7 +332,7 @@ for(pc in seq.int(1,5,2)){ if (pcX %in% colnames(popsim)){ p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop, shape=slabel), alpha=0.25) p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop, shape=slabel)) - p_pca <- p_pca + theme_bw() + thousand_genomes_palette + scale_shape_manual(values=map_shapes, name='sampleset') + p_pca <- p_pca + theme_bw() + current_population_palette + scale_shape_manual(values=map_shapes, name='sampleset') print(p_pca) } } @@ -492,4 +503,4 @@ For scores from the PGS Catalog, please remember to cite the original publicatio > PGS Catalog Calculator (in development). PGS Catalog Team. `https://github.com/PGScatalog/pgsc_calc` -> Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5. \ No newline at end of file +> Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5. diff --git a/assets/schemas/samplesheet.json b/assets/schemas/samplesheet.json index 87afd724..d1fe85f1 100644 --- a/assets/schemas/samplesheet.json +++ b/assets/schemas/samplesheet.json @@ -11,8 +11,8 @@ "properties": { "sampleset": { "type": "string", - "pattern": "^\\S+$", - "description": "Sampleset name must be provided and cannot contain spaces" + "pattern": "^[a-zA-Z0-9]*$", + "description": "Sampleset name must be provided and cannot contain spaces or reserved characters ('_' or '.')" }, "path": { "description": "A list of resolved target genome file paths", diff --git a/conf/base.config b/conf/base.config index d2a485ce..c4a972a2 100644 --- a/conf/base.config +++ b/conf/base.config @@ -51,4 +51,7 @@ process { withName:DUMPSOFTWAREVERSIONS { cache = false } + withLabel:plink2{ + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } } diff --git a/conf/modules.config b/conf/modules.config index 2d0e267e..eef253af 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -38,8 +38,8 @@ process { ext.conda = "$projectDir/environments/pgscatalog_utils/environment.yml" ext.docker = 'ghcr.io/pgscatalog/pgscatalog_utils' ext.singularity = 'oras://ghcr.io/pgscatalog/pgscatalog_utils' - ext.docker_version = ':v0.4.1' - ext.singularity_version = ':v0.4.1-singularity' + ext.docker_version = ':v0.4.2' + ext.singularity_version = ':v0.4.2-singularity' } withLabel: plink2 { diff --git a/conf/test.config b/conf/test.config index 4ebee4d6..e64e0d95 100644 --- a/conf/test.config +++ b/conf/test.config @@ -19,13 +19,10 @@ params { max_memory = '6.GB' max_time = '6.h' - input = "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/input_v2.json" - format = "json" - scorefile = "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt" + input = "$projectDir/assets/examples/samplesheet.csv" + format = "csv" + scorefile = "$projectDir/assets/examples/scorefiles/PGS001229_22.txt" - // TODO: fix local tests with CSV - // input = "$projectDir/assets/examples/samplesheet.csv" - // scorefile = "$projectDir/assets/examples/scorefiles/PGS001229_22.txt" outdir = "$projectDir/results" target_build = "GRCh37" } diff --git a/docs/changelog.rst b/docs/changelog.rst index 29ba8192..4b24045a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,6 +8,26 @@ will only occur in major versions with changes noted in this changelog. .. _`semantic versioning`: https://semver.org/ +pgsc_calc v2.0.0-alpha.3 (2023-10-02) +------------------------------------- + +Improvements: + +* Automatically retry scoring with more RAM on larger datasets +* Describe scoring precision in docs +* Change handling of VCFs to reduce errors when recoding +* Internal changes to improve support for custom reference panels + +Bug fixes: + +* Fix VCF input to ancestry projection subworkflow (thanks `@frahimov`_ and `@AWS-crafter`_ for patiently debugging) +* Fix scoring options when reading allelic frequencies from a reference panel (thanks `@raimondsre`_ for reporting the changes from v1.3.2 -> 2.0.0-alpha) +* Fix conda profile action + +.. _`@frahimov`: https://github.com/PGScatalog/pgsc_calc/issues/172 +.. _`@AWS-crafter`: https://github.com/PGScatalog/pgsc_calc/issues/155 +.. _`@raimondsre`: https://github.com/PGScatalog/pgsc_calc/pull/139#issuecomment-1736313211 + pgsc_calc v2.0.0-alpha.1 (2023-08-11) ------------------------------------- diff --git a/docs/explanation/output.rst b/docs/explanation/output.rst index 7d8c26eb..c9216b47 100644 --- a/docs/explanation/output.rst +++ b/docs/explanation/output.rst @@ -34,6 +34,10 @@ If you have run the pipeline **without** using ancestry information the followin - ``AVG``: normalizes ``SUM`` by the ``DENOM`` field (displayed when you calculate the PGS on a small sample size n<50 to avoid using unreliable allele frequency estimates for missing genotypes in the target sample. +.. note:: The PGS ``SUM`` & ``AVG`` are rounded to a precision of 6 decimal places in the ouput of the PLINK2_SCORE + commands; however, the calculation of the PGS is based on the full precision of the effect_weight value in the + scoring file. + If you have run the pipeline **using ancestry information** (``--run_ancesty``) the following columns may be present depending on the ancestry adjustments that were run (see :ref:`norm` for more details): diff --git a/docs/how-to/prepare.rst b/docs/how-to/prepare.rst index eb2fe4eb..d74427bc 100644 --- a/docs/how-to/prepare.rst +++ b/docs/how-to/prepare.rst @@ -16,8 +16,8 @@ Target genome data requirements - Only human chromosomes 1 -- 22, X, Y, and XY are supported by the pipeline, although sex chromosomes are rarely used in scoring files. -- If input data contain other chromosomes (e.g. pseudoautosomal regions) then - the pipeline will probably complain loudly and stop calculating. +- If input data contain other chromosomes (e.g. patch regions) then + the pipeline may complain loudly and stop calculating. Supported file formats @@ -41,11 +41,17 @@ VCF from an imputation server plink2 --vcf \ --allow-extra-chr \ --chr 1-22, X, Y, XY \ - -make-pgen --out <1000G>_axy + -make-pgen --out _axy + +.. note:: Non-standard chromosomes/patches should not cause errors in versions >v2.0.0-alpha.3; + however, they will be be filtered out from the list of variants available for PGS scoring. VCF from WGS ------------ +See https://github.com/PGScatalog/pgsc_calc/discussions/123 for discussion about tools +to convert the VCF files into ones suitable for calculating PGS. + ``plink`` binary fileset (bfile) -------------------------------- diff --git a/docs/how-to/samplesheet.rst b/docs/how-to/samplesheet.rst index 05b9a77f..b5f74063 100644 --- a/docs/how-to/samplesheet.rst +++ b/docs/how-to/samplesheet.rst @@ -25,11 +25,11 @@ download here <../../assets/examples/samplesheet.csv>`. There are four mandatory columns: -- **sampleset**: A text string referring to the name of a :term:`target dataset` - of genotyping data containing at least one sample/individual (however cohort - datasets will often contain many individuals with combined genotyped/imputed - data). Data from a sampleset may be input as a single file, or split across - chromosomes into multiple files. Scores generated from files with the same +- **sampleset**: A text string (no spaces, or reserved characters [ '.' or '_' ]) referring + to the name of a :term:`target dataset` of genotyping data containing at least one + sample/individual (however cohort datasets will often contain many individuals with + combined genotyped/imputed data). Data from a sampleset may be input as a single file, + or split across chromosomes into multiple files. Scores generated from files with the same sampleset name are combined in later stages of the analysis. .. danger:: diff --git a/environments/pgscatalog_utils/environment.yml b/environments/pgscatalog_utils/environment.yml index a6827249..b2be6204 100644 --- a/environments/pgscatalog_utils/environment.yml +++ b/environments/pgscatalog_utils/environment.yml @@ -3,4 +3,4 @@ dependencies: - python=3.10 - pip - pip: - - pgscatalog_utils==0.4.1 + - pgscatalog_utils==0.4.2 diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 28567bd7..8d030f4e 100755 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,19 +21,26 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { + if (channels_missing | channel_priority_violation) { log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/modules/local/ancestry/bootstrap/make_database.nf b/modules/local/ancestry/bootstrap/make_database.nf index dca93503..e6d2aa16 100644 --- a/modules/local/ancestry/bootstrap/make_database.nf +++ b/modules/local/ancestry/bootstrap/make_database.nf @@ -14,6 +14,8 @@ process MAKE_DATABASE { input: path '*' + tuple val(grch37_king_meta), path(grch37_king) + tuple val(grch38_king_meta), path(grch38_king) path checksums output: @@ -24,7 +26,13 @@ process MAKE_DATABASE { """ md5sum -c $checksums - echo $workflow.manifest.version > meta.txt + echo ${params.ref_format_version} > meta.txt + + # can't use meta variables in stageAs + # don't want to use renameTo because it's destructive for the input + cp -L $grch37_king ${grch37_king_meta.build}_${grch37_king_meta.id}.king.cutoff.out.id + cp -L $grch38_king ${grch38_king_meta.build}_${grch38_king_meta.id}.king.cutoff.out.id + rm $grch37_king $grch38_king tar --dereference -acf pgsc_calc.tar.zst * diff --git a/modules/local/ancestry/extract_database.nf b/modules/local/ancestry/extract_database.nf index 6778b28a..b7c2fc1b 100644 --- a/modules/local/ancestry/extract_database.nf +++ b/modules/local/ancestry/extract_database.nf @@ -17,18 +17,27 @@ process EXTRACT_DATABASE { output: tuple val(meta38), path("GRCh38_*_ALL.pgen"), path("GRCh38_*_ALL.psam"), path("GRCh38_*_ALL.pvar.zst"), emit: grch38, optional: true - tuple val(meta38), path("deg2_hg38.king.cutoff.out.id"), emit: grch38_king, optional: true + tuple val(meta38), path("GRCh38_*.king.cutoff.out.id"), emit: grch38_king, optional: true tuple val(meta37), path("GRCh37_*_ALL.pgen"), path("GRCh37_*_ALL.psam"), path("GRCh37_*_ALL.pvar.zst"), emit: grch37, optional: true - tuple val(meta37), path("deg2_phase3.king.cutoff.out.id"), emit: grch37_king, optional: true + tuple val(meta37), path("GRCh37_*.king.cutoff.out.id"), emit: grch37_king, optional: true path "versions.yml", emit: versions script: meta38 = ['build': 'GRCh38'] meta37 = ['build': 'GRCh37'] - def king = params.target_build == "GRCh37" ? "deg2_phase3.king.cutoff.out.id" : "deg2_hg38.king.cutoff.out.id" """ - tar -xvf $reference --wildcards "${params.target_build}*" $king + tar -xf $reference --wildcards "${params.target_build}*" meta.txt 2> /dev/null + + DB_VERSION=\$(cat meta.txt) + + if [ "\$DB_VERSION" != "${params.ref_format_version}" ]; then + echo "Old reference database version detected, please redownload the latest version and try again" + echo "See https://pgsc-calc.readthedocs.io/en/latest/how-to/ancestry.html" + exit 1 + else + echo "Database version good" + fi cat <<-END_VERSIONS > versions.yml ${task.process.tokenize(':').last()}: diff --git a/modules/local/match_combine.nf b/modules/local/match_combine.nf index 7484001b..1b84709f 100644 --- a/modules/local/match_combine.nf +++ b/modules/local/match_combine.nf @@ -1,6 +1,7 @@ process MATCH_COMBINE { // labels are defined in conf/modules.config label 'process_medium' + label 'error_retry' label 'pgscatalog_utils' // controls conda, docker, + singularity options // first element of tag must be sampleset diff --git a/modules/local/plink2_relabelbim.nf b/modules/local/plink2_relabelbim.nf index d495960f..beb487da 100644 --- a/modules/local/plink2_relabelbim.nf +++ b/modules/local/plink2_relabelbim.nf @@ -5,8 +5,8 @@ process PLINK2_RELABELBIM { label "plink2" // controls conda, docker, + singularity options tag "$meta.id chromosome $meta.chrom" - storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${params.target_build}/${meta.chrom}" : - "$workDir/genomes/${meta.id}/${params.target_build}/${meta.chrom}/") + storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${meta.build}/${meta.chrom}" : + "$workDir/genomes/${meta.id}/${meta.build}/${meta.chrom}/") conda "${task.ext.conda}" @@ -20,9 +20,9 @@ process PLINK2_RELABELBIM { tuple val(meta), path(geno), path(variants), path(pheno) output: - tuple val(meta), path("*.bed"), emit: geno - tuple val(meta), path("*.zst"), emit: variants - tuple val(meta), path("*.fam"), emit: pheno + tuple val(meta), path("${meta.build}_*.bed"), emit: geno + tuple val(meta), path("${meta.build}_*.zst"), emit: variants + tuple val(meta), path("${meta.build}_*.fam"), emit: pheno tuple val(meta), path("*.vmiss.gz"), emit: vmiss path "versions.yml" , emit: versions @@ -33,7 +33,7 @@ process PLINK2_RELABELBIM { script: def args = task.ext.args ?: '' def compressed = variants.getName().endsWith("zst") ? 'vzs' : '' - def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}_" : "${meta.id}_" + def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" def mem_mb = task.memory.toMega() // plink is greedy // if dropping multiallelic variants, set a generic ID that won't match def set_ma_missing = params.keep_multiallelic ? '' : '--var-id-multi @:#' @@ -48,11 +48,11 @@ process PLINK2_RELABELBIM { $set_ma_missing \\ --bfile ${geno.baseName} $compressed \\ --make-just-bim zs \\ - --out ${params.target_build}_${prefix}${meta.chrom} + --out ${meta.build}_${prefix}_${meta.chrom} # cross platform (mac, linux) method of preserving symlinks - cp -a $geno ${params.target_build}_${prefix}${meta.chrom}.bed - cp -a $pheno ${params.target_build}_${prefix}${meta.chrom}.fam + cp -a $geno ${meta.build}_${prefix}_${meta.chrom}.bed + cp -a $pheno ${meta.build}_${prefix}_${meta.chrom}.fam gzip *.vmiss cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/plink2_relabelpvar.nf b/modules/local/plink2_relabelpvar.nf index 67671808..6de23ec3 100644 --- a/modules/local/plink2_relabelpvar.nf +++ b/modules/local/plink2_relabelpvar.nf @@ -6,8 +6,8 @@ process PLINK2_RELABELPVAR { tag "$meta.id chromosome $meta.chrom" - storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${params.target_build}/${meta.chrom}" : - "$workDir/genomes/${meta.id}/${params.target_build}/${meta.chrom}/") + storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${meta.build}/${meta.chrom}" : + "$workDir/genomes/${meta.id}/${meta.build}/${meta.chrom}/") conda "${task.ext.conda}" @@ -21,9 +21,9 @@ process PLINK2_RELABELPVAR { tuple val(meta), path(geno), path(pheno), path(variants) output: - tuple val(meta), path("*.pgen"), emit: geno - tuple val(meta), path("*.zst") , emit: variants - tuple val(meta), path("*.psam"), emit: pheno + tuple val(meta), path("${meta.build}_*.pgen"), emit: geno + tuple val(meta), path("${meta.build}_*.pvar.zst") , emit: variants + tuple val(meta), path("${meta.build}_*.psam"), emit: pheno tuple val(meta), path("*.vmiss.gz"), emit: vmiss path "versions.yml" , emit: versions @@ -34,7 +34,7 @@ process PLINK2_RELABELPVAR { script: def args = task.ext.args ?: '' def compressed = variants.getName().endsWith("zst") ? 'vzs' : '' - def prefix = task.ext.suffix ? "${meta.id}_${task.ext.suffix}_" : "${meta.id}_" + def prefix = task.ext.suffix ? "${meta.id}_${task.ext.suffix}" : "${meta.id}" def mem_mb = task.memory.toMega() // plink is greedy // if dropping multiallelic variants, set a generic ID that won't match def set_ma_missing = params.keep_multiallelic ? '' : '--var-id-multi @:#' @@ -49,11 +49,12 @@ process PLINK2_RELABELPVAR { $set_ma_missing \\ --pfile ${geno.baseName} $compressed \\ --make-just-pvar zs \\ - --out ${params.target_build}_${prefix}${meta.chrom} + --out ${meta.build}_${prefix}_${meta.chrom} # cross platform (mac, linux) method of preserving symlinks - cp -a $geno ${params.target_build}_${prefix}${meta.chrom}.pgen - cp -a $pheno ${params.target_build}_${prefix}${meta.chrom}.psam + cp -a $geno ${meta.build}_${prefix}_${meta.chrom}.pgen + cp -a $pheno ${meta.build}_${prefix}_${meta.chrom}.psam + gzip *.vmiss cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/plink2_score.nf b/modules/local/plink2_score.nf index f8b21fc3..2da01f11 100644 --- a/modules/local/plink2_score.nf +++ b/modules/local/plink2_score.nf @@ -3,6 +3,7 @@ process PLINK2_SCORE { // labels are defined in conf/modules.config label 'process_low' label 'process_long' + label 'error_retry' label 'plink2' // controls conda, docker, + singularity options tag "$meta.id chromosome $meta.chrom effect type $scoremeta.effect_type $scoremeta.n" @@ -40,22 +41,23 @@ process PLINK2_SCORE { // custom args2 def maxcol = (scoremeta.n_scores.toInteger() + 2) // id + effect allele = 2 cols - // if we load allelic frequencies, don't do mean imputation - def no_imputation = (ref_afreq.name == 'NO_FILE') ? "no-mean-imputation" : "" - // if no-mean-imputation, be more efficient + // if we have allelic frequencies or enough samples don't do mean imputation and skip freq-calc + def no_imputation = ((ref_afreq.name == 'NO_FILE') && (meta.n_samples.toInteger() < 50)) ? "no-mean-imputation" : "" def error_on_freq_calc = (no_imputation == "no-mean-imputation") ? "--error-on-freq-calc" : "" - def cols = (meta.n_samples.toInteger() < 50) ? 'header-read cols=+scoresums,+denom,-fid' : 'header-read cols=+scoresums,+denom,-fid' + def cols = 'header-read cols=+scoresums,+denom,-fid' def recessive = (scoremeta.effect_type == 'recessive') ? ' recessive ' : '' def dominant = (scoremeta.effect_type == 'dominant') ? ' dominant ' : '' args2 = [args2, cols, 'list-variants', no_imputation, recessive, dominant, error_on_freq_calc].join(' ') + // speed up the calculation by only considering scoring-file variants for allele frequency calculation (--extract) if (scoremeta.n_scores.toInteger() == 1) """ plink2 \ --threads $task.cpus \ --memory $mem_mb \ --seed 31 \ + --extract $scorefile \ $load_afreq \ $args \ --score $scorefile $args2 \ @@ -73,6 +75,7 @@ process PLINK2_SCORE { --threads $task.cpus \ --memory $mem_mb \ --seed 31 \ + --extract $scorefile \ $load_afreq \ $args \ --score $scorefile $args2 \ diff --git a/modules/local/plink2_vcf.nf b/modules/local/plink2_vcf.nf index 10ceb515..d13201d4 100644 --- a/modules/local/plink2_vcf.nf +++ b/modules/local/plink2_vcf.nf @@ -6,8 +6,8 @@ process PLINK2_VCF { tag "$meta.id chromosome $meta.chrom" - storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${params.target_build}/${meta.chrom}" : - "$workDir/genomes/${meta.id}/${params.target_build}/${meta.chrom}/") + storeDir ( params.genotypes_cache ? "$params.genotypes_cache/${meta.id}/${meta.build}/${meta.chrom}" : + "$workDir/genomes/${meta.id}/${meta.build}/${meta.chrom}/") conda "${task.ext.conda}" @@ -20,19 +20,20 @@ process PLINK2_VCF { tuple val(meta), path(vcf) output: - tuple val(newmeta), path("*.pgen"), emit: pgen - tuple val(newmeta), path("*.psam"), emit: psam - tuple val(newmeta), path("*.zst") , emit: pvar - tuple val(meta), path("*.vmiss.gz"), emit: vmiss + tuple val(newmeta), path("${meta.build}_*.pgen"), emit: pgen + tuple val(newmeta), path("${meta.build}_*.psam"), emit: psam + tuple val(newmeta), path("${meta.build}_*.zst") , emit: pvar + tuple val(newmeta), path("${meta.build}_*.vmiss.gz"), emit: vmiss path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}_" + def prefix = task.ext.suffix ? "${meta.id}_${task.ext.suffix}" : "${meta.id}" def mem_mb = task.memory.toMega() def dosage_options = meta.vcf_import_dosage ? 'dosage=DS' : '' // rewriting genotypes, so use --max-alleles instead of using generic ID def set_ma_missing = params.keep_multiallelic ? '' : '--max-alleles 2' + def chrom_filter = meta.chrom == "ALL" ? "--chr 1-22, X, Y, XY" : "--chr ${meta.chrom}" // filter to canonical/stated chromosome newmeta = meta.clone() // copy hashmap for updating... newmeta.is_pfile = true // now it's converted to a pfile :) @@ -45,8 +46,9 @@ process PLINK2_VCF { --missing vcols=fmissdosage,fmiss \\ $args \\ --vcf $vcf $dosage_options \\ + --allow-extra-chr $chrom_filter \\ --make-pgen vzs \\ - --out ${params.target_build}_${prefix}${meta.chrom} + --out ${meta.build}_${prefix}_${meta.chrom}_vcf gzip *.vmiss diff --git a/nextflow.config b/nextflow.config index 279c6779..5892f786 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,7 +22,7 @@ params { efo_direct = false // reference params - run_ancestry = null // path to reference database + run_ancestry = null // path to reference database TODO: replace with NO_FILE ancestry_checksums = "$projectDir/assets/ancestry/checksums.txt" // if you want to liftover --scorefiles, set the chain files hg19_chain = null // "https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz" @@ -37,7 +37,7 @@ params { ld_grch38 = "$projectDir/assets/ancestry/high-LD-regions-hg38-GRCh38.txt" // ancestry params - ancestry_params_file = null + ref_format_version = "v0.1" ref_samplesheet = "$projectDir/assets/ancestry/reference.csv" projection_method = "oadp" ancestry_method = "RandomForest" @@ -47,7 +47,6 @@ params { n_normalization = 4 // compatibility params - compat_params_file = null liftover = false target_build = null min_lift = 0.95 @@ -80,7 +79,7 @@ params { help = false validate_params = true show_hidden_params = false - schema_ignore_params = 'only_bootstrap,only_input,only_compatible,only_match,only_projection,only_score,skip_ancestry,igenomes_ignore' + schema_ignore_params = 'only_bootstrap,only_input,only_compatible,only_match,only_projection,only_score,skip_ancestry' // Max resource options // Defaults only, expecting to be overwritten diff --git a/subworkflows/local/ancestry/ancestry_project.nf b/subworkflows/local/ancestry/ancestry_project.nf index 8fcae1fc..4420ea2e 100644 --- a/subworkflows/local/ancestry/ancestry_project.nf +++ b/subworkflows/local/ancestry/ancestry_project.nf @@ -71,7 +71,7 @@ workflow ANCESTRY_PROJECT { // ch_genomes - .join(vmiss) + .join(vmiss, failOnMismatch: true) .combine( ch_db.map{ it.tail() } ) // (drop hashmap) .flatten() .buffer(size: 8) diff --git a/subworkflows/local/ancestry/bootstrap_ancestry.nf b/subworkflows/local/ancestry/bootstrap_ancestry.nf index 49fd9a07..2dfdbc4a 100644 --- a/subworkflows/local/ancestry/bootstrap_ancestry.nf +++ b/subworkflows/local/ancestry/bootstrap_ancestry.nf @@ -2,7 +2,7 @@ // Create a database containing reference data required for ancestry inference // include { SETUP_RESOURCE } from '../../../modules/local/ancestry/bootstrap/setup_resource' -include { PLINK2_RELABELPVAR } from '../../../modules/local/plink2_relabelpvar' +include { PLINK2_RELABELPVAR as BOOTSTRAP_RELABEL } from '../../../modules/local/plink2_relabelpvar' include { MAKE_DATABASE } from '../../../modules/local/ancestry/bootstrap/make_database' workflow BOOTSTRAP_ANCESTRY { @@ -33,11 +33,11 @@ workflow BOOTSTRAP_ANCESTRY { SETUP_RESOURCE.out.plink.dump( tag: 'ref_setup' ) - PLINK2_RELABELPVAR( SETUP_RESOURCE.out.plink ) - ch_versions = ch_versions.mix(PLINK2_RELABELPVAR.out.versions.first()) + BOOTSTRAP_RELABEL( SETUP_RESOURCE.out.plink ) + ch_versions = ch_versions.mix(BOOTSTRAP_RELABEL.out.versions.first()) - PLINK2_RELABELPVAR.out.geno - .concat(PLINK2_RELABELPVAR.out.pheno, PLINK2_RELABELPVAR.out.variants) + BOOTSTRAP_RELABEL.out.geno + .concat(BOOTSTRAP_RELABEL.out.pheno, BOOTSTRAP_RELABEL.out.variants) .dump(tag: 'ancestry_relabelled') .set { relabelled } @@ -47,12 +47,14 @@ workflow BOOTSTRAP_ANCESTRY { .groupTuple(size: 3) .dump(tag: 'ancestry_relabelled_grouped') .map { drop_meta_keys(it).flatten() } - .set{ relabelled_flat } + .set{ relabelled_flat } - ref.king - .map { drop_meta_keys(it) } - // dropping meta keys simplifies the join - .join( relabelled_flat ) + ref.king.branch { + GRCh37: it[0].build == "GRCh37" + GRCh38: it[0].build == "GRCh38" + }.set { ch_king } + + relabelled_flat .flatten() .filter(Path) .collect() @@ -62,7 +64,7 @@ workflow BOOTSTRAP_ANCESTRY { Channel.fromPath(params.ancestry_checksums, checkIfExists: true) .set { ch_checksums } - MAKE_DATABASE( ch_raw_ref, ch_checksums ) + MAKE_DATABASE( ch_raw_ref, ch_king.GRCh37, ch_king.GRCh38, ch_checksums ) ch_versions = ch_versions.mix(MAKE_DATABASE.out.versions) emit: diff --git a/tests/ancestry/samplesheet_vcf.csv b/tests/ancestry/samplesheet_vcf.csv new file mode 100644 index 00000000..19f13eb0 --- /dev/null +++ b/tests/ancestry/samplesheet_vcf.csv @@ -0,0 +1,2 @@ +sampleset,path_prefix,chrom,format +test,ANCESTRY_TARGET_DIR/GRCh38_HAPNEST_TARGET_ALL,,vcf \ No newline at end of file diff --git a/tests/ancestry/test.yml b/tests/ancestry/test_ancestry.yml similarity index 81% rename from tests/ancestry/test.yml rename to tests/ancestry/test_ancestry.yml index b31c2e81..5cf5370e 100644 --- a/tests/ancestry/test.yml +++ b/tests/ancestry/test_ancestry.yml @@ -27,4 +27,11 @@ - "AFR,100 (33.33%)" - "EAS,100 (33.33%)" - "EUR,100 (33.33%)" + - path: output/plink2/test_ALL_additive_0.log + contains: + - "--read-freq" + - "--extract" + must_not_contain: + - "no-mean-imputation" + - "error-on-freq-calc" diff --git a/tests/ancestry/test_ancestry_vcf.yml b/tests/ancestry/test_ancestry_vcf.yml new file mode 100644 index 00000000..991fcbfb --- /dev/null +++ b/tests/ancestry/test_ancestry_vcf.yml @@ -0,0 +1,38 @@ +# ancestry test notes: +# need to stage reference in $ANCESTRY_REF_DIR +# extract target in $ANCESTRY_TARGET_DIR +# need to convert target to VCF + +- name: test ancestry projection and scoring with VCF input + command: > + bash -c " + sed \"s|ANCESTRY_TARGET_DIR|$ANCESTRY_TARGET_DIR|\" tests/ancestry/samplesheet.csv > samplesheet.csv; + nextflow run main.nf -c ./tests/config/nextflow.config \ + --input samplesheet.csv \ + --run_ancestry $ANCESTRY_REF_DIR/GRCh38_HAPNEST_reference.tar.zst \ + --target_build GRCh38 \ + --pgs_id PGS001229 \ + --min_overlap 0.50 \ + --target_build GRCh38 \ + --scorefile false + " + tags: + - ancestry vcf + - slow + stdout: + contains: + - "Pipeline completed successfully" + files: + - path: "output/test/score/pop_summary.csv" + contains: + - "AFR,100 (33.33%)" + - "EAS,100 (33.33%)" + - "EUR,100 (33.33%)" + - path: output/plink2/test_ALL_additive_0.log + contains: + - "--read-freq" + - "--extract" + must_not_contain: + - "no-mean-imputation" + - "error-on-freq-calc" + diff --git a/tests/config/nextflow.config b/tests/config/nextflow.config index 3c530d3c..bfb8aeb7 100644 --- a/tests/config/nextflow.config +++ b/tests/config/nextflow.config @@ -25,13 +25,16 @@ process { } def platform = "$PROFILE" == 'arm' ? '--platform linux/arm64' : '--platform linux/amd64' -def mount_home = "-v $HOME:$HOME" +def mount_home = "-v $HOME:$HOME -v /private/var/folders/" if ("$PROFILE" == "singularity") { singularity.enabled = true singularity.autoMounts = true } else if ("$PROFILE" == "conda") { conda.enabled = true +} else if ("$PROFILE" == "mamba") { + conda.enabled = true + conda.useMamba = true } else if ("$PROFILE" == "arm") { docker.enabled = true docker.userEmulation = false diff --git a/tests/modules/combine/test.yml b/tests/modules/combine/test.yml index abd151cf..9f5d708d 100644 --- a/tests/modules/combine/test.yml +++ b/tests/modules/combine/test.yml @@ -15,6 +15,6 @@ - "effect_type" - path: output/combine/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" diff --git a/tests/modules/download/test.yml b/tests/modules/download/test.yml index 79ced86c..e6a341fe 100644 --- a/tests/modules/download/test.yml +++ b/tests/modules/download/test.yml @@ -8,7 +8,7 @@ - path: output/download/PGS000001_hmPOS_GRCh37.txt.gz - path: output/download/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" - name: pgscatalog test --efo_trait --pgp_id and --pgs_id command: nextflow run ./tests/modules/download -entry testmultipleaccessions -c ./tests/config/nextflow.config @@ -24,7 +24,7 @@ - path: output/download/PGS002054_hmPOS_GRCh37.txt.gz - path: output/download/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" - name: pgscatalog test bad accession command: nextflow run ./tests/modules/download -entry testbadaccession -c ./tests/config/nextflow.config @@ -44,4 +44,4 @@ - path: output/download/PGS000001_hmPOS_GRCh38.txt.gz - path: output/download/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" diff --git a/tests/modules/match/test.yml b/tests/modules/match/test.yml index 689353ff..191d190e 100644 --- a/tests/modules/match/test.yml +++ b/tests/modules/match/test.yml @@ -8,7 +8,7 @@ files: - path: output/test/match/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" - name: test match combine module command: nextflow run ./tests/modules/match -entry testmatchcombine -c ./tests/config/nextflow.config @@ -20,7 +20,7 @@ files: - path: output/combine/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" - path: output/combine/scorefiles.txt.gz contains: - "effect_allele" diff --git a/tests/modules/plink2/relabelbim/main.nf b/tests/modules/plink2/relabelbim/main.nf index 4ae36f09..9d530119 100644 --- a/tests/modules/plink2/relabelbim/main.nf +++ b/tests/modules/plink2/relabelbim/main.nf @@ -8,7 +8,7 @@ workflow testrelabelbim { bim = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim') bed = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bed') fam = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.fam') - def meta = [id: 'test', is_bfile: true] + def meta = [id: 'test', build: 'GRCh37', is_bfile: true, chrom: 22] PLINK2_RELABELBIM( Channel.of([meta, bed, bim, fam]) ) } diff --git a/tests/modules/plink2/relabelbim/test.yml b/tests/modules/plink2/relabelbim/test.yml index cd6bbab4..6338f261 100644 --- a/tests/modules/plink2/relabelbim/test.yml +++ b/tests/modules/plink2/relabelbim/test.yml @@ -5,10 +5,10 @@ - fast - module files: - - path: output/plink2/GRCh37_test_null.bed + - path: output/plink2/GRCh37_test_22.bed md5sum: a8be76ae3301d395563784fcbd571ae2 - - path: output/plink2/GRCh37_test_null.bim.zst - - path: output/plink2/GRCh37_test_null.fam + - path: output/plink2/GRCh37_test_22.bim.zst + - path: output/plink2/GRCh37_test_22.fam md5sum: 8915d48959a21e827d1db1b192422ba1 - path: output/plink2/versions.yml contains: diff --git a/tests/modules/plink2/relabelpvar/main.nf b/tests/modules/plink2/relabelpvar/main.nf index e460bafd..8cfa13b8 100644 --- a/tests/modules/plink2/relabelpvar/main.nf +++ b/tests/modules/plink2/relabelpvar/main.nf @@ -7,7 +7,7 @@ include { PLINK2_RELABELPVAR } from '../../../../modules/local/plink2_relabelpva workflow testrelabelpvar { vcf = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.vcf.gz') - def meta = [id: 'test', chrom: 22] + def meta = [id: 'test', 'build': 'GRCh37', chrom: 22] PLINK2_VCF(Channel.of([meta, vcf])) diff --git a/tests/modules/plink2/relabelpvar/test.yml b/tests/modules/plink2/relabelpvar/test.yml index a13895cc..0b8341b0 100644 --- a/tests/modules/plink2/relabelpvar/test.yml +++ b/tests/modules/plink2/relabelpvar/test.yml @@ -5,11 +5,11 @@ - fast - module files: - - path: output/plink2/GRCh37_test_22.psam + - path: output/plink2/GRCh37_test_22_vcf.psam md5sum: 90f1430b71153d59bc14e9499b0366f4 - - path: output/plink2/GRCh37_test_22.pgen + - path: output/plink2/GRCh37_test_22_vcf.pgen md5sum: be32a51a5509111327a5deb6a3610b2d - - path: output/plink2/GRCh37_test_22.pvar.zst + - path: output/plink2/GRCh37_test_22_vcf.pvar.zst - path: output/plink2/versions.yml contains: - "plink2: 2.00a3.3" diff --git a/tests/modules/plink2/score/test.yml b/tests/modules/plink2/score/test.yml index 4dddd189..b7098e10 100644 --- a/tests/modules/plink2/score/test.yml +++ b/tests/modules/plink2/score/test.yml @@ -11,6 +11,8 @@ - "PGS001229_22_SUM" - path: output/plink2/test_null_null_null.log contains: + - "--extract" + must_not_contain: - "no-mean-imputation" - "error-on-freq-calc" - path: output/plink2/versions.yml @@ -32,6 +34,7 @@ contains: - "no-mean-imputation" - "error-on-freq-calc" + - "--extract" - path: output/plink2/versions.yml contains: - "plink2: 2.00a3.3" @@ -50,9 +53,11 @@ - "second_score_SUM" - path: output/plink2/test_null_null_null.log contains: + - "--score-col-nums" + - "--extract" + must_not_contain: - "error-on-freq-calc" - "no-mean-imputation" - - "--score-col-nums" - path: output/plink2/versions.yml contains: - "plink2: 2.00a3.3" @@ -74,6 +79,7 @@ - "no-mean-imputation" - "error-on-freq-calc" - "--score-col-nums" + - "--extract" - path: output/plink2/versions.yml contains: - "plink2: 2.00a3.3" diff --git a/tests/modules/plink2/vcf/main.nf b/tests/modules/plink2/vcf/main.nf index 612715a6..b078074f 100644 --- a/tests/modules/plink2/vcf/main.nf +++ b/tests/modules/plink2/vcf/main.nf @@ -6,7 +6,7 @@ include { PLINK2_VCF } from '../../../../modules/local/plink2_vcf' workflow testvcf { vcf = file('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.vcf.gz') - def meta = [id: 'test', is_vcf: true] + def meta = [id: 'test', is_vcf: true, build: 'GRCh37', chrom: '22'] PLINK2_VCF(Channel.of([meta, vcf])) diff --git a/tests/modules/plink2/vcf/test.yml b/tests/modules/plink2/vcf/test.yml index 7174a174..f9440e26 100644 --- a/tests/modules/plink2/vcf/test.yml +++ b/tests/modules/plink2/vcf/test.yml @@ -5,9 +5,9 @@ - plink2 - fast files: - - path: output/plink2/GRCh37_vcf_null.pgen - - path: output/plink2/GRCh37_vcf_null.psam - - path: output/plink2/GRCh37_vcf_null.pvar.zst + - path: output/plink2/GRCh37_test_22_vcf.pgen + - path: output/plink2/GRCh37_test_22_vcf.psam + - path: output/plink2/GRCh37_test_22_vcf.pvar.zst - path: output/plink2/versions.yml contains: - "plink2: 2.00a3.3" diff --git a/tests/subworkflows/test_liftover_run.yml b/tests/subworkflows/test_liftover_run.yml index ddf0d2ef..9c1a8f3c 100644 --- a/tests/subworkflows/test_liftover_run.yml +++ b/tests/subworkflows/test_liftover_run.yml @@ -9,7 +9,7 @@ - path: output/combine/scorefiles.txt.gz - path: output/combine/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" - name: test input check subworkflow with liftover 37to38 command: nextflow run main.nf --only_input --pgs_id PGS001229 --liftover --target_build GRCh38 -c ./tests/config/nextflow.config --hg19_chain https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz --hg38_chain https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz @@ -22,4 +22,4 @@ - path: output/combine/scorefiles.txt.gz - path: output/combine/versions.yml contains: - - "pgscatalog_utils: 0.4.1" + - "pgscatalog_utils: 0.4.2" diff --git a/tests/subworkflows/test_make_compatible.yml b/tests/subworkflows/test_make_compatible.yml index caa68411..11499f92 100644 --- a/tests/subworkflows/test_make_compatible.yml +++ b/tests/subworkflows/test_make_compatible.yml @@ -21,10 +21,10 @@ files: - path: output/samplesheet/out.json - path: output/combine/scorefiles.txt.gz - - path: output/plink2/GRCh37_vcf_22.pgen - - path: output/plink2/GRCh37_vcf_22.pvar.zst - - path: output/plink2/GRCh37_vcf_22.psam - - path: output/plink2/GRCh37_vcf_22.vmiss.gz + - path: output/plink2/GRCh37_cineca_22_vcf.pgen + - path: output/plink2/GRCh37_cineca_22_vcf.pvar.zst + - path: output/plink2/GRCh37_cineca_22_vcf.psam + - path: output/plink2/GRCh37_cineca_22_vcf.vmiss.gz - name: test make compatible subworkflow with pfile command: nextflow run main.nf --only_compatible -c ./tests/config/nextflow.config diff --git a/workflows/pgscalc.nf b/workflows/pgscalc.nf index 60304a07..6c7242aa 100644 --- a/workflows/pgscalc.nf +++ b/workflows/pgscalc.nf @@ -128,7 +128,7 @@ if (params.only_projection) { run_ancestry_bootstrap = true run_input_check = true run_make_compatible = true - run_match = false + run_match = true run_ancestry_assign = true run_apply_score = false run_report = false