From 4c83c94a1fcd7568b38fbf215b90ab94a294fe42 Mon Sep 17 00:00:00 2001 From: Shadi Date: Thu, 4 Apr 2024 17:08:02 -0400 Subject: [PATCH] `magenpy==0.0.1` release --- .github/workflows/ci-docs.yml | 35 + .github/workflows/ci-linux.yml | 39 + .github/workflows/ci-osx.yml | 39 + .github/workflows/ci-windows.yml | 39 + .github/workflows/wheels.yml | 71 + .gitignore | 5 + CHANGELOG.md | 31 + CITATION.md | 21 + LICENSE | 2 +- MANIFEST.in | 3 +- Makefile | 29 +- README.md | 728 +-------- bin/magenpy_ld | 97 +- bin/magenpy_simulate | 111 +- docs/api/AnnotationMatrix.md | 1 + docs/api/GWADataLoader.md | 1 + docs/api/GenotypeMatrix.md | 1 + docs/api/LDMatrix.md | 1 + docs/api/SampleTable.md | 1 + docs/api/SumstatsTable.md | 1 + docs/api/overview.md | 45 + docs/api/parsers/annotation_parsers.md | 1 + docs/api/parsers/misc_parsers.md | 1 + docs/api/parsers/plink_parsers.md | 1 + docs/api/parsers/sumstats_parsers.md | 1 + docs/api/plot/gwa.md | 1 + docs/api/plot/ld.md | 1 + .../simulation/AnnotatedPhenotypeSimulator.md | 1 + .../MultiCohortPhenotypeSimulator.md | 1 + docs/api/simulation/PhenotypeSimulator.md | 1 + docs/api/stats/gwa/utils.md | 1 + docs/api/stats/h2/ldsc.md | 1 + docs/api/stats/ld/estimator.md | 1 + docs/api/stats/ld/utils.md | 1 + docs/api/stats/score/utils.md | 1 + docs/api/stats/transforms/genotype.md | 1 + docs/api/stats/transforms/phenotype.md | 1 + docs/api/stats/variant/utils.md | 1 + docs/api/utils/compute_utils.md | 1 + docs/api/utils/data_utils.md | 1 + docs/api/utils/executors.md | 1 + docs/api/utils/model_utils.md | 1 + docs/api/utils/system_utils.md | 1 + docs/citation.md | 21 + docs/commandline/magenpy_ld.md | 82 + docs/commandline/magenpy_simulate.md | 68 + docs/commandline/overview.md | 14 + docs/faq.md | 0 docs/features.md | 385 +++++ docs/getting_started.md | 121 ++ docs/index.md | 42 + docs/installation.md | 60 + docs/tutorials/overview.md | 0 examples/simulate_phenotype.sh | 12 +- magenpy/AnnotationMatrix.py | 77 +- magenpy/GWADataLoader.py | 235 ++- magenpy/GenotypeMatrix.py | 487 +++++- magenpy/LDMatrix.py | 1454 +++++++++++++++++ magenpy/LDMatrix.pyx | 658 -------- magenpy/SampleTable.py | 154 +- magenpy/SumstatsTable.py | 335 +++- magenpy/__init__.py | 17 +- magenpy/parsers/annotation_parsers.py | 19 +- magenpy/parsers/misc_parsers.py | 21 +- magenpy/parsers/plink_parsers.py | 15 +- magenpy/parsers/sumstats_parsers.py | 292 +++- magenpy/plot/__init__.py | 0 magenpy/{plot.py => plot/gwa.py} | 87 +- magenpy/plot/ld.py | 41 + ...ator.py => AnnotatedPhenotypeSimulator.py} | 26 +- ...or.py => MultiCohortPhenotypeSimulator.py} | 25 +- ...{GWASimulator.py => PhenotypeSimulator.py} | 129 +- magenpy/stats/gwa/utils.py | 243 ++- magenpy/stats/h2/ldsc.py | 50 +- magenpy/stats/ld/c_utils.pyx | 231 ++- magenpy/stats/ld/estimator.py | 306 +++- magenpy/stats/ld/utils.py | 925 +++-------- magenpy/stats/score/score.hpp | 267 +++ magenpy/stats/score/score_cpp.pyx | 63 + magenpy/stats/score/utils.py | 31 +- magenpy/stats/transforms/genotype.py | 6 +- magenpy/stats/transforms/phenotype.py | 65 +- magenpy/stats/variant/utils.py | 25 +- magenpy/utils/compute_utils.py | 15 +- magenpy/utils/data_utils.py | 6 +- magenpy/utils/executors.py | 58 +- magenpy/utils/model_utils.py | 113 +- magenpy/utils/system_utils.py | 51 +- mkdocs.yml | 102 ++ pyproject.toml | 6 + requirements-docs.txt | 4 + requirements-optional.txt | 2 +- requirements-test.txt | 1 + requirements.txt | 5 +- setup.py | 30 +- tests/conda_manual_testing.sh | 48 + tests/test_gdl.py | 77 + tests/test_ld.py | 115 ++ tests/test_simulation.py | 44 + 99 files changed, 6352 insertions(+), 2737 deletions(-) create mode 100644 .github/workflows/ci-docs.yml create mode 100644 .github/workflows/ci-linux.yml create mode 100644 .github/workflows/ci-osx.yml create mode 100644 .github/workflows/ci-windows.yml create mode 100644 .github/workflows/wheels.yml create mode 100644 CITATION.md create mode 100644 docs/api/AnnotationMatrix.md create mode 100644 docs/api/GWADataLoader.md create mode 100644 docs/api/GenotypeMatrix.md create mode 100644 docs/api/LDMatrix.md create mode 100644 docs/api/SampleTable.md create mode 100644 docs/api/SumstatsTable.md create mode 100644 docs/api/overview.md create mode 100644 docs/api/parsers/annotation_parsers.md create mode 100644 docs/api/parsers/misc_parsers.md create mode 100644 docs/api/parsers/plink_parsers.md create mode 100644 docs/api/parsers/sumstats_parsers.md create mode 100644 docs/api/plot/gwa.md create mode 100644 docs/api/plot/ld.md create mode 100644 docs/api/simulation/AnnotatedPhenotypeSimulator.md create mode 100644 docs/api/simulation/MultiCohortPhenotypeSimulator.md create mode 100644 docs/api/simulation/PhenotypeSimulator.md create mode 100644 docs/api/stats/gwa/utils.md create mode 100644 docs/api/stats/h2/ldsc.md create mode 100644 docs/api/stats/ld/estimator.md create mode 100644 docs/api/stats/ld/utils.md create mode 100644 docs/api/stats/score/utils.md create mode 100644 docs/api/stats/transforms/genotype.md create mode 100644 docs/api/stats/transforms/phenotype.md create mode 100644 docs/api/stats/variant/utils.md create mode 100644 docs/api/utils/compute_utils.md create mode 100644 docs/api/utils/data_utils.md create mode 100644 docs/api/utils/executors.md create mode 100644 docs/api/utils/model_utils.md create mode 100644 docs/api/utils/system_utils.md create mode 100644 docs/citation.md create mode 100644 docs/commandline/magenpy_ld.md create mode 100644 docs/commandline/magenpy_simulate.md create mode 100644 docs/commandline/overview.md create mode 100644 docs/faq.md create mode 100644 docs/features.md create mode 100644 docs/getting_started.md create mode 100644 docs/index.md create mode 100644 docs/installation.md create mode 100644 docs/tutorials/overview.md create mode 100644 magenpy/LDMatrix.py delete mode 100644 magenpy/LDMatrix.pyx create mode 100644 magenpy/plot/__init__.py rename magenpy/{plot.py => plot/gwa.py} (59%) create mode 100644 magenpy/plot/ld.py rename magenpy/simulation/{AnnotatedGWASimulator.py => AnnotatedPhenotypeSimulator.py} (82%) rename magenpy/simulation/{MulticohortGWASimulator.py => MultiCohortPhenotypeSimulator.py} (88%) rename magenpy/simulation/{GWASimulator.py => PhenotypeSimulator.py} (67%) create mode 100644 magenpy/stats/score/score.hpp create mode 100644 magenpy/stats/score/score_cpp.pyx create mode 100644 mkdocs.yml create mode 100644 requirements-docs.txt create mode 100644 requirements-test.txt create mode 100644 tests/conda_manual_testing.sh create mode 100644 tests/test_gdl.py create mode 100644 tests/test_ld.py create mode 100644 tests/test_simulation.py diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml new file mode 100644 index 0000000..aee4e51 --- /dev/null +++ b/.github/workflows/ci-docs.yml @@ -0,0 +1,35 @@ +name: ci Docs +on: + push: + branches: + - master + - main +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + + - uses: actions/checkout@v4 + + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + + - uses: actions/setup-python@v5 + with: + python-version: 3.x + + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + + - uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + + - run: python -m pip install -r requirements-docs.txt + - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml new file mode 100644 index 0000000..fa7cfcc --- /dev/null +++ b/.github/workflows/ci-linux.yml @@ -0,0 +1,39 @@ +name: magenpy Linux-CI + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up miniconda + uses: conda-incubator/setup-miniconda@v3.0.1 + with: + channels: conda-forge + python-version: ${{ matrix.python-version }} + + - name: Set up Conda environment + shell: "bash -l {0}" + run: > + conda create --name "magenpy_ci" -c conda-forge -c anaconda + python=${{matrix.python-version}} pip wheel compilers openblas -y + + - name: Install magenpy + shell: "bash -l {0}" + run: | + conda activate magenpy_ci + python -m pip install -v -e .[test] + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate magenpy_ci + pytest -v \ No newline at end of file diff --git a/.github/workflows/ci-osx.yml b/.github/workflows/ci-osx.yml new file mode 100644 index 0000000..4171572 --- /dev/null +++ b/.github/workflows/ci-osx.yml @@ -0,0 +1,39 @@ +name: magenpy OSX-CI + +on: [push, pull_request] + +jobs: + build: + + runs-on: macos-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up miniconda + uses: conda-incubator/setup-miniconda@v3.0.1 + with: + channels: conda-forge + python-version: ${{ matrix.python-version }} + + - name: Set up Conda environment + shell: "bash -l {0}" + run: > + conda create --name "magenpy_ci" -c conda-forge -c anaconda + python=${{matrix.python-version}} pip wheel compilers openblas -y + + - name: Install magenpy + shell: "bash -l {0}" + run: | + conda activate magenpy_ci + python -m pip install -v -e .[test] + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate magenpy_ci + pytest -v \ No newline at end of file diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml new file mode 100644 index 0000000..a160509 --- /dev/null +++ b/.github/workflows/ci-windows.yml @@ -0,0 +1,39 @@ +name: magenpy OSX-CI + +on: [push, pull_request] + +jobs: + build: + + runs-on: windows-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up miniconda + uses: conda-incubator/setup-miniconda@v3.0.1 + with: + channels: conda-forge + python-version: ${{ matrix.python-version }} + + - name: Set up Conda environment + shell: "bash -l {0}" + run: > + conda create --name "magenpy_ci" -c conda-forge -c anaconda + python=${{matrix.python-version}} pip wheel compilers openblas -y + + - name: Install magenpy + shell: "bash -l {0}" + run: | + conda activate magenpy_ci + python -m pip install -v -e .[test] + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate magenpy_ci + pytest -v \ No newline at end of file diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000..3eac899 --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,71 @@ +name: Build magenpy and upload to PyPI + +on: + workflow_dispatch: + pull_request: + push: + branches: + - main + release: + types: + - published + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + # macos-13 is an intel runner, macos-14 is apple silicon + os: [ubuntu-latest, windows-latest, macos-13, macos-14] + + steps: + - uses: actions/checkout@v4 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.17.0 + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./wheelhouse/*.whl + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build sdist + run: pipx run build --sdist + + - name: test install + run: pip install dist/magenpy*.tar.gz + + - uses: actions/upload-artifact@v4 + with: + name: cibw-sdist + path: dist/*.tar.gz + + upload_pypi: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write + if: github.event_name == 'release' && github.event.action == 'published' + # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) + # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + steps: + - uses: actions/download-artifact@v4 + with: + # unpacks all CIBW artifacts into dist/ + pattern: cibw-* + path: dist + merge-multiple: true + + - uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + # To test: repository-url: https://test.pypi.org/legacy/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 9c2c1c4..d1b674d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # C extensions *.c +*.cpp *.so # Distribution / packaging @@ -18,6 +19,7 @@ lib/ lib64/ parts/ sdist/ +site/ var/ *.egg-info/ .installed.cfg @@ -34,6 +36,9 @@ pip-delete-this-directory.txt # custom .idea/ +.vscode/ +.tox/ +.pytest_cache/ *.html *.zarr *.npz diff --git a/CHANGELOG.md b/CHANGELOG.md index 4158490..b79f9ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,37 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.0] - 2024-04-04 + +A large scale restructuring of the code base to improve efficiency and usability. + +### Changed + +- Bug fixes across the entire code base. +- Simulator classes have been renamed from `GWASimulator` to `PhenotypeSimulator`. +- Moved plotting script to its own separate module. +- Updated some method names / commandline flags to be consistent throughout. + +### Added + +- Basic integration testing with `pytest` and GitHub workflows. +- Documentation for the entire package using `mkdocs`. +- Integration testing / automating building with GitHub workflows. +- New implementation of the LD matrix that uses CSR matrix data structures. + - Quantization / float precision specification when storing LD matrices. + - Allow user to specify Compressor / Compressor options for Zarr storage. +- New implementation of `magenpy_simulate` script. + - Allow users to set random seed. + - Now accept `--prop-causal` instead of specifying full mixing proportions. +- Tried to incorporate `genome_build` into various data structures. This will be useful in the +future to ensure consistent genome builds across different data types. +- Allow user to pass various metadata to `magenpy_ld` to save information about dataset +characteristics. +- New sumstats parsers: + - Saige sumstats format. + - plink1.9 sumstats format. + - GWAS Catalog sumstats format. +- Chained transform function for transforming phenotypes. ## [0.0.12] - 2023-02-12 diff --git a/CITATION.md b/CITATION.md new file mode 100644 index 0000000..4b82e4c --- /dev/null +++ b/CITATION.md @@ -0,0 +1,21 @@ +If you use `magenpy` in your research, please cite the following paper(s): + +> Zabad, S., Gravel, S., & Li, Y. (2023). **Fast and accurate Bayesian polygenic risk modeling with variational inference.** +The American Journal of Human Genetics, 110(5), 741–761. https://doi.org/10.1016/j.ajhg.2023.03.009 + +## BibTeX records + +```bibtex +@article{ZABAD2023741, + title = {Fast and accurate Bayesian polygenic risk modeling with variational inference}, + journal = {The American Journal of Human Genetics}, + volume = {110}, + number = {5}, + pages = {741-761}, + year = {2023}, + issn = {0002-9297}, + doi = {https://doi.org/10.1016/j.ajhg.2023.03.009}, + url = {https://www.sciencedirect.com/science/article/pii/S0002929723000939}, + author = {Shadi Zabad and Simon Gravel and Yue Li} +} +``` diff --git a/LICENSE b/LICENSE index 33c23aa..d049465 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 Shadi Zabad, McGill University +Copyright (c) 2024 Shadi Zabad, McGill University. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in index 6fd080c..947a8b4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,12 +2,13 @@ include MANIFEST.in include requirements*.txt include magenpy/config/*.ini include LICENSE -include README.md +include *.md include setup.py graft magenpy global-exclude *.c +global-exclude *.cpp global-exclude *.so global-exclude *.pyd global-exclude *.pyc diff --git a/Makefile b/Makefile index cde740f..67a558d 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,40 @@ -.PHONY: build dist redist install install-from-source clean uninstall +.PHONY: build build-inplace test test-inplace dist redist install install-from-source clean uninstall publish-test publish build: python3 setup.py build +build-inplace: + python3 setup.py build_ext --inplace + +test-inplace: + PYTHONPATH=. pytest + +test: + python -m pytest + dist: - python3 setup.py sdist bdist_wheel + python setup.py sdist bdist_wheel redist: clean dist install: - pip install . + python -m pip install . install-from-source: dist - pip install dist/magenpy-*.tar.gz + python -m pip install dist/magenpy-*.tar.gz clean: $(RM) -r build dist *.egg-info - $(RM) -r magenpy/LDMatrix.c magenpy/utils/c_utils.c + $(RM) -r magenpy/stats/ld/*.c magenpy/stats/score/*.cpp + $(RM) -r magenpy/stats/ld/*.so magenpy/stats/score/*.so + $(RM) -r .pytest_cache .tox temp output find . -name __pycache__ -exec rm -r {} + uninstall: - pip uninstall magenpy \ No newline at end of file + python -m pip uninstall magenpy + +publish-test: + python -m twine upload -r testpypi dist/* --verbose + +publish: + python -m twine upload dist/* --verbose diff --git a/README.md b/README.md index ae57846..4c45351 100644 --- a/README.md +++ b/README.md @@ -4,716 +4,32 @@ [![PyPI version fury.io](https://badge.fury.io/py/magenpy.svg)](https://pypi.python.org/pypi/magenpy/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -This repository includes modules and scripts for loading, manipulating, and simulating with genotype data. -The software works mainly with `plink`'s `.bed` file format, with the hope that we will extend this to -other genotype data formats in the future. -The features and functionalities that this package supports are: +[![Linux CI](https://github.com/shz9/magenpy/actions/workflows/ci-linux.yml/badge.svg)](https://github.com/shz9/magenpy/actions/workflows/ci-linux.yml) +[![MacOS CI](https://github.com/shz9/magenpy/actions/workflows/ci-osx.yml/badge.svg)](https://github.com/shz9/magenpy/actions/workflows/ci-osx.yml) +[![Windows CI](https://github.com/shz9/magenpy/actions/workflows/ci-windows.yml/badge.svg)](https://github.com/shz9/magenpy/actions/workflows/ci-windows.yml) +[![Docs Build](https://github.com/shz9/magenpy/actions/workflows/ci-docs.yml/badge.svg)](https://github.com/shz9/magenpy/actions/workflows/ci-docs.yml) +[![Binary wheels](https://github.com/shz9/magenpy/actions/workflows/wheels.yml/badge.svg)](https://github.com/shz9/magenpy/actions/workflows/wheels.yml) -- Efficient LD matrix construction and storage in [Zarr](https://zarr.readthedocs.io/en/stable/) array format. -- Data structures for harmonizing various GWAS data sources. -- Simulating complex traits (continuous and binary) using elaborate genetic architectures. - - Multi-cohort simulation scenarios (beta) - - Simulations incorporating functional annotations in the genetic architecture (beta) -- Interfaces for performing association testing on simulated and real phenotypes. -- Preliminary support for processing and integrating genomic annotations with other data sources. -**NOTE**: The codebase is still in active development and some of interfaces or data structures will be -replaced or modified in future releases. Check the [CHANGELOG](CHANGELOG.md) for the latest updates -and modifications. +[![Downloads](https://static.pepy.tech/badge/magenpy)](https://pepy.tech/project/magenpy) +[![Downloads](https://static.pepy.tech/badge/magenpy/month)](https://pepy.tech/project/magenpy) -## Table of Contents +`magenpy` is a Python package for modeling and analyzing statistical genetics data. +The package provides tools for: -- [Installation](#Installation) -- [Getting started](#getting-started) -- [Features and Configurations](#features-and-configurations) - - [(1) Complex trait simulation](#1-complex-trait-simulation) - - [(2) Genome-wide Association Testing](#2-genome-wide-association-testing) - - [(3) Calculating LD matrices](#3-calculating-ld-matrices) - - [LD estimators and their properties](#ld-estimators-and-their-properties) - - [(4) Data harmonization](#4-data-harmonization) - - [(5) Using `plink` as a backend](#5-using-plink-as-backend) - - [(6) Commandline scripts](#6-commandline-scripts) -- [Citations](#citations) +* Reading and processing genotype data in `plink` BED format. +* Efficient LD matrix construction and storage in [Zarr](https://zarr.readthedocs.io/en/stable/index.html) array format. +* Data structures for harmonizing various GWAS data sources. + * Includes parsers for commonly used GWAS summary statistics formats. +* Simulating polygenic traits (continuous and binary) using complex genetic architectures. + * Multi-cohort simulation scenarios (beta) + * Simulations incorporating functional annotations in the genetic architecture (beta) +* Interfaces for performing association testing on simulated and real phenotypes. +* Preliminary support for processing and integrating genomic annotations with other data sources. +### Helpful links -## Installation - -`magenpy` is now available on the python package index `pypi` and -can be minimally installed using the package installer `pip`: - -```shell -pip install magenpy -``` - -To access the full functionalities of `magenpy`, however, it is recommended that -you install the full list of dependencies: - -```shell -pip install magenpy[full] -``` - -To use `magenpy` on a shared computing cluster, we recommend installing it in a -`python` virtual environment. For example: - -```shell -module load python/3.8 -python -m venv magenpy_env -source magenpy_env/bin/activate -pip install --upgrade pip -pip install magenpy -``` - -Finally, if you wish to install the package from source, -you can directly clone it from the GitHub repository and install it locally -as follows: - -```shell -git clone https://github.com/shz9/magenpy.git -cd magenpy -make install -``` - -## Getting started - -`magenpy` comes with a sample dataset from the 1000G project that -you can use to experiment and familiarize yourself with its features. -Once the package is installed, you can run a couple of quick tests -to verify that the main features are working properly. - -For example, to simulate a quantitative trait, you can invoke -the following commands in a `python` interpreter: - -```python -import magenpy as mgp -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path(), - h2=0.1) -g_sim.simulate() -g_sim.to_phenotype_table() -# FID IID phenotype -# 0 HG00096 HG00096 0.795651 -# 1 HG00097 HG00097 0.550914 -# 2 HG00099 HG00099 -0.928486 -# 3 HG00100 HG00100 0.893626 -# 4 HG00101 HG00101 -0.670106 -# .. ... ... ... -# 373 NA20815 NA20815 0.246071 -# 374 NA20818 NA20818 1.821426 -# 375 NA20819 NA20819 -0.457994 -# 376 NA20826 NA20826 0.954208 -# 377 NA20828 NA20828 0.088412 -# -# [378 rows x 3 columns] -``` - -This simulates a quantitative trait with heritability set to 0.1, -using genotype data for a subset of 378 individuals of European ancestry -from the 1000G project and approximately 15,000 SNPs on chromosome 22. -By default, the simulator assumes that only 10% of the SNPs are -causal (this is drawn at random from a Bernoulli distribution with `p=0.1`). -To obtain a list of the causal SNPs in this simulation, you can invoke the -`.get_causal_status()` method, which returns a boolean vector indicating -whether each SNP is causal or not: - -```python -g_sim.get_causal_status() -# {22: array([ True, False, False, ..., False, False, False])} -``` - -In this case, for example, the first SNP is causal for the simulated phenotype. A note -about the design of data structures in `magenpy`. Our main data structure is a class known -as `GWADataLoader`, which is an all-purpose object that brings together different data sources and -harmonizes them together. In `GWADataLoader`, SNP-related data sources are stored in dictionaries, where -the key is the chromosome number and the value is the data structure associated with that chromosome. -Thus, in the output above, the data is for chromosome 22 and the feature is a boolean -vector indicating whether a given SNP is causal or not. - -You can also get the full information -about the genetic architecture by invoking the method `.to_true_beta_table()`, -which returns a `pandas` dataframe with the effect size, expected heritability contribution, -and causal status of each variant in the simulation: - -```python -g_sim.to_true_beta_table() -# CHR SNP A1 MixtureComponent Heritability BETA Causal -# 0 22 rs131538 A 1 0.000063 -0.008013 True -# 1 22 rs9605903 C 0 0.000000 0.000000 False -# 2 22 rs5746647 G 0 0.000000 0.000000 False -# 3 22 rs16980739 T 0 0.000000 0.000000 False -# 4 22 rs9605923 A 0 0.000000 0.000000 False -# ... ... ... .. ... ... ... ... -# 15933 22 rs8137951 A 0 0.000000 0.000000 False -# 15934 22 rs2301584 A 0 0.000000 0.000000 False -# 15935 22 rs3810648 G 0 0.000000 0.000000 False -# 15936 22 rs2285395 A 0 0.000000 0.000000 False -# 15937 22 rs28729663 A 0 0.000000 0.000000 False -# -# [15938 rows x 7 columns] -``` - - -We can also simulate a more complex genetic architecture by, e.g. simulating effect sizes from -4 Gaussian mixture components, instead of the standard spike-and-slab density used by default: - -```python -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path(), - pi=[.9, .03, .03, .04], - d=[0., .01, .1, 1.], - h2=0.1) -g_sim.simulate() -g_sim.to_phenotype_table() -# FID IID phenotype -# 0 HG00096 HG00096 0.435024 -# 1 HG00097 HG00097 1.030874 -# 2 HG00099 HG00099 0.042322 -# 3 HG00100 HG00100 1.392733 -# 4 HG00101 HG00101 0.722763 -# .. ... ... ... -# 373 NA20815 NA20815 -0.402506 -# 374 NA20818 NA20818 -0.321429 -# 375 NA20819 NA20819 -0.845630 -# 376 NA20826 NA20826 -0.690078 -# 377 NA20828 NA20828 0.256937 -# -# [378 rows x 3 columns] -``` - -The parameter `pi` specifies the mixing proportions for the Gaussian mixture -distribution and the `d` is a multiplier on the variance (see references below). In this case, 90% of the variants -are not causal, and the remaining 10% are divided between 3 mixture components that contribute -differentially to the heritability. The last component, which constitutes 4% of all SNPs, contributes 100 -times and 10 times to the heritability than components 2 an 3, respectively. - -## Features and Configurations - -### (1) Complex trait simulation - -`magenpy` may be used for complex trait simulation employing a variety of different -genetic architectures and phenotype likelihoods. For example, to simulate a quantitative -trait with heritability set to 0.25 and where a random subset of 15% of the variants are causal, -you may invoke the following command: - -```python -import magenpy as mgp -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path(), - pi=[.85, .15], - h2=0.25) -g_sim.simulate() -``` - -Then, you can export the simulated phenotype to a `pandas` dataframe as follows: - -```python -g_sim.to_phenotype_table() -# FID IID phenotype -# 0 HG00096 HG00096 -2.185944 -# 1 HG00097 HG00097 -1.664984 -# 2 HG00099 HG00099 -0.208703 -# 3 HG00100 HG00100 0.257040 -# 4 HG00101 HG00101 -0.068826 -# .. ... ... ... -# 373 NA20815 NA20815 -1.770358 -# 374 NA20818 NA20818 1.823890 -# 375 NA20819 NA20819 0.835763 -# 376 NA20826 NA20826 -0.029256 -# 377 NA20828 NA20828 -0.088353 -# -# [378 rows x 3 columns] -``` - -To simulate a binary, or case-control, trait, the interface is very similar. First, -you need to specify that the likelihood for the phenotype is binomial (`phenotype_likelihood='binomial'`), and then -specify the prevalence of the positive cases in the population. For example, -to simulate a case-control trait with heritability of 0.3 and prevalence of 8%, we can invoke the following -command: - -```python -import magenpy as mgp -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path(), - phenotype_likelihood='binomial', - prevalence=.08, - h2=0.3) -g_sim.simulate() -g_sim.to_phenotype_table() -# FID IID phenotype -# 0 HG00096 HG00096 0 -# 1 HG00097 HG00097 0 -# 2 HG00099 HG00099 0 -# 3 HG00100 HG00100 0 -# 4 HG00101 HG00101 0 -# .. ... ... ... -# 373 NA20815 NA20815 0 -# 374 NA20818 NA20818 0 -# 375 NA20819 NA20819 1 -# 376 NA20826 NA20826 0 -# 377 NA20828 NA20828 0 -# -# [378 rows x 3 columns] -``` - -### (2) Genome-wide Association Testing - -`magenpy` is not a GWAS tool. However, we do support preliminary association -testing functionalities either via closed-form formulas for quantitative traits, or -by providing a `python` interface to third-party association testing tools, such as `plink`. - -If you are conducting simple tests based on simulated data, an easy way to perform -association testing is to tell the simulator that you'd like to perform GWAS on the -simulated trait, with the `perform_gwas=True` flag: - -```python -import magenpy as mgp -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path(), - pi=[.85, .15], - h2=0.25) -g_sim.simulate(perform_gwas=True) -``` - -Alternatively, you can conduct association testing on real or -simulated phenotypes using the `.perform_gwas()` method and exporting the -summary statistics to a `pandas` dataframe with `.to_summary_statistics_table()`: - -```python -g_sim.perform_gwas() -g_sim.to_summary_statistics_table() -# CHR SNP POS A1 A2 ... N BETA Z SE PVAL -# 0 22 rs131538 16871137 A G ... 378 -0.046662 -0.900937 0.051793 0.367622 -# 1 22 rs9605903 17054720 C T ... 378 0.063977 1.235253 0.051793 0.216736 -# 2 22 rs5746647 17057138 G T ... 378 0.057151 1.103454 0.051793 0.269830 -# 3 22 rs16980739 17058616 T C ... 378 -0.091312 -1.763029 0.051793 0.077896 -# 4 22 rs9605923 17065079 A T ... 378 0.069368 1.339338 0.051793 0.180461 -# ... ... ... ... .. .. ... ... ... ... ... ... -# 15933 22 rs8137951 51165664 A G ... 378 0.078817 1.521782 0.051793 0.128064 -# 15934 22 rs2301584 51171497 A G ... 378 0.076377 1.474658 0.051793 0.140304 -# 15935 22 rs3810648 51175626 G A ... 378 -0.001448 -0.027952 0.051793 0.977701 -# 15936 22 rs2285395 51178090 A G ... 378 -0.019057 -0.367949 0.051793 0.712911 -# 15937 22 rs28729663 51219006 A G ... 378 0.029667 0.572805 0.051793 0.566777 -# -# [15938 rows x 11 columns] -``` - -If you wish to use `plink2` for association testing (highly recommended), ensure that -you tell `GWASimulator` (or any `GWADataLoader`-derived object) to use plink by explicitly -specifying the `backend` software that you wish to use: - -```python -import magenpy as mgp -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path(), - backend='plink', - pi=[.85, .15], - h2=0.25) -g_sim.simulate(perform_gwas=True) -``` - -When using `plink`, we sometimes create temporary intermediate files to pass to the software. To clean up -the temporary directories and files, you can invoke the `.cleanup()` command: - -```python -g_sim.cleanup() -``` - -### (3) Calculating LD matrices - -One of the main features of the `magenpy` package is an efficient interface for computing -and storing Linkage Disequilibrium (LD) matrices. LD matrices record the pairwise SNP-by-SNP -Pearson correlation coefficient. In general, LD matrices are computed for each chromosome separately -or may also be computed within LD blocks from, e.g. LDetect. For large autosomal chromosomes, -LD matrices can be huge and may require extra care from the user. - -In `magenpy`, LD matrices can be computed using either `xarray` or `plink`, depending on the -backend that the user specifies (see Section 5 below). In general, at this moment, we do not recommend using -`xarray` as a backend for large genotype matrices, as it is less efficient than `plink`. When using the default -`xarray` as a backend, we compute the full `X'X` (X-transpose-X) matrix first, store it on-disk in chunked -`Zarr` arrays and then perform all sparsification procedures afterwards. When using `plink` as a -backend, on the other hand, we only compute LD between variants that are generally in close proximity -along the chromosome, so it is generally more efficient. In the end, both will be transformed such that -the LD matrix is stored in sparse `Zarr` arrays. - -**A note on dependencies**: If you wish to use `xarray` as a backend to compute LD matrices, -you may need to install some of the optional dependencies for `magenpy`, including e.g. `rechunker`. In this case, -it is recommended that you install all the dependencies listed in `requirements-optional.txt`. If you wish -to use `plink` as a backend, you may need to configure the paths for `plink` as explained in Section 5 below. - -In either case, to compute an LD matrix using `magenpy`, you can invoke the `.compute_ld()` method -of all `GWADataLoader`-derived objects, as follows: - -```python -# Using xarray: -import magenpy as mgp -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path()) -gdl.compute_ld(estimator='windowed', - output_dir='output/ldl/', - window_size=100) -``` - -This creates a windowed LD matrix where we only measure the correlation between the focal SNP and the nearest -100 from either side. As stated above, the LD matrix will be stored on-disk and that is why we must -specify the output directory when we call `.compute_ld()`. To use `plink` to compute the LD matrix, -we can invoke a similar command: - -```python -# Using plink: -import magenpy as mgp -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), - backend='plink') -gdl.compute_ld(estimator='windowed', - output_dir='output/ld/', - cm_window_size=3.) -``` - -In this case, we are computing a windowed LD matrix where we only measure the correlation between -SNPs that are at most 3 centi Morgan (cM) apart along the chromosome. For this small 1000G dataset, computing -the LD matrix takes about a minute. The LD matrices in Zarr format will be written to the path -specified in `output_dir`, so ensure that this argument is set to the desired directory. - -To facilitate working with LD matrices stored in `Zarr` format, we created a data structure in cython called `LDMatrix`, -which acts as an intermediary and provides various features. For example, to compute LD scores -using this LD matrix, you can invoke the command `.compute_ld_scores()` on it: - -```python -gdl.ld[22] -# -gdl.ld[22].compute_ld_scores() -# array([1.60969673, 1.84471792, 1.59205322, ..., 3.3126724 , 3.42234106, -# 2.97252452]) -``` - -You can also get a table that lists the properties of the SNPs included in the LD matrix: - -```python -gdl.ld[22].to_snp_table() -# CHR SNP POS A1 MAF -# 0 22 rs9605903 17054720 C 0.260736 -# 1 22 rs5746647 17057138 G 0.060327 -# 2 22 rs16980739 17058616 T 0.131902 -# 3 22 rs9605927 17067005 C 0.033742 -# 4 22 rs5746664 17074622 A 0.066462 -# ... ... ... ... .. ... -# 14880 22 rs8137951 51165664 A 0.284254 -# 14881 22 rs2301584 51171497 A 0.183027 -# 14882 22 rs3810648 51175626 G 0.065440 -# 14883 22 rs2285395 51178090 A 0.061350 -# 14884 22 rs28729663 51219006 A 0.159509 -# -# [14885 rows x 5 columns] -``` - -Finally, note that the `LDMatrix` object supports an iterator interface, so in principle -you can iterate over rows of the LD matrix without loading the entire thing into memory. -The following example shows the first 10 entries of the first row of the matrix: - -```python -np.array(next(gdl.ld[22]))[:10] -# array([ 1.00000262, -0.14938791, -0.27089083, 0.33311111, 0.35015815, -# -0.08077946, -0.08077946, 0.0797345 , -0.16252513, -0.23680465]) -``` - -Finally, as of `magenpy>=0.0.2`, now you can export the Zarr array into a `scipy` sparse `csr` -matrix as follows: - -```python -gdl.ld[22].to_csr_matrix() -# <15938x15938 sparse matrix of type '' -# with 24525854 stored elements in Compressed Sparse Row format> -``` - -#### LD estimators and their properties - -`magenpy` supports computing LD matrices using 4 different estimators that are commonly used -in statistical genetics applications. -For a more thorough description of the estimators and their properties, consult our manuscript -and the citations therein. The LD estimators are: - -1) `windowed` (recommended): The windowed estimator computes the pairwise correlation coefficient between SNPs that are - within a pre-defined distance along the chromosome from each other. In many statistical genetics applications, the - recommended distance is between 1 and 3 centi Morgan (cM). As of `magenpy==0.0.2`, now you can customize - the distance based on three criteria: **(1)** A window size based on the number neighboring variants, **(2)** - distance threshold in kilobases (kb), and **(3)** distance threshold in centi Morgan (cM). When defining the - boundaries for each SNP, `magenpy` takes the intersection of the boundaries defined by each window. - -```python -import magenpy as mgp -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), - backend='plink') -gdl.compute_ld('windowed', output_dir='output/ld/', - window_size=100, kb_window_size=1000, cm_window_size=2.) -gdl.cleanup() -``` - -2) `block`: The block estimator estimates the pairwise correlation coefficient between -variants that are in the same LD block, as defined by, e.g. LDetect. Given an LD block file, - we can compute a block-based LD matrix as follows: - -```python -import magenpy as mgp -ld_block_url = "https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/EUR/fourier_ls-all.bed" -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), - backend='plink') -gdl.compute_ld('block', output_dir='output/ld/', - ld_blocks_file=ld_block_url) -gdl.cleanup() -``` - -If you have the LD blocks file on your system, you can also pass the path to the file instead. - -3) `shrinkage`: For the shrinkage estimator, we shrink the entries of the LD matrix by a - quantity related to the distance between SNPs along the chromosome + some additional information - related to the sample from which the genetic map was estimated. In particular, - we need to specify the effective population size and the sample size used to - estimate the genetic map. Also, to make the matrix sparse, we often specify a threshold value - below which we consider the correlation to be zero. Here's an example for the 1000G sample: - - -```python -import magenpy as mgp -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), - backend='plink') -gdl.compute_ld('shrinkage', output_dir='output/ld/', - genetic_map_ne=11400, # effective population size (Ne) - genetic_map_sample_size=183, # Sample size - threshold=1e-3) # The cutoff value -gdl.cleanup() -``` - -4) `sample`: This estimator computes the pairwise correlation coefficient between all SNPs on - the same chromosome and thus results in a dense matrix. Thus, it is rarely used in practice and - we include it here for testing/debugging purposes mostly. To compute the sample LD matrix, you only need - to specify the correct estimator: - -```python -import magenpy as mgp -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), - backend='plink') -gdl.compute_ld('sample', output_dir='output/ld/') -gdl.cleanup() -``` - -### (4) Data harmonization - -There are many different statistical genetics data sources and formats out there. One of the goals of -`magenpy` is to create a friendly interface for matching and merging these data sources for -downstream analyses. For example, for summary statistics-based methods, we often need -to merge the LD matrix derived from a reference panel with the GWAS summary statistics estimated -in a different cohort. While this is a simple task, it can be tricky sometimes, e.g. in -cases where the effect allele is flipped between the two cohort. - -The functionalities that we provide for this are minimal at this stage and mainly geared towards -harmonizing `Zarr`-formatted LD matrices with GWAS summary statistics. The following example -shows how to do this in a simple case: - -```python -import magenpy as mgp -# First, generate some summary statistics from a simulation: -g_sim = mgp.GWASimulator(mgp.tgp_eur_data_path()) -g_sim.simulate() -g_sim.to_summary_statistics_table().to_csv( - "chr_22.sumstats", sep="\t", index=False -) -# Then load those summary statistics and match them with previously -# computed windowed LD matrix for chromosome 22: -gdl = mgp.GWADataLoader(ld_store_files='output/windowed_ld/chr_22/', - sumstats_files='chr_22.sumstats', - sumstats_format='magenpy') -``` - -Here, the `GWADataLoader` object takes care of the harmonization step by -automatically invoking the `.harmonize_data()` method. When you read or update -any of the data sources, we recommend that you invoke the `.harmonize_data()` method again -to make sure that all the data sources are aligned properly. In the near future, -we are planning to add many other functionalities in this space. Stay tuned. - -### (5) Using `plink` as backend - -Many of the functionalities that `magenpy` supports require access to and performing linear algebra -operations on the genotype matrix. By default, `magenpy` uses `xarray` and `dask` -to carry out these operations, as these are the tools supported by our main dependency: `pandas-plink`. - -However, `dask` can be quite slow and inefficient when deployed on large-scale genotype matrices. To get -around this difficulty, for many operations, such as linear scoring or computing minor allele frequency, -we support (and recommend) using `plink` as a backend. - -To use `plink` as a backend for `magenpy`, first you may need to configure the paths -on your system. By default, `magenpy` assumes that, in the shell, the name `plink2` invokes the `plink2` -executable and `plink` invokes `plink1.9` software. To change this behavior, you can update the -configuration file as follows. First, let's see the default configurations that ship with `magenpy`: - -```python -import magenpy as mgp -mgp.print_options() -# -> Section: DEFAULT -# ---> plink1.9_path: plink -# ---> plink2_path: plink2 -``` - -The above shows the default configurations for the `plink1.9` and `plink2` paths. To change -the path for `plink2`, for example, you can use the `set_option()` function: - -```python -mgp.set_option("plink2_path", "~/software/plink2/plink2") -mgp.print_options() -# -> Section: USER -# ---> plink2_path: ~/software/plink2/plink2 -# ---> plink1.9_path: plink -# -> Section: DEFAULT -# ---> plink1.9_path: plink -# ---> plink2_path: plink2 -``` - -As you can see, this added a new section to the configuration file, named `USER`, that has the -new path for the `plink2` software. Now, every time `magenpy` needs to invoke `plink2`, it calls -the executable stored at `~/software/plink2/`. Note that you only need to do this once on any particular -machine or system, as this preference is now recorded in the configuration file and will be taken into -account for all future operations. - -Note that for most of the operations, we assume that the user has `plink2` installed. We only -use `plink1.9` for some operations that are currently not supported by `plink2`, especially for -e.g. LD computation. This behavior may change in the near future. - -Once the paths are configured, to use `plink` as a backend for the various computations and -tools, make sure that you specify the `backend='plink'` flag in `GWADataLoader` and all of its -derived data structures (including all the `GWASimulator` classes): - -```python -import magenpy as mgp -gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), - backend='plink') -``` - -### (6) Commandline scripts - -If you are not comfortable programming in `python` and would like to access some of the functionalities -of `magenpy` with minimal interaction with `python` code, we packaged a number of commandline -scripts that can be useful for some downstream applications. - -The binaries that are currently supported are: - -1) `magenpy_ld`: For computing LD matrices and storing them in `Zarr` format. -2) `magenpy_simulate`: For simulating complex traits with various genetic architectures. - -Once you install `magenpy` via `pip`, these two scripts will be added to the system `PATH` -and you can invoke them directly from the commandline, as follows: - -```shell -$ magenpy_ld -h - -********************************************** - _ __ ___ __ _ __ _ ___ _ __ _ __ _ _ -| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | | -| | | | | | (_| | (_| | __/ | | | |_) | |_| | -|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, | - |___/ |_| |___/ -Modeling and Analysis of Genetics data in python -Version: 0.0.10 | Release date: May 2022 -Author: Shadi Zabad, McGill University -********************************************** -< Compute LD matrix and output in Zarr format > - -usage: magenpy_ld [-h] [--estimator {block,shrinkage,sample,windowed}] --bfile BED_FILE [--keep KEEP_FILE] [--extract EXTRACT_FILE] [--backend {plink,xarray}] [--temp-dir TEMP_DIR] - --output-dir OUTPUT_DIR [--min-maf MIN_MAF] [--min-mac MIN_MAC] [--ld-window LD_WINDOW] [--ld-window-kb LD_WINDOW_KB] [--ld-window-cm LD_WINDOW_CM] [--ld-blocks LD_BLOCKS] - [--genmap-Ne GENMAP_NE] [--genmap-sample-size GENMAP_SS] [--shrinkage-cutoff SHRINK_CUTOFF] - -Commandline arguments for LD matrix computation - -optional arguments: - -h, --help show this help message and exit - --estimator {block,shrinkage,sample,windowed} - The LD estimator (windowed, shrinkage, block, sample) - --bfile BED_FILE The path to a plink BED file - --keep KEEP_FILE A plink-style keep file to select a subset of individuals to compute the LD matrices. - --extract EXTRACT_FILE - A plink-style extract file to select a subset of SNPs to compute the LD matrix for. - --backend {plink,xarray} - The backend software used to compute the Linkage-Disequilibrium between variants. - --temp-dir TEMP_DIR The temporary directory where we store intermediate files. - --output-dir OUTPUT_DIR - The output directory where the Zarr formatted LD matrices will be stored. - --min-maf MIN_MAF The minimum minor allele frequency for variants included in the LD matrix. - --min-mac MIN_MAC The minimum minor allele count for variants included in the LD matrix. - --ld-window LD_WINDOW - Maximum number of neighboring SNPs to consider when computing LD. - --ld-window-kb LD_WINDOW_KB - Maximum distance (in kilobases) between pairs of variants when computing LD. - --ld-window-cm LD_WINDOW_CM - Maximum distance (in centi Morgan) between pairs of variants when computing LD. - --ld-blocks LD_BLOCKS - Path to the file with the LD block boundaries, in LDetect format (e.g. chr start stop, tab-separated) - --genmap-Ne GENMAP_NE - The effective population size for the population from which the genetic map was derived. - --genmap-sample-size GENMAP_SS - The sample size for the dataset used to infer the genetic map. - --shrinkage-cutoff SHRINK_CUTOFF - The cutoff value below which we assume that the correlation between variants is zero. -``` - -And: - -```shell -$ magenpy_simulate -h - -********************************************** - _ __ ___ __ _ __ _ ___ _ __ _ __ _ _ -| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | | -| | | | | | (_| | (_| | __/ | | | |_) | |_| | -|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, | - |___/ |_| |___/ -Modeling and Analysis of Genetics data in python -Version: 0.0.10 | Release date: May 2022 -Author: Shadi Zabad, McGill University -********************************************** -< Simulate complex quantitative or case-control traits > - -usage: magenpy_simulate [-h] --bed-files BED_FILES [--keep KEEP_FILE] [--extract EXTRACT_FILE] [--backend {plink,xarray}] [--temp-dir TEMP_DIR] --output-file OUTPUT_FILE - [--output-simulated-effects] [--min-maf MIN_MAF] [--min-mac MIN_MAC] --h2 H2 [--mix-prop MIX_PROP] [--var-mult VAR_MULT] [--likelihood {binomial,gaussian}] - [--prevalence PREVALENCE] - -Commandline arguments for the complex trait simulator - -optional arguments: - -h, --help show this help message and exit - --bed-files BED_FILES - The BED files containing the genotype data. You may use a wildcard here (e.g. "data/chr_*.bed") - --keep KEEP_FILE A plink-style keep file to select a subset of individuals for simulation. - --extract EXTRACT_FILE - A plink-style extract file to select a subset of SNPs for simulation. - --backend {plink,xarray} - The backend software used for the computation. - --temp-dir TEMP_DIR The temporary directory where we store intermediate files. - --output-file OUTPUT_FILE - The path where the simulated phenotype will be stored (no extension needed). - --output-simulated-effects - Output a table with the true simulated effect size for each variant. - --min-maf MIN_MAF The minimum minor allele frequency for variants included in the simulation. - --min-mac MIN_MAC The minimum minor allele count for variants included in the simulation. - --h2 H2 Trait heritability. Ranges between 0. and 1., inclusive. - --mix-prop MIX_PROP, -p MIX_PROP - Mixing proportions for the mixture density (comma separated). For example, for the spike-and-slab mixture density, with the proportion of causal variants set to 0.1, - you can specify: "--mix-prop 0.9,0.1 --var-mult 0,1". - --var-mult VAR_MULT, -d VAR_MULT - Multipliers on the variance for each mixture component. - --likelihood {binomial,gaussian} - The likelihood for the simulated trait. Gaussian (e.g. quantitative) or binomial (e.g. case-control). - --prevalence PREVALENCE - The prevalence of cases (or proportion of positives) for binary traits. Ranges between 0. and 1. -``` - -You can find examples of how to run the commandline scripts in the `examples` directory on GitHub. -To request other functionalities to be packaged with `magenpy`, please contact the developers or -open an Issue on [GitHub](https://github.com/shz9/magenpy). - -## Citations - -Shadi Zabad, Simon Gravel, Yue Li. **Fast and Accurate Bayesian Polygenic Risk Modeling with Variational Inference**. (2022) - -```bibtex -@article { - Zabad2022.05.10.491396, - author = {Zabad, Shadi and Gravel, Simon and Li, Yue}, - title = {Fast and Accurate Bayesian Polygenic Risk Modeling with Variational Inference}, - elocation-id = {2022.05.10.491396}, - year = {2022}, - doi = {10.1101/2022.05.10.491396}, - publisher = {Cold Spring Harbor Laboratory}, - URL = {https://www.biorxiv.org/content/early/2022/05/11/2022.05.10.491396}, - journal = {bioRxiv} -} -``` +- [Documentation](https://magenpy.github.io/magenpy/) +- [Citation / BibTeX records](./CITATION.md) +- [Report issues/bugs](https://github.com/shz9/magenpy/issues) diff --git a/bin/magenpy_ld b/bin/magenpy_ld index 92e7e32..534241e 100644 --- a/bin/magenpy_ld +++ b/bin/magenpy_ld @@ -1,20 +1,36 @@ #!/usr/bin/env python3 """ -Author: Shadi Zabad -Date: May 2022 +Compute Linkage-Disequilibrium (LD) matrices and store in Zarr array format +---------------------------- + +This is a commandline script that facilitates the computation of LD matrices +from genotype data stored in plink BED format. The script supports various +estimators for computing the LD matrix, including windowed, shrinkage, block, +and sample-based estimators. The script outputs the computed LD matrices in +Zarr array format, which is a compressed, parallelized, and scalable format +for storing large numerical arrays. + +Usage: + + python -m magenpy_ld --bfile --estimator --output-dir + +For larger genotype matrices, we recommend using `plink1.9` as a backend to compute the LD matrices. +You can do that by specifying the `--backend` parameter: + + python -m magenpy_ld --bfile --estimator --output-dir --backend plink -This is a commandline script that enables users to generate -LD matrices in Zarr format from plink's `.bed` files. """ import os.path as osp import argparse import magenpy as mgp +import time +from datetime import timedelta from magenpy.utils.system_utils import valid_url from magenpy.GenotypeMatrix import xarrayGenotypeMatrix, plinkBEDGenotypeMatrix -print(f""" +print(fr""" ********************************************** _ __ ___ __ _ __ _ ___ _ __ _ __ _ _ | '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | | @@ -22,14 +38,14 @@ print(f""" |_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, | |___/ |_| |___/ Modeling and Analysis of Genetics data in python -Version: {mgp.__version__} | Release date: May 2022 +Version: {mgp.__version__} | Release date: {mgp.__release_date__} Author: Shadi Zabad, McGill University ********************************************** < Compute LD matrix and output in Zarr format > """) parser = argparse.ArgumentParser(description=""" -Commandline arguments for LD matrix computation + Commandline arguments for LD matrix computation """) # General options: @@ -37,7 +53,7 @@ parser.add_argument('--estimator', dest='estimator', type=str, default='windowed choices={'windowed', 'shrinkage', 'block', 'sample'}, help='The LD estimator (windowed, shrinkage, block, sample)') parser.add_argument('--bfile', dest='bed_file', type=str, required=True, - help='The path to a plink BED file') + help='The path to a plink BED file.') parser.add_argument('--keep', dest='keep_file', type=str, help='A plink-style keep file to select a subset of individuals to compute the LD matrices.') parser.add_argument('--extract', dest='extract_file', type=str, @@ -54,6 +70,29 @@ parser.add_argument('--min-maf', dest='min_maf', type=float, parser.add_argument('--min-mac', dest='min_mac', type=float, help='The minimum minor allele count for variants included in the LD matrix.') +# Metadata / reproducibility options: +parser.add_argument('--genome-build', dest='genome_build', type=str, + help='The genome build for the genotype data (recommend storing as metadata).') +parser.add_argument('--metadata', dest='metadata', type=str, + help='A comma-separated string with metadata keys and values. This is used to store ' + 'information about the genotype data from which the LD matrix was computed, such as ' + 'the biobank/samples, cohort characteristics (e.g. ancestry), etc. Keys and values ' + 'should be separated by =, such that inputs are in the form of:' + '--metadata Biobank=UKB,Ancestry=EUR,Date=April2024') + +# Argument for the float precision: +parser.add_argument('--storage-dtype', dest='storage_dtype', type=str, + default='int16', help='The data type for the entries of the LD matrix.', + choices={'float32', 'float64', 'int16', 'int8'}) + +# Add arguments for the compressor: +parser.add_argument('--compressor', dest='compressor', type=str, + default='lz4', help='The compressor name or compression algorithm to use for the LD matrix.', + choices={'lz4', 'zstd', 'gzip', 'zlib'}) + +parser.add_argument('--compression-level', dest='compression_level', type=int, + default=5, help='The compression level to use for the entries of the LD matrix (1-9).') + # Options for the various LD estimators: # For the windowed estimator: @@ -85,7 +124,7 @@ args = parser.parse_args() if args.estimator == 'windowed': if args.ld_window is None and args.ld_window_kb is None and args.ld_window_cm is None: raise Exception("For the windowed estimator, the user must provide the window size using --ld-window or " - "the maximum distance in kb (--ld-window-kb) or cM (--ld-window-cm).") + "the maximum distance in kilobases (--ld-window-kb) or centi Morgan (--ld-window-cm).") elif args.estimator == 'block': if args.ld_blocks is None: @@ -142,6 +181,10 @@ if args.min_maf is not None: if args.min_mac is not None: print(">>> Minimum allele count:", args.min_mac) +print(">>> Storage data type:", args.storage_dtype) +print(">>> Compressor:", args.compressor) +print(">>> Compression level:", args.compression_level) + print("\n\n> Output:") print(">>> Temporary directory:", args.temp_dir) print(">>> Output directory:", args.output_dir) @@ -152,9 +195,13 @@ print(">>> Output directory:", args.output_dir) print("\n\n> Processing the genotype data...") if args.backend == 'xarray': - g = xarrayGenotypeMatrix.from_file(args.bed_file, temp_dir=args.temp_dir) + g = xarrayGenotypeMatrix.from_file(args.bed_file, + temp_dir=args.temp_dir, + genome_build=args.genome_build) else: - g = plinkBEDGenotypeMatrix.from_file(args.bed_file, temp_dir=args.temp_dir) + g = plinkBEDGenotypeMatrix.from_file(args.bed_file, + temp_dir=args.temp_dir, + genome_build=args.genome_build) if args.keep_file is not None: print("> Filtering samples...") @@ -169,9 +216,35 @@ if args.min_mac is not None or args.min_maf is not None: g.filter_by_allele_frequency(min_maf=args.min_maf, min_mac=args.min_mac) +# Record start time: +start_time = time.time() + +# Compute LD matrix: print("> Computing the LD matrix...") -g.compute_ld(args.estimator, args.output_dir, **ld_kwargs) +ld_mat = g.compute_ld(args.estimator, + args.output_dir, + dtype=args.storage_dtype, + compressor_name=args.compressor, + compression_level=args.compression_level, + **ld_kwargs) + +# Store metadata (if provided): +if args.metadata is not None: + parsed_metadata = { + k: v for entry in args.metadata.split(',') for k, v in [entry.strip().split('=')] + if len(entry.strip()) > 0 + } + + if len(parsed_metadata) > 0: + for k, v in parsed_metadata.items(): + ld_mat.set_store_attr(k, v) + # Clean up all intermediate files and directories: g.cleanup() + print("Done!") +print("> Output directory:\n\t", args.output_dir) +# Record the end time: +end_time = time.time() +print('Total runtime:', timedelta(seconds=end_time - start_time)) diff --git a/bin/magenpy_simulate b/bin/magenpy_simulate index 82a8636..dde44b4 100644 --- a/bin/magenpy_simulate +++ b/bin/magenpy_simulate @@ -1,23 +1,38 @@ #!/usr/bin/env python3 """ -Author: Shadi Zabad -Date: May 2022 +Simulate Polygenic Traits using Complex Genetic Architectures +---------------------------- -This script provides functionalities to simulate complex traits on top of -existing genotype data in the form of plink's `.bed` files. +This is a commandline script that facilitates the simulation of complex traits +using a linear additive model with heterogeneous genetic architectures. The script +supports simulating phenotypes with different heritabilities, levels of polygenicity, +and genetic architectures. The script outputs the simulated phenotypes in a tabular +format that can be used for downstream analyses. + +The script can simulate both quantitative and case-control traits. For case-control +traits, the script requires the specification of the prevalence of cases in the population. + +The script requires access to genotype data in PLINK BED format. + +Usage: + + python -m magenpy_simulate --bfile --h2

--prop-causal

--output-file """ import os.path as osp +import numpy as np import magenpy as mgp +import time +from datetime import timedelta import warnings -from magenpy.simulation.GWASimulator import GWASimulator +from magenpy.simulation.PhenotypeSimulator import PhenotypeSimulator from magenpy.utils.system_utils import makedir, get_filenames import argparse -print(f""" +print(fr""" ********************************************** _ __ ___ __ _ __ _ ___ _ __ _ __ _ _ | '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | | @@ -25,7 +40,7 @@ print(f""" |_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, | |___/ |_| |___/ Modeling and Analysis of Genetics data in python -Version: {mgp.__version__} | Release date: May 2022 +Version: {mgp.__version__} | Release date: {mgp.__release_date__} Author: Shadi Zabad, McGill University ********************************************** < Simulate complex quantitative or case-control traits > @@ -34,10 +49,10 @@ Author: Shadi Zabad, McGill University # --------- Options --------- parser = argparse.ArgumentParser(description=""" -Commandline arguments for the complex trait simulator + Commandline arguments for the complex trait simulator """) -parser.add_argument('--bed-files', dest='bed_files', type=str, required=True, +parser.add_argument('--bfile', dest='bed_file', type=str, required=True, help='The BED files containing the genotype data. ' 'You may use a wildcard here (e.g. "data/chr_*.bed")') parser.add_argument('--keep', dest='keep_file', type=str, @@ -52,40 +67,51 @@ parser.add_argument('--temp-dir', dest='temp_dir', type=str, default='temp', parser.add_argument('--output-file', dest='output_file', type=str, required=True, help='The path where the simulated phenotype will be stored ' '(no extension needed).') -parser.add_argument('--output-simulated-effects', dest='output_true', action='store_true', default=False, +parser.add_argument('--output-simulated-beta', dest='output_sim_beta', + action='store_true', default=False, help='Output a table with the true simulated effect size for each variant.') parser.add_argument('--min-maf', dest='min_maf', type=float, help='The minimum minor allele frequency for variants included in the simulation.') -parser.add_argument('--min-mac', dest='min_mac', type=float, +parser.add_argument('--min-mac', dest='min_mac', type=int, help='The minimum minor allele count for variants included in the simulation.') # Simulation parameters: parser.add_argument('--h2', dest='h2', type=float, required=True, help='Trait heritability. Ranges between 0. and 1., inclusive.') -parser.add_argument('--mix-prop', '-p', dest='mix_prop', type=str, +parser.add_argument('--mix-prop', dest='mix_prop', type=str, help='Mixing proportions for the mixture density (comma separated). For example, ' 'for the spike-and-slab mixture density, with the proportion of causal variants ' 'set to 0.1, you can specify: "--mix-prop 0.9,0.1 --var-mult 0,1".') +parser.add_argument('--prop-causal', '-p', dest='prop_causal', type=float, + help='The proportion of causal variants in the simulation. See --mix-prop for ' + 'more complex architectures specification.') parser.add_argument('--var-mult', '-d', dest='var_mult', type=str, help='Multipliers on the variance for each mixture component.') -parser.add_argument('--likelihood', dest='likelihood', type=str, default='gaussian', +parser.add_argument('--phenotype-likelihood', dest='likelihood', type=str, default='gaussian', choices={'gaussian', 'binomial'}, - help='The likelihood for the simulated trait. ' - 'Gaussian (e.g. quantitative) or binomial (e.g. case-control).') + help='The likelihood for the simulated trait: ' + 'gaussian (e.g. quantitative) or binomial (e.g. case-control).') parser.add_argument('--prevalence', dest='prevalence', type=float, help='The prevalence of cases (or proportion of positives) for binary traits. ' 'Ranges between 0. and 1.') +parser.add_argument('--seed', dest='seed', type=int, + help='The random seed to use for the random number generator.') + args = parser.parse_args() # ------------------------------------------------------ # Sanity checks on the inputs: -bed_files = get_filenames(args.bed_files, extension=".bed") -if len(bed_files) < 1: - raise FileNotFoundError(f"No BED files were identified at the specified location: {args.bed_files}") +bed_file = get_filenames(args.bed_file, extension=".bed") +if len(bed_file) < 1: + raise FileNotFoundError(f"No BED files were identified at the specified location: {args.bed_file}") -if args.mix_prop is not None: + +if args.prop_causal is not None: + pi = [1. - args.prop_causal, args.prop_causal] + d = [0., 1.] +elif args.mix_prop is not None: pi = list(map(float, args.mix_prop.split(","))) if args.var_mult: d = list(map(float, args.var_mult.split(","))) @@ -106,9 +132,12 @@ print(f"> Simulating complex trait with {args.likelihood} likelihood...") print(f">>> Heritability:", args.h2) print(f">>> Mixing proportions:", pi) print(f">>> Variance multipliers:", d) +if args.likelihood == 'binomial' and args.prevalence is not None: + print(f">>> Prevalence:", args.prevalence) print("\n\n> Source data:") -print(">>> BED files:", args.bed_files) +print(">>> BED files:", args.bed_file) + if args.keep_file is not None: print(">>> Keep samples:", args.keep_file) if args.extract_file is not None: @@ -124,17 +153,25 @@ print(">>> Output file:", args.output_file) # ------------------------------------------------------ -gs = GWASimulator(bed_files, - keep_file=args.keep_file, - extract_file=args.extract_file, - phenotype_likelihood=args.likelihood, - h2=args.h2, - pi=pi, - d=d, - min_maf=args.min_maf, - min_mac=args.min_mac, - backend=args.backend, - temp_dir=args.temp_dir) +# Record start time: +start_time = time.time() + +# Set the random seed: +if args.seed is not None: + np.random.seed(args.seed) + +# Construct the PhenotypeSimulator object: +gs = PhenotypeSimulator(bed_file, + keep_file=args.keep_file, + extract_file=args.extract_file, + phenotype_likelihood=args.likelihood, + h2=args.h2, + pi=pi, + d=d, + min_maf=args.min_maf, + min_mac=args.min_mac, + backend=args.backend, + temp_dir=args.temp_dir) print("> Simulating phenotype...") gs.simulate(reset_beta=True, perform_gwas=False) @@ -145,10 +182,18 @@ makedir(osp.dirname(args.output_file)) pheno_table.to_csv(args.output_file + '.SimPheno', sep="\t", index=False, header=False) -if args.output_true is not None: - +if args.output_sim_beta: + # Output the simulated effect sizes: sim_effects = gs.to_true_beta_table() sim_effects.to_csv(args.output_file + ".SimEffect", sep="\t", index=False) gs.cleanup() + +print("Done!") +print("> Output file(s):\n\t", args.output_file + '.SimPheno') +if args.output_sim_beta: + print("\t", args.output_file + ".SimEffect") +# Record the end time: +end_time = time.time() +print('Total runtime:', timedelta(seconds=end_time - start_time)) diff --git a/docs/api/AnnotationMatrix.md b/docs/api/AnnotationMatrix.md new file mode 100644 index 0000000..bc19c77 --- /dev/null +++ b/docs/api/AnnotationMatrix.md @@ -0,0 +1 @@ +::: magenpy.AnnotationMatrix.AnnotationMatrix \ No newline at end of file diff --git a/docs/api/GWADataLoader.md b/docs/api/GWADataLoader.md new file mode 100644 index 0000000..d43570a --- /dev/null +++ b/docs/api/GWADataLoader.md @@ -0,0 +1 @@ +::: magenpy.GWADataLoader.GWADataLoader \ No newline at end of file diff --git a/docs/api/GenotypeMatrix.md b/docs/api/GenotypeMatrix.md new file mode 100644 index 0000000..a5fdfc6 --- /dev/null +++ b/docs/api/GenotypeMatrix.md @@ -0,0 +1 @@ +::: magenpy.GenotypeMatrix \ No newline at end of file diff --git a/docs/api/LDMatrix.md b/docs/api/LDMatrix.md new file mode 100644 index 0000000..1b335ba --- /dev/null +++ b/docs/api/LDMatrix.md @@ -0,0 +1 @@ +::: magenpy.LDMatrix.LDMatrix \ No newline at end of file diff --git a/docs/api/SampleTable.md b/docs/api/SampleTable.md new file mode 100644 index 0000000..fa3246b --- /dev/null +++ b/docs/api/SampleTable.md @@ -0,0 +1 @@ +::: magenpy.SampleTable.SampleTable \ No newline at end of file diff --git a/docs/api/SumstatsTable.md b/docs/api/SumstatsTable.md new file mode 100644 index 0000000..4436737 --- /dev/null +++ b/docs/api/SumstatsTable.md @@ -0,0 +1 @@ +::: magenpy.SumstatsTable.SumstatsTable \ No newline at end of file diff --git a/docs/api/overview.md b/docs/api/overview.md new file mode 100644 index 0000000..1c5cd66 --- /dev/null +++ b/docs/api/overview.md @@ -0,0 +1,45 @@ + + +## Data Structures + +* [GWADataLoader](GWADataLoader.md): A general class for loading multiple statistical genetics data sources and +harmonizing them for downstream analyses. +* [GenotypeMatrix](GenotypeMatrix.md): A class for representing on-disk genotype matrices. It provides +interfaces for querying / manipulating / and performing computations on genotype data. +* [LDMatrix](LDMatrix.md): A class for representing on-disk Linkage-Disequilibrium (LD) matrices. It provides +interfaces for querying / manipulating / and performing computations on LD data. +* [SampleTable](SampleTable.md): A class for representing data about samples (individuals), including covariates, +phenotypes, and other sample-specific metadata. +* [SumstatsTable](SumstatsTable.md): A class for representing summary statistics data from a GWAS study. It provides +interfaces for querying / manipulating / and performing computations on summary statistics data. +* [AnnotationMatrix](AnnotationMatrix.md): A class for representing variant annotations (e.g. functional annotations, +pathogenicity scores, etc.) for a set of variants. It provides interfaces for querying / manipulating / and +performing computations on annotation data. + +## Simulation + +* [PhenotypeSimulator](simulation/PhenotypeSimulator.md): A general class for simulating phenotypes based on genetic data. + +## Parsers + +* [Sumstats Parsers](parsers/sumstats_parsers.md): A collection of parsers for reading GWAS summary statistics files in various formats. +* [Annotation Parsers](parsers/annotation_parsers.md): A collection of parsers for reading variant annotation files in various formats. +* [Plink Parsers](parsers/plink_parsers.md): A collection of parsers for reading PLINK files (BED/BIM/FAM) and other PLINK-related formats. + +## Statistics + +## Plotting + +* [GWAS plots](plot/gwa.md): Functions for plotting various quantities / results from GWAS studies. +* [LD plots](plot/ld.md): Functions for plotting various quantities from LD matrices. + +## Utilities + +* [Compute utilities](utils/compute_utils.md): Utilities for computing various statistics / quantities over python data structures. +* [Data utilities](utils/data_utils.md): Utilities for downloading and processing relevant data. +* [Executors](utils/executors.md): A collection of classes for interfacing with third party software, such as `plink`. +* [Model utilities](utils/model_utils.md): Utilities for merging / aligning / filtering GWAS data sources. +* [System utilities](utils/system_utils.md): Utilities for interfacing with the system environment (e.g. file I/O, environment variables, etc.). + +## Data + diff --git a/docs/api/parsers/annotation_parsers.md b/docs/api/parsers/annotation_parsers.md new file mode 100644 index 0000000..606c5bf --- /dev/null +++ b/docs/api/parsers/annotation_parsers.md @@ -0,0 +1 @@ +::: magenpy.parsers.annotation_parsers diff --git a/docs/api/parsers/misc_parsers.md b/docs/api/parsers/misc_parsers.md new file mode 100644 index 0000000..4059db1 --- /dev/null +++ b/docs/api/parsers/misc_parsers.md @@ -0,0 +1 @@ +::: magenpy.parsers.misc_parsers \ No newline at end of file diff --git a/docs/api/parsers/plink_parsers.md b/docs/api/parsers/plink_parsers.md new file mode 100644 index 0000000..d4e7a22 --- /dev/null +++ b/docs/api/parsers/plink_parsers.md @@ -0,0 +1 @@ +::: magenpy.parsers.plink_parsers diff --git a/docs/api/parsers/sumstats_parsers.md b/docs/api/parsers/sumstats_parsers.md new file mode 100644 index 0000000..0f8806a --- /dev/null +++ b/docs/api/parsers/sumstats_parsers.md @@ -0,0 +1 @@ +::: magenpy.parsers.sumstats_parsers \ No newline at end of file diff --git a/docs/api/plot/gwa.md b/docs/api/plot/gwa.md new file mode 100644 index 0000000..e5a4d49 --- /dev/null +++ b/docs/api/plot/gwa.md @@ -0,0 +1 @@ +::: magenpy.plot.gwa \ No newline at end of file diff --git a/docs/api/plot/ld.md b/docs/api/plot/ld.md new file mode 100644 index 0000000..2b7f7d6 --- /dev/null +++ b/docs/api/plot/ld.md @@ -0,0 +1 @@ +::: magenpy.plot.ld \ No newline at end of file diff --git a/docs/api/simulation/AnnotatedPhenotypeSimulator.md b/docs/api/simulation/AnnotatedPhenotypeSimulator.md new file mode 100644 index 0000000..2e22ffb --- /dev/null +++ b/docs/api/simulation/AnnotatedPhenotypeSimulator.md @@ -0,0 +1 @@ +::: magenpy.simulation.AnnotatedPhenotypeSimulator \ No newline at end of file diff --git a/docs/api/simulation/MultiCohortPhenotypeSimulator.md b/docs/api/simulation/MultiCohortPhenotypeSimulator.md new file mode 100644 index 0000000..6d3312c --- /dev/null +++ b/docs/api/simulation/MultiCohortPhenotypeSimulator.md @@ -0,0 +1 @@ +::: magenpy.simulation.MultiCohortPhenotypeSimulator \ No newline at end of file diff --git a/docs/api/simulation/PhenotypeSimulator.md b/docs/api/simulation/PhenotypeSimulator.md new file mode 100644 index 0000000..e0a0917 --- /dev/null +++ b/docs/api/simulation/PhenotypeSimulator.md @@ -0,0 +1 @@ +::: magenpy.simulation.PhenotypeSimulator \ No newline at end of file diff --git a/docs/api/stats/gwa/utils.md b/docs/api/stats/gwa/utils.md new file mode 100644 index 0000000..eb4c420 --- /dev/null +++ b/docs/api/stats/gwa/utils.md @@ -0,0 +1 @@ +::: magenpy.stats.gwa.utils \ No newline at end of file diff --git a/docs/api/stats/h2/ldsc.md b/docs/api/stats/h2/ldsc.md new file mode 100644 index 0000000..e42ea8f --- /dev/null +++ b/docs/api/stats/h2/ldsc.md @@ -0,0 +1 @@ +::: magenpy.stats.h2.ldsc \ No newline at end of file diff --git a/docs/api/stats/ld/estimator.md b/docs/api/stats/ld/estimator.md new file mode 100644 index 0000000..7a110fd --- /dev/null +++ b/docs/api/stats/ld/estimator.md @@ -0,0 +1 @@ +::: magenpy.stats.ld.estimator \ No newline at end of file diff --git a/docs/api/stats/ld/utils.md b/docs/api/stats/ld/utils.md new file mode 100644 index 0000000..a93416e --- /dev/null +++ b/docs/api/stats/ld/utils.md @@ -0,0 +1 @@ +::: magenpy.stats.ld.utils \ No newline at end of file diff --git a/docs/api/stats/score/utils.md b/docs/api/stats/score/utils.md new file mode 100644 index 0000000..7a6de23 --- /dev/null +++ b/docs/api/stats/score/utils.md @@ -0,0 +1 @@ +::: magenpy.stats.score.utils \ No newline at end of file diff --git a/docs/api/stats/transforms/genotype.md b/docs/api/stats/transforms/genotype.md new file mode 100644 index 0000000..c04b703 --- /dev/null +++ b/docs/api/stats/transforms/genotype.md @@ -0,0 +1 @@ +::: magenpy.stats.transforms.genotype \ No newline at end of file diff --git a/docs/api/stats/transforms/phenotype.md b/docs/api/stats/transforms/phenotype.md new file mode 100644 index 0000000..b6a7060 --- /dev/null +++ b/docs/api/stats/transforms/phenotype.md @@ -0,0 +1 @@ +::: magenpy.stats.transforms.phenotype \ No newline at end of file diff --git a/docs/api/stats/variant/utils.md b/docs/api/stats/variant/utils.md new file mode 100644 index 0000000..beebf21 --- /dev/null +++ b/docs/api/stats/variant/utils.md @@ -0,0 +1 @@ +::: magenpy.stats.variant.utils \ No newline at end of file diff --git a/docs/api/utils/compute_utils.md b/docs/api/utils/compute_utils.md new file mode 100644 index 0000000..0aa9039 --- /dev/null +++ b/docs/api/utils/compute_utils.md @@ -0,0 +1 @@ +::: magenpy.utils.compute_utils diff --git a/docs/api/utils/data_utils.md b/docs/api/utils/data_utils.md new file mode 100644 index 0000000..97cf5b5 --- /dev/null +++ b/docs/api/utils/data_utils.md @@ -0,0 +1 @@ +::: magenpy.utils.data_utils \ No newline at end of file diff --git a/docs/api/utils/executors.md b/docs/api/utils/executors.md new file mode 100644 index 0000000..407e7e0 --- /dev/null +++ b/docs/api/utils/executors.md @@ -0,0 +1 @@ +::: magenpy.utils.executors \ No newline at end of file diff --git a/docs/api/utils/model_utils.md b/docs/api/utils/model_utils.md new file mode 100644 index 0000000..65e3cd9 --- /dev/null +++ b/docs/api/utils/model_utils.md @@ -0,0 +1 @@ +::: magenpy.utils.model_utils \ No newline at end of file diff --git a/docs/api/utils/system_utils.md b/docs/api/utils/system_utils.md new file mode 100644 index 0000000..b57b4a2 --- /dev/null +++ b/docs/api/utils/system_utils.md @@ -0,0 +1 @@ +::: magenpy.utils.system_utils \ No newline at end of file diff --git a/docs/citation.md b/docs/citation.md new file mode 100644 index 0000000..4b82e4c --- /dev/null +++ b/docs/citation.md @@ -0,0 +1,21 @@ +If you use `magenpy` in your research, please cite the following paper(s): + +> Zabad, S., Gravel, S., & Li, Y. (2023). **Fast and accurate Bayesian polygenic risk modeling with variational inference.** +The American Journal of Human Genetics, 110(5), 741–761. https://doi.org/10.1016/j.ajhg.2023.03.009 + +## BibTeX records + +```bibtex +@article{ZABAD2023741, + title = {Fast and accurate Bayesian polygenic risk modeling with variational inference}, + journal = {The American Journal of Human Genetics}, + volume = {110}, + number = {5}, + pages = {741-761}, + year = {2023}, + issn = {0002-9297}, + doi = {https://doi.org/10.1016/j.ajhg.2023.03.009}, + url = {https://www.sciencedirect.com/science/article/pii/S0002929723000939}, + author = {Shadi Zabad and Simon Gravel and Yue Li} +} +``` diff --git a/docs/commandline/magenpy_ld.md b/docs/commandline/magenpy_ld.md new file mode 100644 index 0000000..31e3ff2 --- /dev/null +++ b/docs/commandline/magenpy_ld.md @@ -0,0 +1,82 @@ +Compute Linkage-Disequilibrium (LD) matrices (`magenpy_ld`) +--- + +The `magenpy_ld` script is used to compute Linkage-Disequilibrium (LD) matrices, which record the +pairwise SNP-by-SNP correlations from a sample of genotype data stored in `plink`'s BED format. The script +offers an interface to compute LD matrices by simply specifying the path to the genotype files, the type of LD +estimator to use, the subset of variants or samples to keep, and the output directory. + +A full listing of the options available for the `magenpy_ld` script can be found by running the +following command in your terminal: + +```bash +magenpy_ld -h +``` + +Which outputs the following help message: + +```bash + +********************************************** + _ __ ___ __ _ __ _ ___ _ __ _ __ _ _ +| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | | +| | | | | | (_| | (_| | __/ | | | |_) | |_| | +|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, | + |___/ |_| |___/ +Modeling and Analysis of Genetics data in python +Version: 0.1.0 | Release date: April 2024 +Author: Shadi Zabad, McGill University +********************************************** +< Compute LD matrix and output in Zarr format > + +usage: magenpy_ld [-h] [--estimator {shrinkage,windowed,block,sample}] --bfile BED_FILE [--keep KEEP_FILE] [--extract EXTRACT_FILE] + [--backend {plink,xarray}] [--temp-dir TEMP_DIR] --output-dir OUTPUT_DIR [--min-maf MIN_MAF] [--min-mac MIN_MAC] + [--genome-build GENOME_BUILD] [--metadata METADATA] [--storage-dtype {int8,float32,int16,float64}] + [--compressor {zstd,lz4,zlib,gzip}] [--compression-level COMPRESSION_LEVEL] [--ld-window LD_WINDOW] [--ld-window-kb LD_WINDOW_KB] + [--ld-window-cm LD_WINDOW_CM] [--ld-blocks LD_BLOCKS] [--genmap-Ne GENMAP_NE] [--genmap-sample-size GENMAP_SS] + [--shrinkage-cutoff SHRINK_CUTOFF] + +Commandline arguments for LD matrix computation + +options: + -h, --help show this help message and exit + --estimator {shrinkage,windowed,block,sample} + The LD estimator (windowed, shrinkage, block, sample) + --bfile BED_FILE The path to a plink BED file. + --keep KEEP_FILE A plink-style keep file to select a subset of individuals to compute the LD matrices. + --extract EXTRACT_FILE + A plink-style extract file to select a subset of SNPs to compute the LD matrix for. + --backend {plink,xarray} + The backend software used to compute the Linkage-Disequilibrium between variants. + --temp-dir TEMP_DIR The temporary directory where we store intermediate files. + --output-dir OUTPUT_DIR + The output directory where the Zarr formatted LD matrices will be stored. + --min-maf MIN_MAF The minimum minor allele frequency for variants included in the LD matrix. + --min-mac MIN_MAC The minimum minor allele count for variants included in the LD matrix. + --genome-build GENOME_BUILD + The genome build for the genotype data (recommend storing as metadata). + --metadata METADATA A comma-separated string with metadata keys and values. This is used to store information about the genotype data from which + the LD matrix was computed, such as the biobank/samples, cohort characteristics (e.g. ancestry), etc. Keys and values should + be separated by =, such that inputs are in the form of:--metadata Biobank=UKB,Ancestry=EUR,Date=April2024 + --storage-dtype {int8,float32,int16,float64} + The data type for the entries of the LD matrix. + --compressor {zstd,lz4,zlib,gzip} + The compressor name or compression algorithm to use for the LD matrix. + --compression-level COMPRESSION_LEVEL + The compression level to use for the entries of the LD matrix (1-9). + --ld-window LD_WINDOW + Maximum number of neighboring SNPs to consider when computing LD. + --ld-window-kb LD_WINDOW_KB + Maximum distance (in kilobases) between pairs of variants when computing LD. + --ld-window-cm LD_WINDOW_CM + Maximum distance (in centi Morgan) between pairs of variants when computing LD. + --ld-blocks LD_BLOCKS + Path to the file with the LD block boundaries, in LDetect format (e.g. chr start stop, tab-separated) + --genmap-Ne GENMAP_NE + The effective population size for the population from which the genetic map was derived. + --genmap-sample-size GENMAP_SS + The sample size for the dataset used to infer the genetic map. + --shrinkage-cutoff SHRINK_CUTOFF + The cutoff value below which we assume that the correlation between variants is zero. + +``` \ No newline at end of file diff --git a/docs/commandline/magenpy_simulate.md b/docs/commandline/magenpy_simulate.md new file mode 100644 index 0000000..9ef0440 --- /dev/null +++ b/docs/commandline/magenpy_simulate.md @@ -0,0 +1,68 @@ +Simulate complex traits with varying genetic architectures (`magenpy_simulate`) +--- + +The `magenpy_simulate` script is used to facilitate simulating complex traits with a variety of +genetic architectures, given a set of genotypes stored in `plink`'s BED file format. The script +takes as input the path to the genotype data, the type of trait to simulate, the parameters of +the genetic architecture (e.g. polygenicity, heritability, effect sizes), and the output directory +where the simulated phenotypes will be stored. + +A full listing of the options available for the `magenpy_simulate` script can be found by running the +following command in your terminal: + +```bash +magenpy_simulate -h +``` + +Which outputs the following help message: + +```bash + +********************************************** + _ __ ___ __ _ __ _ ___ _ __ _ __ _ _ +| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | | +| | | | | | (_| | (_| | __/ | | | |_) | |_| | +|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, | + |___/ |_| |___/ +Modeling and Analysis of Genetics data in python +Version: 0.1.0 | Release date: April 2024 +Author: Shadi Zabad, McGill University +********************************************** +< Simulate complex quantitative or case-control traits > + +usage: magenpy_simulate [-h] --bfile BED_FILE [--keep KEEP_FILE] [--extract EXTRACT_FILE] [--backend {plink,xarray}] [--temp-dir TEMP_DIR] + --output-file OUTPUT_FILE [--output-simulated-beta] [--min-maf MIN_MAF] [--min-mac MIN_MAC] --h2 H2 [--mix-prop MIX_PROP] + [--prop-causal PROP_CAUSAL] [--var-mult VAR_MULT] [--phenotype-likelihood {binomial,gaussian}] [--prevalence PREVALENCE] + [--seed SEED] + +Commandline arguments for the complex trait simulator + +options: + -h, --help show this help message and exit + --bfile BED_FILE The BED files containing the genotype data. You may use a wildcard here (e.g. "data/chr_*.bed") + --keep KEEP_FILE A plink-style keep file to select a subset of individuals for simulation. + --extract EXTRACT_FILE + A plink-style extract file to select a subset of SNPs for simulation. + --backend {plink,xarray} + The backend software used for the computation. + --temp-dir TEMP_DIR The temporary directory where we store intermediate files. + --output-file OUTPUT_FILE + The path where the simulated phenotype will be stored (no extension needed). + --output-simulated-beta + Output a table with the true simulated effect size for each variant. + --min-maf MIN_MAF The minimum minor allele frequency for variants included in the simulation. + --min-mac MIN_MAC The minimum minor allele count for variants included in the simulation. + --h2 H2 Trait heritability. Ranges between 0. and 1., inclusive. + --mix-prop MIX_PROP Mixing proportions for the mixture density (comma separated). For example, for the spike-and-slab mixture density, with the + proportion of causal variants set to 0.1, you can specify: "--mix-prop 0.9,0.1 --var-mult 0,1". + --prop-causal PROP_CAUSAL, -p PROP_CAUSAL + The proportion of causal variants in the simulation. See --mix-prop for more complex architectures specification. + --var-mult VAR_MULT, -d VAR_MULT + Multipliers on the variance for each mixture component. + --phenotype-likelihood {binomial,gaussian} + The likelihood for the simulated trait: gaussian (e.g. quantitative) or binomial (e.g. case-control). + --prevalence PREVALENCE + The prevalence of cases (or proportion of positives) for binary traits. Ranges between 0. and 1. + --seed SEED The random seed to use for the random number generator. + +``` \ No newline at end of file diff --git a/docs/commandline/overview.md b/docs/commandline/overview.md new file mode 100644 index 0000000..a1161ed --- /dev/null +++ b/docs/commandline/overview.md @@ -0,0 +1,14 @@ +In addition to the python package interface, users may also opt to use some of `magenpy`'s functionalities +via commandline scripts. The commandline interface is limited at this point to mainly simulating complex traits +and computing LD matrices. + +When you install `magenpy` using `pip`, the commandline scripts are automatically installed on your system and +are available for use. The available scripts are: + +* [`magenpy_ld`](magenpy_ld.md): This script is used to compute LD matrices from genotype data in `plink` BED format. + The script provides a variety of options for the user to customize the LD computation process, including the + choice of LD estimator, storage and compression options, etc. + +* [`magenpy_simulate`](magenpy_simulate.md): This script is used to simulate complex traits with a variety of genetic + architectures. The script provides a variety of options for the user to customize the simulation process, + including the choice of genetic architecture, the proportion of causal variants, the effect sizes, etc. \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/features.md b/docs/features.md new file mode 100644 index 0000000..6926f68 --- /dev/null +++ b/docs/features.md @@ -0,0 +1,385 @@ + +## (1) Complex trait simulation + +`magenpy` may be used for complex trait simulation employing a variety of different +genetic architectures and phenotype likelihoods. For example, to simulate a quantitative +trait with heritability set to 0.25 and where a random subset of 15% of the variants are causal, +you may invoke the following command: + +```python linenums="1" +import magenpy as mgp +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), # Path to 1000G genotype data + pi=[.85, .15], # Proportion of non-causal and causal variants + h2=0.25) # Heritability +# Export simulated phenotype to pandas dataframe: +g_sim.to_phenotype_table() +``` + +``` + FID IID phenotype + 0 HG00096 HG00096 -2.185944 + 1 HG00097 HG00097 -1.664984 + 2 HG00099 HG00099 -0.208703 + 3 HG00100 HG00100 0.257040 + 4 HG00101 HG00101 -0.068826 + .. ... ... ... + 373 NA20815 NA20815 -1.770358 + 374 NA20818 NA20818 1.823890 + 375 NA20819 NA20819 0.835763 + 376 NA20826 NA20826 -0.029256 + 377 NA20828 NA20828 -0.088353 + + [378 rows x 3 columns] +``` + +To simulate a binary, or case-control, trait, the interface is very similar. First, +you need to specify that the likelihood for the phenotype is binomial (`phenotype_likelihood='binomial'`), and then +specify the prevalence of the positive cases in the population. For example, +to simulate a case-control trait with heritability of 0.3 and prevalence of 8%, we can invoke the following +command: + +```python linenums="1" +import magenpy as mgp +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), + phenotype_likelihood='binomial', + prevalence=.08, + h2=0.3) +g_sim.simulate() +g_sim.to_phenotype_table() +``` + +``` + FID IID phenotype + 0 HG00096 HG00096 0 + 1 HG00097 HG00097 0 + 2 HG00099 HG00099 0 + 3 HG00100 HG00100 0 + 4 HG00101 HG00101 0 + .. ... ... ... + 373 NA20815 NA20815 0 + 374 NA20818 NA20818 0 + 375 NA20819 NA20819 1 + 376 NA20826 NA20826 0 + 377 NA20828 NA20828 0 + + [378 rows x 3 columns] +``` + +## (2) Genome-wide Association Testing (GWAS) + +`magenpy` is **not** a GWAS tool. However, we do support preliminary association +testing functionalities either via closed-form formulas for quantitative traits, or +by providing a `python` interface to third-party association testing tools, such as `plink`. + +If you are conducting simple tests based on simulated data, an easy way to perform +association testing is to tell the simulator that you'd like to perform GWAS on the +simulated trait, with the `perform_gwas=True` flag: + +```python linenums="1" +import magenpy as mgp +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), + pi=[.85, .15], + h2=0.25) +g_sim.simulate(perform_gwas=True) +``` + +Alternatively, you can conduct association testing on real or +simulated phenotypes using the `.perform_gwas()` method and exporting the +summary statistics to a `pandas` dataframe with `.to_summary_statistics_table()`: + +```python linenums="1" +g_sim.perform_gwas() +g_sim.to_summary_statistics_table() +``` + +``` + CHR SNP POS A1 A2 ... N BETA Z SE PVAL + 0 22 rs131538 16871137 A G ... 378 -0.046662 -0.900937 0.051793 0.367622 + 1 22 rs9605903 17054720 C T ... 378 0.063977 1.235253 0.051793 0.216736 + 2 22 rs5746647 17057138 G T ... 378 0.057151 1.103454 0.051793 0.269830 + 3 22 rs16980739 17058616 T C ... 378 -0.091312 -1.763029 0.051793 0.077896 + 4 22 rs9605923 17065079 A T ... 378 0.069368 1.339338 0.051793 0.180461 + ... ... ... ... .. .. ... ... ... ... ... ... + 15933 22 rs8137951 51165664 A G ... 378 0.078817 1.521782 0.051793 0.128064 + 15934 22 rs2301584 51171497 A G ... 378 0.076377 1.474658 0.051793 0.140304 + 15935 22 rs3810648 51175626 G A ... 378 -0.001448 -0.027952 0.051793 0.977701 + 15936 22 rs2285395 51178090 A G ... 378 -0.019057 -0.367949 0.051793 0.712911 + 15937 22 rs28729663 51219006 A G ... 378 0.029667 0.572805 0.051793 0.566777 + + [15938 rows x 11 columns] +``` + +If you wish to use `plink2` for association testing (highly recommended), ensure that +you tell `PhenotypeSimulator` (or any `GWADataLoader`-derived object) to use plink by explicitly +specifying the `backend` software that you wish to use: + +```python linenums="1" +import magenpy as mgp +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), + backend='plink', # Set the backend + pi=[.85, .15], + h2=0.25) +g_sim.simulate(perform_gwas=True) +g_sim.cleanup() # Clean up temporary files +``` + +When using `plink`, we sometimes create temporary intermediate files to pass to the software. To clean up +the temporary directories and files, you can invoke the `.cleanup()` command. + +## (3) Calculating LD matrices + +One of the main features of the `magenpy` package is an efficient interface for computing +and storing Linkage Disequilibrium (LD) matrices. LD matrices record the pairwise SNP-by-SNP +Pearson correlation coefficient. In general, LD matrices are computed for each chromosome separately +or may also be computed within LD blocks from, e.g. LDetect. For large autosomal chromosomes, +LD matrices can be huge and may require extra care from the user. + +In `magenpy`, LD matrices can be computed using either `xarray` or `plink`, depending on the +backend that the user specifies (see Section 5 below). In general, at this moment, we do not recommend using +`xarray` as a backend for large genotype matrices, as it is less efficient than `plink`. When using the default +`xarray` as a backend, we compute the full `X'X` (X-transpose-X) matrix first, store it on-disk in chunked +`Zarr` arrays and then perform all sparsification procedures afterwards. When using `plink` as a +backend, on the other hand, we only compute LD between variants that are generally in close proximity +along the chromosome, so it is generally more efficient. In the end, both will be transformed such that +the LD matrix is stored in sparse `Zarr` arrays. + +In either case, to compute an LD matrix using `magenpy`, you can invoke the `.compute_ld()` method +of all `GWADataLoader`-derived objects, as follows: + +```python linenums="1" +# Using xarray: +import magenpy as mgp +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path()) +gdl.compute_ld(estimator='windowed', + output_dir='output/ldl/', + window_size=100) +gdl.cleanup() +``` + +This creates a windowed LD matrix where we only measure the correlation between the focal SNP and the nearest +100 variants from either side. As stated above, the LD matrix will be stored on-disk and that is why we must +specify the output directory when we call `.compute_ld()`. To use `plink` to compute the LD matrix, +we can invoke a similar command: + +```python linenums="1" +# Using plink: +import magenpy as mgp +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='plink') +gdl.compute_ld(estimator='windowed', + output_dir='output/ld/', + cm_window_size=3.) +gdl.cleanup() +``` + +In this case, we are computing a windowed LD matrix where we only measure the correlation between +SNPs that are at most 3 centi Morgan (cM) apart along the chromosome. For this small 1000G dataset, computing +the LD matrix takes about a minute. The LD matrices in Zarr format will be written to the path +specified in `output_dir`, so ensure that this argument is set to the desired directory. + +To facilitate working with LD matrices stored in `Zarr` format, we created a data structure in python called `LDMatrix`, +which acts as an intermediary and provides various features. For example, to compute LD scores +using this LD matrix, you can invoke the command `.compute_ld_scores()` on it: + +```python linenums="1" +gdl.ld[22].compute_ld_scores() +``` + +``` +array([1.60969673, 1.84471792, 1.59205322, ..., 3.3126724 , 3.42234106, + 2.97252452]) +``` + +You can also get a table that lists the properties of the SNPs included in the LD matrix: + +```python linenums="1" +gdl.ld[22].to_snp_table() +``` + +``` + CHR SNP POS A1 MAF + 0 22 rs9605903 17054720 C 0.260736 + 1 22 rs5746647 17057138 G 0.060327 + 2 22 rs16980739 17058616 T 0.131902 + 3 22 rs9605927 17067005 C 0.033742 + 4 22 rs5746664 17074622 A 0.066462 + ... ... ... ... .. ... + 14880 22 rs8137951 51165664 A 0.284254 + 14881 22 rs2301584 51171497 A 0.183027 + 14882 22 rs3810648 51175626 G 0.065440 + 14883 22 rs2285395 51178090 A 0.061350 + 14884 22 rs28729663 51219006 A 0.159509 + + [14885 rows x 5 columns] +``` + +### LD estimators and their properties + +`magenpy` supports computing LD matrices using 4 different estimators that are commonly used +in statistical genetics applications. +For a more thorough description of the estimators and their properties, consult our manuscript +and the citations therein. The LD estimators are: + +1) `windowed` (recommended): The windowed estimator computes the pairwise correlation coefficient between SNPs that are + within a pre-defined distance along the chromosome from each other. In many statistical genetics applications, the + recommended distance is between 1 and 3 centi Morgan (cM). As of `magenpy>=0.0.2`, now you can customize + the distance based on three criteria: **(1)** A window size based on the number neighboring variants, **(2)** + distance threshold in kilobases (kb), and **(3)** distance threshold in centi Morgan (cM). When defining the + boundaries for each SNP, `magenpy` takes the intersection of the boundaries defined by each window. + +```python linenums="1" +import magenpy as mgp +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='plink') +gdl.compute_ld('windowed', + output_dir='output/ld/', + window_size=100, kb_window_size=1000, cm_window_size=2.) +gdl.cleanup() +``` + +2) `block`: The block estimator estimates the pairwise correlation coefficient between +variants that are in the same LD block, as defined by, e.g. LDetect. Given an LD block file, + we can compute a block-based LD matrix as follows: + +```python linenums="1" +import magenpy as mgp +ld_block_url = "https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/EUR/fourier_ls-all.bed" +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='plink') +gdl.compute_ld('block', + output_dir='output/ld/', + ld_blocks_file=ld_block_url) +gdl.cleanup() +``` + +If you have the LD blocks file on your system, you can also pass the path to the file instead. + +3) `shrinkage`: For the shrinkage estimator, we shrink the entries of the LD matrix by a + quantity related to the distance between SNPs along the chromosome + some additional information + related to the sample from which the genetic map was estimated. In particular, + we need to specify the effective population size and the sample size used to + estimate the genetic map. Also, to make the matrix sparse, we often specify a threshold value + below which we consider the correlation to be zero. Here's an example for the 1000G sample: + + +```python linenums="1" +import magenpy as mgp +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='plink') +gdl.compute_ld('shrinkage', + output_dir='output/ld/', + genetic_map_ne=11400, # effective population size (Ne) + genetic_map_sample_size=183, # Sample size + threshold=1e-3) # The cutoff value +gdl.cleanup() +``` + +4) `sample`: This estimator computes the pairwise correlation coefficient between all SNPs on + the same chromosome and thus results in a dense matrix. Thus, it is rarely used in practice and + we include it here for testing/debugging purposes mostly. To compute the sample LD matrix, you only need + to specify the correct estimator: + +```python linenums="1" +import magenpy as mgp +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='plink') +gdl.compute_ld('sample', output_dir='output/ld/') +gdl.cleanup() +``` + +## (4) Data harmonization + +There are many different statistical genetics data sources and formats out there. One of the goals of +`magenpy` is to create a friendly interface for matching and merging these data sources for +downstream analyses. For example, for summary statistics-based methods, we often need +to merge the LD matrix derived from a reference panel with the GWAS summary statistics estimated +in a different cohort. While this is a simple task, it can be tricky sometimes, e.g. in +cases where the effect allele is flipped between the two cohort. + +The functionalities that we provide for this are minimal at this stage and mainly geared towards +harmonizing `Zarr`-formatted LD matrices with GWAS summary statistics. The following example +shows how to do this in a simple case: + +```python linenums="1" +import magenpy as mgp +# First, generate some summary statistics from a simulation: +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path()) +g_sim.simulate() +g_sim.to_summary_statistics_table().to_csv( + "chr_22.sumstats", sep="\t", index=False +) +# Then load those summary statistics and match them with previously +# computed windowed LD matrix for chromosome 22: +gdl = mgp.GWADataLoader(ld_store_files='output/windowed_ld/chr_22/', + sumstats_files='chr_22.sumstats', + sumstats_format='magenpy') +``` + +Here, the `GWADataLoader` object takes care of the harmonization step by +automatically invoking the `.harmonize_data()` method. When you read or update +any of the data sources, we recommend that you invoke the `.harmonize_data()` method again +to make sure that all the data sources are aligned properly. In the near future, +we are planning to add many other functionalities in this space. Stay tuned. + +## (5) Using `plink` as backend + +Many of the functionalities that `magenpy` supports require access to and performing linear algebra +operations on the genotype matrix. By default, `magenpy` uses `xarray` and `dask` +to carry out these operations, as these are the tools supported by our main dependency: `pandas-plink`. + +However, `dask` can be quite slow and inefficient when deployed on large-scale genotype matrices. To get +around this difficulty, for many operations, such as linear scoring or computing minor allele frequency, +we support (and recommend) using `plink` as a backend. + +To use `plink` as a backend for `magenpy`, first you may need to configure the paths +on your system. By default, `magenpy` assumes that, in the shell, the name `plink2` invokes the `plink2` +executable and `plink` invokes `plink1.9` software. To change this behavior, you can update the +configuration file as follows. First, let's see the default configurations that ship with `magenpy`: + +```python linenums="1" +import magenpy as mgp +mgp.print_options() +``` + +``` +-> Section: DEFAULT +---> plink1.9_path: plink +---> plink2_path: plink2 +``` + +The above shows the default configurations for the `plink1.9` and `plink2` paths. To change +the path for `plink2`, for example, you can use the `set_option()` function: + +```python linenums="1" +mgp.set_option("plink2_path", "~/software/plink2/plink2") +mgp.print_options() +``` + +``` +-> Section: USER +---> plink2_path: ~/software/plink2/plink2 +---> plink1.9_path: plink +-> Section: DEFAULT +---> plink1.9_path: plink +---> plink2_path: plink2 +``` + +As you can see, this added a new section to the configuration file, named `USER`, that has the +new path for the `plink2` software. Now, every time `magenpy` needs to invoke `plink2`, it calls +the executable stored at `~/software/plink2/`. Note that you only need to do this once on any particular +machine or system, as this preference is now recorded in the configuration file and will be taken into +account for all future operations. + +Note that for most of the operations, we assume that the user has `plink2` installed. We only +use `plink1.9` for some operations that are currently not supported by `plink2`, especially for +e.g. LD computation. This behavior may change in the near future. + +Once the paths are configured, to use `plink` as a backend for the various computations and +tools, make sure that you specify the `backend='plink'` flag in `GWADataLoader` and all of its +derived data structures (including all the `PhenotypeSimulator` classes): + +```python linenums="1" +import magenpy as mgp +gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='plink') +``` \ No newline at end of file diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 0000000..cbe213c --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,121 @@ +`magenpy` is a `python` package that aims to streamline working with statistical genetics data +in order to facilitate downstream analyses. The package comes with a sample dataset from the 1000G project that +you can use to experiment and familiarize yourself with its features. +Once the package is installed, you can run a couple of quick tests +to verify that the main features are working properly. + +For example, to simulate a quantitative trait, you can invoke +the following commands in a `python` interpreter: + +```python linenums="1" +import magenpy as mgp +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), # Provide path to 1000G data + h2=0.1) # Heritability set to 0.1 +g_sim.simulate() # Simulate the phenotype +g_sim.to_phenotype_table() + +``` + +``` + FID IID phenotype + 0 HG00096 HG00096 0.795651 + 1 HG00097 HG00097 0.550914 + 2 HG00099 HG00099 -0.928486 + 3 HG00100 HG00100 0.893626 + 4 HG00101 HG00101 -0.670106 + .. ... ... ... + 373 NA20815 NA20815 0.246071 + 374 NA20818 NA20818 1.821426 + 375 NA20819 NA20819 -0.457994 + 376 NA20826 NA20826 0.954208 + 377 NA20828 NA20828 0.088412 + + [378 rows x 3 columns] +``` + +This simulates a quantitative trait with heritability set to 0.1, +using genotype data for a subset of 378 individuals of European ancestry +from the 1000G project and approximately 15,000 SNPs on chromosome 22. +By default, the simulator assumes that only 10% of the SNPs are +causal (this is drawn at random from a Bernoulli distribution with `p=0.1`). +To obtain a list of the causal SNPs in this simulation, you can invoke the +`.get_causal_status()` method, which returns a boolean vector indicating +whether each SNP is causal or not: + +```python linenums="1" +g_sim.get_causal_status() +``` + +``` +{22: array([ True, False, False, ..., False, False, False])} +``` + +In this case, for example, the first SNP is causal for the simulated phenotype. A note +about the design of data structures in `magenpy`. Our main data structure is a class known +as `GWADataLoader`, which is an all-purpose object that brings together different data sources and +harmonizes them together. In `GWADataLoader`, SNP-related data sources are stored in dictionaries, where +the key is the chromosome number and the value is the data structure associated with that chromosome. +Thus, in the output above, the data is for chromosome 22 and the feature is a boolean +vector indicating whether a given SNP is causal or not. + +You can also get the full information +about the genetic architecture by invoking the method `.to_true_beta_table()`, +which returns a `pandas` dataframe with the effect size, expected heritability contribution, +and causal status of each variant in the simulation: + +```python linenums="1" +g_sim.to_true_beta_table() +``` + +``` + CHR SNP A1 MixtureComponent Heritability BETA Causal + 0 22 rs131538 A 1 0.000063 -0.008013 True + 1 22 rs9605903 C 0 0.000000 0.000000 False + 2 22 rs5746647 G 0 0.000000 0.000000 False + 3 22 rs16980739 T 0 0.000000 0.000000 False + 4 22 rs9605923 A 0 0.000000 0.000000 False + ... ... ... .. ... ... ... ... + 15933 22 rs8137951 A 0 0.000000 0.000000 False + 15934 22 rs2301584 A 0 0.000000 0.000000 False + 15935 22 rs3810648 G 0 0.000000 0.000000 False + 15936 22 rs2285395 A 0 0.000000 0.000000 False + 15937 22 rs28729663 A 0 0.000000 0.000000 False + + [15938 rows x 7 columns] +``` + + +We can also simulate a more complex genetic architecture by, e.g. simulating effect sizes from +4-component sparse Gaussian mixture density, instead of the standard spike-and-slab density used by default: + +```python linenums="1" +g_sim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), + pi=[.9, .03, .03, .04], # Mixing proportions + d=[0., .01, .1, 1.], # Variance multipliers + h2=0.1) +g_sim.simulate() +g_sim.to_phenotype_table() +``` + +``` + FID IID phenotype + 0 HG00096 HG00096 0.435024 + 1 HG00097 HG00097 1.030874 + 2 HG00099 HG00099 0.042322 + 3 HG00100 HG00100 1.392733 + 4 HG00101 HG00101 0.722763 + .. ... ... ... + 373 NA20815 NA20815 -0.402506 + 374 NA20818 NA20818 -0.321429 + 375 NA20819 NA20819 -0.845630 + 376 NA20826 NA20826 -0.690078 + 377 NA20828 NA20828 0.256937 + + [378 rows x 3 columns] +``` + +The parameter `pi` specifies the mixing proportions for the Gaussian mixture +distribution and the `d` is a multiplier on the variance (see references below). In this case, 90% of the variants +are not causal, and the remaining 10% are divided between 3 mixture components that contribute +differentially to the heritability. The last component, which constitutes 4% of all SNPs, contributes 100 +times and 10 times to the heritability than components 2 an 3, respectively. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..899f69e --- /dev/null +++ b/docs/index.md @@ -0,0 +1,42 @@ +# Modeling and Analysis of Statistical Genetics data in Python (`magenpy`) + +This site contains documentation, tutorials, and examples for using the `magenpy` package for the purposes of +handling, harmonizing, and computing over genotype data to prepare them for downstream genetics analyses. +The `magenpy` package provides tools for: + +* Reading and processing genotype data in `plink` BED format. +* Efficient LD matrix construction and storage in [Zarr](https://zarr.readthedocs.io/en/stable/index.html) array format. +* Data structures for harmonizing various GWAS data sources. + * Includes parsers for commonly used GWAS summary statistics formats. +* Simulating polygenic traits (continuous and binary) using complex genetic architectures. + * Multi-cohort simulation scenarios (beta) + * Simulations incorporating functional annotations in the genetic architecture (beta) +* Interfaces for performing association testing on simulated and real phenotypes. +* Preliminary support for processing and integrating genomic annotations with other data sources. + +If you use `magenpy` in your research, please cite the following paper: + +> Zabad, S., Gravel, S., & Li, Y. (2023). **Fast and accurate Bayesian polygenic risk modeling with variational inference.** +The American Journal of Human Genetics, 110(5), 741–761. https://doi.org/10.1016/j.ajhg.2023.03.009 + + +## Helpful links + +* [API Reference](api/overview.md) +* [Installation](installation.md) +* [Getting Started](getting_started.md) +* [Features and Configurations](features.md) +* [Command Line Scripts](commandline/overview.md) +* [Project homepage on `GitHub`](https://github.com/shz9/magenpy) +* [Sister package `viprs`](https://github.com/shz9/viprs) + + +## Contact + +If you have any questions or issues, please feel free to open an [issue](https://github.com/shz9/magenpy/issues) +on the `GitHub` repository or contact us directly at: + +* [Shadi Zabad](mailto:shadi.zabad@mail.mcgill.ca) +* [Yue Li](mailto:yueli@cs.mcgill.ca) +* [Simon Gravel](mailto:simon.gravel@mcgill.ca) + diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..1823492 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,60 @@ +The `magenpy` software is written in `Cython/Python3`. +The software is designed to be used in a variety of computing environments, including local workstations, +shared computing environments, and cloud-based computing environments. Because of the dependencies on `Cython`, you need +to ensure that a `C/C++` Compiler (with appropriate flags) is present on your system. + +## Requirements + +Building the `magenpy` package requires the following dependencies: + +* `Python` (>=3.8) +* `C/C++` Compiler +* `Cython` (>=0.29.21) +* `NumPy` (>=1.19.5) + +### Setting up the environment with `conda` + +If you can use `Anaconda` or `miniconda` to manage your Python environment, we recommend using them to create +a new environment with the required dependencies as follows: + +```bash +python_version=3.11 # Change python version here if needed +conda create --name "magenpy_env" -c anaconda -c conda-forge python=$python_version compilers openblas -y +conda activate magenpy_env +``` + +Using `conda` to set up and manage your environment is especially *recommended* if you have trouble compiling +the `C/C++` extensions on your system. + +## Installation + +### Using `pip` + +The package is available for easy installation via the Python Package Index (`pypi`) can be installed using `pip`: + +```bash +python -m pip install magenpy>=0.1 +``` + +### Building from source + +You may also build the package from source, by cloning the repository and running the `make install` command: + +```bash +git clone https://github.com/shz9/magenpy.git +cd magenpy +make install +``` + +### Using a virtual environment + +If you wish to use `magenpy` on a shared computing environment or cluster, it is recommended that you install +the package in a virtual environment. Here's a quick example of how to install `magenpy` on a SLURM-based cluster: + +```bash +module load python/3.8 +python -m venv magenpy_env +source magenpy_env/bin/activate +python -m pip install --upgrade pip +python -m pip install magenpy>=0.1 +``` diff --git a/docs/tutorials/overview.md b/docs/tutorials/overview.md new file mode 100644 index 0000000..e69de29 diff --git a/examples/simulate_phenotype.sh b/examples/simulate_phenotype.sh index d48ee86..f79aa1d 100644 --- a/examples/simulate_phenotype.sh +++ b/examples/simulate_phenotype.sh @@ -17,13 +17,13 @@ TGP_PATH=$(python -c "import magenpy as mgp; print(mgp.tgp_eur_data_path())") # Example 1: Simulate a heritable quantitative trait (h2 = 0.2) for the 1000G individuals -magenpy_simulate --bed-files "$TGP_PATH" \ +magenpy_simulate --bfile "$TGP_PATH" \ --output-file "output/simulations/example_1" \ --h2 0.2 # Example 2: Simulate a heritable case-control trait (h2 = 0.3, prevalence=.2) for the 1000G individuals: -magenpy_simulate --bed-files "$TGP_PATH" \ +magenpy_simulate --bfile "$TGP_PATH" \ --output-file "output/simulations/example_2" \ --likelihood "binomial" \ --h2 0.3 \ @@ -31,14 +31,14 @@ magenpy_simulate --bed-files "$TGP_PATH" \ # Example 3: Use plink as a backend for operations on the genotype matrix (recommended): -magenpy_simulate --bed-files "$TGP_PATH" \ +magenpy_simulate --bfile "$TGP_PATH" \ --backend "plink" \ --output-file "output/simulations/example_3" \ --h2 0.2 # Example 4: Use a mixture of 4 Gaussians for the effect sizes: -magenpy_simulate --bed-files "$TGP_PATH" \ +magenpy_simulate --bfile "$TGP_PATH" \ --backend "plink" \ --output-file "output/simulations/example_4" \ --h2 0.2 \ @@ -47,9 +47,9 @@ magenpy_simulate --bed-files "$TGP_PATH" \ # Example 5: Output the simulated phenotype + simulated effect sizes per variant: -magenpy_simulate --bed-files "$TGP_PATH" \ +magenpy_simulate --bfile "$TGP_PATH" \ --output-file "output/simulations/example_5" \ --h2 0.2 \ -p 0.9,0.1 \ -d 0.,1. \ - --output-simulated-effects + --output-simulated-beta diff --git a/magenpy/AnnotationMatrix.py b/magenpy/AnnotationMatrix.py index 4219637..1912569 100644 --- a/magenpy/AnnotationMatrix.py +++ b/magenpy/AnnotationMatrix.py @@ -2,6 +2,22 @@ class AnnotationMatrix(object): + """ + A wrapper class for handling annotation matrices, which are essentially tables of + features for each variant in the genome. These features include information such as + whether the variant is in coding regions, enhancers, etc. It can also include continuous + features derived from experimental assays or other sources. + + The purpose of this class is to present a unified and consistent interface for handling + annotations across different tools and applications. It should be able to read and write + annotation matrices in different formats, filter annotations, and perform basic operations + on the annotation matrix. It should also allow users to define new custom annotations + that can be used for downstream statistical genetics applications. + + :ivar table: A pandas dataframe containing the annotation information. + :ivar _annotations: A list or array of column namees to consider as annotations. If not provided, + will be inferred heuristically, though we recommend that the user specify this information. + """ def __init__(self, annotation_table=None, annotations=None): """ @@ -25,7 +41,7 @@ def __init__(self, annotation_table=None, annotations=None): def from_file(cls, annot_file, annot_format='magenpy', annot_parser=None, **parse_kwargs): """ - Takes an annotation file and initializes an annotation matrix object from it. + Initialize an AnnotationMatrix object from a file. :param annot_file: The path to the annotation file. :param annot_format: The format of the annotation file. For now, we mainly support @@ -33,6 +49,8 @@ def from_file(cls, annot_file, annot_format='magenpy', annot_parser=None, :param annot_parser: An `AnnotationMatrixParser` derived object, which can be tailored to specific annotation formats that the user has. :param parse_kwargs: arguments for the pandas `read_csv` function, such as the delimiter. + + :return: An instance of the `AnnotationMatrix` class. """ from .parsers.annotation_parsers import AnnotationMatrixParser, LDSCAnnotationMatrixParser @@ -53,14 +71,25 @@ def from_file(cls, annot_file, annot_format='magenpy', annot_parser=None, @property def shape(self): + """ + :return: The dimensions of the annotation matrix (number of variants x number of annotations). + """ return self.n_snps, self.n_annotations @property def n_snps(self): + """ + :return: The number of variants in the annotation matrix. + """ return len(self.table) @property def chromosome(self): + """ + A convenience method to get the chromosome if there is only one chromosome in the annotation matrix. + + :return: The chromosome number if there is only one chromosome in the annotation matrix. Otherwise, None. + """ chrom = self.chromosomes if chrom is not None: if len(chrom) == 1: @@ -68,15 +97,24 @@ def chromosome(self): @property def chromosomes(self): + """ + :return: The list of unique chromosomes in the annotation matrix. + """ if 'CHR' in self.table.columns: return self.table['CHR'].unique() @property def snps(self): + """ + :return: The list of SNP rsIDs in the annotation matrix. + """ return self.table['SNP'].values @property def n_annotations(self): + """ + :return: The number of annotations in the annotation matrix. + """ if self.annotations is None: return 0 else: @@ -84,19 +122,28 @@ def n_annotations(self): @property def binary_annotations(self): + """ + :return: A list of binary (0/1) annotations in the annotation matrix. + """ assert self.annotations is not None return np.array([c for c in self.annotations if len(self.table[c].unique()) == 2]) @property def annotations(self): + """ + :return: The list of annotation names or IDs in the annotation matrix. + """ return self._annotations def values(self, add_intercept=False): """ - Returns the annotation matrix. :param add_intercept: Adds a base annotation corresponding to the intercept. + + :return: The annotation matrix as a numpy matrix. + :raises KeyError: If no annotations are defined in the table. """ + if self.annotations is None: raise KeyError("No annotations are defined in this table!") annot_mat = self.table[self.annotations].values @@ -111,7 +158,7 @@ def filter_snps(self, extract_snps=None, extract_file=None): either a list of variants to extract or the path to a file with the list of variants to extract. - :param extract_snps: A list (or array) of SNP IDs to keep in the annotation matrix. + :param extract_snps: A list or array of SNP IDs to keep in the annotation matrix. :param extract_file: The path to a file with the list of variants to extract. """ @@ -121,7 +168,7 @@ def filter_snps(self, extract_snps=None, extract_file=None): from .parsers.misc_parsers import read_snp_filter_file extract_snps = read_snp_filter_file(extract_file) - from magenpy.utils.compute_utils import intersect_arrays + from .utils.compute_utils import intersect_arrays arr_idx = intersect_arrays(self.snps, extract_snps, return_index=True) @@ -130,7 +177,7 @@ def filter_snps(self, extract_snps=None, extract_file=None): def filter_annotations(self, keep_annotations): """ Filter the list of annotations in the matrix. - :param keep_annotations: A list or vector of annotations to keep. + :param keep_annotations: A list or array of annotations to keep. """ if self.annotations is None: @@ -142,6 +189,7 @@ def filter_annotations(self, keep_annotations): def add_annotation(self, annot_vec, annotation_name): """ Add an annotation vector or list to the AnnotationMatrix object. + :param annot_vec: A vector/list/Series containing the annotation information for each SNP in the AnnotationMatrix. For now, it's the responsibility of the user to make sure that the annotation list or vector are sorted properly. @@ -170,12 +218,15 @@ def add_annotation_from_bed(self, bed_file, annotation_name): CHR StartCoordinate EndCoordinate ... - NOTE: This implementation is quite slow at the moment. May need to find more efficient - ways to do the merge over list of ranges. + !!! note + This implementation is quite slow at the moment. May need to find more efficient + ways to do the merge over list of ranges. :param bed_file: The path to the BED file containing the annotation coordinates. :param annotation_name: The name of the annotation to create. Make sure the name is not already in the matrix! + + :raises AssertionError: If the annotation name is already in the matrix. """ from .parsers.annotation_parsers import parse_annotation_bed_file @@ -199,7 +250,7 @@ def annotation_overlap(row): return False check = (chr_range.Start <= row['POS']) & (chr_range.End >= row['POS']) - return int(check.any()) + return int(np.any(check)) self.table[annotation_name] = self.table.apply(annotation_overlap, axis=1) @@ -210,15 +261,17 @@ def annotation_overlap(row): def get_binary_annotation_index(self, bin_annot): """ - Get the indices of all SNPs that belong to binary annotation `bin_annot` + :param bin_annot: The name of the binary annotation for which to fetch the relevant variants. + :return: The indices of all variants that belong to binary annotation `bin_annot` """ assert bin_annot in self.binary_annotations return np.where(self.table[bin_annot] == 1)[0] def split_by_chromosome(self): """ - Split the annotation matrix by chromosome, so that we would - have a separate `AnnotationMatrix` object for each chromosome. + Split the annotation matrix by chromosome. + + :return: A dictionary of `AnnotationMatrix` objects, where the keys are the chromosome numbers. """ if 'CHR' in self.table.columns: @@ -233,7 +286,7 @@ def split_by_chromosome(self): def to_file(self, output_path, col_subset=None, compress=True, **to_csv_kwargs): """ - Write the annotation matrix to file. + A convenience method to write the annotation matrix to a file. :param output_path: The path and prefix to the file where to write the annotation matrix. :param col_subset: A subset of the columns to write to file. diff --git a/magenpy/GWADataLoader.py b/magenpy/GWADataLoader.py index 3aecb71..3e3d894 100644 --- a/magenpy/GWADataLoader.py +++ b/magenpy/GWADataLoader.py @@ -1,7 +1,3 @@ -""" -Author: Shadi Zabad -Date: December 2020 -""" from typing import Union, Dict @@ -19,9 +15,33 @@ from .utils.compute_utils import iterable from .utils.system_utils import makedir, get_filenames +from .utils.model_utils import match_chromosomes class GWADataLoader(object): + """ + A class to load and manage multiple data sources for genetic association studies. + This class is designed to handle genotype matrices, summary statistics, LD matrices, + and annotation matrices. It also provides functionalities to filter samples and/or SNPs, + harmonize data sources, and compute LD matrices. This is all done in order to facilitate + downstream statistical genetics analyses that require multiple data sources to be aligned + and harmonized. The use cases include: + + * Summary statistics-based PRS computation + * Summary statistics-based heritability estimation. + * Complex trait simulation. + * Performing Genome-wide association tests. + + :ivar genotype: A dictionary of `GenotypeMatrix` objects, where the key is the chromosome number. + :ivar sample_table: A `SampleTable` object containing the sample information. + :ivar phenotype_likelihood: The likelihood of the phenotype (e.g. `gaussian`, `binomial`). + :ivar ld: A dictionary of `LDMatrix` objects, where the key is the chromosome number. + :ivar sumstats_table: A dictionary of `SumstatsTable` objects, where the key is the chromosome number. + :ivar annotation: A dictionary of `AnnotationMatrix` objects, where the key is the chromosome number. + :ivar backend: The backend software used for the computation. Currently, supports `xarray` and `plink`. + :ivar temp_dir: The temporary directory where we store intermediate files (if necessary). + :ivar output_dir: The output directory where we store the results of the computation. + """ def __init__(self, bed_files=None, @@ -44,7 +64,41 @@ def __init__(self, temp_dir='temp', output_dir='output', verbose=True, - n_threads=1): + threads=1): + """ + Initialize the `GWADataLoader` object with the data sources required for + downstream statistical genetics analyses. + + :param bed_files: The path to the BED file(s). You may use a wildcard here to read files for multiple + chromosomes. + :param phenotype_file: The path to the phenotype file. + (Default: tab-separated file with `FID IID phenotype` columns). + :param covariates_file: The path to the covariates file. + (Default: tab-separated file starting with the `FID IID ...` columns and followed by the covariate columns). + :param keep_samples: A vector or list of sample IDs to keep when filtering the genotype matrix. + :param keep_file: A path to a plink-style keep file to select a subset of individuals. + :param extract_snps: A vector or list of SNP IDs to keep when filtering the genotype matrix. + :param extract_file: A path to a plink-style extract file to select a subset of SNPs. + :param min_maf: The minimum minor allele frequency cutoff. + :param min_mac: The minimum minor allele count cutoff. + :param drop_duplicated: If True, drop SNPs with duplicated rsID. + :param phenotype_likelihood: The likelihood of the phenotype (e.g. `gaussian`, `binomial`). + :param sumstats_files: The path to the summary statistics file(s). The path may be a wildcard. + :param sumstats_format: The format for the summary statistics. Currently supports the following + formats: `plink1.9`, `plink2`, `magenpy`, `fastGWA`, `COJO`, `SAIGE`, or `GWASCatalog` for the standard + summary statistics format (also known as `ssf` or `gwas-ssf`). + :param ld_store_files: The path to the LD matrices. This may be a wildcard to accommodate reading data + for multiple chromosomes. + :param annotation_files: The path to the annotation file(s). The path may contain a wildcard. + :param annotation_format: The format for the summary statistics. Currently, supports the following + formats: `magenpy`, `ldsc`. + :param backend: The backend software used for computations with the genotype matrix. Currently, supports + `xarray` and `plink`. + :param temp_dir: The temporary directory where to store intermediate files. + :param output_dir: The output directory where to store the results of the computation. + :param verbose: Verbosity of the information printed to standard output. + :param threads: The number of threads to use for computations. + """ # ------- Sanity checks ------- @@ -62,7 +116,7 @@ def __init__(self, makedir([temp_dir, output_dir]) self.verbose = verbose - self.n_threads = n_threads + self.threads = threads # ------- General parameters ------- @@ -103,25 +157,36 @@ def __init__(self, @property def samples(self): + """ + :return: The list of samples retained in the sample table. + """ if self.sample_table is not None: return self.sample_table.iid @property def sample_size(self): """ - The number of samples. + + !!! seealso "See Also" + * [n][magenpy.GWADataLoader.GWADataLoader.n] + + :return: The number of samples in the genotype matrix. + """ if self.sample_table is not None: return self.sample_table.n elif self.sumstats_table is not None: return np.max([np.max(ss.n_per_snp) for ss in self.sumstats_table.values()]) else: - raise Exception("Information about the sample size is not available!") + raise ValueError("Information about the sample size is not available!") @property def n(self): """ - The number of samples. See also `.sample_size()`. + !!! seealso "See Also" + * [sample_size][magenpy.GWADataLoader.GWADataLoader.sample_size] + + :return: The number of samples in the genotype matrix. """ return self.sample_size @@ -129,7 +194,8 @@ def n(self): @property def snps(self): """ - Return the list of SNPs retained in each chromosome. + :return: The list of SNP rsIDs retained in each chromosome. + :rtype: dict """ if self.genotype is not None: return {c: g.snps for c, g in self.genotype.items()} @@ -140,26 +206,32 @@ def snps(self): elif self.annotation is not None: return {c: a.snps for c, a in self.annotation.items()} else: - raise Exception("GWADataLoader is not properly initialized!") + raise ValueError("GWADataLoader instance is not properly initialized!") @property def m(self): """ - The number of variants. See also `.n_snps` + !!! seealso "See Also" + * [n_snps][magenpy.GWADataLoader.GWADataLoader.n_snps] + + :return: The number of variants in the harmonized data sources. """ return sum(self.shapes.values()) @property def n_snps(self): """ - The number of variants. See also `.m` + !!! seealso "See Also" + * [m][magenpy.GWADataLoader.GWADataLoader.m] + + :return: The number of variants in the harmonized data sources. """ return self.m @property def shapes(self): """ - Return a dictionary where the key is the chromosome number and the value is + :return: A dictionary where the key is the chromosome number and the value is the number of variants on that chromosome. """ if self.genotype is not None: @@ -171,19 +243,19 @@ def shapes(self): elif self.annotation is not None: return {c: a.shape[0] for c, a in self.annotation.items()} else: - raise Exception("GWADataLoader is not properly initialized!") + raise ValueError("GWADataLoader instance is not properly initialized!") @property def chromosomes(self): """ - Return the list of chromosomes that were loaded to `GWADataLoader`. + :return: The list of chromosomes that were loaded to `GWADataLoader`. """ return sorted(list(self.shapes.keys())) @property def n_annotations(self): """ - Return the number of annotations included in the annotation matrices. + :return: The number of annotations included in the annotation matrices. """ if self.annotation is not None: return self.annotation[self.chromosomes[0]].n_annotations @@ -191,9 +263,9 @@ def n_annotations(self): def filter_snps(self, extract_snps=None, extract_file=None, chromosome=None): """ Filter the SNP set from all the GWADataLoader objects. - :param extract_snps: A list or vector of SNP IDs to keep. - :param extract_file: A path to a plink-style file with SNP IDs to keep. - :param chromosome: Chromosome number. If specified, applies the filter on that chromosome only. + :param extract_snps: A list or array of SNP rsIDs to keep. + :param extract_file: A path to a plink-style file with SNP rsIDs to keep. + :param chromosome: Chromosome number. If specified, applies the filter to that chromosome only. """ if extract_snps is None and extract_file is None: @@ -246,7 +318,7 @@ def filter_samples(self, keep_samples=None, keep_file=None): either a list of samples to keep or the path to a file with the list of samples to keep. - :param keep_samples: A list (or array) of sample IDs to keep. + :param keep_samples: A list or array of sample IDs to keep. :param keep_file: The path to a file with the list of samples to keep. """ @@ -352,7 +424,9 @@ def read_genotypes(self, desc="Reading BED files", disable=not self.verbose or len(bed_files) < 2): # Read BED file and update the genotypes dictionary: - self.genotype.update(gmat_class.from_file(bfile, temp_dir=self.temp_dir).split_by_chromosome()) + self.genotype.update(gmat_class.from_file(bfile, + temp_dir=self.temp_dir, + threads=self.threads).split_by_chromosome()) # After reading the genotype matrices, apply some standard filters: for i, (c, g) in enumerate(self.genotype.items()): @@ -380,7 +454,8 @@ def read_phenotype(self, phenotype_file, drop_na=True, **read_csv_kwargs): Read the phenotype file and integrate it with the sample tables and genotype matrices. :param phenotype_file: The path to the phenotype file - (Default: tab-separated file with `FID IID phenotype` columns). + (Default: tab-separated file with `FID IID phenotype` columns). If different, supply + details as additional arguments to this function. :param drop_na: Drop samples with missing phenotype information. :param read_csv_kwargs: keyword arguments for the `read_csv` function of `pandas`. """ @@ -438,7 +513,8 @@ def read_summary_statistics(self, :param sumstats_path: The path to the summary statistics file(s). The path may be a wildcard. :param sumstats_format: The format for the summary statistics. Currently supports the following - formats: `plink`, `magenpy`, `fastGWA`, `COJO`. + formats: `plink1.9`, `plink2`, `magenpy`, `fastGWA`, `COJO`, `SAIGE`, or `GWASCatalog` for the standard + summary statistics format (also known as `ssf` or `gwas-ssf`). :param parser: If the summary statistics file does not follow any of the formats above, you can create your own parser by inheriting from the base `SumstatsParser` class and passing it here as an argument. :param drop_duplicated: Drop SNPs with duplicated rsIDs. @@ -485,7 +561,7 @@ def read_summary_statistics(self, elif self.ld is not None: ref_table = {c: ld.snps for c, ld in self.ld.items()} else: - raise Exception("Cannot index summary statistics tables without chromosome information!") + raise ValueError("Cannot index summary statistics tables without chromosome information!") self.sumstats_table.update(ss_tab.split_by_chromosome(snps_per_chrom=ref_table)) @@ -500,7 +576,7 @@ def read_ld(self, ld_store_paths): return if not iterable(ld_store_paths): - ld_store_files = get_filenames(ld_store_paths, extension='.zarray') + ld_store_files = get_filenames(ld_store_paths, extension='.zgroup') else: ld_store_files = ld_store_paths @@ -522,7 +598,7 @@ def read_ld(self, ld_store_paths): def load_ld(self): """ - A utility function to load the LD matrices to memory from on-disk storage. + A utility method to load the LD matrices to memory from on-disk storage. """ if self.ld is not None: for ld in self.ld.values(): @@ -530,13 +606,19 @@ def load_ld(self): def release_ld(self): """ - A utility function to release LD matrices from memory. + A utility function to release the LD matrices from memory. """ if self.ld is not None: for ld in self.ld.values(): ld.release() - def compute_ld(self, estimator, output_dir, **ld_kwargs): + def compute_ld(self, + estimator, + output_dir, + dtype='int16', + compressor_name='lz4', + compression_level=5, + **ld_kwargs): """ Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson correlation matrix between genetic variants. This function only considers correlations @@ -548,6 +630,10 @@ def compute_ld(self, estimator, output_dir, **ld_kwargs): 4 different estimators: `sample`, `windowed`, `shrinkage`, and `block`. :param output_dir: The output directory where the Zarr array containing the entries of the LD matrix will be stored. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compression algorithm to use for the LD matrix. + :param compression_level: The compression level to use for the entries of the LD matrix (1-9). :param ld_kwargs: keyword arguments for the various LD estimators. Consult the implementations of `WindowedLD`, `ShrinkageLD`, and `BlockLD` for details. """ @@ -556,7 +642,12 @@ def compute_ld(self, estimator, output_dir, **ld_kwargs): print("> Computing LD matrix...") self.ld = { - c: g.compute_ld(estimator, output_dir, **ld_kwargs) + c: g.compute_ld(estimator, + output_dir, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level, + **ld_kwargs) for c, g in tqdm(sorted(self.genotype.items(), key=lambda x: x[0]), total=len(self.genotype), desc='Computing LD matrices', @@ -564,21 +655,23 @@ def compute_ld(self, estimator, output_dir, **ld_kwargs): } def get_ld_matrices(self): + """ + :return: The LD matrices computed for each chromosome. + """ return self.ld - def get_ld_boundaries(self): - - if self.ld is None: - return None - - return {c: ld.get_masked_boundaries() for c, ld in self.ld.items()} - def harmonize_data(self): """ This method ensures that the data sources (reference genotype, LD matrices, summary statistics, annotations) are all aligned in terms of the set of variants that they operate on as well as the designation of the effect allele for each variant. + + !!! note + This method is called automatically during the initialization of the `GWADataLoader` object. + However, if you read or manipulate the data sources after initialization, + you may need to call this method again to ensure that the data sources remain aligned. + """ data_sources = (self.genotype, self.sumstats_table, self.ld, self.annotation) @@ -656,42 +749,48 @@ def perform_gwas(self, **gwa_kwargs): disable=not self.verbose or len(self.genotype) < 2) } - def score(self, beta=None): + def score(self, beta=None, standardize_genotype=False): """ Perform linear scoring, i.e. multiply the genotype matrix by the vector of effect sizes, `beta`. :param beta: A dictionary where the keys are the chromosome numbers and the values are a vector of effect sizes for each variant on that chromosome. If the betas are not provided, we use the marginal betas by default (if those are available). + :param standardize_genotype: If True, standardize the genotype matrix before scoring. """ if beta is None: try: beta = {c: s.marginal_beta or s.get_snp_pseudo_corr() for c, s in self.sumstats_table.items()} except Exception: - raise Exception("To perform linear scoring, you must a provide effect size estimates (BETA)!") + raise ValueError("To perform linear scoring, you must provide effect size estimates (BETA)!") - common_chroms = sorted(list(set(self.genotype.keys()).intersection(set(beta.keys())))) + # Here, we have a very ugly way of accounting for + # the fact that the chromosomes may be coded differently between the genotype + # and the beta dictionary. Maybe we can find a better solution in the future. + common_chr_g, common_chr_b = match_chromosomes(self.genotype.keys(), beta.keys(), return_both=True) - if self.verbose and len(common_chroms) < 2: + if len(common_chr_g) < 1: + raise ValueError("No common chromosomes found between the genotype and the effect size estimates!") + + if self.verbose and len(common_chr_g) < 2: print("> Generating polygenic scores...") pgs = None - for c in tqdm(common_chroms, - total=len(common_chroms), - desc='Generating polygenic scores', - disable=not self.verbose or len(common_chroms) < 2): + for c_g, c_b in tqdm(zip(common_chr_g, common_chr_b), + total=len(common_chr_g), + desc='Generating polygenic scores', + disable=not self.verbose or len(common_chr_g) < 2): if pgs is None: - pgs = self.genotype[c].score(beta[c]) + pgs = self.genotype[c_g].score(beta[c_b], standardize_genotype=standardize_genotype) else: - pgs += self.genotype[c].score(beta[c]) + pgs += self.genotype[c_g].score(beta[c_b], standardize_genotype=standardize_genotype) # If we only have a single set of betas, flatten the PGS vector: - if len(pgs.shape) > 1: - if pgs.shape[1] == 1: - pgs = pgs.flatten() + if len(pgs.shape) > 1 and pgs.shape[1] == 1: + pgs = pgs.flatten() return pgs @@ -718,7 +817,7 @@ def predict(self, beta=None): def to_individual_table(self): """ - Get a plink-style dataframe of individual IDs, in the form of + :return: A plink-style dataframe of individual IDs, in the form of Family ID (FID) and Individual ID (IID). """ @@ -726,7 +825,7 @@ def to_individual_table(self): def to_phenotype_table(self): """ - Get a plink-style dataframe with each individual's Family ID (FID), + :return: A plink-style dataframe with each individual's Family ID (FID), Individual ID (IID), and phenotype value. """ @@ -734,26 +833,28 @@ def to_phenotype_table(self): def to_snp_table(self, col_subset=None, per_chromosome=False): """ - Return a dataframe of SNP information for all variants + Get a dataframe of SNP data for all variants across different chromosomes. :param col_subset: The subset of columns to obtain. :param per_chromosome: If True, returns a dictionary where the key is the chromosome number and the value is the SNP table per chromosome. + + :return: A dataframe (or dictionary of dataframes) of SNP data. """ snp_tables = {} for c in self.chromosomes: if self.sumstats_table is not None: - snp_tables[c] = self.sumstats_table[c].get_table(col_subset=col_subset) + snp_tables[c] = self.sumstats_table[c].to_table(col_subset=col_subset) elif self.genotype is not None: snp_tables[c] = self.genotype[c].get_snp_table(col_subset=col_subset) elif self.ld is not None: snp_tables[c] = self.ld[c].to_snp_table(col_subset=col_subset) else: - raise Exception("GWADataLoader is not properly initialized!") + raise ValueError("GWADataLoader instance is not properly initialized!") if per_chromosome: return snp_tables @@ -762,13 +863,15 @@ def to_snp_table(self, col_subset=None, per_chromosome=False): def to_summary_statistics_table(self, col_subset=None, per_chromosome=False): """ - Return a dataframe of the GWAS summary statistics for all variants + Get a dataframe of the GWAS summary statistics for all variants across different chromosomes. :param col_subset: The subset of columns (or summary statistics) to obtain. :param per_chromosome: If True, returns a dictionary where the key is the chromosome number and the value is the summary statistics table per chromosome. + + :return: A dataframe (or dictionary of dataframes) of summary statistics. """ assert self.sumstats_table is not None @@ -776,7 +879,7 @@ def to_summary_statistics_table(self, col_subset=None, per_chromosome=False): snp_tables = {} for c in self.chromosomes: - snp_tables[c] = self.sumstats_table[c].get_table(col_subset=col_subset) + snp_tables[c] = self.sumstats_table[c].to_table(col_subset=col_subset) if per_chromosome: return snp_tables @@ -827,8 +930,8 @@ def split_by_samples(self, proportions=None, groups=None, keep_original=True): """ Split the `GWADataLoader` object by samples, if genotype or sample data is available. The user must provide a list or proportion of samples in each split, - and the method will return a list of `GWADataLoader` objects with only samples - within each split. This may be a useful utility for training/testing split or some + and the method will return a list of `GWADataLoader` objects with only the samples + designated for each split. This may be a useful utility for training/testing split or some other downstream tasks. :param proportions: A list with the proportion of samples in each split. Must add to 1. @@ -838,12 +941,12 @@ def split_by_samples(self, proportions=None, groups=None, keep_original=True): """ if self.sample_table is None: - raise Exception("The sample table is not set!") + raise ValueError("The sample table is not set!") if groups is None: if proportions is None: - raise Exception("To split a `GWADataloader` object by samples, the user must provide either the list " - "or proportion of individuals in each split.") + raise ValueError("To split a `GWADataloader` object by samples, the user must provide either the list " + "or proportion of individuals in each split.") else: # Assign each sample to a different split randomly by drawing from a multinomial: @@ -855,7 +958,7 @@ def split_by_samples(self, proportions=None, groups=None, keep_original=True): for i, g in enumerate(groups): if len(g) < 1: - raise Exception(f"Group {i} is empty! Please ensure that all splits have at least one sample.") + raise ValueError(f"Group {i} is empty! Please ensure that all splits have at least one sample.") if (i + 1) == len(groups) and not keep_original: new_gdl = self @@ -874,13 +977,15 @@ def align_with(self, other_gdls, axis='SNP', how='inner'): set of SNPs or samples. This utility method is meant to enable the user to align multiple data sources for downstream analyses. - NOTE: Experimental for now, would like to add more features here in the near future. - :param other_gdls: A `GWADataLoader` or list of `GWADataLoader` objects. :param axis: The axis on which to perform the alignment (can be `sample` for aligning individuals or `SNP` for aligning variants across the datasets). :param how: The type of join to perform across the datasets. For now, we support an inner join sort of operation. + + !!! warning + Experimental for now, would like to add more features here in the near future. + """ if isinstance(other_gdls, GWADataLoader): diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py index 8e432c1..7eb07cc 100644 --- a/magenpy/GenotypeMatrix.py +++ b/magenpy/GenotypeMatrix.py @@ -1,4 +1,3 @@ - from typing import Union import tempfile import pandas as pd @@ -8,12 +7,53 @@ class GenotypeMatrix(object): + """ + A class to represent a genotype matrix. The genotype matrix is a matrix of + where the rows represent samples and the columns represent genetic variants. + In general, genotype matrices are assumed to reside on disk and this class + provides a convenient interface to interact with and perform computations + on the genotype matrix. + + Currently, we assume that the genotype matrix is stored using plink's BED + file format, with associated tables for the samples (i.e. FAM file) and genetic + variants (i.e. BIM file). Classes that inherit from this generic class support + various backends to access and performing computations on this genotype data. + + !!! seealso "See Also" + * [xarrayGenotypeMatrix][magenpy.GenotypeMatrix.xarrayGenotypeMatrix] + * [plinkBEDGenotypeMatrix][magenpy.GenotypeMatrix.plinkBEDGenotypeMatrix] + + :ivar sample_table: A table containing information about the samples in the genotype matrix + (initially read from the FAM file). + :ivar snp_table: A table containing information about the genetic variants in the genotype matrix + (initially read from the BIM file). + :ivar bed_file: The path to the plink BED file containing the genotype matrix. + :ivar _genome_build: The genome build or assembly under which the SNP coordinates are defined. + :ivar temp_dir: The directory where temporary files will be stored (if needed). + :ivar cleanup_dir_list: A list of directories to clean up after execution. + :ivar threads: The number of threads to use for parallel computations. + + """ def __init__(self, sample_table: Union[pd.DataFrame, SampleTable, None] = None, snp_table: Union[pd.DataFrame, None] = None, temp_dir: str = 'temp', + bed_file: str = None, + genome_build=None, + threads=1, **kwargs): + """ + Initialize a GenotypeMatrix object. + + :param sample_table: A table containing information about the samples in the genotype matrix. + :param snp_table: A table containing information about the genetic variants in the genotype matrix. + :param temp_dir: The directory where temporary files will be stored (if needed). + :param bed_file: The path to the plink BED file containing the genotype matrix. + :param genome_build: The genome build or assembly under which the SNP coordinates are defined. + :param threads: The number of threads to use for parallel computations. + :param kwargs: Additional keyword arguments. + """ self.sample_table: Union[pd.DataFrame, SampleTable, None] = None self.snp_table: Union[pd.DataFrame, None] = snp_table @@ -21,22 +61,36 @@ def __init__(self, if sample_table is not None: self.set_sample_table(sample_table) + if snp_table is not None: + self.snp_table['original_index'] = np.arange(len(self.snp_table)) + from .utils.system_utils import makedir + makedir(temp_dir) + + self.bed_file = bed_file + self._genome_build = genome_build self.temp_dir = temp_dir self.cleanup_dir_list = [] # Directories to clean up after execution. + self.threads = threads + @classmethod - def from_file(cls, file_path, temp_dir='temp'): + def from_file(cls, file_path, temp_dir='temp', **kwargs): """ - Read and parse the genotype matrix information from file. + Initialize a genotype matrix object by passing a file path + other keyword arguments. + :param file_path: The path to the plink BED file. + :type file_path: str + :param temp_dir: The directory where temporary files will be stored. + :type temp_dir: str + :param kwargs: Additional keyword arguments. """ raise NotImplementedError @property def shape(self): """ - The shape of the genotype matrix. Rows correspond to the + :return: The shape of the genotype matrix. Rows correspond to the number of samples and columns to the number of SNPs. """ return self.n, self.m @@ -44,50 +98,77 @@ def shape(self): @property def n(self): """ - The sample size, see also `.sample_size()` + !!! seealso "See Also" + * [sample_size][magenpy.GenotypeMatrix.GenotypeMatrix.sample_size] + + :return: The sample size or number of individuals in the genotype matrix. """ return self.sample_table.n @property def sample_size(self): """ - The sample size of the genotype matrix. See also `.n()`. + !!! seealso "See Also" + * [n][magenpy.GenotypeMatrix.GenotypeMatrix.n] + + :return: The sample size or number of individuals in the genotype matrix. """ return self.n @property def samples(self): """ - Obtain a vector of sample IDs. + :return: An array of sample IDs in the genotype matrix. """ return self.sample_table.iid @property def m(self): """ - The number of SNPs, see also `n_snps` + + !!! seealso "See Also" + * [n_snps][magenpy.GenotypeMatrix.GenotypeMatrix.n_snps] + + :return: The number of variants in the genotype matrix. """ if self.snp_table is not None: return len(self.snp_table) @property def n_snps(self): + """ + !!! seealso "See Also" + * [m][magenpy.GenotypeMatrix.GenotypeMatrix.m] + + :return: The number of variants in the genotype matrix. + """ return self.m + @property + def genome_build(self): + """ + :return: The genome build or assembly under which the SNP coordinates are defined. + """ + return self._genome_build + @property def chromosome(self): """ - If the genotype matrix is comprised of a single chromosome, return the chromosome number. + :return: The chromosome associated with the variants in the genotype matrix. + + ..note:: + This is a convenience method that assumes that the genotype matrix contains variants + from a single chromosome. If there are multiple chromosomes, the method will return `None`. + """ chrom = self.chromosomes - if chrom is not None: - if len(chrom) == 1: - return chrom[0] + if chrom is not None and len(chrom) == 1: + return chrom[0] @property def chromosomes(self): """ - Return the unique set of chromosomes comprising the genotype matrix. + :return: The unique set of chromosomes comprising the genotype matrix. """ chrom = self.get_snp_attribute('CHR') if chrom is not None: @@ -96,21 +177,22 @@ def chromosomes(self): @property def snps(self): """ - Return the SNP IDs. + :return: The SNP rsIDs for variants in the genotype matrix. """ return self.get_snp_attribute('SNP') @property def bp_pos(self): """ - The position for the genetic variants in base pairs. + :return: The basepair position for the genetic variants in the genotype matrix. """ return self.get_snp_attribute('POS') @property def cm_pos(self): """ - The position for the genetic variants in Centi Morgan. + :return: The position of genetic variants in the genotype matrix in units of Centi Morgan. + :raises KeyError: If the genetic distance is not set in the genotype file. """ cm = self.get_snp_attribute('cM') if len(set(cm)) == 1: @@ -121,42 +203,67 @@ def cm_pos(self): @property def a1(self): """ - Return the effect allele `A1`. See also `.alt_allele()`, `.effect_allele()`. + !!! seealso "See Also" + * [alt_allele][magenpy.GenotypeMatrix.GenotypeMatrix.alt_allele] + * [effect_allele][magenpy.GenotypeMatrix.GenotypeMatrix.effect_allele] + + :return: The effect allele `A1` for each genetic variant. + """ return self.get_snp_attribute('A1') @property def a2(self): """ - Return the reference allele `A2`. See also `.ref_allele()`. + + !!! seealso "See Also" + * [ref_allele][magenpy.GenotypeMatrix.GenotypeMatrix.ref_allele] + + :return: The reference allele `A2` for each genetic variant. + """ return self.get_snp_attribute('A2') @property def ref_allele(self): """ - Return the reference allele `A2`. See also `.a2()`. + + !!! seealso "See Also" + * [a2][magenpy.GenotypeMatrix.GenotypeMatrix.a2] + + :return: The reference allele `A2` for each genetic variant. """ return self.a2 @property def alt_allele(self): """ - Return the alternative (i.e. effect) allele `A1`. See also `.a1()`, `.effect_allele()`. + !!! seealso "See Also" + * [effect_allele][magenpy.GenotypeMatrix.GenotypeMatrix.effect_allele] + * [a1][magenpy.GenotypeMatrix.GenotypeMatrix.a1] + + :return: The effect allele `A1` for each genetic variant. + """ return self.a1 @property def effect_allele(self): """ - Return the effect allele `A1`. See also `.a1()`, `.alt_allele()`. + + !!! seealso "See Also" + * [alt_allele][magenpy.GenotypeMatrix.GenotypeMatrix.alt_allele] + * [a1][magenpy.GenotypeMatrix.GenotypeMatrix.a1] + + :return: The effect allele `A1` for each genetic variant. + """ return self.a1 @property def n_per_snp(self): """ - Sample size per genetic variant (this accounts for missing values). + :return: Sample size per genetic variant (accounting for potential missing values). """ n = self.get_snp_attribute('N') if n is not None: @@ -168,7 +275,7 @@ def n_per_snp(self): @property def maf(self): """ - Minor allele frequency + :return: The minor allele frequency (MAF) of each variant in the genotype matrix. """ maf = self.get_snp_attribute('MAF') if maf is not None: @@ -180,19 +287,22 @@ def maf(self): @property def maf_var(self): """ - The variance in minor allele frequency. + :return: The variance in minor allele frequency (MAF) of each variant in the genotype matrix. """ return 2. * self.maf * (1. - self.maf) def estimate_memory_allocation(self, dtype=np.float32): """ - Estimate the size of the genotype matrix in MB + :return: An estimate of the memory allocation for the genotype matrix in megabytes. """ return self.n * self.m * np.dtype(dtype).itemsize / 1024 ** 2 def get_snp_table(self, col_subset=None): """ - Return the SNP table or a subset of its columns. + A convenience method to extract SNP-related information from the genotype matrix. + :param col_subset: A list of columns to extract from the SNP table. + + :return: A `pandas` DataFrame with the requested columns. """ if col_subset is None: @@ -221,21 +331,33 @@ def get_snp_table(self, col_subset=None): def get_snp_attribute(self, attr): """ - A utility function to extract a given column from the SNP table. + + :param attr: The name of the attribute to extract from the SNP table. + :return: The values of a specific attribute for each variant in the genotype matrix. """ - if self.snp_table is not None: - if attr in self.snp_table.columns: - return self.snp_table[attr].values + if self.snp_table is not None and attr in self.snp_table.columns: + return self.snp_table[attr].values - def compute_ld(self, estimator, output_dir, **ld_kwargs): + def compute_ld(self, + estimator, + output_dir, + dtype='int16', + compressor_name='lz4', + compression_level=5, + **ld_kwargs): """ + Compute the Linkage-Disequilibrium (LD) or SNP-by-SNP correlation matrix - for the genotype matrix. + for the variants defined in the genotype matrix. :param estimator: The estimator for the LD matrix. We currently support 4 different estimators: `sample`, `windowed`, `shrinkage`, and `block`. :param output_dir: The output directory where the Zarr array containing the entries of the LD matrix will be stored. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor to use for the Zarr array. + :param compression_level: The compression level for the Zarr array (1-9) :param ld_kwargs: keyword arguments for the various LD estimators. Consult the implementations of `WindowedLD`, `ShrinkageLD`, and `BlockLD` for details. """ @@ -257,13 +379,21 @@ def compute_ld(self, estimator, output_dir, **ld_kwargs): tmp_ld_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='ld_') self.cleanup_dir_list.append(tmp_ld_dir) - return ld_est.compute(output_dir, temp_dir=tmp_ld_dir.name) + return ld_est.compute(output_dir, + temp_dir=tmp_ld_dir.name, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) def set_sample_table(self, sample_table): """ - A convenience method set the sample table for genotype matrix. + A convenience method set the sample table for the genotype matrix. This may be useful for syncing sample tables across different Genotype matrices corresponding to different chromosomes or genomic regions. + + :param sample_table: An instance of SampleTable or a pandas dataframe containing + information about the samples in the genotype matrix. + """ if isinstance(sample_table, SampleTable): @@ -271,12 +401,14 @@ def set_sample_table(self, sample_table): elif isinstance(sample_table, pd.DataFrame): self.sample_table = SampleTable(sample_table) else: - raise Exception("The sample table is invalid!") + raise ValueError("The sample table is invalid! " + "Has to be either an instance of " + "SampleTable or pandas DataFrame.") def filter_snps(self, extract_snps=None, extract_file=None): """ Filter variants from the genotype matrix. User must specify - either a list of variants to extract or the path to a file + either a list of variants to extract or the path to a plink-style file with the list of variants to extract. :param extract_snps: A list (or array) of SNP IDs to keep in the genotype matrix. @@ -293,35 +425,37 @@ def filter_snps(self, extract_snps=None, extract_file=None): def filter_by_allele_frequency(self, min_maf=None, min_mac=1): """ - Filter variants by minimum minor allele frequency or allele count + Filter variants by minimum minor allele frequency or allele count cutoffs. + :param min_maf: Minimum minor allele frequency :param min_mac: Minimum minor allele count (1 by default) """ if min_mac or min_maf: + maf = self.maf n = self.n_per_snp - keep_flag = None + keep_flag = None - if min_mac: - mac = (2*maf*n).astype(np.int64) - keep_flag = (mac >= min_mac) & ((2*n - mac) >= min_mac) + if min_mac: + mac = (2*maf*n).astype(np.int64) + keep_flag = (mac >= min_mac) & ((2*n - mac) >= min_mac) - if min_maf: + if min_maf: - maf_cond = (maf >= min_maf) & (1. - maf >= min_maf) - if keep_flag is not None: - keep_flag = keep_flag & maf_cond - else: - keep_flag = maf_cond + maf_cond = (maf >= min_maf) & (1. - maf >= min_maf) + if keep_flag is not None: + keep_flag = keep_flag & maf_cond + else: + keep_flag = maf_cond - if keep_flag is not None: - self.filter_snps(extract_snps=self.snps[keep_flag]) + if keep_flag is not None: + self.filter_snps(extract_snps=self.snps[keep_flag]) def drop_duplicated_snps(self): """ - Drop variants with duplicated SNP IDs. + A convenience method to drop variants with duplicated SNP rsIDs. """ u_snps, counts = np.unique(self.snps, return_counts=True) @@ -332,7 +466,7 @@ def drop_duplicated_snps(self): def filter_samples(self, keep_samples=None, keep_file=None): """ Filter samples from the genotype matrix. User must specify - either a list of samples to keep or the path to a file + either a list of samples to keep or the path to a plink-style file with the list of samples to keep. :param keep_samples: A list (or array) of sample IDs to keep in the genotype matrix. @@ -354,14 +488,18 @@ def perform_gwas(self, **gwa_kwargs): """ Perform genome-wide association testing of all variants against the phenotype. - :param gwa_kwargs: Keyword arguments to pass to the GWA functions. Consult stats.gwa.utils + :param gwa_kwargs: Keyword arguments to pass to the GWA functions. Consult `stats.gwa.utils` for relevant keyword arguments for each backend. + + :raises NotImplementedError: If the method is not implemented in the subclass. """ raise NotImplementedError def compute_allele_frequency(self): """ Compute the allele frequency of each variant or SNP in the genotype matrix. + + :raises NotImplementedError: If the method is not implemented in the subclass. """ raise NotImplementedError @@ -369,6 +507,8 @@ def compute_sample_size_per_snp(self): """ Compute the sample size for each variant in the genotype matrix, accounting for potential missing values. + + :raises NotImplementedError: If the method is not implemented in the subclass. """ raise NotImplementedError @@ -378,6 +518,8 @@ def split_by_chromosome(self): have a separate `GenotypeMatrix` objects for each chromosome. This method returns a dictionary where the key is the chromosome number and the value is an object of `GenotypeMatrix` for that chromosome. + + :return: A dictionary of `GenotypeMatrix` objects, one for each chromosome. """ chromosome = self.chromosome @@ -398,23 +540,69 @@ def cleanup(self): Clean up all temporary files and directories """ - for tmpdir in self.cleanup_dir_list: + for tmp in self.cleanup_dir_list: try: - tmpdir.cleanup() + tmp.cleanup() except FileNotFoundError: continue class xarrayGenotypeMatrix(GenotypeMatrix): + """ + A class that defines methods and interfaces for interacting with genotype matrices + using the `xarray` library. In particular, the class leverages functionality provided by + the `pandas-plink` package to represent on-disk genotype matrices as chunked multidimensional + arrays that can be queried and manipulated efficiently and in parallel. - def __init__(self, sample_table=None, snp_table=None, temp_dir='temp', xr_mat=None): - super().__init__(sample_table=sample_table, snp_table=snp_table, temp_dir=temp_dir) + This class inherits all the attributes of the `GenotypeMatrix` class. + + :ivar xr_mat: The `xarray` object representing the genotype matrix. + + """ + + def __init__(self, + sample_table=None, + snp_table=None, + bed_file=None, + temp_dir='temp', + xr_mat=None, + genome_build=None, + threads=1): + """ + Initialize an xarrayGenotypeMatrix object. + + :param sample_table: A table containing information about the samples in the genotype matrix. + :param snp_table: A table containing information about the genetic variants in the genotype matrix. + :param bed_file: The path to the plink BED file containing the genotype matrix. + :param temp_dir: The directory where temporary files will be stored (if needed). + :param xr_mat: The xarray object representing the genotype matrix. + :param genome_build: The genome build or assembly under which the SNP coordinates are defined. + :param threads: The number of threads to use for parallel computations. + """ + + super().__init__(sample_table=sample_table, + snp_table=snp_table, + temp_dir=temp_dir, + bed_file=bed_file, + genome_build=genome_build, + threads=threads) # xarray matrix object, as defined by pandas-plink: self.xr_mat = xr_mat @classmethod - def from_file(cls, file_path, temp_dir='temp'): + def from_file(cls, file_path, temp_dir='temp', **kwargs): + """ + Create a GenotypeMatrix object using a PLINK BED file with the help + of the data structures defined in `pandas_plink`. The genotype matrix + will be represented implicitly in an `xarray` object, and we will use it + to perform various computations. This method is a utility function to + construct the genotype matrix object from a plink BED file. + + :param file_path: Path to the plink BED file. + :param temp_dir: The directory where the temporary files will be stored. + :param kwargs: Additional keyword arguments. + """ from pandas_plink import read_plink1_bin @@ -422,8 +610,6 @@ def from_file(cls, file_path, temp_dir='temp'): xr_gt = read_plink1_bin(file_path + ".bed", ref="a0", verbose=False) except ValueError: xr_gt = read_plink1_bin(file_path, ref="a0", verbose=False) - except Exception as e: - raise e # Set the sample table: sample_table = xr_gt.sample.coords.to_dataset().to_dataframe() @@ -448,24 +634,30 @@ def from_file(cls, file_path, temp_dir='temp'): 'CHR': int, 'SNP': str, 'cM': float, - 'POS': np.int, + 'POS': int, 'A1': str, 'A2': str }) - # Set the index to be the SNP ID: - xr_gt = xr_gt.set_index(variant='snp') - g_mat = cls(sample_table=SampleTable(sample_table), snp_table=snp_table, temp_dir=temp_dir, - xr_mat=xr_gt) + bed_file=file_path, + xr_mat=xr_gt, + **kwargs) return g_mat def set_sample_table(self, sample_table): + """ + A convenience method set the sample table for the genotype matrix. + This is useful for cases when we need to sync the sample table across chromosomes. + + :param sample_table: An instance of SampleTable or a pandas dataframe containing + information about the samples in the genotype matrix. + """ - super(xarrayGenotypeMatrix, self).set_sample_table(sample_table) + super().set_sample_table(sample_table) try: if self.n != self.xr_mat.shape[0]: @@ -474,21 +666,62 @@ def set_sample_table(self, sample_table): pass def filter_snps(self, extract_snps=None, extract_file=None): + """ + Filter variants from the genotype matrix. User must specify either a list of variants to + extract or the path to a file with the list of variants to extract. + + :param extract_snps: A list or array of SNP rsIDs to keep in the genotype matrix. + :param extract_file: The path to a file with the list of variants to extract. + """ - super(xarrayGenotypeMatrix, self).filter_snps(extract_snps=extract_snps, extract_file=extract_file) - self.xr_mat = self.xr_mat.sel(variant=self.snps) + super().filter_snps(extract_snps=extract_snps, extract_file=extract_file) + self.xr_mat = self.xr_mat.sel(variant=np.isin(self.xr_mat.variant.coords['snp'], self.snps)) def filter_samples(self, keep_samples=None, keep_file=None): + """ + Filter samples from the genotype matrix. + User must specify either a list of samples to keep or the path to a file with the list of samples to keep. + + :param keep_samples: A list (or array) of sample IDs to keep in the genotype matrix. + :param keep_file: The path to a file with the list of samples to keep. + """ - super(xarrayGenotypeMatrix, self).filter_samples(keep_samples=keep_samples, keep_file=keep_file) + super().filter_samples(keep_samples=keep_samples, keep_file=keep_file) self.xr_mat = self.xr_mat.sel(sample=self.samples) + def to_numpy(self, dtype=np.int8): + """ + Convert the genotype matrix to a numpy array. + :param dtype: The data type of the numpy array. Default: Int8 + + :return: A numpy array representation of the genotype matrix. + """ + + return self.xr_mat.data.astype(dtype).compute() + + def to_csr(self, dtype=np.int8): + """ + Convert the genotype matrix to a scipy sparse CSR matrix. + :param dtype: The data type of the scipy array. Default: Int8 + + :return: A `scipy` sparse CSR matrix representation of the genotype matrix. + """ + + mat = self.to_numpy(dtype=dtype) + + from scipy.sparse import csr_matrix + + return csr_matrix(mat) + def score(self, beta, standardize_genotype=False, skip_na=True): """ Perform linear scoring on the genotype matrix. :param beta: A vector or matrix of effect sizes for each variant in the genotype matrix. :param standardize_genotype: If True, standardize the genotype when computing the polygenic score. :param skip_na: If True, skip missing values when computing the polygenic score. + + :return: The polygenic score (PGS) for each sample in the genotype matrix. + """ import dask.array as da @@ -507,34 +740,80 @@ def score(self, beta, standardize_genotype=False, skip_na=True): return pgs def perform_gwas(self, **gwa_kwargs): + """ + A convenience method that calls specialized utility functions that perform + genome-wide association testing of all variants against the phenotype. - from magenpy.stats.gwa.utils import perform_gwa_xarray + :return: A Summary statistics table containing the results of the association testing. + """ + + from .stats.gwa.utils import perform_gwa_xarray return perform_gwa_xarray(self, **gwa_kwargs) def compute_allele_frequency(self): + """ + A convenience method that calls specialized utility functions that + compute the allele frequency of each variant or SNP in the genotype matrix. + """ self.snp_table['MAF'] = (self.xr_mat.sum(axis=0) / (2. * self.n_per_snp)).compute().values def compute_sample_size_per_snp(self): + """ + A convenience method that calls specialized utility functions that compute + the sample size for each variant in the genotype matrix, accounting for + potential missing values. + """ self.snp_table['N'] = self.xr_mat.shape[0] - self.xr_mat.isnull().sum(axis=0).compute().values def split_by_chromosome(self): - split = super(xarrayGenotypeMatrix, self).split_by_chromosome() + """ + Split the genotype matrix by chromosome. + :return: A dictionary of `xarrayGenotypeMatrix` objects, one for each chromosome. + """ + split = super().split_by_chromosome() for c, gt in split.items(): + gt.xr_mat = self.xr_mat if len(split) > 1: - gt.xr_mat = self.xr_mat.sel(variant=gt.snps) - else: - gt.xr_mat = self.xr_mat + gt.filter_snps(extract_snps=gt.snps) return split class plinkBEDGenotypeMatrix(GenotypeMatrix): + """ + A class that defines methods and interfaces for interacting with genotype matrices + using `plink2` software. This class provides a convenient interface to perform various + computations on genotype matrices stored in the plink BED format. - def __init__(self, sample_table=None, snp_table=None, temp_dir='temp', bed_file=None): - super().__init__(sample_table=sample_table, snp_table=snp_table, temp_dir=temp_dir) + This class inherits all the attributes of the `GenotypeMatrix` class. + """ + + def __init__(self, + sample_table=None, + snp_table=None, + temp_dir='temp', + bed_file=None, + genome_build=None, + threads=1): + """ + Initialize a `plinkBEDGenotypeMatrix` object. + + :param sample_table: A table containing information about the samples in the genotype matrix. + :param snp_table: A table containing information about the genetic variants in the genotype matrix. + :param temp_dir: The directory where temporary files will be stored (if needed). + :param bed_file: The path to the plink BED file containing the genotype matrix. + :param genome_build: The genome build or assembly under which the SNP coordinates are defined. + :param threads: The number of threads to use for parallel computations. + """ + + super().__init__(sample_table=sample_table, + snp_table=snp_table, + temp_dir=temp_dir, + bed_file=bed_file, + genome_build=genome_build, + threads=threads) - self.bed_file = bed_file if self.bed_file is not None: self.bed_file = self.bed_file.replace('.bed', '') @@ -545,13 +824,33 @@ def __init__(self, sample_table=None, snp_table=None, temp_dir='temp', bed_file= self.snp_table = parse_bim_file(self.bed_file) @classmethod - def from_file(cls, file_path, temp_dir='temp'): + def from_file(cls, file_path, temp_dir='temp', **kwargs): + """ + A convenience method to create a `plinkBEDGenotypeMatrix` object by + providing a path to a PLINK BED file. + + :param file_path: The path to the plink BED file. + :param temp_dir: The directory where temporary files will be stored. + :param kwargs: Additional keyword arguments. + """ - p_gt = cls(bed_file=file_path, temp_dir=temp_dir) + p_gt = cls(bed_file=file_path, temp_dir=temp_dir, **kwargs) return p_gt def score(self, beta, standardize_genotype=False): + """ + Perform linear scoring on the genotype matrix. This function takes a vector (or matrix) of + effect sizes and returns the matrix-vector or matrix-matrix product of the genotype matrix + multiplied by the effect sizes. + + This can be used for polygenic score calculation or projecting the genotype matrix. + + :param beta: A vector or matrix of effect sizes for each variant in the genotype matrix. + :param standardize_genotype: If True, standardize the genotype when computing the polygenic score. + + :return: The polygenic score (PGS) for each sample in the genotype matrix. + """ from .stats.score.utils import score_plink2 @@ -562,8 +861,15 @@ def score(self, beta, standardize_genotype=False): return score_plink2(self, beta, standardize_genotype=standardize_genotype, temp_dir=tmp_score_dir.name) def perform_gwas(self, **gwa_kwargs): + """ + Perform genome-wide association testing of all variants against the phenotype. + This method calls specialized functions that, in turn, call `plink2` to perform + the association testing. + + :return: A Summary statistics table containing the results of the association testing. + """ - from magenpy.stats.gwa.utils import perform_gwa_plink2 + from .stats.gwa.utils import perform_gwa_plink2 # Create a temporary directory where we store intermediate results: tmp_gwas_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='gwas_') @@ -572,8 +878,13 @@ def perform_gwas(self, **gwa_kwargs): return perform_gwa_plink2(self, temp_dir=tmp_gwas_dir.name, **gwa_kwargs) def compute_allele_frequency(self): + """ + Compute the allele frequency of each variant or SNP in the genotype matrix. + This method calls specialized functions that, in turn, call `plink2` to compute + allele frequency. + """ - from magenpy.stats.variant.utils import compute_allele_frequency_plink2 + from .stats.variant.utils import compute_allele_frequency_plink2 # Create a temporary directory where we store intermediate results: tmp_freq_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='freq_') @@ -582,7 +893,15 @@ def compute_allele_frequency(self): self.snp_table['MAF'] = compute_allele_frequency_plink2(self, temp_dir=tmp_freq_dir.name) def compute_sample_size_per_snp(self): - from magenpy.stats.variant.utils import compute_sample_size_per_snp_plink2 + """ + Compute the sample size for each variant in the genotype matrix, accounting for + potential missing values. + + This method calls specialized functions that, in turn, call `plink2` to compute sample + size per variant. + """ + + from .stats.variant.utils import compute_sample_size_per_snp_plink2 # Create a temporary directory where we store intermediate results: tmp_miss_dir = tempfile.TemporaryDirectory(dir=self.temp_dir, prefix='miss_') @@ -591,8 +910,12 @@ def compute_sample_size_per_snp(self): self.snp_table['N'] = compute_sample_size_per_snp_plink2(self, temp_dir=tmp_miss_dir.name) def split_by_chromosome(self): + """ + Split the genotype matrix by chromosome. + :return: A dictionary of `plinkBEDGenotypeMatrix` objects, one for each chromosome. + """ - split = super(plinkBEDGenotypeMatrix, self).split_by_chromosome() + split = super().split_by_chromosome() for c, gt in split.items(): gt.bed_file = self.bed_file diff --git a/magenpy/LDMatrix.py b/magenpy/LDMatrix.py new file mode 100644 index 0000000..a001146 --- /dev/null +++ b/magenpy/LDMatrix.py @@ -0,0 +1,1454 @@ +import zarr +import os.path as osp +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix, identity, triu, diags +from .utils.model_utils import quantize, dequantize + + +class LDMatrix(object): + """ + A class that represents Linkage-Disequilibrium (LD) matrices, which record + the SNP-by-SNP pairwise correlations in a sample of genetic data. The class + provides various functionalities for initializing, storing, loading, and + performing computations with LD matrices. The LD matrices are stored in a + hierarchical format using the `Zarr` library, which allows for efficient + storage and retrieval of the data. + + The class provides the following functionalities: + + * Initialize an `LDMatrix` object from plink's LD table files. + * Initialize an `LDMatrix` object from a sparse CSR matrix. + * Initialize an `LDMatrix` object from a Zarr array store. + * Compute LD scores for each SNP in the LD matrix. + * Filter the LD matrix based on SNP indices or ranges. + + The Zarr hierarchy is structured as follows: + + * `chr_22.zarr`: The Zarr group. + * `matrix`: The subgroup containing the data of the LD matrix in Scipy Sparse CSR matrix format. + * `data`: The array containing the non-zero entries of the LD matrix. + * `indptr`: The array containing the index pointers for the CSR matrix. + * `metadata`: The subgroup containing the metadata for variants included in the LD matrix. + * `snps`: The array containing the SNP rsIDs. + * `a1`: The array containing the alternative alleles. + * `a2`: The array containing the reference alleles. + * `maf`: The array containing the minor allele frequencies. + * `bp`: The array containing the base pair positions. + * `cm`: The array containing the centi Morgan positions. + * `ldscore`: The array containing the LD scores. + * `attrs`: A JSON-style metadata object containing general information about how the LD matrix + was calculated, including the chromosome number, sample size, genome build, LD estimator, + and estimator properties. + + :ivar _zg: The Zarr group object that stores the LD matrix and its metadata. + :ivar _mat: The in-memory CSR matrix object. + :ivar in_memory: A boolean flag indicating whether the LD matrix is in memory. + :ivar is_symmetric: A boolean flag indicating whether the LD matrix is symmetric. + :ivar index: An integer index for the current SNP in the LD matrix (useful for iterators). + :ivar _mask: A boolean mask for filtering the LD matrix. + + """ + + def __init__(self, zarr_group, symmetric=False): + """ + Initialize an `LDMatrix` object from a Zarr group store. + + :param zarr_group: The Zarr group object that stores the LD matrix. + :param symmetric: A boolean flag indicating whether to represent the LD matrix as symmetric. + """ + + # Checking the input for correct formatting: + # First, it has to be a Zarr group: + assert isinstance(zarr_group, zarr.hierarchy.Group) + # Second, it has to have a group called `matrix`: + assert 'matrix' in list(zarr_group.group_keys()) + + # Third, all the sparse array keys must be present: + arr_keys = list(zarr_group['matrix'].array_keys()) + assert all([arr in arr_keys + for arr in ('data', 'indptr')]) + + self._zg = zarr_group + + self._mat = None + self.in_memory = False + self.is_symmetric = symmetric + self.index = 0 + + self._mask = None + + @classmethod + def from_path(cls, ld_store_path): + """ + Initialize an `LDMatrix` object from a pre-computed Zarr group store. + :param ld_store_path: The path to the Zarr array store on the filesystem. + + !!! seealso "See Also" + * [from_dir][magenpy.LDMatrix.LDMatrix.from_dir] + + """ + + for level in range(2): + try: + ld_group = zarr.open_group(ld_store_path, mode='r') + return cls(ld_group) + except zarr.hierarchy.GroupNotFoundError as e: + if level < 1: + ld_store_path = osp.dirname(ld_store_path) + else: + raise e + + @classmethod + def from_dir(cls, ld_store_path): + """ + Initialize an `LDMatrix` object from a Zarr array store. + :param ld_store_path: The path to the Zarr array store on the filesystem. + + !!! seealso "See Also" + * [from_path][magenpy.LDMatrix.LDMatrix.from_path] + """ + return cls.from_path(ld_store_path) + + @classmethod + def from_csr(cls, + csr_mat, + store_path, + overwrite=False, + dtype='int16', + compressor_name='lz4', + compression_level=5): + """ + Initialize an LDMatrix object from a sparse CSR matrix. + + :param csr_mat: The sparse CSR matrix. + :param store_path: The path to the Zarr LD store where the data will be stored. + :param overwrite: If True, it overwrites the LD store at `store_path`. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor or compression algorithm to use with Zarr. + :param compression_level: The compression level to use with the compressor (1-9). + """ + + dtype = np.dtype(dtype) + + # Get the upper triangular part of the matrix: + triu_mat = triu(csr_mat, k=1, format='csr') + + # Check that the non-zeros are contiguous around the diagonal with no gaps. + # If there are gaps, eliminate them or raise an error. + if np.diff(triu_mat.indices).max() > 1: + # TODO: Figure out a way to fix this automatically for the user? + raise ValueError("The non-zero entries of the LD matrix are not contiguous around the diagonal.") + + # Create hierarchical storage with zarr groups: + store = zarr.DirectoryStore(store_path) + z = zarr.group(store=store, overwrite=overwrite) + + # Create a compressor object: + compressor = zarr.Blosc(cname=compressor_name, clevel=compression_level) + + # First sub-hierarchy stores the information for the sparse LD matrix: + mat = z.create_group('matrix') + if np.issubdtype(dtype, np.integer): + mat.array('data', quantize(triu_mat.data, int_dtype=dtype), dtype=dtype, compressor=compressor) + else: + mat.array('data', triu_mat.data.astype(dtype), dtype=dtype, compressor=compressor_name) + + # Store the index pointer: + mat.array('indptr', triu_mat.indptr, + dtype=np.int32, compressor=compressor) + + return cls(z) + + @classmethod + def from_plink_table(cls, + plink_ld_file, + snps, + store_path, + pandas_chunksize=None, + overwrite=False, + dtype='int16', + compressor_name='lz4', + compression_level=5): + """ + Construct a Zarr LD matrix using output tables from plink1.9. + This class method takes the following inputs: + + :param plink_ld_file: The path to the plink LD table file. + :param snps: An iterable containing the list of SNPs in the LD matrix. + :param store_path: The path to the Zarr LD store. + :param pandas_chunksize: If the LD table is large, provide chunk size + (i.e. number of rows to process at each step) to keep memory footprint manageable. + :param overwrite: If True, it overwrites the LD store at `store_path`. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor or compression algorithm to use with Zarr. + :param compression_level: The compression level to use with the compressor (1-9). + """ + + dtype = np.dtype(dtype) + + # Create hierarchical storage with zarr groups: + store = zarr.DirectoryStore(store_path) + z = zarr.group(store=store, overwrite=overwrite) + + # Create a compressor object: + compressor = zarr.Blosc(cname=compressor_name, clevel=compression_level) + + # First sub-hierarchy stores the information for the sparse LD matrix: + mat = z.create_group('matrix') + mat.empty('data', shape=len(snps)**2, dtype=dtype, compressor=compressor) + + # Create a chunked iterator with pandas: + # Chunk size will correspond to the average chunk size for the Zarr array: + ld_chunks = pd.read_csv(plink_ld_file, + sep=r'\s+', + usecols=['SNP_A', 'R'], + engine='c', + chunksize=pandas_chunksize, + dtype={'SNP_A': str, 'R': np.float32}) + + if pandas_chunksize is None: + ld_chunks = [ld_chunks] + + # Create a dictionary mapping SNPs to their indices: + snp_dict = dict(zip(snps, np.arange(len(snps)))) + + indptr_counts = np.zeros(len(snps), dtype=np.int32) + + total_len = 0 + + # For each chunk in the LD file: + for ld_chunk in ld_chunks: + + # Create an indexed LD chunk: + ld_chunk['row_index'] = ld_chunk['SNP_A'].map(snp_dict) + + # Add LD data to the zarr array: + if np.issubdtype(dtype, np.integer): + mat['data'][total_len:total_len + len(ld_chunk)] = quantize(ld_chunk['R'].values, int_dtype=dtype) + else: + mat['data'][total_len:total_len + len(ld_chunk)] = ld_chunk['R'].values.astype(dtype) + + total_len += len(ld_chunk) + + # Group by the row index: + grouped_ridx = ld_chunk.groupby('row_index').size() + + # Add the number of entries to indptr_counts: + indptr_counts[grouped_ridx.index] += grouped_ridx.values + + # Get the final indptr by computing cumulative sum: + indptr = np.insert(np.cumsum(indptr_counts), 0, 0) + # Store indptr in the zarr group: + mat.array('indptr', indptr, dtype=np.int32, compressor=compressor) + + # Resize the data array: + mat['data'].resize(total_len) + + return cls(z) + + @classmethod + def from_dense_zarr_matrix(cls, + dense_zarr, + ld_boundaries, + store_path, + overwrite=False, + delete_original=False, + dtype='int16', + compressor_name='lz4', + compression_level=5): + """ + Initialize a new LD matrix object using a Zarr array object. This method is + useful for converting a dense LD matrix computed using Dask (or other distributed computing + software) to a sparse or banded one. + + :param dense_zarr: The path to the dense Zarr array object. + :param ld_boundaries: The LD boundaries for each SNP in the LD matrix (delineates the indices of + the leftmost and rightmost neighbors of each SNP). + :param store_path: The path where to store the new LD matrix. + :param overwrite: If True, it overwrites the LD store at `store_path`. + :param delete_original: If True, it deletes the original dense LD matrix. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor or compression algorithm to use with Zarr. + :param compression_level: The compression level to use with the compressor (1-9). + """ + + dtype = np.dtype(dtype) + + # If dense_zarr is a path, rather than a Zarr Array object, then + # open it as a Zarr array object before proceeding: + if isinstance(dense_zarr, str): + if osp.isfile(osp.join(dense_zarr, '.zarray')): + dense_zarr = zarr.open(dense_zarr) + else: + raise FileNotFoundError + + # Create hierarchical storage with zarr groups: + store = zarr.DirectoryStore(store_path) + z = zarr.group(store=store, overwrite=overwrite) + + # Create a compressor object: + compressor = zarr.Blosc(cname=compressor_name, clevel=compression_level) + + # First sub-hierarchy stores the information for the sparse LD matrix: + mat = z.create_group('matrix') + mat.empty('data', shape=dense_zarr.shape[0]**2, dtype=dtype, compressor=compressor) + + num_rows = dense_zarr.shape[0] + chunk_size = dense_zarr.chunks[0] + + indptr_counts = np.zeros(num_rows, dtype=int) + + total_len = 0 + + for chunk_idx in range(int(np.ceil(num_rows / chunk_size))): + + chunk_start = chunk_idx * chunk_size + chunk_end = min((chunk_idx + 1) * chunk_size, num_rows) + + z_chunk = dense_zarr[chunk_start: chunk_end] + + data = [] + + chunk_len = 0 + + for j in range(chunk_start, chunk_end): + + data.append( + z_chunk[j - chunk_start][j + 1:ld_boundaries[1, j]] + ) + indptr_counts[j] = len(data[-1]) + chunk_len += int(ld_boundaries[1, j] - (j+1)) + + # Add data + columns indices to zarr array: + concat_data = np.concatenate(data) + + if np.issubdtype(dtype, np.integer): + mat['data'][total_len:total_len + chunk_len] = quantize(concat_data, int_dtype=dtype) + else: + mat['data'][total_len:total_len + chunk_len] = concat_data.astype(dtype) + + total_len += chunk_len + + # Get the final indptr by computing cumulative sum: + indptr = np.insert(np.cumsum(indptr_counts), 0, 0) + # Store indptr in the zarr array: + mat.array('indptr', indptr, compressor=compressor) + + # Resize the data and indices arrays: + mat['data'].resize(total_len) + + if delete_original: + from .stats.ld.utils import delete_ld_store + delete_ld_store(dense_zarr) + + return cls(z) + + @classmethod + def from_ragged_zarr_matrix(cls, + ragged_zarr, + store_path, + overwrite=False, + delete_original=False, + dtype='int16', + compressor_name='lz4', + compression_level=5): + """ + Initialize a new LD matrix object using a Zarr array object + conforming to the old LD Matrix format from magenpy v<=0.0.12. + + This utility function will also copy some of the stored attributes + associated with the matrix in the old format. + + :param ragged_zarr: The path to the ragged Zarr array object. + :param store_path: The path where to store the new LD matrix. + :param overwrite: If True, it overwrites the LD store at `store_path`. + :param delete_original: If True, it deletes the original ragged LD matrix. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor or compression algorithm to use with Zarr. + :param compression_level: The compression level to use with the compressor (1-9). + """ + + dtype = np.dtype(dtype) + + # If ragged_zarr is a path, rather than a Zarr Array object, then + # open it as a Zarr array object before proceeding: + if isinstance(ragged_zarr, str): + if osp.isfile(osp.join(ragged_zarr, '.zarray')): + ragged_zarr = zarr.open(ragged_zarr) + else: + raise FileNotFoundError + + num_rows = ragged_zarr.shape[0] + chunk_size = ragged_zarr.chunks[0] + + # Create hierarchical storage with zarr groups: + store = zarr.DirectoryStore(store_path) + z = zarr.group(store=store, overwrite=overwrite) + + # Create a compressor object: + compressor = zarr.Blosc(cname=compressor_name, clevel=compression_level) + + # First sub-hierarchy stores the information for the sparse LD matrix: + mat = z.create_group('matrix') + mat.empty('data', shape=num_rows ** 2, dtype=dtype, compressor=compressor) + + indptr_counts = np.zeros(num_rows, dtype=int) + + # Get the LD boundaries from the Zarr array attributes: + ld_boundaries = np.array(ragged_zarr.attrs['LD boundaries']) + + total_len = 0 + + for chunk_idx in range(int(np.ceil(num_rows / chunk_size))): + + chunk_start = chunk_idx * chunk_size + chunk_end = min((chunk_idx + 1) * chunk_size, num_rows) + + z_chunk = ragged_zarr[chunk_start: chunk_end] + + data = [] + chunk_len = 0 + + for j in range(chunk_start, chunk_end): + + start, end = ld_boundaries[:, j] + new_start = (j - start) + 1 + + data.append( + z_chunk[j - chunk_start][new_start:] + ) + indptr_counts[j] = end - (j + 1) + chunk_len += int(end - (j + 1)) + + # Add data + columns indices to zarr array: + concat_data = np.concatenate(data) + + if np.issubdtype(dtype, np.integer): + mat['data'][total_len:total_len + chunk_len] = quantize(concat_data, int_dtype=dtype) + else: + mat['data'][total_len:total_len + chunk_len] = concat_data.astype(dtype) + + total_len += chunk_len + + # Get the final indptr by computing cumulative sum: + indptr = np.insert(np.cumsum(indptr_counts), 0, 0) + # Store indptr in the zarr array: + mat.array('indptr', indptr, compressor=compressor) + + # Resize the data and indices arrays: + mat['data'].resize(total_len) + + # ============================================================ + # Transfer the attributes/metadata from the old matrix format: + + ld_mat = cls(z) + + ld_mat.set_metadata('snps', np.array(ragged_zarr.attrs['SNP'])) + ld_mat.set_metadata('a1', np.array(ragged_zarr.attrs['A1'])) + ld_mat.set_metadata('a2', np.array(ragged_zarr.attrs['A2'])) + ld_mat.set_metadata('maf', np.array(ragged_zarr.attrs['MAF'])) + ld_mat.set_metadata('bp', np.array(ragged_zarr.attrs['BP'])) + ld_mat.set_metadata('cm', np.array(ragged_zarr.attrs['cM'])) + + try: + ld_mat.set_metadata('ldscore', np.array(ragged_zarr.attrs['LDScore'])) + except KeyError: + print("Did not find LD scores in old LD matrix format! Skipping...") + + # Set matrix attributes: + ld_mat.set_store_attr('Chromosome', ragged_zarr.attrs['Chromosome']) + ld_mat.set_store_attr('LD estimator', ragged_zarr.attrs['LD estimator']) + ld_mat.set_store_attr('Estimator properties', ragged_zarr.attrs['Estimator properties']) + ld_mat.set_store_attr('Sample size', ragged_zarr.attrs['Sample size']) + + if delete_original: + from .stats.ld.utils import delete_ld_store + delete_ld_store(ragged_zarr) + + return ld_mat + + @property + def n_snps(self): + """ + :return: The number of variants in the LD matrix. If the matrix is loaded and filtered, + we return the number of variants remaining after applying the filter. + """ + if self._mat is not None: + return self._mat.shape[0] + else: + return self.stored_n_snps + + @property + def shape(self): + """ + + !!! seealso "See Also" + * [n_snps][magenpy.LDMatrix.LDMatrix.n_snps] + + :return: The shape of the square LD matrix. + """ + return self.n_snps, self.n_snps + + @property + def store(self): + """ + :return: The Zarr group store object. + """ + return self._zg.store + + @property + def compressor(self): + """ + :return: The `numcodecs` compressor object for the LD data. + """ + return self._zg['matrix/data'].compressor + + @property + def zarr_group(self): + """ + :return: The Zarr group object that stores the LD matrix and its metadata. + """ + return self._zg + + @property + def chunks(self): + """ + :return: The chunks for the data array of the LD matrix. + """ + return self._zg['matrix/data'].chunks + + @property + def chunk_size(self): + """ + :return: The chunk size for the data array of the LD matrix. + """ + return self.chunks[0] + + @property + def stored_n_snps(self): + """ + :return: The number of variants stored in the LD matrix (irrespective of any masks / filters). + """ + return self._zg['matrix/indptr'].shape[0] - 1 + + @property + def stored_dtype(self): + """ + :return: The data type for the stored entries of `data` array of the LD matrix. + """ + return self._zg['matrix/data'].dtype + + @property + def stored_shape(self): + """ + :return: The shape of the stored LD matrix (irrespective of any masks / filters). + """ + n_snps = self.stored_n_snps + return n_snps, n_snps + + @property + def dtype(self): + """ + :return: The data type for the entries of the `data` array of the LD matrix. If the matrix is + in memory, return the dtype of the CSR matrix. Otherwise, return the + dtype of the entries in the Zarr array. + """ + if self.in_memory: + return self.csr_matrix.dtype + else: + return self.stored_dtype + + @property + def chromosome(self): + """ + :return: The chromosome for which this LD matrix was calculated. + """ + return self.get_store_attr('Chromosome') + + @property + def ld_estimator(self): + """ + :return: The LD estimator used to compute the LD matrix. Examples include: `block`, `windowed`, `shrinkage`. + """ + return self.get_store_attr('LD estimator') + + @property + def estimator_properties(self): + """ + :return: The properties of the LD estimator used to compute the LD matrix. + """ + return self.get_store_attr('Estimator properties') + + @property + def sample_size(self): + """ + :return: The sample size used to compute the LD matrix. + """ + return self.get_store_attr('Sample size') + + @property + def genome_build(self): + """ + :return: The genome build based on which the base pair coordinates are defined. + """ + return self.get_store_attr('Genome build') + + @property + def snps(self): + """ + :return: rsIDs of the variants included in the LD matrix. + """ + return self.get_metadata('snps') + + @property + def a1(self): + """ + :return: The alternative alleles of the variants included in the LD matrix. + """ + return self.get_metadata('a1') + + @property + def a2(self): + """ + :return: The reference alleles of the variants included in the LD matrix. + """ + return self.get_metadata('a2') + + @property + def maf(self): + """ + :return: The minor allele frequency (MAF) of the alternative allele (A1) in the LD matrix. + """ + try: + return self.get_metadata('maf') + except KeyError: + return None + + @property + def bp_position(self): + """ + !!! seealso "See Also" + * [genome_build][magenpy.LDMatrix.LDMatrix.genome_build] + + :return: The base pair position of each SNP in the LD matrix. + """ + return self.get_metadata('bp') + + @property + def cm_position(self): + """ + :return: The centi Morgan (cM) position of each variant in the LD matrix. + """ + try: + return self.get_metadata('cm') + except KeyError: + return None + + @property + def ld_score(self): + """ + :return: The LD score of each variant in the LD matrix. + """ + try: + return self.get_metadata('ldscore') + except KeyError: + + ld_score = self.compute_ld_scores() + + if self._mask is None: + self.set_metadata('ldscore', ld_score, overwrite=True) + + return ld_score + + @property + def ld_boundaries(self): + """ + The LD boundaries associated with each variant. + The LD boundaries are defined as the index of the leftmost neighbor + (lower boundary) and the rightmost neighbor (upper boundary) of for each variant. + If the LD matrix is upper triangular, then the boundaries for variant `i` go from `i + 1` to `i + k_i`, + where `k_i` is the number of neighbors that SNP `i` is in LD with. + + :return: A matrix of shape `(2, n_snps)` where the first row contains the lower boundaries and the second row + contains the upper boundaries. + + """ + + indptr = self.indptr + + if self.in_memory and self.is_symmetric: + + # Check that the matrix has canonical format (indices are sorted / no duplicates): + assert self.csr_matrix.has_canonical_format + + return np.vstack([self.indices[indptr[:-1]], self.indices[indptr[1:] - 1] + 1]).astype(np.int32) + + else: + + # If the matrix is not in memory, then the format is upper triangular. + # Therefore, it goes from diagonal + 1 to the end of the row. + left_bound = np.arange(1, len(indptr) - 1) # The leftmost neighbor of each SNP (diagonal + 1) + return np.vstack([left_bound, left_bound + np.diff(indptr[:-1])]).astype(np.int32) + + @property + def window_size(self): + """ + !!! seealso "See Also" + * [n_neighbors][magenpy.LDMatrix.LDMatrix.n_neighbors] + + !!! note + This includes the variant itself if the matrix is in memory and is symmetric. + + :return: The number of variants in the LD window for each SNP. + + """ + return np.diff(self.indptr) + + @property + def n_neighbors(self): + """ + The number of variants in the LD window for each SNP. + + !!! seealso "See Also" + * [window_size][magenpy.LDMatrix.LDMatrix.window_size] + + !!! note + This includes the variant itself if the matrix is in memory and is symmetric. + + """ + return self.window_size() + + @property + def csr_matrix(self): + """ + :return: The in-memory CSR matrix object. + + ..note :: + If the LD matrix is not in-memory, then it'll be loaded using default settings. + + """ + if self._mat is None: + self.load() + return self._mat + + @property + def data(self): + """ + :return: The `data` array of the sparse `CSR` matrix, containing the entries of the LD matrix. + """ + if self.in_memory: + return self.csr_matrix.data + else: + return self._zg['matrix/data'] + + @property + def indices(self): + """ + :return: The column indices of the non-zero elements of the sparse, CSR representation of the LD matrix. + """ + if self.in_memory: + return self.csr_matrix.indices + else: + ld_bounds = self.ld_boundaries + + from .stats.ld.c_utils import expand_ranges + + return expand_ranges(ld_bounds[0], ld_bounds[1], self.data.shape[0]) + + @property + def row_indices(self): + """ + :return: The row indices of the non-zero elements of the sparse, CSR representation of the LD matrix + """ + if self.in_memory: + # TODO: Check that this behaves correctly if some entries are zero but not eliminated. + return self.csr_matrix.nonzero()[0] + else: + indptr = self.indptr + return np.repeat(np.arange(len(indptr) - 1), np.diff(indptr)) + + @property + def indptr(self): + """ + :return: The index pointers `indptr` delineating where the data for each row of the flattened, + sparse CSR representation of the lD matrix. + """ + if self.in_memory: + return self.csr_matrix.indptr + else: + return self._zg['matrix/indptr'] + + def filter_snps(self, extract_snps=None, extract_file=None): + """ + Filter the LDMatrix to keep a subset of variants. This mainly sets + the mask for the LD matrix, which is used to hide/remove some SNPs from the LD matrix, + without altering the stored objects on-disk. + + :param extract_snps: A list or array of SNP rsIDs to keep. + :param extract_file: A plink-style file containing the SNP rsIDs to keep. + """ + + assert extract_snps is not None or extract_file is not None + + if extract_snps is None: + from .parsers.misc_parsers import read_snp_filter_file + extract_snps = read_snp_filter_file(extract_file) + + from .utils.compute_utils import intersect_arrays + + new_mask = intersect_arrays(self.get_metadata('snps', apply_mask=False), + extract_snps, + return_index=True) + + self.set_mask(new_mask) + + def get_mask(self): + """ + :return: The mask (a boolean flag array) used to hide/remove some SNPs from the LD matrix. + """ + return self._mask + + def set_mask(self, mask): + """ + Set the mask (a boolean array) to hide/remove some SNPs from the LD matrix. + :param mask: An array of indices or boolean mask for SNPs to retain. + """ + + # If the mask is equivalent to the current mask, return: + if np.array_equal(mask, self._mask): + return + + # If the mask is boolean, convert to indices (should we?): + if mask.dtype == bool: + self._mask = np.where(mask)[0] + else: + self._mask = mask + + # If the data is already in memory, reload: + if self.in_memory: + self.load(force_reload=True, + return_symmetric=self.is_symmetric, + fill_diag=self.is_symmetric) + + def to_snp_table(self, col_subset=None): + """ + :param col_subset: The subset of columns to add to the table. If None, it returns + all available columns. + + :return: A `pandas` dataframe of the SNP attributes and metadata for variants + included in the LD matrix. + """ + + col_subset = col_subset or ['CHR', 'SNP', 'POS', 'A1', 'A2', 'MAF', 'LDScore'] + + table = pd.DataFrame({'SNP': self.snps}) + + for col in col_subset: + if col == 'CHR': + table['CHR'] = self.chromosome + if col == 'POS': + table['POS'] = self.bp_position + if col == 'cM': + table['cM'] = self.cm_position + if col == 'A1': + table['A1'] = self.a1 + if col == 'A2': + table['A2'] = self.a2 + if col == 'MAF': + table['MAF'] = self.maf + if col == 'LDScore': + table['LDScore'] = self.ld_score + if col == 'WindowSize': + table['WindowSize'] = self.window_size + + return table[list(col_subset)] + + def compute_ld_scores(self, + annotation_matrix=None, + corrected=True, + chunk_size=10_000): + """ + + Computes the LD scores for variants in the LD matrix. LD Scores are defined + as the sum of the squared pairwise Pearson Correlation coefficient between the focal SNP and + all its neighboring SNPs. See Bulik-Sullivan et al. (2015) for details. + + :param annotation_matrix: A matrix of annotations for each variant for which to aggregate the LD scores. + :param corrected: Use the sample-size corrected estimator for the squared Pearson correlation coefficient. + See Bulik-Sullivan et al. (2015). + :param chunk_size: Specify the number of rows (i.e. SNPs) to compute the LD scores for simultaneously. + Smaller chunk sizes should require less memory resources. If set to None, we compute LD scores + for all SNPs in the LD matrix in one go. + + :return: An array of LD scores for each variant in the LD matrix. + """ + + if chunk_size is None: + chunk_size = self.stored_n_snps + + if annotation_matrix is None: + annotation_matrix = np.ones((self.n_snps, 1), dtype=np.float32) + + ld_scores = np.zeros((self.n_snps, annotation_matrix.shape[1])) + + for chunk_idx in range(int(np.ceil(self.stored_n_snps / chunk_size))): + + start_row = chunk_idx*chunk_size + end_row = (chunk_idx + 1)*chunk_size + + csr_mat = self.load_rows(start_row=start_row, + end_row=end_row, + return_symmetric=False, + fill_diag=False, + dtype=np.float32) + + # If a mask is set, apply it to the matrix: + if self._mask is not None: + csr_mat = csr_mat[self._mask, :][:, self._mask] + + mat_sq = csr_mat.power(2) + + if corrected: + mat_sq.data -= (1. - mat_sq.data) / (self.sample_size - 2) + + ld_scores += mat_sq.dot(annotation_matrix) + ld_scores += mat_sq.T.dot(annotation_matrix) + + # Add the contribution of the diagonal: + ld_scores += identity(self.n_snps, dtype=np.float32).dot(annotation_matrix) + + # Set floating type to float32: + ld_scores = ld_scores.astype(np.float32) + + if ld_scores.shape[1] == 1: + return ld_scores.flatten() + else: + return ld_scores + + def multiply(self, vec): + """ + Multiply the LD matrix with an input vector `vec`. + + !!! seealso "See Also" + * [dot][magenpy.LDMatrix.LDMatrix.dot] + + :return: The product of the LD matrix with the input vector. + """ + return self.csr_matrix.dot(vec) + + def dot(self, vec): + """ + Multiply the LD matrix with an input vector `vec`. + + !!! seealso "See Also" + * [multiply][magenpy.LDMatrix.LDMatrix.multiply] + + :return: The product of the LD matrix with the input vector. + + """ + return self.multiply(vec) + + def estimate_uncompressed_size(self, dtype=None): + """ + Provide an estimate of size of the uncompressed LD matrix in megabytes (MB). + This is only a rough estimate. Depending on how the LD matrix is loaded, the actual size + may be much larger than this estimate. + + :return: The estimated size of the uncompressed LD matrix in MB. + + """ + + if dtype is None: + dtype = self.stored_dtype + + return 2.*self._zg['matrix/data'].shape[0]*np.dtype(dtype).itemsize / 1024 ** 2 + + def get_metadata(self, key, apply_mask=True): + """ + Get the metadata associated with each variant in the LD matrix. + :param key: The key for the metadata item. + :param apply_mask: If True, apply the mask (e.g. filter) to the metadata. + + :return: The metadata item for each variant in the LD matrix. + :raises KeyError: if the metadata item is not set. + """ + try: + if self._mask is not None and apply_mask: + return self._zg[f'metadata/{key}'][self._mask] + else: + return self._zg[f'metadata/{key}'][:] + except KeyError: + raise KeyError(f"LD matrix metadata item {key} is not set!") + + def get_store_attr(self, attr): + """ + Get the attribute or metadata `attr` associated with the LD matrix. + :param attr: The attribute name. + + :return: The value for the attribute. + :raises KeyError: if the attribute is not set. + """ + try: + return self._zg.attrs[attr] + except KeyError: + print(f"Warning: Attribute '{attr}' is not set!") + return None + + def set_store_attr(self, attr, value): + """ + Set the attribute `attr` associated with the LD matrix. This is used + to set high-level information, such as information about the sample from which + the matrix was computed, the LD estimator used, its properties, etc. + + :param attr: The attribute name. + :param value: The value for the attribute. + """ + + self._zg.attrs[attr] = value + + def set_metadata(self, key, value, overwrite=False): + """ + Set the metadata field associated with variants the LD matrix. + :param key: The key for the metadata item. + :param value: The value for the metadata item (an array with the same length as the number of variants). + :param overwrite: If True, overwrite the metadata item if it already exists. + """ + + if 'metadata' not in list(self._zg.group_keys()): + meta = self._zg.create_group('metadata') + else: + meta = self._zg['metadata'] + + value = np.array(value) + + if np.issubdtype(value.dtype, np.floating): + dtype = np.float32 + elif np.issubdtype(value.dtype, np.integer): + dtype = np.int32 + else: + dtype = str + + meta.array(key, value, overwrite=overwrite, dtype=dtype, compressor=self.compressor) + + def update_rows_inplace(self, new_csr, start_row=None, end_row=None): + """ + A utility function to perform partial updates to a subset of rows in the + LD matrix. The function takes a new CSR matrix and, optionally, a start + and end row delimiting the chunk of the LD matrix to update with the `new_csr`. + + !!! note + Current implementation assumes that the update does not change the sparsity + structure of the original matrix. Updating the matrix with new sparsity structure + is a harder problem that we will try to tackle later on. + + !!! note + Current implementation assumes `new_csr` is upper triangular. + + :param new_csr: A sparse CSR matrix (`scipy.sparse.csr_matrix`) where the column dimension + matches the column dimension of the LD matrix. + :param start_row: The start row for the chunk to update. + :param end_row: The end row for the chunk to update. + + :raises AssertionError: if the column dimension of `new_csr` does not match the column dimension + """ + + assert new_csr.shape[1] == self.stored_n_snps + + start_row = start_row or 0 + end_row = end_row or self.stored_n_snps + + # Sanity checking: + assert start_row >= 0 + assert end_row <= self.stored_n_snps + + indptr = self._zg['matrix/indptr'][:] + + data_start = indptr[start_row] + data_end = indptr[end_row] + + # TODO: Check that this covers most cases and would not result in unexpected behavior + if np.issubdtype(self.stored_dtype, np.integer) and np.issubdtype(new_csr.dtype, np.floating): + self._zg['matrix/data'][data_start:data_end] = quantize(new_csr.data, int_dtype=self.stored_dtype) + else: + self._zg['matrix/data'][data_start:data_end] = new_csr.data.astype(self.stored_dtype) + + def low_memory_load(self, dtype=None): + """ + A utility method to load the LD matrix in low-memory mode. + The method will load the entries of the upper triangular portion of the matrix, + perform filtering based on the mask (if set), and return the filtered data + and index pointer (`indptr`) arrays. + + This is useful for some application, such as the `low_memory` version of + the `viprs` method, because it avoids reconstructing the `indices` array for the CSR matrix, + which can potentially be a very long array of large integers. + + !!! note + The method, by construction, does not support loading the full symmetric matrix. If + that's the goal, use the `.load()` or `.load_rows()` methods. + + !!! seealso "See Also" + * [load_rows][magenpy.LDMatrix.LDMatrix.load_rows] + * [load][magenpy.LDMatrix.LDMatrix.load] + + :param dtype: The data type for the entries of the LD matrix. + + :return: A tuple of the data and index pointer arrays for the LD matrix. + + """ + + # Determine the final data type for the LD matrix entries + # and whether we need to perform dequantization or not depending on + # the stored data type and the requested data type. + + if dtype is None: + dtype = self.stored_dtype + dequantize_data = False + else: + dtype = np.dtype(dtype) + if np.issubdtype(self.stored_dtype, np.integer) and np.issubdtype(dtype, np.floating): + dequantize_data = True + else: + dequantize_data = False + + # Get the index pointer array: + indptr = self._zg['matrix/indptr'][:] + + # Filter the index pointer array based on the mask: + if self._mask is not None: + + if np.issubdtype(self._mask.dtype, np.integer): + mask = np.zeros(self.stored_n_snps, dtype=np.int8) + mask[self._mask] = 1 + else: + mask = self._mask + + from .stats.ld.c_utils import filter_ut_csr_matrix_low_memory + + data_mask, indptr = filter_ut_csr_matrix_low_memory(indptr, mask) + # Unfortunately, .vindex is very slow in Zarr right now (~order of magnitude) + # So for now, we load the entire data array before performing the mask selection: + data = self._zg['matrix/data'][:][data_mask] + else: + data = self._zg['matrix/data'][:] + + if dequantize_data: + return dequantize(data, float_dtype=dtype), indptr + else: + return data.astype(dtype), indptr + + def load_rows(self, + start_row=None, + end_row=None, + return_symmetric=False, + fill_diag=False, + keep_shape=True, + dtype=None): + """ + A utility function to allow for loading a subset of the LD matrix. + By specifying `start_row` and `end_row`, the user can process or inspect small + blocks of the LD matrix without loading the whole thing into memory. + + TODO: Consider using `low_memory_load` internally to avoid reconstructing the `indices` array. + + !!! note + This method does not perform any filtering on the stored data. + To access the LD matrix with filtering, use `.load()` or `low_memory_load`. + + !!! seealso "See Also" + * [low_memory_load][magenpy.LDMatrix.LDMatrix.low_memory_load] + * [load][magenpy.LDMatrix.LDMatrix.load] + + :param start_row: The start row to load to memory + :param end_row: The end row (not inclusive) to load to memory + :param return_symmetric: If True, return a full symmetric representation of the LD matrix. + :param fill_diag: If True, fill the diagonal of the LD matrix with ones. + :param keep_shape: If True, return the LD matrix with the same shape as the original. Here, + entries that are outside the requested start_row:end_row region will be zeroed out. + :param dtype: The data type for the entries of the LD matrix. + + :return: The requested sub-matrix of the LD matrix. + """ + + # Determine the final data type for the LD matrix entries + # and whether we need to perform dequantization or not depending on + # the stored data type and the requested data type. + if dtype is None: + dtype = self.stored_dtype + dequantize_data = False + else: + dtype = np.dtype(dtype) + if np.issubdtype(self.stored_dtype, np.integer) and np.issubdtype(dtype, np.floating): + dequantize_data = True + else: + dequantize_data = False + + # Sanity checking + forming the dimensions of the + # requested sub-matrix: + n_snps = self.stored_n_snps + + start_row = start_row or 0 + end_row = end_row or n_snps + + # Sanity checking: + assert start_row >= 0 + end_row = min(end_row, n_snps) + + # Load the index pointer from disk: + indptr = self._zg['matrix/indptr'][:] + + # Determine the start and end positions in the data matrix + # based on the requested start and end rows: + data_start = indptr[start_row] + data_end = indptr[end_row] + + # If the user is requesting a subset of the matrix, then we need to adjust + # the index pointer accordingly: + if start_row > 0 or end_row < n_snps: + # Zero out all index pointers before `start_row`: + indptr = np.clip(indptr - data_start, a_min=0, a_max=None) + # Adjust all index pointers after `end_row`: + indptr[end_row+1:] = (data_end - data_start) + + # Extract the data for the requested rows: + csr_data = self._zg['matrix/data'][data_start:data_end] + + # If we need to de-quantize the data, do it now: + if dequantize_data: + csr_data = dequantize(csr_data, float_dtype=dtype) + + # Construct a CSR matrix from the loaded data, updated indptr, and indices: + + # Get the indices array: + if self.in_memory: + # If the matrix (or a version of it) is already loaded, + # then set the `in_memory` flag to False before fetching the indices. + self.in_memory = False + indices = self.indices + self.in_memory = True + else: + indices = self.indices + + mat = csr_matrix( + ( + csr_data, + indices[data_start:data_end], + indptr + ), + shape=(n_snps, n_snps), + dtype=dtype + ) + + # Determine the "invalid" value for the purposes of reconstructing + # the symmetric matrix: + if np.issubdtype(dtype, np.integer): + # For integers, we don't use the minimum value during quantization + # because we would like to have the zero point at exactly zero. So, + # we can use this value as our alternative to `nan`. + invalid_value = np.iinfo(dtype).min + identity_val = np.iinfo(dtype).max + else: + invalid_value = np.nan + identity_val = 1 + + if return_symmetric: + + # First, replace explicit zeros with invalid value (this is a hack to prevent scipy + # from eliminating those zeros when making the matrix symmetric): + mat.data[mat.data == 0] = invalid_value + + # Add the matrix transpose to make it symmetric: + mat = (mat + mat.T).astype(dtype) + + # If the user requested filling the diagonals, do it here: + if fill_diag: + diag_vals = np.concatenate([np.zeros(start_row, dtype=dtype), + identity_val*np.ones(end_row - start_row, dtype=dtype), + np.zeros(n_snps - end_row, dtype=dtype)]) + mat += diags(diag_vals, dtype=dtype, shape=mat.shape) + + # Replace the invalid values with zeros again: + if np.isnan(invalid_value): + mat.data[np.isnan(mat.data)] = 0 + else: + mat.data[mat.data == invalid_value] = 0 + + return mat + elif fill_diag: + diag_vals = np.concatenate([np.zeros(start_row, dtype=dtype), + identity_val*np.ones(end_row - start_row, dtype=dtype), + np.zeros(n_snps - end_row, dtype=dtype)]) + mat += diags(diag_vals, dtype=dtype, shape=mat.shape) + + # If the shape remains the same, return the matrix as is. + # Otherwise, return the requested sub-matrix: + if keep_shape: + return mat + else: + return mat[start_row:end_row, :] + + def load(self, + force_reload=False, + return_symmetric=True, + fill_diag=True, + dtype=None): + + """ + Load the LD matrix from on-disk storage in the form of Zarr arrays to memory, + in the form of sparse CSR matrices. + + !!! seealso "See Also" + * [low_memory_load][magenpy.LDMatrix.LDMatrix.low_memory_load] + * [load_rows][magenpy.LDMatrix.LDMatrix.load_rows] + + :param force_reload: If True, it will reload the data even if it is already in memory. + :param return_symmetric: If True, return a full symmetric representation of the LD matrix. + :param fill_diag: If True, fill the diagonal elements of the LD matrix with ones. + :param dtype: The data type for the entries of the LD matrix. + + :return: The LD matrix as a sparse CSR matrix. + """ + + if dtype is not None: + dtype = np.dtype(dtype) + + if self.in_memory: + # If the LD matrix is already in memory: + + if (return_symmetric == self.is_symmetric) and not force_reload: + # If the requested symmetry is the same as the one already loaded, + # and the user asked not to force a reload, then do nothing. + + # If the currently loaded LD matrix has float entries and the user wants + # the return type to be another floating point, then just cast and return. + # Otherwise, we have to reload the matrix: + if np.issubdtype(self._mat.data.dtype, np.floating) and np.issubdtype(dtype, np.floating): + self._mat.data = self._mat.data.astype(dtype) + return + elif self._mat.data.dtype == dtype: + return + + # If we are re-loading the matrix, make sure to release the current one: + self.release() + + self._mat = self.load_rows(return_symmetric=return_symmetric, + fill_diag=fill_diag, + dtype=dtype) + + # If a mask is set, apply it: + if self._mask is not None: + self._mat = self._mat[self._mask, :][:, self._mask] + + # Update the flags: + self.in_memory = True + self.is_symmetric = return_symmetric + + def release(self): + """ + Release the LD data from memory. + """ + self._mat = None + self.in_memory = False + self.is_symmetric = False + self.index = 0 + + def get_row(self, index, return_indices=False): + """ + Extract a single row from the LD matrix. + + :param index: The index of the row to extract. + :param return_indices: If True, return the indices of the non-zero elements of that row. + + :return: The requested row of the LD matrix. + """ + + if self.in_memory: + row = self.csr_matrix.getrow(index) + if return_indices: + return row.data, row.indices + else: + return row.data + else: + indptr = self.indptr[:] + start_idx, end_idx = indptr[index], indptr[index + 1] + if return_indices: + return self.data[start_idx:end_idx], np.arange(index + 1, + index + 1 + (indptr[index + 1] - indptr[index])) + else: + return self.data[start_idx:end_idx] + + def validate_ld_matrix(self): + """ + Checks that the `LDMatrix` object has correct structure and + checks its contents for validity. + + Specifically, we check that: + * The dimensions of the matrix and its associated attributes are matching. + * The masking is working properly. + + :return: True if the matrix has the correct structure. + :raises ValueError: if the matrix is not valid. + """ + + class_attrs = ['snps', 'a1', 'a2', 'maf', 'bp_position', 'cm_position', 'ld_score'] + + for attr in class_attrs: + attribute = getattr(self, attr) + if attribute is None: + continue + if len(attribute) != len(self): + raise ValueError(f"Invalid LD Matrix: Dimensions for attribute {attr} are not aligned!") + + # TODO: Add other sanity checks here? + + return True + + def __getstate__(self): + return self.store.path, self.in_memory, self.is_symmetric, self._mask + + def __setstate__(self, state): + + path, in_mem, is_symmetric, mask = state + + self._zg = zarr.open_group(path, mode='r') + self.in_memory = in_mem + self.is_symmetric = is_symmetric + self._mat = None + self.index = 0 + self._mask = None + + if mask is not None: + self.set_mask(mask) + + if in_mem: + self.load(return_symmetric=is_symmetric, fill_diag=is_symmetric) + + def __len__(self): + return self.n_snps + + def __getitem__(self, index): + return self.get_row(index) + + def __iter__(self): + """ + TODO: Add a flag to allow for chunked iterator, with limited memory footprint. + """ + self.index = 0 + self.load(return_symmetric=self.is_symmetric) + return self + + def __next__(self): + + if self.index == len(self): + self.index = 0 + raise StopIteration + + next_item = self.get_row(self.index) + self.index += 1 + + return next_item diff --git a/magenpy/LDMatrix.pyx b/magenpy/LDMatrix.pyx deleted file mode 100644 index 62e62dd..0000000 --- a/magenpy/LDMatrix.pyx +++ /dev/null @@ -1,658 +0,0 @@ -# cython: linetrace=False -# cython: profile=False -# cython: binding=False -# cython: boundscheck=False -# cython: wraparound=False -# cython: initializedcheck=False -# cython: nonecheck=False -# cython: language_level=3 -# cython: infer_types=True - - -import zarr -import os.path as osp -import numpy as np -cimport numpy as np -import pandas as pd -from magenpy.stats.ld.c_utils import zarr_islice - - -cdef class LDMatrix: - - cdef public: - object _zarr - bint in_memory - list _data - unsigned int index, _n_elements - np.ndarray _ld_boundaries # For caching - np.ndarray _mask - - def __init__(self, zarr_arr): - - assert isinstance(zarr_arr, zarr.Array) - - self._zarr = zarr_arr - - self._data = None - self.in_memory = False - self.index = 0 - - self._ld_boundaries = None - self._mask = None - self._n_elements = self.shape[0] - - @classmethod - def from_path(cls, ld_store_path): - """ - Initialize an `LDMatrix` object from a Zarr array store. - :param ld_store_path: The path to the Zarr array store on the filesystem. - """ - - if '.zarray' in ld_store_path: - ld_store_path = osp.dirname(ld_store_path) - - if osp.isfile(osp.join(ld_store_path, '.zarray')): - ldm = zarr.open(ld_store_path) - return cls(ldm) - else: - raise FileNotFoundError - - @classmethod - def from_dir(cls, ld_store_path): - """ - Initialize an `LDMatrix` object from a Zarr array store. See also `.from_path` - :param ld_store_path: The path to the Zarr array store on the filesystem. - """ - return cls.from_path(ld_store_path) - - @property - def n_elements(self): - """ - The number of non-masked elements in the LD matrix. For the full number - see `.shape`. - """ - return self._n_elements - - @property - def n_snps(self): - """ - The number of SNPs in the LD matrix. See also `.n_elements` - """ - return self.n_elements - - @property - def shape(self): - """ - The shape (dimensions) of the LD matrix. - """ - return self._zarr.shape - - @property - def store(self): - """ - The Zarr array store object. - """ - return self._zarr.store - - @property - def z_array(self): - """ - The Zarr array - """ - return self._zarr - - @property - def chunks(self): - """ - The chunks of the Zarr array - """ - return self._zarr.chunks - - @property - def chunk_size(self): - """ - The chunk size of the Zarr array. - """ - return self.chunks[0] - - @property - def chromosome(self): - """ - The chromosome for which this LD matrix was calculated. - """ - return self.get_store_attr('Chromosome') - - @property - def ld_estimator(self): - """ - The LD estimator - """ - return self.get_store_attr('LD estimator') - - @property - def estimator_properties(self): - """ - Properties of the LD estimator - """ - return self.get_store_attr('Estimator properties') - - @property - def snps(self): - """ - The SNPs included in the LD matrix. - """ - - z_snps = np.array(self.get_store_attr('SNP')) - - if self._mask is not None: - return z_snps[self._mask] - else: - return z_snps - - @property - def ld_boundaries(self): - """ - The LD boundaries associated with each SNP. - """ - if self._ld_boundaries is None: - self._ld_boundaries = np.array(self.get_store_attr('LD boundaries')) - return np.array(self._ld_boundaries) - - @property - def window_size(self): - """ - Return the window size for each SNP (i.e. how many neighboring SNPs are included in its LD window) - """ - ld_bounds = self.get_masked_boundaries() - return ld_bounds[1, :] - ld_bounds[0, :] - - @property - def sample_size(self): - """ - The sample size used to compute the LD matrix. - """ - return self.get_store_attr('Sample size') - - @property - def a1(self): - """ - The alternative allele for which we count mutations at each SNP - """ - - a1 = np.array(self.get_store_attr('A1')) - - if self._mask is not None: - return a1[self._mask] - else: - return a1 - - @property - def a2(self): - """ - The reference allele - """ - - a2 = self.get_store_attr('A2') - - if a2 is None: - return None - else: - a2 = np.array(a2) - - if self._mask is not None: - return a2[self._mask] - else: - return a2 - - @property - def maf(self): - """ - The minor allele frequency (MAF) of each SNP in the LD matrix. - """ - - maf = self.get_store_attr('MAF') - - if maf is None: - return None - else: - maf = np.array(maf) - - if self._mask is not None: - return maf[self._mask] - else: - return maf - - @property - def bp_position(self): - """ - The base pair position of each SNP in the LD matrix. - """ - - bp = np.array(self.get_store_attr('BP')) - - if self._mask is not None: - return bp[self._mask] - else: - return bp - - @property - def cm_position(self): - """ - The centi Morgan position of each SNP in the LD matrix. - """ - - cm = self.get_store_attr('cM') - - if cm is None: - return None - else: - cm = np.array(cm) - - if self._mask is not None: - return cm[self._mask] - else: - return cm - - @property - def ld_score(self): - """ - The LD score of each SNP in the LD matrix. - """ - - ld_score = self.get_store_attr('LDScore') - - if ld_score is None: - ld_score = self.compute_ld_scores() - if self._mask is None: - self.set_store_attr('LDScore', ld_score.tolist()) - else: - ld_score = np.array(ld_score) - - if self._mask is not None: - return ld_score[self._mask] - else: - return ld_score - - def filter_snps(self, extract_snps=None, extract_file=None): - """ - Filter the LDMatrix to a subset of SNPs. - :param extract_snps: A list or array of SNP IDs to keep. - :param extract_file: A file containing the SNP IDs to keep. - """ - - assert extract_snps is not None or extract_file is not None - - if extract_snps is None: - from .parsers.misc_parsers import read_snp_filter_file - extract_snps = read_snp_filter_file(extract_file) - - from .utils.compute_utils import intersect_arrays - - extract_index = intersect_arrays(np.array(self.get_store_attr('SNP')), - extract_snps, - return_index=True) - - new_mask = np.zeros(self.shape[0], dtype=bool) - new_mask[extract_index] = True - - self.set_mask(new_mask) - - def get_mask(self): - """ - Get the mask used to hide/remove some SNPs from the LD matrix. - """ - if self._mask is not None: - return np.array(self._mask) - - def set_mask(self, mask): - """ - Set the mask to hide/remove some SNPs from the LD matrix. - :param mask: A boolean numpy array indicating whether to keep each SNP - or not. - """ - - self._mask = mask - - if mask is None: - # Update the number of elements: - self._n_elements = self.shape[0] - else: - # Update the number of elements: - self._n_elements = mask.sum() - - # Load the LD boundaries: - ld_bounds = self.ld_boundaries - - # If the data is already in memory, reload: - if self.in_memory: - self.load(force_reload=True) - - def get_masked_boundaries(self): - """ - Return the LD boundaries after applying the mask - If the mask is not set, return the original boundaries - """ - - curr_ld_bounds = self.ld_boundaries - - if self._mask is None: - return curr_ld_bounds - else: - # Number of excluded elements up to (and including) position i - n_excluded = np.cumsum(~self._mask) - # Number of excluded elements up to (not including) position i - n_excluded_before = n_excluded - (~self._mask).astype(int) - - # New start position: - start_pos = curr_ld_bounds[0, :] - n_excluded_before[curr_ld_bounds[0, :]] - # New end position: - end_pos = curr_ld_bounds[1, :] - n_excluded[curr_ld_bounds[1, :] - 1] - - # Return masked boundaries array: - return np.array([start_pos[self._mask], end_pos[self._mask]]) - - def to_snp_table(self, col_subset=None): - """ - Return a table of the SNP attributes for SNPs in the LD matrix. - """ - - col_subset = col_subset or ['CHR', 'SNP', 'POS', 'A1', 'A2', 'MAF', 'LDScore'] - - table = pd.DataFrame({'SNP': self.snps}) - - for col in col_subset: - if col == 'CHR': - table['CHR'] = self.chromosome - if col == 'POS': - table['POS'] = self.bp_position - if col == 'cM': - table['cM'] = self.cm_position - if col == 'A1': - table['A1'] = self.a1 - if col == 'A2': - table['A2'] = self.a2 - if col == 'MAF': - table['MAF'] = self.maf - if col == 'LDScore': - table['LDScore'] = self.ld_score - if col == 'WindowSize': - table['WindowSize'] = self.window_size - - return table[list(col_subset)] - - def flatten(self): - """ - Flatten the LD matrix into one long vector (excluding zero elements). - See also `.flattened_boundaries()` - NOTE: The implementation below will not work with chunked storage and loading of the - data from disk on a chunk-by-chunk basis. This is left for future work. - """ - - return np.concatenate([np.array(x) for x in self]) - - def get_flattened_boundaries(self): - """ - Return the boundaries separating the LD blocks of each SNP in a flattened LD matrix. - """ - return np.cumsum(self.window_size) - - def to_csr_matrix(self): - """ - Convert the Zarr-formatted LD matrix into scipy sparse CSR matrix. - """ - - # Concatenate the data (entries of the LD matrix): - data = self.flatten() - - # Stitch together the rows and columns for each data point: - bounds = self.get_masked_boundaries() - window_sizes = bounds[1, :] - bounds[0, :] - - rows = np.concatenate([np.repeat(i, ws) for i, ws in enumerate(window_sizes)]) - cols = np.concatenate([np.arange(bounds[0, i], bounds[1, i]) for i in range(bounds.shape[1])]) - - from scipy.sparse import csr_matrix - - return csr_matrix((data, (rows, cols)), shape=(self.n_elements, self.n_elements)) - - def compute_ld_scores(self, annotation_matrix=None, corrected=True): - """ - Computes the LD scores for all SNPs in the LD matrix. - :param annotation_matrix: A matrix of annotations for each variant for which to aggregate the LD scores. - :param corrected: Use the sample-size corrected estimator for the squared Pearson correlation. - See Bulik-Sullivan et al. 2015. - """ - - ld_scores = [] - cdef int n = self.sample_size - - for snp_ld, (start, end) in zip(self, self.get_masked_boundaries().T): - - ldsc = np.array(snp_ld) ** 2 - - if corrected: - ldsc = ldsc - (1. - ldsc) / (n - 2) - - if annotation_matrix is None: - ld_scores.append(ldsc.sum()) - else: - ld_scores.append((ldsc.reshape(-1, 1) * annotation_matrix[start: end, :]).sum(axis=0)) - - return np.array(ld_scores) - - def multiply(self, double[::1] vec): - """ - Multiply the LD matrix with an input vector `vec`. - """ - - cdef: - double[::1] D_j, res = np.empty_like(vec) - int j, start, end - - for j, (D_j, (start, end)) in enumerate(zip(self, self.get_masked_boundaries().T)): - res[j] = np.dot(D_j, vec[start:end]) - - return np.array(res) - - def store_size(self): - """ - Returns the size of the compressed LD store in MB - """ - return self.store.getsize() / 1024 ** 2 - - def estimate_uncompressed_size(self): - """ - Returns an estimate of size of the uncompressed LD matrix in MB - If the array is masked, it returns a (rough) estimate of the size of the - elements that will be loaded into memory. - """ - ld_bounds = self.get_masked_boundaries() - - return (ld_bounds[1, :] - ld_bounds[0, :]).sum() * np.dtype(np.float64).itemsize / 1024 ** 2 - - def get_store_attr(self, attr): - """ - Get the attribute or metadata `attr` associated with the LD matrix. - """ - try: - return self._zarr.attrs[attr] - except KeyError: - print(f"Warning: Attribute '{attr}' is not set!") - return None - - def set_store_attr(self, attr, value): - """ - Set the attribute or metadata `attr` associated with the LD matrix. - """ - try: - self._zarr.attrs[attr] = value - except Exception as e: - raise e - - def load(self, start=0, end=None, force_reload=False): - """ - Load the LD matrix from disk to memory. - :param start: The start row position. - :param end: The end row position. - :param force_reload: If True, it will reload the data even if it was already loaded. - """ - - if self.in_memory and not force_reload: - return - - if end is None: - end = self.shape[0] - if start == 0: - self.in_memory = True - - cdef: - unsigned int i - long[:, ::1] ld_bounds = self.ld_boundaries - - self._data = [] - - if self._mask is None: - - for d in zarr_islice(self._zarr, start, end): - # TODO: Figure out a way to get around copying - self._data.append(d.copy()) - - else: - - for i, d in enumerate(zarr_islice(self._zarr, start, end), start): - if self._mask[i]: - bound_start, bound_end = ld_bounds[:, i] - self._data.append(d[self._mask[bound_start: bound_end]].copy()) - else: - self._data.append(np.array([np.nan])) - - def release(self): - """ - Release the LD data from memory. - """ - self._data = None - self.in_memory = False - self.index = 0 - - def iterate_blockwise(self, block_size=None): - """ - Iterate over the LD matrix in a block-wise fashion. - This function returns blocks of rows from the LD matrix, where the block size - is either specified by the user (`block_size`) or via the `block` estimator - properties, or by using the chunk size from the Zarr matrix. This function takes into - account the masked entries when selecting the blocks to return to the user. - - This utility function may be useful for parallel processing across different - blocks of the LD matrix. - - :param block_size: The number of SNPs or items in each block. If not specified, - we will automatically use the block boundaries from the block LD estimator or the - chunk size from the Zarr matrix. - - """ - - if block_size is None: - # If the block size is not specified, - if self.ld_estimator == 'block': - # Use pre-defined block boundaries from the LD estimator: - block_delim = np.unique(self.get_masked_boundaries()[1, :]) - else: - # Use the Zarr chunk size as a proxy for the block size: - block_delim = np.clip(np.arange(self.chunk_size, len(self) + self.chunk_size, self.chunk_size), - a_min=self.chunk_size, a_max=len(self)) - else: - block_delim = np.clip(np.arange(block_size, len(self) + block_size, block_size), - a_min=block_size, a_max=len(self)) - - block_idx = 0 - block_data = [] - - for j, Dj in enumerate(self): - - if j < block_delim[block_idx]: - block_data.append(Dj) - else: - yield block_data - block_idx += 1 - block_data = [Dj] - - if len(block_data) > 0: - yield block_data - - def iterate_chunks(self): - """ - Iterate over chunks of the LD matrix. Calls `iterate_blockwise` - where the size of the block is the `chunk_size` of the Zarr matrix. - """ - self.iterate_blockwise(block_size=self.chunk_size) - - def __getstate__(self): - return self.store.path, self.in_memory, self._mask - - def __setstate__(self, state): - - path, in_mem, mask = state - - self._zarr = zarr.open(path) - - if mask is None: - if in_mem: - self.load() - else: - self.in_memory = in_mem - self.set_mask(mask) - - def __len__(self): - return self._n_elements - - def __getitem__(self, item): - if self.in_memory: - return self._data[item] - else: - return self._zarr[item] - - def __iter__(self): - self.index = 0 - return self - - def __next__(self): - - cdef int i, curr_chunk, index_chunk - - if self.index == 0: - curr_chunk = -1 - else: - curr_chunk = (self.index - 1) // self.chunk_size - - if self._mask is not None: - - try: - if not self._mask[self.index]: - for i in range(0, self.shape[0] - self.index + 1): - if self._mask[self.index + i]: - break - - self.index += i - except IndexError: - # Reached the end of the array - self.index = self.shape[0] - pass - - if self.index == self.shape[0]: - self.index = 0 - raise StopIteration - - cdef double[::1] next_item - - if self.in_memory: - next_item = self._data[self.index] - else: - index_chunk = self.index // self.chunk_size - if index_chunk > curr_chunk: - self.load(start=index_chunk * self.chunk_size, end=(index_chunk + 1) * self.chunk_size) - - next_item = self._data[self.index % self.chunk_size] - - self.index += 1 - - return next_item diff --git a/magenpy/SampleTable.py b/magenpy/SampleTable.py index 705d376..64c8e80 100644 --- a/magenpy/SampleTable.py +++ b/magenpy/SampleTable.py @@ -2,73 +2,159 @@ from typing import Union import numpy as np import pandas as pd -from .parsers.plink_parsers import parse_fam_file class SampleTable(object): - - def __init__(self, table: Union[pd.DataFrame, None] = None, phenotype_likelihood: Union[str, None] = None): + """ + A class to represent sample (individual) information and attributes in + the context of a genotype matrix. The sample table is a wrapper around + a `pandas.DataFrame` object that contains the sample information. The + table provides methods to read and write sample information from/to + disk, filter samples, perofm checks/validation, and extract specific columns + from the table. + + :ivar table: The sample table as a pandas `DataFrame`. + :ivar _phenotype_likelihood: The likelihood of the phenotype values (if present). + :ivar _covariate_cols: The names or IDs of covariates that are present in the sample table. + + """ + + def __init__(self, + table: Union[pd.DataFrame, None] = None, + phenotype_likelihood: Union[str, None] = None): + """ + Initialize the sample table object. + :param table: A pandas DataFrame with the sample information. + :param phenotype_likelihood: The likelihood of the phenotype values. + """ self.table: Union[pd.DataFrame, None] = table - assert phenotype_likelihood in (None, 'binomial', 'gaussian') + if self.table is not None and 'original_index' not in self.table.columns: + self.table['original_index'] = np.arange(len(self.table)) + + assert phenotype_likelihood in (None, 'binomial', 'gaussian', 'infer') self._phenotype_likelihood: Union[str, None] = phenotype_likelihood self._covariate_cols = None - self.post_check_phenotype() + if self.table is not None: + self.post_check_phenotype() @property def shape(self): + """ + :return: The shape of the sample table (mainly sample size) as a tuple (n,). + """ return (self.n,) @property def n(self): + """ + !!! seealso "See Also" + * [sample_size][magenpy.SampleTable.SampleTable.sample_size] + + :return: The sample size (number of individuals) in the sample table. + """ return len(self.table) @property def sample_size(self): + """ + !!! seealso "See Also" + * [n][magenpy.SampleTable.SampleTable.n] + + :return: he sample size (number of individuals) in the sample table. + """ return self.n @property def iid(self): + """ + :return: The individual ID of each individual in the sample table. + """ if self.table is not None: return self.table['IID'].values @property def fid(self): + """ + :return: The family ID of each individual in the sample table. + """ if self.table is not None: return self.table['FID'].values @property def phenotype(self): + """ + :return: The phenotype column from the sample table. + :raises KeyError: If the phenotype is not set. + """ if self.table is not None: try: return self.table['phenotype'].values except KeyError: raise KeyError("The phenotype is not set!") + @property + def original_index(self): + """ + :return: The original index of each individual in the sample table (before applying any filters). + """ + if self.table is not None: + return self.table['original_index'].values + @property def covariates(self): + """ + :return: The column names for the covariates stored in the sample table. + """ return self._covariate_cols @property def phenotype_likelihood(self): + """ + :return: The phenotype likelihood family. + """ return self._phenotype_likelihood @classmethod def from_fam_file(cls, fam_file): + """ + Initialize a sample table object from a path to PLINK FAM file. + :param fam_file: The path to the FAM file. + + :return: A `SampleTable` object. + """ + + from .parsers.plink_parsers import parse_fam_file + s_tab = parse_fam_file(fam_file) return cls(table=s_tab) @classmethod def from_phenotype_file(cls, phenotype_file, filter_na=True, **read_csv_kwargs): + """ + Initialize a sample table from a phenotype file. + :param phenotype_file: The path to the phenotype file. + :param filter_na: Filter samples with missing phenotype values (Default: True). + :param read_csv_kwargs: keyword arguments to pass to the `read_csv` function of `pandas`. + + :return: A `SampleTable` object. + """ s_tab = cls() s_tab.read_phenotype_file(phenotype_file, filter_na, **read_csv_kwargs) return s_tab @classmethod def from_covariate_file(cls, covar_file, **read_csv_kwargs): + """ + Initialize a sample table from a file of covariates. + :param covar_file: The path to the covariates file. + :param read_csv_kwargs: keyword arguments to pass to the `read_csv` function of `pandas`. + + :return: A `SampleTable` object. + """ s_tab = cls() s_tab.read_covariates_file(covar_file, **read_csv_kwargs) return s_tab @@ -85,7 +171,7 @@ def read_phenotype_file(self, phenotype_file, drop_na=True, **read_csv_kwargs): """ if 'sep' not in read_csv_kwargs and 'delimiter' not in read_csv_kwargs: - read_csv_kwargs['delim_whitespace'] = True + read_csv_kwargs['sep'] = r'\s+' if 'na_values' not in read_csv_kwargs: read_csv_kwargs['na_values'] = {'phenotype': [-9.]} @@ -117,7 +203,6 @@ def read_phenotype_file(self, phenotype_file, drop_na=True, **read_csv_kwargs): self.post_check_phenotype() def read_covariates_file(self, covar_file, **read_csv_kwargs): - """ Read the covariates file from the provided path. The expected format is Family ID (`FID`), Individual ID (`IID`) and the remaining columns are assumed to be covariates. You may adjust @@ -128,7 +213,7 @@ def read_covariates_file(self, covar_file, **read_csv_kwargs): """ if 'sep' not in read_csv_kwargs and 'delimiter' not in read_csv_kwargs: - read_csv_kwargs['delim_whitespace'] = True + read_csv_kwargs['sep'] = r'\s+' covar_table = pd.read_csv(covar_file, **read_csv_kwargs) self._covariate_cols = covar_table.columns[2:] @@ -145,7 +230,10 @@ def read_covariates_file(self, covar_file, **read_csv_kwargs): def post_check_phenotype(self): """ Apply some simple heuristics to check the phenotype values - provided by the user and infer the phenotype likelihood (if needed). + provided by the user and infer the phenotype likelihood (if feasible). + + :raises ValueError: If the phenotype values could not be matched with the + inferred phenotype likelihood. """ if 'phenotype' in self.table.columns: @@ -154,13 +242,14 @@ def post_check_phenotype(self): if self.table['phenotype'].isnull().all(): self.table.drop('phenotype', axis=1, inplace=True) - elif self.phenotype_likelihood in ('binomial', None): + elif self._phenotype_likelihood != 'gaussian': if len(unique_vals) > 2: self._phenotype_likelihood = 'gaussian' return unique_vals = sorted(unique_vals) + if unique_vals == [1, 2]: # Plink coding for case/control self.table['phenotype'] -= 1 @@ -190,22 +279,41 @@ def filter_samples(self, keep_samples=None, keep_file=None): self.table = self.table.merge(pd.DataFrame({'IID': keep_samples}, dtype=type(self.iid[0]))) - def get_table(self, col_subset=None): + def to_table(self, col_subset=None): + """ + Get the sample table as a pandas DataFrame. + + :param col_subset: A subset of the columns to include in the table. + :return: A pandas DataFrame with the sample information. + """ if col_subset is not None: return self.table[list(col_subset)] else: return self.table def get_individual_table(self): - return self.get_table(col_subset=['FID', 'IID']) + """ + :return: A table of individual IDs (FID, IID) present in the sample table. + """ + return self.to_table(col_subset=['FID', 'IID']) def get_phenotype_table(self): + """ + :return: A table of individual IDs and phenotype values (FID IID phenotype) in the sample table. + """ try: - return self.get_table(col_subset=['FID', 'IID', 'phenotype']) + return self.to_table(col_subset=['FID', 'IID', 'phenotype']) except KeyError: raise KeyError("The phenotype is not set!") def get_covariates_table(self, covar_subset=None): + """ + Get a table of covariates associated with each individual in the + sample table. The table will be formatted as (FID, IID, covar1, covar2, ...). + + :param covar_subset: A subset of the covariate names or IDs to include in the table. + :return: A pandas DataFrame with the covariate information. + """ assert self._covariate_cols is not None if covar_subset is None: @@ -215,25 +323,41 @@ def get_covariates_table(self, covar_subset=None): assert len(covar) >= 1 - return self.get_table(col_subset=['FID', 'IID'] + covar) + return self.to_table(col_subset=['FID', 'IID'] + covar) def get_covariates(self, covar_subset=None): + """ + Get the covariates associated with each individual in the sample table as a matrix. + :param covar_subset: A subset of the covariate names or IDs to include in the matrix. + + :return: A numpy array with the covariate values. + """ return self.get_covariates_table(covar_subset=covar_subset).iloc[:, 2:].values def set_phenotype(self, phenotype, phenotype_likelihood=None): + """ + Update the phenotype in the sample table using the provided values. + :param phenotype: The new phenotype values, represented by a numpy array or Iterable. + :param phenotype_likelihood: The likelihood of the phenotype values. + """ self.table['phenotype'] = phenotype if phenotype_likelihood: self._phenotype_likelihood = phenotype_likelihood + else: + self.post_check_phenotype() def to_file(self, output_file, col_subset=None, **to_csv_kwargs): """ - Write the sample table to file. + Write the contents of the sample table to file. :param output_file: The path to the file where to write the sample table. :param col_subset: A subset of the columns to write to file. + :param to_csv_kwargs: keyword arguments to pass to the `to_csv` function of `pandas`. """ + assert self.table is not None + if 'sep' not in to_csv_kwargs and 'delimiter' not in to_csv_kwargs: to_csv_kwargs['sep'] = '\t' diff --git a/magenpy/SumstatsTable.py b/magenpy/SumstatsTable.py index 1297d37..a57c73a 100644 --- a/magenpy/SumstatsTable.py +++ b/magenpy/SumstatsTable.py @@ -2,86 +2,202 @@ import warnings import pandas as pd import numpy as np -from magenpy.utils.compute_utils import intersect_arrays +from .utils.compute_utils import intersect_arrays class SumstatsTable(object): + """ + A wrapper class for representing the summary statistics obtained from + Genome-wide Association Studies (GWAS). GWAS software tools publish their + results in the form of summary statistics, which include the SNP rsIDs, + the effect/reference alleles tested, the marginal effect sizes (BETA), + the standard errors (SE), the Z-scores, the p-values, etc. + + This class provides a convenient way to access/manipulate/harmonize these summary statistics + across various formats. Particularly, given the heterogeneity in summary statistics + formats, this class provides a common interface to access these statistics + in a consistent manner. The class also supports computing some derived statistics + from the summary statistics, such as the pseudo-correlation between the SNP and the + phenotype, the Chi-squared statistics, etc. + + :ivar table: A pandas DataFrame containing the summary statistics. + """ def __init__(self, ss_table: pd.DataFrame): + """ + Initialize the summary statistics table. + + :param ss_table: A pandas DataFrame containing the summary statistics. + + !!! seealso "See Also" + * [from_file][magenpy.SumstatsTable.SumstatsTable.from_file] + """ self.table: pd.DataFrame = ss_table assert all([col in self.table.columns for col in ('SNP', 'A1')]) @property def shape(self): + """ + :return: he shape of the summary statistics table. + """ return self.table.shape + def __len__(self): + return len(self.table) + @property def chromosome(self): + """ + A convenience method to return the chromosome number if there is only one chromosome in the summary statistics. + If multiple chromosomes are present, it returns None. + + :return: The chromosome number if there is only one chromosome in the summary statistics. + """ chrom = self.chromosomes - if chrom is not None: - if len(chrom) == 1: - return chrom[0] + if chrom is not None and len(chrom) == 1: + return chrom[0] @property def chromosomes(self): + """ + :return: The unique chromosomes in the summary statistics table. + """ if 'CHR' in self.table.columns: return sorted(self.table['CHR'].unique()) @property def m(self): + """ + !!! seealso "See Also" + * [n_snps][magenpy.SumstatsTable.SumstatsTable.n_snps] + + :return: The number of variants in the summary statistics table. + """ return self.n_snps @property def n_snps(self): + """ + !!! seealso "See Also" + * [m][magenpy.SumstatsTable.SumstatsTable.m] + + :return: The number of variants in the summary statistics table. + """ return len(self.table) @property def snps(self): + """ + :return: The rsIDs associated with each variant in the summary statistics table. + """ return self.table['SNP'].values @property def a1(self): + """ + !!! seealso "See Also" + * [effect_allele][magenpy.SumstatsTable.SumstatsTable.effect_allele] + * [alt_allele][magenpy.SumstatsTable.SumstatsTable.alt_allele] + + :return: The alternative or effect allele for each variant in the summary statistics table. + + """ return self.table['A1'].values @property def a2(self): + """ + !!! seealso "See Also" + * [ref_allele][magenpy.SumstatsTable.SumstatsTable.ref_allele] + + :return: The reference allele for each variant in the summary statistics table. + """ return self.get_col('A2') @property def ref_allele(self): + """ + !!! seealso "See Also" + * [a2][magenpy.SumstatsTable.SumstatsTable.a2] + + :return: The reference allele for each variant in the summary statistics table. + """ return self.a2 @property def alt_allele(self): + """ + !!! seealso "See Also" + * [effect_allele][magenpy.SumstatsTable.SumstatsTable.effect_allele] + * [a1][magenpy.SumstatsTable.SumstatsTable.a1] + + :return: The alternative or effect allele for each variant in the summary statistics table. + """ return self.a1 @property def effect_allele(self): + """ + !!! seealso "See Also" + * [alt_allele][magenpy.SumstatsTable.SumstatsTable.alt_allele] + * [a1][magenpy.SumstatsTable.SumstatsTable.a1] + + :return: The alternative or effect allele for each variant in the summary statistics table. + """ return self.a1 @property def bp_pos(self): + """ + :return: The base pair position for each variant in the summary statistics table. + """ return self.get_col('POS') @property def maf(self): + """ + :return: The minor allele frequency for each variant in the summary statistics table. + """ return self.get_col('MAF') @property def maf_var(self): + """ + :return: The variance of the minor allele frequency for each variant in the summary statistics table. + """ return 2.*self.maf*(1. - self.maf) @property def n(self): + """ + !!! seealso "See Also" + * [n_per_snp][magenpy.SumstatsTable.SumstatsTable.n_per_snp] + + :return: The sample size for the association test of each variant in the summary statistics table. + """ return self.get_col('N') @property def n_per_snp(self): + """ + # TODO: Add a way to infer N from other sumstats if missing. + + !!! seealso "See Also" + * [n][magenpy.SumstatsTable.SumstatsTable.n] + + :return: The sample size for the association test of each variant in the summary statistics table. + """ return self.get_col('N') @property def beta_hat(self): + """ + !!! seealso "See Also" + * [marginal_beta][magenpy.SumstatsTable.SumstatsTable.marginal_beta] + + :return: The marginal beta from the association test of each variant on the phenotype. + """ beta = self.get_col('BETA') @@ -95,24 +211,42 @@ def beta_hat(self): @property def marginal_beta(self): + """ + !!! seealso "See Also" + * [beta_hat][magenpy.SumstatsTable.SumstatsTable.beta_hat] + + :return: The marginal beta from the association test of each variant on the phenotype. + """ return self.beta_hat @property def odds_ratio(self): + """ + :return: The odds ratio from the association test of each variant on case-control phenotypes. + """ return self.get_col('OR') @property def standardized_marginal_beta(self): """ - Return the marginal BETAs assuming that both the genotype matrix - and the phenotype vector are standardized column-wise. + Get the marginal BETAs assuming that both the genotype matrix + and the phenotype vector are standardized column-wise to have mean zero and variance 1. In some contexts, this is also known as the per-SNP correlation or pseudo-correlation with the phenotype. + + !!! seealso "See Also" + * [get_snp_pseudo_corr][magenpy.SumstatsTable.SumstatsTable.get_snp_pseudo_corr] + + :return: The standardized marginal beta from the association test of each variant on the phenotype. """ return self.get_snp_pseudo_corr() @property def z_score(self): + """ + :return: The Z-score from the association test of each SNP on the phenotype. + :raises KeyError: If the Z-score statistic is not available and could not be inferred from available data. + """ z = self.get_col('Z') if z is not None: @@ -129,14 +263,33 @@ def z_score(self): @property def standard_error(self): + """ + !!! seealso "See Also" + * [se][magenpy.SumstatsTable.SumstatsTable.se] + + :return: The standard error from the association test of each variant on the phenotype. + + """ return self.get_col('SE') @property def se(self): + """ + !!! seealso "See Also" + * [standard_error][magenpy.SumstatsTable.SumstatsTable.standard_error] + + :return: The standard error from the association test of each variant on the phenotype. + """ return self.standard_error @property def pval(self): + """ + !!! seealso "See Also" + * [p_value][magenpy.SumstatsTable.SumstatsTable.p_value] + + :return: The p-value from the association test of each variant on the phenotype. + """ p = self.get_col('PVAL') if p is not None: @@ -148,21 +301,29 @@ def pval(self): @property def p_value(self): + """ + !!! seealso "See Also" + * [pval][magenpy.SumstatsTable.SumstatsTable.pval] + + :return: The p-value from the association test of each variant on the phenotype. + """ return self.pval @property def log10_p_value(self): """ - Computes -log10(p_value). - May be useful for Manhattan plots. + :return: The negative log10 of the p-value (-log10(p_value)) of association + test of each variant on the phenotype. """ return -np.log10(self.pval) @property def effect_sign(self): """ - Return the sign for the effect size (1 for positive effect, -1 for negative effect) - of each genetic marker. + :return: The sign for the effect size (1 for positive effect, -1 for negative effect) + of each genetic variant ib the phenotype. + + :raises KeyError: If the sign could not be inferred from available data. """ signed_statistics = ['BETA', 'Z', 'OR'] @@ -177,10 +338,50 @@ def effect_sign(self): raise KeyError("No signed statistic to extract the sign from!") + def infer_a2(self, reference_table, allow_na=False): + """ + Infer the reference allele A2 (if not present in the SumstatsTable) + from a reference table. Make sure that the reference table contains the SNP ID, + the reference allele A2 and the alternative (i.e. effect) allele A1. It is the + user's responsibility to make sure that the reference table matches the summary + statistics in terms of the specification of reference vs. alternative. They are + allowed to be flipped, but they have to be consistent across the two tables. + + :param reference_table: A pandas table containing the following columns at least: + `SNP`, `A1`, `A2`. + :param allow_na: If True, allow the reference allele to be missing from the final result. + """ + + # Merge the summary statistics table with the reference table on `SNP` ID: + merged_table = self.table[['SNP', 'A1']].merge(reference_table[['SNP', 'A1', 'A2']], + how='left', + on='SNP') + # If `A1_x` agrees with `A1_y`, then `A2` is indeed the reference allele. + # Otherwise, they are flipped and `A1_y` should be the reference allele: + merged_table['A2'] = np.where(merged_table['A1_x'] == merged_table['A1_y'], + merged_table['A2'], + merged_table['A1_y']) + + # Check that the reference allele could be inferred for all SNPs: + if not allow_na and merged_table['A2'].isna().any(): + raise ValueError("The reference allele could not be inferred for some SNPs!") + else: + self.table['A2'] = merged_table['A2'] + + def set_sample_size(self, n): + """ + Set the sample size for each variant in the summary table. + This can be useful when the overall sample size from the GWAS analysis is available, + but not on a per-SNP basis. + + :param n: A scalar or array of sample sizes for each variant. + """ + self.table['N'] = n + def match(self, reference_table, correct_flips=True): """ Match the summary statistics table with a reference table, - correcting for potential flips in the effect allele. + correcting for potential flips in the effect alleles. :param reference_table: The SNP table to use as a reference. Must be a pandas table with at least three columns: SNP, A1, A2. @@ -188,7 +389,7 @@ def match(self, reference_table, correct_flips=True): estimates if the effect allele is reversed. """ - from magenpy.utils.model_utils import merge_snp_tables + from .utils.model_utils import merge_snp_tables self.table = merge_snp_tables(ref_table=reference_table[['SNP', 'A1', 'A2']], alt_table=self.table, @@ -197,7 +398,7 @@ def match(self, reference_table, correct_flips=True): def filter_by_allele_frequency(self, min_maf=None, min_mac=None): """ - Filter variants by minimum minor allele frequency or allele count + Filter variants in the summary statistics table by minimum minor allele frequency or allele count :param min_maf: Minimum minor allele frequency :param min_mac: Minimum minor allele count """ @@ -227,9 +428,9 @@ def filter_by_allele_frequency(self, min_maf=None, min_mac=None): def filter_snps(self, extract_snps=None, extract_file=None, extract_index=None): """ - Filter the summary statistics table to a subset of SNPs. + Filter the summary statistics table to keep a subset of SNPs. :param extract_snps: A list or array of SNP IDs to keep. - :param extract_file: A file containing the SNP IDs to keep. + :param extract_file: A plink-style file containing the SNP IDs to keep. :param extract_index: A list or array of the indices of SNPs to retain. """ @@ -250,23 +451,24 @@ def filter_snps(self, extract_snps=None, extract_file=None, extract_index=None): def drop_duplicates(self): """ - Drop duplicated SNP rsIDs. + Drop variants with duplicated rsIDs from the summary statistics table. """ self.table = self.table.drop_duplicates(subset='SNP', keep=False) def get_col(self, col_name): """ - Returns a particular summary statistic or column from the summary statistics - table. - :param col_name: The name of the column + :param col_name: The name of the column to extract. + + :return: The column associated with `col_name` from summary statistics table. """ if col_name in self.table.columns: return self.table[col_name].values def get_chisq_statistic(self): """ - Obtain the Chi-Squared statistic + :return: The Chi-Squared statistic from the association test of each variant on the phenotype. + :raises KeyError: If the Chi-Squared statistic is not available and could not be inferred from available data. """ chisq = self.get_col('CHISQ') @@ -289,11 +491,24 @@ def get_chisq_statistic(self): def get_snp_pseudo_corr(self): """ + Computes the pseudo-correlation coefficient (standardized beta) between the SNP and the phenotype (X_jTy / N) from GWAS summary statistics. - Uses Equation 15 in Mak et al. 2017 - beta = z_j / sqrt(n - 1 + z_j^2) - Where z_j is the marginal GWAS Z-score + + This method uses Equation 15 in Mak et al. 2017 + + $$ + beta = z_j / sqrt(n - 1 + z_j^2) + $$ + + Where `z_j` is the marginal GWAS Z-score. + + !!! seealso "See Also" + * [standardized_marginal_beta][magenpy.SumstatsTable.SumstatsTable.standardized_marginal_beta] + + :return: The pseudo-correlation coefficient between the SNP and the phenotype. + :raises KeyError: If the Z-scores are not available or the sample size is not available. + """ zsc = self.z_score @@ -303,22 +518,30 @@ def get_snp_pseudo_corr(self): if n is not None: return zsc / (np.sqrt(n - 1 + zsc**2)) else: - raise Exception("Sample size is not available!") + raise KeyError("Sample size is not available!") else: - raise Exception("Z-scores are not available!") + raise KeyError("Z-scores are not available!") def get_yy_per_snp(self): """ Computes the quantity (y'y)_j/n_j following SBayesR (Lloyd-Jones 2019) and Yang et al. (2012). - (y'y)_j/n_j is the empirical variance for continuous phenotypes and may be estimated + + (y'y)_j/n_j is defined as the empirical variance for continuous phenotypes and may be estimated from GWAS summary statistics by re-arranging the equation for the squared standard error: - SE(b_j)^2 = (Var(y) - Var(x_j)*b_j^2) / (Var(x)*n) + $$ + SE(b_j)^2 = (Var(y) - Var(x_j)*b_j^2) / (Var(x)*n) + $$ Which gives the following estimate: - (y'y)_j / n_j = (n_j - 2)*SE(b_j)^2 + b_j^2 + $$ + (y'y)_j / n_j = (n_j - 2)*SE(b_j)^2 + b_j^2 + $$ + + :return: The quantity (y'y)_j/n_j for each SNP in the summary statistics table. + :raises KeyError: If the marginal betas, standard errors or sample sizes are not available. """ @@ -331,18 +554,20 @@ def get_yy_per_snp(self): if se is not None: return (n - 2)*se**2 + b**2 else: - raise Exception("Standard errors are not available!") + raise KeyError("Standard errors are not available!") else: - raise Exception("Marginal betas are not available!") + raise KeyError("Marginal betas are not available!") else: - raise Exception("Sample size per SNP is not available!") + raise KeyError("Sample size per SNP is not available!") def split_by_chromosome(self, snps_per_chrom=None): """ Split the summary statistics table by chromosome, so that we would - have a separate `SumstatsTable` table for each chromosome. + have a separate `SumstatsTable` object for each chromosome. :param snps_per_chrom: A dictionary where the keys are the chromosome number and the value is an array or list of SNPs on that chromosome. + + :return: A dictionary where the keys are the chromosome number and the value is a `SumstatsTable` object. """ if 'CHR' in self.table.columns: @@ -366,10 +591,13 @@ def split_by_chromosome(self, snps_per_chrom=None): "you must provide the a dictionary mapping chromosome number " "to an array of SNPs `snps_per_chrom`.") - def get_table(self, col_subset=None): + def to_table(self, col_subset=None): """ - Get the summary statistics table or a subset of it. + A convenience method to extract the summary statistics table or subsets of it. + :param col_subset: A list corresponding to a subset of columns to return. + + :return: A pandas DataFrame containing the summary statistics with the requested column subset. """ col_subset = col_subset or ['CHR', 'SNP', 'POS', 'A1', 'A2', 'MAF', @@ -409,13 +637,15 @@ def get_table(self, col_subset=None): def to_file(self, output_file, col_subset=None, **to_csv_kwargs): """ - Write the summary statistics table to file. + A convenience method to write the summary statistics table to file. TODO: Add a format argument to this method and allow the user to output summary statistics according to supported formats (e.g. COJO, plink, fastGWA, etc.). :param output_file: The path to the file where to write the summary statistics. :param col_subset: A subset of the columns to write to file. + :param to_csv_kwargs: Keyword arguments to pass to pandas' `to_csv` method. + """ if 'sep' not in to_csv_kwargs and 'delimiter' not in to_csv_kwargs: @@ -424,7 +654,7 @@ def to_file(self, output_file, col_subset=None, **to_csv_kwargs): if 'index' not in to_csv_kwargs: to_csv_kwargs['index'] = False - table = self.get_table(col_subset) + table = self.to_table(col_subset) table.to_csv(output_file, **to_csv_kwargs) @classmethod @@ -432,29 +662,42 @@ def from_file(cls, sumstats_file, sumstats_format=None, parser=None, **parse_kwa """ Initialize a summary statistics table from file. The user must provide either the format for the summary statistics file or the parser object - (see parsers.sumstats_parsers). + (see `parsers.sumstats_parsers`). + :param sumstats_file: The path to the summary statistics file. :param sumstats_format: The format for the summary statistics file. Currently, - we explicitly support the following three formats formats: magenpy, plink, COJO. + we support the following summary statistics formats: `magenpy`, `plink1.9`, `plink` or `plink2`, + `COJO`, `fastGWA`, `SAIGE`, `GWASCatalog` (also denoted as `GWAS-SSF` and `SSF`). :param parser: An instance of SumstatsParser parser, implements basic parsing/conversion functionalities. :param parse_kwargs: arguments for the pandas `read_csv` function, such as the delimiter. + + :return: A `SumstatsTable` object initialized from the summary statistics file. """ assert sumstats_format is not None or parser is not None from .parsers.sumstats_parsers import ( - SumstatsParser, plinkSumstatsParser, COJOSumstatsParser, fastGWASumstatsParser + SumstatsParser, Plink1SSParser, Plink2SSParser, COJOSSParser, + FastGWASSParser, SSFParser, SaigeSSParser ) + sumstats_format_l = sumstats_format.lower() + if parser is None: - if sumstats_format == 'magenpy': + if sumstats_format_l == 'magenpy': parser = SumstatsParser(None, **parse_kwargs) - elif sumstats_format == 'plink': - parser = plinkSumstatsParser(None, **parse_kwargs) - elif sumstats_format == 'COJO': - parser = COJOSumstatsParser(None, **parse_kwargs) - elif sumstats_format == 'fastGWA': - parser = fastGWASumstatsParser(None, **parse_kwargs) + elif sumstats_format_l in ('plink', 'plink2'): + parser = Plink2SSParser(None, **parse_kwargs) + elif sumstats_format_l == 'plink1.9': + parser = Plink1SSParser(None, **parse_kwargs) + elif sumstats_format_l == 'cojo': + parser = COJOSSParser(None, **parse_kwargs) + elif sumstats_format_l == 'fastgwa': + parser = FastGWASSParser(None, **parse_kwargs) + elif sumstats_format_l in ('ssf', 'gwas-ssf', 'gwascatalog'): + parser = SSFParser(None, **parse_kwargs) + elif sumstats_format_l == 'saige': + parser = SaigeSSParser(None, **parse_kwargs) else: raise KeyError(f"Parsers for summary statistics format {sumstats_format} are not implemented!") diff --git a/magenpy/__init__.py b/magenpy/__init__.py index 764421a..66984f6 100644 --- a/magenpy/__init__.py +++ b/magenpy/__init__.py @@ -2,21 +2,22 @@ import configparser # Data structures: -from magenpy.AnnotationMatrix import AnnotationMatrix -from magenpy.LDMatrix import LDMatrix -from magenpy.GWADataLoader import GWADataLoader -from magenpy.SumstatsTable import SumstatsTable -from magenpy.SampleTable import SampleTable +from .AnnotationMatrix import AnnotationMatrix +from .LDMatrix import LDMatrix +from .GWADataLoader import GWADataLoader +from .SumstatsTable import SumstatsTable +from .SampleTable import SampleTable # Simulation: -from magenpy.simulation.GWASimulator import GWASimulator +from .simulation.PhenotypeSimulator import PhenotypeSimulator # Data utilities: -from magenpy.utils.data_utils import * +from .utils.data_utils import * -__version__ = '0.0.12' +__version__ = '0.1.0' +__release_date__ = 'April 2024' config = configparser.ConfigParser() diff --git a/magenpy/parsers/annotation_parsers.py b/magenpy/parsers/annotation_parsers.py index 055ddbe..c55555a 100644 --- a/magenpy/parsers/annotation_parsers.py +++ b/magenpy/parsers/annotation_parsers.py @@ -7,13 +7,19 @@ class AnnotationMatrixParser(object): """ def __init__(self, col_name_converter=None, **read_csv_kwargs): + """ + :param col_name_converter: A dictionary mapping column names + in the original table to magenpy's column names for the various + SNP features in the annotation matrix. + :param read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + """ self.col_name_converter = col_name_converter self.read_csv_kwargs = read_csv_kwargs # If the delimiter is not specified, assume whitespace by default: if 'sep' not in self.read_csv_kwargs and 'delimiter' not in self.read_csv_kwargs: - self.read_csv_kwargs['delim_whitespace'] = True + self.read_csv_kwargs['sep'] = r'\s+' def parse(self, annotation_file, drop_na=True): """ @@ -59,6 +65,11 @@ def __init__(self, col_name_converter=None, **read_csv_kwargs): ) def parse(self, annotation_file, drop_na=True): + """ + Parse the annotation matrix file + :param annotation_file: The path to the annotation file. + :param drop_na: Drop any entries with missing values. + """ df, annotations = super().parse(annotation_file, drop_na=drop_na) @@ -77,11 +88,13 @@ def parse_annotation_bed_file(annot_bed_file): After reading the raw file, we let pandas infer whether the file has a header or not and we standardize the names of the first 3 columns and convert the chromosome column into an integer. - :param annot_bed_file: The path to the annotation BED file + :param annot_bed_file: The path to the annotation BED file. + :type annot_bed_file: str """ try: - annot_bed = pd.read_csv(annot_bed_file, usecols=[0, 1, 2], delim_whitespace=True, + annot_bed = pd.read_csv(annot_bed_file, usecols=[0, 1, 2], + sep=r'\s+', names=['CHR', 'Start', 'End']) except Exception as e: raise e diff --git a/magenpy/parsers/misc_parsers.py b/magenpy/parsers/misc_parsers.py index 96cb180..f81df05 100644 --- a/magenpy/parsers/misc_parsers.py +++ b/magenpy/parsers/misc_parsers.py @@ -6,6 +6,11 @@ def read_snp_filter_file(filename, snp_id_col=0): """ Read plink-style file listing variant IDs. The file should not have a header and only has a single column. + + :param filename: The path to the file containing the SNP IDs + :type filename: str + :param snp_id_col: The column index containing the SNP IDs + :type snp_id_col: int """ try: @@ -22,6 +27,9 @@ def read_sample_filter_file(filename): The file should not have a header, be tab-separated, and has two columns corresponding to Family ID (FID) and Individual ID (IID). You may also pass a file with a single-column of Individual IDs instead. + + :param filename: The path to the file containing the sample IDs + :type filename: str """ keep_list = pd.read_csv(filename, sep="\t", header=None).values @@ -40,13 +48,17 @@ def parse_ld_block_data(ldb_file_path): The parser assumes that the LD block files have the ldetect format: https://bitbucket.org/nygcresearch/ldetect-data/src/master/ - :param ldb_file_path: The path to the LD blocks file + :param ldb_file_path: The path (or URL) to the LD blocks file + :type ldb_file_path: str """ ld_blocks = {} - df = pd.read_csv(ldb_file_path, delim_whitespace=True, - dtype={'chr': str, 'start': np.int64, 'end': np.int64}) + df = pd.read_csv(ldb_file_path, sep=r'\s+') + + df = df.loc[(df.start != 'None') & (df.stop != 'None')] + df = df.astype({'chr': str, 'start': np.int64, 'stop': np.int64}) + df = df.sort_values('start') if df.isnull().values.any(): raise ValueError("The LD block data contains missing information. This may result in invalid " @@ -65,9 +77,10 @@ def parse_cluster_assignment_file(cluster_assignment_file): and contain three columns: FID, IID, and Cluster :param cluster_assignment_file: The path to the cluster assignment file. + :type cluster_assignment_file: str """ try: - clusters = pd.read_csv(cluster_assignment_file, delim_whitespace=True) + clusters = pd.read_csv(cluster_assignment_file, sep=r'\s+') clusters.columns = ['FID', 'IID', 'Cluster'] except Exception as e: raise e diff --git a/magenpy/parsers/plink_parsers.py b/magenpy/parsers/plink_parsers.py index d846da9..107b702 100644 --- a/magenpy/parsers/plink_parsers.py +++ b/magenpy/parsers/plink_parsers.py @@ -15,7 +15,8 @@ def parse_bim_file(plink_bfile): - Allele 1 (corresponding to clear bits in .bed; usually minor) - Allele 2 (corresponding to set bits in .bed; usually major) - :param plink_bfile: + :param plink_bfile: The path to the plink bfile (with or without the extension). + :type plink_bfile: str """ if '.bim' not in plink_bfile: @@ -24,7 +25,8 @@ def parse_bim_file(plink_bfile): else: plink_bfile = plink_bfile + '.bim' - bim_df = pd.read_csv(plink_bfile, delim_whitespace=True, + bim_df = pd.read_csv(plink_bfile, + sep=r'\s+', names=['CHR', 'SNP', 'cM', 'POS', 'A1', 'A2'], dtype={ 'CHR': int, @@ -52,7 +54,8 @@ def parse_fam_file(plink_bfile): - Sex code ('1' = male, '2' = female, '0' = unknown) - Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control) - :param plink_bfile: + :param plink_bfile: The path to the plink bfile (with or without the extension). + :type plink_bfile: str """ if '.fam' not in plink_bfile: @@ -62,8 +65,8 @@ def parse_fam_file(plink_bfile): plink_bfile = plink_bfile + '.fam' fam_df = pd.read_csv(plink_bfile, - delim_whitespace=True, - usecols=range(6), + sep=r'\s+', + usecols=list(range(6)), names=['FID', 'IID', 'fatherID', 'motherID', 'sex', 'phenotype'], dtype={'FID': str, 'IID': str, @@ -77,9 +80,11 @@ def parse_fam_file(plink_bfile): 'sex': [0] }) + # If the phenotype is all null or unknown, drop the column: if fam_df['phenotype'].isnull().all(): fam_df.drop('phenotype', axis=1, inplace=True) + # If the sex column is all null or unknown, drop the column: if fam_df['sex'].isnull().all(): fam_df.drop('sex', axis=1, inplace=True) diff --git a/magenpy/parsers/sumstats_parsers.py b/magenpy/parsers/sumstats_parsers.py index 81ed461..6cb1135 100644 --- a/magenpy/parsers/sumstats_parsers.py +++ b/magenpy/parsers/sumstats_parsers.py @@ -1,31 +1,63 @@ - import pandas as pd +import numpy as np +import warnings class SumstatsParser(object): """ - A generic GWAS summary statistics parser class. + A wrapper class for parsing summary statistics files that are written by statistical genetics software + for Genome-wide Association testing. A common challenge is the fact that different software tools + output summary statistics in different formats and with different column names. Thus, this class + provides a common interface for parsing summary statistics files from different software tools + and aims to make this process as seamless as possible. + + The class is designed to be extensible, so that users can easily add new parsers for different software tools. + + !!! seealso "See Also" + * [Plink2SSParser][magenpy.parsers.sumstats_parsers.Plink2SSParser] + * [Plink1SSParser][magenpy.parsers.sumstats_parsers.Plink1SSParser] + * [COJOSSParser][magenpy.parsers.sumstats_parsers.COJOSSParser] + * [FastGWASSParser][magenpy.parsers.sumstats_parsers.FastGWASSParser] + * [SSFParser][magenpy.parsers.sumstats_parsers.SSFParser] + * [SaigeSSParser][magenpy.parsers.sumstats_parsers.SaigeSSParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + """ def __init__(self, col_name_converter=None, **read_csv_kwargs): """ - :param col_name_converter: A dictionary mapping column names + Initialize the summary statistics parser. + + :param col_name_converter: A dictionary/string mapping column names in the original table to magenpy's column names for the various - summary statistics. + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv """ - self.col_name_converter = col_name_converter + + if isinstance(col_name_converter, str): + self.col_name_converter = { + k: v for entry in col_name_converter.split(',') for k, v in [entry.strip().split('=')] + if len(entry.strip()) > 0 + } + else: + self.col_name_converter = col_name_converter + self.read_csv_kwargs = read_csv_kwargs # If the delimiter is not specified, assume whitespace by default: if 'sep' not in self.read_csv_kwargs and 'delimiter' not in self.read_csv_kwargs: - self.read_csv_kwargs['delim_whitespace'] = True + self.read_csv_kwargs['sep'] = r'\s+' def parse(self, file_name, drop_na=True): """ Parse a summary statistics file. :param file_name: The path to the summary statistics file. - :param drop_na: Drop any entries with missing values. + :param drop_na: If True, drop any entries with missing values. + + :return: A pandas DataFrame containing the parsed summary statistics. """ df = pd.read_csv(file_name, **self.read_csv_kwargs) @@ -39,16 +71,31 @@ def parse(self, file_name, drop_na=True): return df -class plinkSumstatsParser(SumstatsParser): +class Plink2SSParser(SumstatsParser): """ - A parser for plink GWAS summary statistics files. + A specialized class for parsing GWAS summary statistics files generated by `plink2`. + + !!! seealso "See Also" + * [Plink1SSParser][magenpy.parsers.sumstats_parsers.Plink1SSParser] + * [COJOSSParser][magenpy.parsers.sumstats_parsers.COJOSSParser] + * [FastGWASSParser][magenpy.parsers.sumstats_parsers.FastGWASSParser] + * [SSFParser][magenpy.parsers.sumstats_parsers.SSFParser] + * [SaigeSSParser][magenpy.parsers.sumstats_parsers.SaigeSSParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + """ def __init__(self, col_name_converter=None, **read_csv_kwargs): """ - :param col_name_converter: A dictionary mapping column names + + Initialize the `plink2` summary statistics parser. + + :param col_name_converter: A dictionary/string mapping column names in the original table to magenpy's column names for the various - summary statistics. + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv """ @@ -73,29 +120,94 @@ def parse(self, file_name, drop_na=True): Parse a summary statistics file. :param file_name: The path to the summary statistics file. :param drop_na: Drop any entries with missing values. + + :return: A pandas DataFrame containing the parsed summary statistics. """ df = super().parse(file_name, drop_na=drop_na) - try: - df['A2'] = df.apply(lambda x: [x['ALT1'], x['REF']][x['A1'] == x['ALT1']], axis=1) - except KeyError: - print("Warning: the reference allele A2 could not be inferred " - "from the summary statistics file!") + if 'A2' not in df.columns: + try: + if 'ALT1' in df.columns: + df['A2'] = np.where(df['A1'] == df['ALT1'], df['REF'], df['ALT1']) + elif 'ALT' in df.columns: + df['A2'] = np.where(df['A1'] == df['ALT'], df['REF'], df['ALT']) + else: + warnings.warn("The reference allele A2 could not be inferred " + "from the summary statistics file!") + except KeyError: + warnings.warn("The reference allele A2 could not be inferred " + "from the summary statistics file! Some of the columns needed to infer " + "the A2 allele are missing or coded differently than what we expect.") return df -class COJOSumstatsParser(SumstatsParser): +class Plink1SSParser(SumstatsParser): """ - A parser for COJO GWAS summary statistics files. + A specialized class for parsing GWAS summary statistics files generated by `plink1.9`. + + !!! seealso "See Also" + * [Plink2SSParser][magenpy.parsers.sumstats_parsers.Plink2SSParser] + * [COJOSSParser][magenpy.parsers.sumstats_parsers.COJOSSParser] + * [FastGWASSParser][magenpy.parsers.sumstats_parsers.FastGWASSParser] + * [SSFParser][magenpy.parsers.sumstats_parsers.SSFParser] + * [SaigeSSParser][magenpy.parsers.sumstats_parsers.SaigeSSParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + """ def __init__(self, col_name_converter=None, **read_csv_kwargs): """ - :param col_name_converter: A dictionary mapping column names + Initialize the `plink1.9` summary statistics parser. + + :param col_name_converter: A dictionary/string mapping column names in the original table to magenpy's column names for the various - summary statistics. + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). + :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv + """ + + super().__init__(col_name_converter, **read_csv_kwargs) + + self.col_name_converter = self.col_name_converter or {} + + self.col_name_converter.update( + { + 'P': 'PVAL', + 'NMISS': 'N', + 'STAT': 'Z', + 'BP': 'POS' + } + ) + + +class COJOSSParser(SumstatsParser): + """ + A specialized class for parsing GWAS summary statistics files generated by the `COJO` software. + + !!! seealso "See Also" + * [Plink2SSParser][magenpy.parsers.sumstats_parsers.Plink2SSParser] + * [Plink1SSParser][magenpy.parsers.sumstats_parsers.Plink1SSParser] + * [FastGWASSParser][magenpy.parsers.sumstats_parsers.FastGWASSParser] + * [SSFParser][magenpy.parsers.sumstats_parsers.SSFParser] + * [SaigeSSParser][magenpy.parsers.sumstats_parsers.SaigeSSParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + """ + + def __init__(self, col_name_converter=None, **read_csv_kwargs): + """ + + Initialize the COJO summary statistics parser. + + :param col_name_converter: A dictionary/string mapping column names + in the original table to magenpy's column names for the various + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv """ super().__init__(col_name_converter, **read_csv_kwargs) @@ -112,16 +224,29 @@ def __init__(self, col_name_converter=None, **read_csv_kwargs): ) -class fastGWASumstatsParser(SumstatsParser): +class FastGWASSParser(SumstatsParser): """ - A parser for fastGWA summary statistics files + A specialized class for parsing GWAS summary statistics files generated by the `FastGWA` software. + + !!! seealso "See Also" + * [Plink2SSParser][magenpy.parsers.sumstats_parsers.Plink2SSParser] + * [Plink1SSParser][magenpy.parsers.sumstats_parsers.Plink1SSParser] + * [COJOSSParser][magenpy.parsers.sumstats_parsers.COJOSSParser] + * [SSFParser][magenpy.parsers.sumstats_parsers.SSFParser] + * [SaigeSSParser][magenpy.parsers.sumstats_parsers.SaigeSSParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + + """ def __init__(self, col_name_converter=None, **read_csv_kwargs): """ - :param col_name_converter: A dictionary mapping column names + :param col_name_converter: A dictionary/string mapping column names in the original table to magenpy's column names for the various - summary statistics. + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv """ super().__init__(col_name_converter, **read_csv_kwargs) @@ -134,3 +259,122 @@ def __init__(self, col_name_converter=None, **read_csv_kwargs): 'P': 'PVAL' } ) + + +class SSFParser(SumstatsParser): + """ + A specialized class for parsing GWAS summary statistics that are formatted according + to the standardized summary statistics format adopted by the GWAS Catalog. This format is + sometimes denoted as `GWAS-SSF`. + + Reference and details: + https://github.com/EBISPOT/gwas-summary-statistics-standard + + !!! seealso "See Also" + * [Plink2SSParser][magenpy.parsers.sumstats_parsers.Plink2SSParser] + * [Plink1SSParser][magenpy.parsers.sumstats_parsers.Plink1SSParser] + * [COJOSSParser][magenpy.parsers.sumstats_parsers.COJOSSParser] + * [FastGWASSParser][magenpy.parsers.sumstats_parsers.FastGWASSParser] + * [SaigeSSParser][magenpy.parsers.sumstats_parsers.SaigeSSParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + + """ + + def __init__(self, col_name_converter=None, **read_csv_kwargs): + """ + + Initialize the standardized summary statistics parser. + + :param col_name_converter: A dictionary/string mapping column names + in the original table to magenpy's column names for the various + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). + :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv + """ + + super().__init__(col_name_converter, **read_csv_kwargs) + + self.col_name_converter = self.col_name_converter or {} + + self.col_name_converter.update( + { + 'chromosome': 'CHR', + 'base_pair_location': 'POS', + 'rsid': 'SNP', + 'effect_allele': 'A1', + 'other_allele': 'A2', + 'beta': 'BETA', + 'standard_error': 'SE', + 'effect_allele_frequency': 'MAF', + 'p_value': 'PVAL', + 'n': 'N' + } + ) + + +class SaigeSSParser(SumstatsParser): + """ + A specialized class for parsing GWAS summary statistics files generated by the `SAIGE` software. + Reference and details: + https://saigegit.github.io/SAIGE-doc/docs/single_step2.html + + TODO: Ensure that the column names are correct across different trait types + and the inference of the sample size is correct. + + !!! seealso "See Also" + * [Plink2SSParser][magenpy.parsers.sumstats_parsers.Plink2SSParser] + * [Plink1SSParser][magenpy.parsers.sumstats_parsers.Plink1SSParser] + * [COJOSSParser][magenpy.parsers.sumstats_parsers.COJOSSParser] + * [FastGWASSParser][magenpy.parsers.sumstats_parsers.FastGWASSParser] + * [SSFParser][magenpy.parsers.sumstats_parsers.SSFParser] + + :ivar col_name_converter: A dictionary mapping column names in the original table to magenpy's column names. + :ivar read_csv_kwargs: Keyword arguments to pass to pandas' `read_csv`. + + """ + + def __init__(self, col_name_converter=None, **read_csv_kwargs): + """ + Initialize the `SAIGE` summary statistics parser. + + :param col_name_converter: A dictionary/string mapping column names + in the original table to magenpy's column names for the various + summary statistics. If a string, it should be a comma-separated list of + key-value pairs (e.g. 'rsid=SNP,pos=POS'). + :param read_csv_kwargs: Keyword arguments to pass to pandas' read_csv + """ + super().__init__(col_name_converter, **read_csv_kwargs) + + self.col_name_converter = self.col_name_converter or {} + + # NOTE: SAIGE considers Allele2 to be the effect allele, so + # we switch their designation here: + self.col_name_converter.update( + { + 'MarkerID': 'SNP', + 'Allele1': 'A2', + 'Allele2': 'A1', + 'AF_Allele2': 'MAF', + 'AC_Allele2': 'MAC', + 'Tstat': 'Z', + 'p.value': 'PVAL', + } + ) + + def parse(self, file_name, drop_na=True): + """ + Parse the summary statistics file. + :param file_name: The path to the summary statistics file. + :param drop_na: Drop any entries with missing values. + + :return: A pandas DataFrame containing the parsed summary statistics. + """ + + df = super().parse(file_name, drop_na=drop_na) + + # Infer the sample size N + df['N'] = df['MAC'] / (2.*df['MAF']) + + return df diff --git a/magenpy/plot/__init__.py b/magenpy/plot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/magenpy/plot.py b/magenpy/plot/gwa.py similarity index 59% rename from magenpy/plot.py rename to magenpy/plot/gwa.py index af4239d..c1ff93c 100644 --- a/magenpy/plot.py +++ b/magenpy/plot/gwa.py @@ -1,13 +1,11 @@ from typing import Union -import magenpy as mgp -from magenpy.GWADataLoader import GWADataLoader -from magenpy.SumstatsTable import SumstatsTable +from ..GWADataLoader import GWADataLoader +from ..SumstatsTable import SumstatsTable import matplotlib.pylab as plt import numpy as np -def manhattan(gdl: Union[GWADataLoader, None] = None, - sumstats: Union[SumstatsTable, None] = None, +def manhattan(input_data: Union[GWADataLoader, SumstatsTable], y=None, y_label=None, chrom_sep_color='#f0f0f0', @@ -23,10 +21,11 @@ def manhattan(gdl: Union[GWADataLoader, None] = None, TODO: Add functionality to highlight certain SNPs or markers on the plot. - :param gdl: An instance of `GWADataLoader`. - :param sumstats: An instance of `SumstatsTable`. - :param y: A vector of values to plot on the y-axis. - :param y_label: A label for the quantity or statistic that will be plotted. + :param input_data: An instance of `SumstatsTable` or `GWADataLoader` from which data about the + positions of the SNPs will be extracted. + :param y: An optional vector of values to plot on the y-axis. If not provided, the -log10(p-value) + will be plotted by default. + :param y_label: A label for the quantity or statistic that will be plotted on the y-axis. :param chrom_sep_color: The color for the chromosome separator block. :param snp_color: The color of the dots on the Manhattan plot. :param snp_marker: The shape of the marker on the Manhattan plot. @@ -36,13 +35,12 @@ def manhattan(gdl: Union[GWADataLoader, None] = None, """ - if y is not None: - assert y_label is not None - - if gdl is None: - pos = {c: ss.bp_pos for c, ss in sumstats.split_by_chromosome().items()} + if isinstance(input_data, SumstatsTable): + pos = {c: ss.bp_pos for c, ss in input_data.split_by_chromosome().items()} + elif isinstance(input_data, GWADataLoader): + pos = {c: ss.bp_pos for c, ss in input_data.sumstats_table.items()} else: - pos = {c: ss.bp_pos for c, ss in gdl.sumstats_table.items()} + raise ValueError("The input data must be an instance of `SumstatsTable` or `GWADataLoader`.") starting_pos = 0 ticks = [] @@ -57,17 +55,14 @@ def manhattan(gdl: Union[GWADataLoader, None] = None, # Add bonferroni significance threshold line: plt.axhline(-np.log10(0.05 / 1e6), ls='--', zorder=1, color=bonf_line_color) - if gdl is None: - y = {c: ss.log10_p_value for c, ss in sumstats.split_by_chromosome().items()} + if isinstance(input_data, SumstatsTable): + y = {c: ss.log10_p_value for c, ss in input_data.split_by_chromosome().items()} else: - y = {c: ss.log10_p_value for c, ss in gdl.sumstats_table.items()} + y = {c: ss.log10_p_value for c, ss in input_data.sumstats_table.items()} y_label = "$-log_{10}$(p-value)" - if gdl is None: - unique_chr = sumstats.chromosomes - else: - unique_chr = gdl.chromosomes + unique_chr = sorted(list(pos.keys())) for i, c in enumerate(unique_chr): @@ -100,16 +95,15 @@ def manhattan(gdl: Union[GWADataLoader, None] = None, plt.tight_layout() -def qq_plot(gdl: Union[mgp.GWADataLoader, None] = None, - sumstats: Union[mgp.SumstatsTable, None] = None, +def qq_plot(input_data: Union[GWADataLoader, SumstatsTable], statistic='p_value'): """ Generate a quantile-quantile (QQ) plot for the GWAS summary statistics. The function supports plotting QQ plots for the -log10(p-values) as well as the z-score (if available). - :param gdl: An instance of `GWADataLoader`. - :param sumstats: An instance of `SumstatsTable`. + :param input_data: An instance of `SumstatsTable` or `GWADataLoader` from which data about the + positions of the SNPs will be extracted. :param statistic: The statistic to generate the QQ plot for. We currently support `p_value` and `z_score`. """ @@ -117,12 +111,14 @@ def qq_plot(gdl: Union[mgp.GWADataLoader, None] = None, if statistic == 'p_value': - if gdl is None: - p_val = sumstats.log10_p_value - m = sumstats.m + if isinstance(input_data, SumstatsTable): + p_val = input_data.log10_p_value + m = input_data.m + elif isinstance(input_data, GWADataLoader): + p_val = np.concatenate([ss.log10_p_value for ss in input_data.sumstats_table.values()]) + m = input_data.m else: - p_val = np.concatenate([ss.log10_p_value for ss in gdl.sumstats_table.values()]) - m = gdl.m + raise ValueError("The input data must be an instance of `SumstatsTable` or `GWADataLoader`.") plt.scatter(-np.log10(np.arange(1, m + 1) / m), np.sort(p_val)[::-1]) @@ -132,32 +128,15 @@ def qq_plot(gdl: Union[mgp.GWADataLoader, None] = None, plt.ylabel("Observed $-log_{10}$(p-value)") elif statistic == 'z_score': - if gdl is None: - z_scs = sumstats.z_score + if isinstance(input_data, SumstatsTable): + z_scs = input_data.z_score + elif isinstance(input_data, GWADataLoader): + z_scs = np.concatenate([ss.z_score for ss in input_data.sumstats_table.values()]) else: - z_scs = np.concatenate([ss.z_score for ss in gdl.sumstats_table.values()]) + raise ValueError("The input data must be an instance of `SumstatsTable` or `GWADataLoader`.") stats.probplot(z_scs, dist="norm", plot=plt) plt.show() else: - raise ValueError(f"No QQ plot can be generated for the statistic: {statistic}") - - -def plot_ld_matrix(ldm: mgp.LDMatrix, row_slice=None, col_slice=None, cmap='OrRd'): - """ - Plot a heatmap representing the LD matrix or portions of it. - - :param ldm: An instance of `LDMatrix`. - :param row_slice: A `slice` object indicating which rows to extract from the LD matrix. - :param col_slice: A `slice` object indicating which columns to extract from the LD matrix. - :param cmap: The color map for the LD matrix plot. - """ - - if row_slice is None: - row_slice = slice(ldm.shape[0]) - - if col_slice is None: - col_slice = slice(ldm.shape[0]) + raise NotImplementedError(f"No QQ plot can be generated for the statistic: {statistic}") - plt.imshow(ldm.to_csr_matrix()[row_slice, col_slice].toarray(), cmap=cmap) - plt.colorbar() diff --git a/magenpy/plot/ld.py b/magenpy/plot/ld.py new file mode 100644 index 0000000..c002568 --- /dev/null +++ b/magenpy/plot/ld.py @@ -0,0 +1,41 @@ +from ..LDMatrix import LDMatrix +import matplotlib.pylab as plt +import numpy as np + + +def plot_ld_matrix(ldm: LDMatrix, + row_subset=None, + display='full', + cmap='OrRd', + include_colorbar=True): + """ + Plot a heatmap representing the LD matrix or portions of it. + + :param ldm: An instance of `LDMatrix`. + :param row_subset: A boolean or integer index array for the subset of rows/columns to extract from the LD matrix. + :param display: A string indicating what part of the matrix to display. Can be 'full', 'upper', 'lower'. + If upper, only the upper triangle of the matrix will be displayed. + If lower, only the lower triangle will be displayed. + :param cmap: The color map for the LD matrix plot. + :param include_colorbar: If True, include a colorbar in the plot. + """ + + if row_subset is None: + row_subset = np.arange(ldm.shape[0]) + + # TODO: Figure out a way to do this without loading the entire matrix: + ldm.load(return_symmetric=True, fill_diag=True, dtype='float32') + + mat = ldm.csr_matrix[row_subset, :][:, row_subset].toarray() + + if display == 'upper': + mat = np.triu(mat, k=1) + elif display == 'lower': + mat = np.tril(mat, k=1) + + plt.imshow(mat, cmap=cmap, vmin=-1., vmax=1.) + + if include_colorbar: + plt.colorbar() + + plt.axis('off') diff --git a/magenpy/simulation/AnnotatedGWASimulator.py b/magenpy/simulation/AnnotatedPhenotypeSimulator.py similarity index 82% rename from magenpy/simulation/AnnotatedGWASimulator.py rename to magenpy/simulation/AnnotatedPhenotypeSimulator.py index 90703e0..d25b99c 100644 --- a/magenpy/simulation/AnnotatedGWASimulator.py +++ b/magenpy/simulation/AnnotatedPhenotypeSimulator.py @@ -1,22 +1,31 @@ import numpy as np -from magenpy.simulation.GWASimulator import GWASimulator +from .PhenotypeSimulator import PhenotypeSimulator -class AnnotatedGWASimulator(GWASimulator): +class AnnotatedPhenotypeSimulator(PhenotypeSimulator): """ Simulate complex traits by incorporating genomic functional annotations into the mixture densities that govern the effect size of each variant on the trait. - NOTE: This code is experimental and needs much further validation. + !!! warning + This code is experimental and needs much further validation. + """ def __init__(self, bed_files, **kwargs): + """ + Create an instance of the AnnotatedPhenotypeSimulator class. + + :param bed_files: A list of BED files that contain the genotype data. + :param kwargs: Additional keyword arguments for the PhenotypeSimulator class. + """ + super().__init__(bed_files, **kwargs) # For now, we will restrict to 2 mixture components. - assert self.n_mixtures == 2 + assert self.n_components == 2 self.w_h2 = None # The annotation weights for the per-SNP heritability self.w_pi = None # The annotation weights for the per-SNP causal probability @@ -68,6 +77,9 @@ def simulate_w_pi(self, enrichment=None): self.w_pi = np.log(np.array(enr)) def set_per_snp_heritability(self): + """ + Set the per-SNP heritability values using the annotation weights. + """ if self.w_h2 is None: return super().set_per_snp_heritability() @@ -79,6 +91,9 @@ def set_per_snp_heritability(self): a_min=0., a_max=np.inf) def set_per_snp_mixture_probability(self): + """ + Set the per-SNP mixture probabilities using the annotation weights. + """ if self.w_pi is None: return super().set_per_snp_mixture_probability() @@ -91,6 +106,9 @@ def set_per_snp_mixture_probability(self): self.per_snp_pi[c] = np.array([1. - prob, prob]).T def get_heritability_enrichment(self): + """ + Estimate the enrichment of heritability per annotation. + """ tabs = self.to_true_beta_table(per_chromosome=True) total_heritability = sum([tab['Heritability'].sum() for c, tab in tabs.items()]) diff --git a/magenpy/simulation/MulticohortGWASimulator.py b/magenpy/simulation/MultiCohortPhenotypeSimulator.py similarity index 88% rename from magenpy/simulation/MulticohortGWASimulator.py rename to magenpy/simulation/MultiCohortPhenotypeSimulator.py index e2198db..05156af 100644 --- a/magenpy/simulation/MulticohortGWASimulator.py +++ b/magenpy/simulation/MultiCohortPhenotypeSimulator.py @@ -1,21 +1,18 @@ -""" -Author: Shadi Zabad -Date: March 2021 -""" import pandas as pd import numpy as np -from magenpy import GWADataLoader -from magenpy.simulation.GWASimulator import GWASimulator +from ..GWADataLoader import GWADataLoader +from .PhenotypeSimulator import PhenotypeSimulator -class MulticohortGWASimulator(GWADataLoader): +class MultiCohortPhenotypeSimulator(GWADataLoader): """ - A module for simulating GWAS data for separate cohorts. + A module for simulating GWAS data for separate cohorts or clusters of the data. This includes scenarios such as multi-population or multi-ethnic datasets, or datasets that can be stratified by a discrete variable. - NOTE: Incomplete code.. requires more testing. + !!! warning + This code is experimental and needs much further validation. """ @@ -47,7 +44,7 @@ def __init__(self, # Rho can be either a scalar or a matrix that determines the patterns of # correlations between effect sizes in different clusters. - if type(rho) == np.float: + if np.issubdtype(type(rho), np.floating): self.rho = rho*np.ones(shape=(len(self.clusters), len(self.clusters))) np.fill_diagonal(self.rho, 1.) else: @@ -63,9 +60,9 @@ def __init__(self, if self.ref_cluster is None: self.ref_cluster = c - self.cluster_simulators[c] = GWASimulator(bed_files, - keep_samples=self.get_samples_in_cluster(c), - **kwargs) + self.cluster_simulators[c] = PhenotypeSimulator(bed_files, + keep_samples=self.get_samples_in_cluster(c), + **kwargs) @property def clusters(self): @@ -133,7 +130,7 @@ def simulate_beta(self): betas = np.random.multivariate_normal(np.zeros(self.rho.shape[0]), cov=self.rho, size=c_size) for i, c in enumerate(self.clusters): self.cluster_simulators[c].beta[ch] = ( - self.cluster_simulators[c].get_causal_status()[ch].astype(np.int)*betas[:, i] + self.cluster_simulators[c].get_causal_status()[ch].astype(np.int32)*betas[:, i] ) def simulate(self, perform_gwas=False): diff --git a/magenpy/simulation/GWASimulator.py b/magenpy/simulation/PhenotypeSimulator.py similarity index 67% rename from magenpy/simulation/GWASimulator.py rename to magenpy/simulation/PhenotypeSimulator.py index e305f97..fc24d3e 100644 --- a/magenpy/simulation/GWASimulator.py +++ b/magenpy/simulation/PhenotypeSimulator.py @@ -1,35 +1,64 @@ -""" -Author: Shadi Zabad -Date: March 2021 -""" import warnings import numpy as np import pandas as pd -from magenpy.GWADataLoader import GWADataLoader +from ..GWADataLoader import GWADataLoader -class GWASimulator(GWADataLoader): +class PhenotypeSimulator(GWADataLoader): + """ + A wrapper class that supports simulating complex traits with a variety of + genetic architectures and heritability values, using the standard linear model. The + basic implementation supports simulating effect sizes from a sparse Gaussian mixture density, + allowing some variants to have larger effects than others. The class also supports simulating + binary phenotypes (case-control) by thresholding the continuous phenotype at a specified threshold. + + To be concrete, the generative model for the simulation is as follows: + + 1) Simulate the mixture assignment for each variant based on the mixing proportions `pi`. + 2) Simulate the effect sizes for each variant from the corresponding Gaussian density that they were assigned. + 3) Compute the polygenic score for each individual based on the simulated effect sizes. + 4) Simulate the residual component of the phenotype, in such a way that the total heritability is preserved. + + !!! seealso "See Also" + * [GWADataLoader][magenpy.GWADataLoader.GWADataLoader] + + :ivar pi: The mixing proportions for the Gaussian mixture density. + :ivar h2: The trait SNP heritability, or proportion of variance explained by SNPs. + :ivar d: The variance multipliers for each component of the Gaussian mixture density. + :ivar prevalence: The (disease) prevalence for binary (case-control) phenotypes. + :ivar per_snp_h2: The per-SNP heritability for each variant in the dataset. + :ivar per_snp_pi: The per-SNP mixing proportions for each variant in the dataset. + :ivar beta: The effect sizes for each variant in the dataset. + :ivar mixture_assignment: The assignment of each variant to a mixture component. + + """ def __init__(self, bed_files, h2=0.2, - pi=(0.9, 0.1), + pi=0.1, d=(0., 1.), prevalence=0.15, **kwargs): """ - Simulate phenotypes using the linear additive model. + Initialize the PhenotypeSimulator object with the necessary parameters. - :param bed_files: A path (or list of paths) to PLINK BED files. + :param bed_files: A path (or list of paths) to PLINK BED files containing the genotype information. :param h2: The trait SNP heritability, or proportion of variance explained by SNPs. - :param pi: The mixture proportions for Gaussian mixture density. - :param d: The variance multipliers for each component of the Gaussian mixture density. + :param pi: The mixing proportions for the mixture of Gaussians (our model for the distribution of effect sizes). + If a float is provided, it is converted to a tuple (1-pi, pi), where pi is the proportion of causal variants. + :param d: The variance multipliers for each component of the Gaussian mixture density. By default, + all components have the same variance multiplier. :param prevalence: The (disease) prevalence for binary (case-control) phenotypes. """ super().__init__(bed_files, **kwargs) + # If pi is float, convert it to a tuple: + if isinstance(pi, float): + pi = (1. - pi, pi) + self.pi = pi self.h2 = h2 self.prevalence = prevalence @@ -47,7 +76,10 @@ def __init__(self, self.mixture_assignment = None @property - def n_mixtures(self): + def n_components(self): + """ + :return: The number of Gaussian mixture components for the effect size distribution. + """ return len(self.pi) def set_pi(self, new_pi): @@ -67,7 +99,8 @@ def set_h2(self, new_h2): def set_per_snp_mixture_probability(self): """ - Set the per-SNP mixture probability for each variant in the dataset. + Set the per-SNP mixing proportions for each variant in the dataset. + This is a convenience method that may come in handy for more flexible generative models. """ self.per_snp_pi = {} @@ -77,7 +110,8 @@ def set_per_snp_mixture_probability(self): def set_per_snp_heritability(self): """ - Set the per-SNP heritability for each variant in the dataset. + Set the per-SNP heritability (effect size variance) for each variant in the dataset. + This is a convenience method that may come in handy for more flexible generative models. """ assert self.mixture_assignment is not None @@ -97,8 +131,11 @@ def set_per_snp_heritability(self): def get_causal_status(self): """ - This method returns a dictionary where the keys are the chromosome numbers - and the values are of binary vectors indicating which SNPs are causal for the phenotype. + :return: A dictionary where the keys are the chromosome numbers + and the values are binary vectors indicating which SNPs are + causal for the simulated phenotype. + + :raises AssertionError: If the mixture assignment is not set. """ assert self.mixture_assignment is not None @@ -118,12 +155,14 @@ def get_causal_status(self): def set_causal_snps(self, causal_snps): """ - A utility method to set the causal SNPs in the simulation based on an array or - list of SNPs specified by the user. The method takes an iterable of `causal_snps` + A utility method to set the causal variants in the simulation based on an array or + list of SNPs specified by the user. The method takes an iterable (e.g. list or array) of `causal_snps` and then creates a new mixture assignment object where only the `causal_snps` contribute to the phenotype. - :param causal_snps: A list or array of SNP IDs. + :param causal_snps: A list or array of SNP rsIDs. + :raises ValueError: If all mixture components are causal. + """ # Get the index of the mixture component whose multiplier is zero (i.e. the null component): @@ -141,9 +180,9 @@ def set_causal_snps(self, causal_snps): pis /= pis.sum() # Initialize new mixture assignment object: - new_assignment = {c: np.zeros((s, self.n_mixtures)) for c, s in self.shapes.items()} + new_assignment = {c: np.zeros((s, self.n_components)) for c, s in self.shapes.items()} - from magenpy.utils.compute_utils import intersect_arrays + from ..utils.compute_utils import intersect_arrays n_causal_set = 0 @@ -169,21 +208,23 @@ def set_causal_snps(self, causal_snps): def set_mixture_assignment(self, new_assignment): """ - Set the mixture assignments according to user-provided dictionary. + Set the mixture assignments according to user-provided dictionary. The mixture + assignment indicates which mixture component the effect size of a particular + variant comes from. :param new_assignment: A dictionary where the keys are the chromosomes and the values are the mixture assignment for each SNP on that chromosome. """ # Check that the shapes match pre-specified information: for c, c_size in self.shapes.items(): - assert new_assignment[c].shape == (c_size, self.n_mixtures) + assert new_assignment[c].shape == (c_size, self.n_components) self.mixture_assignment = new_assignment def simulate_mixture_assignment(self): """ Simulate assigning SNPs to the various mixture components - with probabilities given by `pi`. + with probabilities given by mixing proportions `pi`. """ if self.per_snp_pi is None or len(self.per_snp_pi) < 1: @@ -191,7 +232,7 @@ def simulate_mixture_assignment(self): self.mixture_assignment = {} - from magenpy.utils.model_utils import multinomial_rvs + from ..utils.model_utils import multinomial_rvs for c, c_size in self.shapes.items(): @@ -201,9 +242,10 @@ def simulate_mixture_assignment(self): def set_beta(self, new_beta): """ - Set the beta according to user-provided dictionary. + Set the variant effect sizes (beta) according to user-provided dictionary. + :param new_beta: A dictionary where the keys are the chromosomes and - the values are the beta for each SNP on that chromosome. + the values are the beta (effect size) for each SNP on that chromosome. """ # Check that the shapes match pre-specified information: @@ -214,8 +256,10 @@ def set_beta(self, new_beta): def simulate_beta(self): """ - Simulate the causal effect size for the variants included - in the dataset. + Simulate the causal effect size for variants included + in the dataset. Here, the variant effect size is drawn from + a Gaussian density with mean zero and scale given by + the root of per-SNP heritability. """ if self.per_snp_h2 is None or len(self.per_snp_h2) < 1: @@ -231,10 +275,17 @@ def simulate_beta(self): return self.beta - def simulate_phenotypes(self): + def simulate_phenotype(self): """ - Simulate complex phenotypes for the samples, given their genotype information and - fixed effect sizes `beta` that were simulated previously. + Simulate complex phenotypes for the samples present in the genotype matrix, given their + genotype information and fixed effect sizes `beta` that were simulated previous steps. + + Given the simulated effect sizes, the phenotype is generated as follows: + + `Y = XB + e` + + Where `Y` is the vector of phenotypes, `X` is the genotype matrix, `B` is the vector of effect sizes, + and `e` represents the residual effects. """ assert self.beta is not None @@ -270,7 +321,10 @@ def simulate_phenotypes(self): return new_y - def simulate(self, reset_beta=True, reset_mixture_assignment=True, perform_gwas=False): + def simulate(self, + reset_beta=True, + reset_mixture_assignment=True, + perform_gwas=False): """ A convenience method to simulate all the components of the generative model. Specifically, the simulation follows the standard linear model, where the phenotype is @@ -303,7 +357,7 @@ def simulate(self, reset_beta=True, reset_mixture_assignment=True, perform_gwas= self.simulate_beta() # Simulate the phenotype - self.simulate_phenotypes() + self.simulate_phenotype() if perform_gwas: # Perform genome-wide association testing... @@ -311,9 +365,10 @@ def simulate(self, reset_beta=True, reset_mixture_assignment=True, perform_gwas= def to_true_beta_table(self, per_chromosome=False): """ - Export the simulated true effect sizes and causal status - into a pandas table. - :param per_chromosome: If True, return a dictionary of tables for each chromosome. + Export the simulated true effect sizes and causal status into a pandas dataframe. + :param per_chromosome: If True, return a dictionary of tables for each chromosome separately. + + :return: A pandas DataFrame with the true effect sizes and causal status for each variant. """ assert self.beta is not None diff --git a/magenpy/stats/gwa/utils.py b/magenpy/stats/gwa/utils.py index 6131028..bc63606 100644 --- a/magenpy/stats/gwa/utils.py +++ b/magenpy/stats/gwa/utils.py @@ -3,30 +3,30 @@ import pandas as pd import numpy as np import warnings -import magenpy as mgp +from ...GWADataLoader import GWADataLoader +from ...SumstatsTable import SumstatsTable +from ..transforms.phenotype import chained_transform -def inflation_factor(gdl: Union[mgp.GWADataLoader, None] = None, - sumstats: Union[mgp.SumstatsTable, None] = None, - chisq=None): +def inflation_factor(sumstats_input: Union[GWADataLoader, SumstatsTable, np.array]): """ Compute the genomic control (GC) inflation factor (also known as lambda) from GWAS summary statistics. The inflation factor can be used to detect and correct inflation in the test statistics. - :param chisq: An array of chi-squared statistics to compute the inflation factor from. - :param gdl: A `GWADataLoader` object, with summary statistics initilized into the `sumstats_table` property. - :param sumstats: A `SumstatsTable` object, with the GWAS summary statistics load and initialized. - """ + :param sumstats_input: The input can be one of three classes of objects: A GWADataLoader object, + a SumstatsTable object, or a numpy array of chi-squared statistics to compute the inflation factor. - assert chisq is not None or gdl is not None or sumstats is not None + :return: The inflation factor (lambda) computed from the chi-squared statistics. + """ - if chisq is None: - if gdl is not None: - chisq = np.concatenate([ss.get_chisq_statistic() for ss in gdl.sumstats_table.values()]) - else: - chisq = sumstats.get_chisq_statistic() + if isinstance(sumstats_input, GWADataLoader): + chisq = np.concatenate([ss.get_chisq_statistic() for ss in sumstats_input.sumstats_table.values()]) + elif isinstance(sumstats_input, SumstatsTable): + chisq = sumstats_input.get_chisq_statistic() + else: + chisq = sumstats_input from scipy.stats import chi2 @@ -35,11 +35,26 @@ def inflation_factor(gdl: Union[mgp.GWADataLoader, None] = None, def perform_gwa_plink2(genotype_matrix, temp_dir='temp', - standardize_phenotype=True, - include_covariates=True): + **phenotype_transform_kwargs): + """ + + Perform genome-wide association testing using plink 2.0 + This function takes a GenotypeMatrix object and gwas-related flags and + calls plink to perform GWA on the genotype and phenotype data referenced + by the GenotypeMatrix object. - from magenpy.GenotypeMatrix import plinkBEDGenotypeMatrix - from magenpy.utils.executors import plink2Executor + :param genotype_matrix: A plinkBEDGenotypeMatrix object. + :param temp_dir: Path to a directory where we keep intermediate temporary files from plink. + :param phenotype_transform_kwargs: Keyword arguments to pass to the `chained_transform` function. These arguments + include the following options to transform the phenotype before performing GWAS: + `adjust_covariates`, `standardize_phenotype`, `rint_phenotype`, and `outlier_sigma_threshold`. NOTE: These + transformations are only applied to continuous phenotypes (`likelihood='gaussian'`). + + :return: A SumstatsTable object containing the summary statistics from the association tests. + """ + + from ...GenotypeMatrix import plinkBEDGenotypeMatrix + from ...utils.executors import plink2Executor assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) @@ -51,17 +66,30 @@ def perform_gwa_plink2(genotype_matrix, warnings.warn("The phenotype likelihood is not specified! " "Assuming that the phenotype is continuous...") - # Output phenotype table: - phe_fname = osp.join(temp_dir, "pheno.txt") + # Transform the phenotype: + phenotype, mask = chained_transform(s_table, **phenotype_transform_kwargs) + + # Prepare the phenotype table to pass to plink: phe_table = s_table.get_phenotype_table() + + # If the likelihood is binomial, transform the phenotype into + # plink's coding for case/control (1/2) rather than (0/1). if s_table.phenotype_likelihood == 'binomial': phe_table['phenotype'] += 1 + else: + phe_table = phe_table.loc[mask, :] + phe_table['phenotype'] = phenotype + + # Output phenotype table: + phe_fname = osp.join(temp_dir, "pheno.txt") phe_table.to_csv(phe_fname, sep="\t", index=False, header=False) # Process covariates: - if include_covariates and s_table.covariates is not None: + if s_table.phenotype_likelihood == 'binomial' and 'adjust_covariates' in phenotype_transform_kwargs and \ + phenotype_transform_kwargs['adjust_covariates']: + covar_fname = osp.join(temp_dir, "covar.txt") - covar = s_table.get_covariates_table() + covar = s_table.get_covariates_table().loc[mask, :] covar.to_csv(covar_fname, sep="\t", index=False, header=False) covar_modifier = '' else: @@ -72,13 +100,13 @@ def perform_gwa_plink2(genotype_matrix, plink_reg_type = ['linear', 'logistic'][s_table.phenotype_likelihood == 'binomial'] # Output subset of SNPs to perform association tests on: - snp_keepfile = osp.join(temp_dir, f"variants.keep") + snp_keepfile = osp.join(temp_dir, "variants.keep") pd.DataFrame({'SNP': genotype_matrix.snps}).to_csv( snp_keepfile, index=False, header=False ) # Output file: - plink_output = osp.join(temp_dir, f"output") + plink_output = osp.join(temp_dir, "output") cmd = [ f"--bfile {genotype_matrix.bed_file}", @@ -88,9 +116,6 @@ def perform_gwa_plink2(genotype_matrix, f"--out {plink_output}" ] - if standardize_phenotype and plink_reg_type == 'linear': - cmd.append('--variance-standardize') - if covar_fname is not None: cmd.append(f'--covar {covar_fname}') @@ -105,64 +130,194 @@ def perform_gwa_plink2(genotype_matrix, raise FileNotFoundError # Read the summary statistics file from plink: - ss_table = mgp.SumstatsTable.from_file(output_fname, sumstats_format='plink') + ss_table = SumstatsTable.from_file(output_fname, sumstats_format='plink2') # Make sure that the effect allele is encoded properly: ss_table.match(genotype_matrix.snp_table, correct_flips=True) return ss_table -def perform_gwa_plink1p9(genotype_matrix, standardize_phenotype=False): +def perform_gwa_plink1p9(genotype_matrix, + temp_dir='temp', + **phenotype_transform_kwargs): """ - TODO: Add support for performing association testing with plink1.9 + Perform genome-wide association testing using plink 1.9 + This function takes a GenotypeMatrix object and gwas-related flags and + calls plink to perform GWA on the genotype and phenotype data referenced + by the GenotypeMatrix object. + + :param genotype_matrix: A plinkBEDGenotypeMatrix object. + :param temp_dir: Path to a directory where we keep intermediate temporary files from plink. + :param phenotype_transform_kwargs: Keyword arguments to pass to the `chained_transform` function. These arguments + include the following options to transform the phenotype before performing GWAS: + `adjust_covariates`, `standardize_phenotype`, `rint_phenotype`, and `outlier_sigma_threshold`. NOTE: These + transformations are only applied to continuous phenotypes (`likelihood='gaussian'`). + + :return: A SumstatsTable object containing the summary statistics from the association tests. """ - #assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) - raise NotImplementedError + from ...GenotypeMatrix import plinkBEDGenotypeMatrix + from ...utils.executors import plink1Executor + + assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) + + plink1 = plink1Executor() + + s_table = genotype_matrix.sample_table + + if s_table.phenotype_likelihood is None: + warnings.warn("The phenotype likelihood is not specified! " + "Assuming that the phenotype is continuous...") + + # Transform the phenotype: + phenotype, mask = chained_transform(s_table, **phenotype_transform_kwargs) + + # Prepare the phenotype table to pass to plink: + phe_table = s_table.get_phenotype_table() + + # If the likelihood is binomial, transform the phenotype into + # plink's coding for case/control (1/2) rather than (0/1). + if s_table.phenotype_likelihood == 'binomial': + phe_table['phenotype'] += 1 + else: + phe_table = phe_table.loc[mask, :] + phe_table['phenotype'] = phenotype + + # Output phenotype table: + phe_fname = osp.join(temp_dir, "pheno.txt") + phe_table.to_csv(phe_fname, sep="\t", index=False, header=False) + + # Process covariates: + if s_table.phenotype_likelihood == 'binomial' and 'adjust_covariates' in phenotype_transform_kwargs and \ + phenotype_transform_kwargs['adjust_covariates']: + + covar_fname = osp.join(temp_dir, "covar.txt") + covar = s_table.get_covariates_table().loc[mask, :] + covar.to_csv(covar_fname, sep="\t", index=False, header=False) + else: + covar_fname = None + + # Determine regression type based on phenotype likelihood: + plink_reg_type = ['linear', 'logistic'][s_table.phenotype_likelihood == 'binomial'] + + # Output subset of SNPs to perform association tests on: + snp_keepfile = osp.join(temp_dir, "variants.keep") + pd.DataFrame({'SNP': genotype_matrix.snps}).to_csv( + snp_keepfile, index=False, header=False + ) + + # Output file: + plink_output = osp.join(temp_dir, "output") + + cmd = [ + f"--bfile {genotype_matrix.bed_file}", + f"--extract {snp_keepfile}", + f"--{plink_reg_type} hide-covar", + f"--pheno {phe_fname}", + f"--out {plink_output}" + ] + + if covar_fname is not None: + cmd.append(f'--covar {covar_fname}') + + plink1.execute(cmd) + + output_fname = plink_output + f".PHENO1.assoc.{plink_reg_type}" + + if not osp.isfile(output_fname): + if plink_reg_type == 'logistic' and osp.isfile(output_fname + ".hybrid"): + output_fname += ".hybrid" + else: + raise FileNotFoundError + + # Read the summary statistics file from plink: + ss_table = SumstatsTable.from_file(output_fname, sumstats_format='plink1.9') + # Infer the reference allele: + ss_table.infer_a2(genotype_matrix.snp_table) + + # Make sure that the effect allele is encoded properly: + ss_table.match(genotype_matrix.snp_table, correct_flips=True) + + return ss_table def perform_gwa_xarray(genotype_matrix, - standardize_genotype=True, - standardize_phenotype=True): + standardize_genotype=False, + **phenotype_transform_kwargs): + """ + Perform genome-wide association testing using xarray and the PyData ecosystem. + This function takes a GenotypeMatrix object and gwas-related flags and + calls performs (simple) GWA on the genotype and phenotype data referenced + by the GenotypeMatrix object. This function only implements GWA testing for + continuous phenotypes. For other functionality (e.g. case-control GWAS), + please use `plink` as a backend or consult other GWAS software (e.g. GCTA or REGENIE). + + :param genotype_matrix: A GenotypeMatrix object. + :param standardize_genotype: If True, the genotype matrix will be standardized such that the columns (i.e. SNPs) + have zero mean and unit variance. + :param phenotype_transform_kwargs: Keyword arguments to pass to the `chained_transform` function. These arguments + include the following options to transform the phenotype before performing GWAS: + `adjust_covariates`, `standardize_phenotype`, `rint_phenotype`, and `outlier_sigma_threshold`. NOTE: These + transformations are only applied to continuous phenotypes (`likelihood='gaussian'`). + + :return: A SumstatsTable object containing the summary statistics from the association tests. + """ - from magenpy.GenotypeMatrix import xarrayGenotypeMatrix + # Sanity checks: + + # Check that the genotype matrix is an xarrayGenotypeMatrix object. + from ...GenotypeMatrix import xarrayGenotypeMatrix assert isinstance(genotype_matrix, xarrayGenotypeMatrix) + # Check that the phenotype likelihood is set correctly and that the phenotype is continuous. if genotype_matrix.sample_table.phenotype_likelihood is None: warnings.warn("The phenotype likelihood is not specified! " "Assuming that the phenotype is continuous...") elif genotype_matrix.sample_table.phenotype_likelihood == 'binomial': - raise Exception("The xarray backend does not support performing association " - "testing on binary (case-control) phenotypes!") + raise ValueError("The xarray backend currently does not support performing association " + "testing on binary (case-control) phenotypes! Try setting the backend to `plink` or " + "use external software (e.g. GCTA or REGENIE) for performing GWAS.") + + # ----------------------------------------------------------- + # Get the SNP table from the genotype_matrix object: sumstats_table = genotype_matrix.get_snp_table( ['CHR', 'SNP', 'POS', 'A1', 'A2', 'N', 'MAF'] ) - phenotype = genotype_matrix.sample_table.phenotype + # ----------------------------------------------------------- + + # Transform the phenotype: + phenotype, mask = chained_transform(genotype_matrix.sample_table, **phenotype_transform_kwargs) - if standardize_phenotype: - from ..transforms.phenotype import standardize - phenotype = standardize(phenotype) + # TODO: Figure out how to adjust the per-variant sample size based on the mask! + # Estimate the phenotypic variance: sigma_sq_y = np.var(phenotype) + # ----------------------------------------------------------- + # Perform association testing using closed-form solutions: + + # Apply the mask to the genotype matrix: + xr_mat = genotype_matrix.xr_mat[mask, :] + if standardize_genotype: from ..transforms.genotype import standardize - sumstats_table['BETA'] = np.dot(standardize(genotype_matrix.xr_mat).T, phenotype) / sumstats_table['N'].values + sumstats_table['BETA'] = np.dot(standardize(xr_mat).T, phenotype) / sumstats_table['N'].values sumstats_table['SE'] = np.sqrt(sigma_sq_y / sumstats_table['N'].values) else: sumstats_table['BETA'] = ( - np.dot(genotype_matrix.xr_mat.fillna(sumstats_table['MAF'].values).T, phenotype) / + np.dot(xr_mat.fillna(sumstats_table['MAF'].values).T, phenotype) / sumstats_table['N'].values * genotype_matrix.maf_var ) sumstats_table['SE'] = np.sqrt(sigma_sq_y / (sumstats_table['N'].values * genotype_matrix.maf_var)) - ss_table = mgp.SumstatsTable(sumstats_table) + ss_table = SumstatsTable(sumstats_table) + # Trigger computing z-score and p-values from the BETA and SE columns: _, _ = ss_table.z_score, ss_table.pval return ss_table diff --git a/magenpy/stats/h2/ldsc.py b/magenpy/stats/h2/ldsc.py index b632f42..150d36c 100644 --- a/magenpy/stats/h2/ldsc.py +++ b/magenpy/stats/h2/ldsc.py @@ -1,8 +1,8 @@ import numpy as np -import magenpy as mgp +from ...GWADataLoader import GWADataLoader -def simple_ldsc(gdl: mgp.GWADataLoader): +def simple_ldsc(gdl: GWADataLoader): """ Provides an estimate of SNP heritability from summary statistics using a simplified version of the LD Score Regression framework. @@ -10,10 +10,10 @@ def simple_ldsc(gdl: mgp.GWADataLoader): Where the response is the Chi-Squared statistic for SNP j and the variable is its LD score. - NOTE: For now, we constrain the slope to 1. - :param gdl: An instance of `GWADataLoader` with the LD information and summary statistics initialized properly. + + :return: The estimated SNP heritability. """ # Check data types: @@ -36,44 +36,28 @@ def simple_ldsc(gdl: mgp.GWADataLoader): class LDSCRegression(object): + """ + Perform LD Score Regression using the jackknife method. + """ - def __init__(self, gdl: mgp.GWADataLoader, n_blocks=200, max_chisq=None): + def __init__(self, gdl: GWADataLoader, n_blocks=200, max_chisq=None): """ - Incomplete... + :param gdl: An instance of GWADataLoader + :param n_blocks: The number of blocks to use for the jackknife method. + :param max_chisq: The maximum Chi-Squared statistic to consider. """ self.gdl = gdl self.n_blocks = n_blocks - # Extract the data from the GDL object: - - chroms = self.gdl.chromosomes - - if self.gdl.annotation is not None: - self.ld_scores = np.concatenate([ - self.gdl.ld[c].compute_ld_scores( - annotations=self.gdl.annotation[c].values(add_intercept=True) - ) - for c in chroms - ]) - else: - self.ld_scores = np.concatenate([self.gdl.ld[c].ld_score.reshape(-1, 1) for c in chroms]) - - self.chisq = np.concatenate([self.gdl.sumstats_table[c].get_chisq_statistic() for c in chroms]) - self.n = np.concatenate([self.gdl.sumstats_table[c].n_per_snp for c in chroms]) - - if max_chisq is None: - max_chisq = max(0.001*self.n.max(), 80) - - chisq_cond = self.chisq < max_chisq - - self.ld_scores = self.ld_scores[chisq_cond, :] - self.chisq = self.chisq[chisq_cond] - self.n = self.n[chisq_cond] + # ... def fit(self): """ - TODO: Implement the jackknife estimator here... + Perform LD Score Regression estimation using the jackknife method. + + :raises NotImplementedError: If method is not implemented. """ - pass + + raise NotImplementedError diff --git a/magenpy/stats/ld/c_utils.pyx b/magenpy/stats/ld/c_utils.pyx index 081a11d..b61b44d 100644 --- a/magenpy/stats/ld/c_utils.pyx +++ b/magenpy/stats/ld/c_utils.pyx @@ -9,106 +9,192 @@ # cython: infer_types=True from libc.math cimport exp +from cython cimport integral +cimport cython import numpy as np -cimport numpy as np -def zarr_islice(arr, start=None, end=None): - +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.exceptval(check=False) +cpdef filter_ut_csr_matrix_low_memory(integral[::1] indptr, char[::1] bool_mask): """ - This is copied from the official, but not yet released implementation of - i_slice in Zarr codebase: - https://github.com/zarr-developers/zarr-python/blob/e79e75ca8f07c95a5deede51f7074f699aa41149/zarr/core.py#L463 - :param arr: A Zarr array - :param start: Start index - :param end: End index + This is a utility function to generate a mask with the purpose of filtering + the data array of upper-triangular CSR matrices. The function also generates a new + indptr array that reflects the filter requested by the user. + + The reason we have this implementation is to avoid row/column filtering with + scipy's native functionality for CSR matrices, which involves using the `indices` + array, which can take substantial amounts of memory that is not needed for + matrices that have special structure, such as Linkage-Disequilibrium matrices. + + :param indptr: The index pointer array for the CSR matrix to be filtered. + :param bool_mask: A boolean mask of 0s and 1s represented as int8. """ - if len(arr.shape) == 0: - # Same error as numpy - raise TypeError("iteration over a 0-d array") - if start is None: - start = 0 - if end is None or end > arr.shape[0]: - end = arr.shape[0] - cdef unsigned int j, chunk_size = arr.chunks[0] - chunk = None + cdef: + long i, curr_row, row_bound, new_indptr_idx = 1, curr_shape=indptr.shape[0] - 1 + long[::1] new_indptr = np.zeros(np.count_nonzero(bool_mask) + 1, dtype=np.int64) + char[::1] data_mask = np.zeros(indptr[curr_shape], dtype=np.int8) + + with nogil: + # For each row in the current matrix: + for curr_row in range(curr_shape): + + # If the row is to be included in the new matrix: + if bool_mask[curr_row]: + + # Useful quantity to convert the data array index `i` to the + # equivalent row index in the `bool` mask: + row_bound = curr_row - indptr[curr_row] + 1 + + # For the new indptr array, copy the value from the previous row: + new_indptr[new_indptr_idx] = new_indptr[new_indptr_idx - 1] + + # For each entry for this row in the data array + for i in range(indptr[curr_row], indptr[curr_row + 1]): + + # If the entry isn't filtered, make sure it's included in the new matrix + # And increase the `indptr` by one unit: + if bool_mask[row_bound + i]: + data_mask[i] = 1 + new_indptr[new_indptr_idx] += 1 + + new_indptr_idx += 1 - for j in range(start, end): - if j % chunk_size == 0: - chunk = arr[j: j + chunk_size] - elif chunk is None: - chunk_start = j - j % chunk_size - chunk_end = chunk_start + chunk_size - chunk = arr[chunk_start:chunk_end] - yield chunk[j % chunk_size] + return np.asarray(data_mask).astype(bool), np.asarray(new_indptr) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.exceptval(check=False) +cpdef expand_ranges(integral[::1] start, integral[::1] end, long output_size): + """ + Given a set of start and end indices, expand them into one long vector that contains + the indices between all start and end positions. + + :param start: A vector with the start indices. + :param end: A vector with the end indices. + :param output_size: The size of the output vector (equivalent to the sum of the lengths + of all ranges). + """ + cdef: + integral i, j, size=start.shape[0] + long out_idx = 0 + integral[::1] output + + if integral is int: + output = np.empty(output_size, dtype=np.int32) + else: + output = np.empty(output_size, dtype=np.int64) + + with nogil: + for i in range(size): + for j in range(start[i], end[i]): + output[out_idx] = j + out_idx += 1 + + return np.asarray(output) + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +@cython.exceptval(check=False) cpdef find_ld_block_boundaries(long[:] pos, long[:, :] block_boundaries): + """ + Find the LD boundaries for the blockwise estimator of LD, i.e., the + indices of the leftmost and rightmost neighbors for each SNP. + + :param pos: A vector with the position of each genetic variant. + :param block_boundaries: A matrix with the boundaries of each LD block. + """ - cdef unsigned int i, j, ldb_idx, block_start, block_end, B = len(block_boundaries), M = len(pos) - cdef long[:] v_min = np.zeros_like(pos, dtype=np.int) - cdef long[:] v_max = M*np.ones_like(pos, dtype=np.int) + cdef: + int i, j, ldb_idx, block_start, block_end, B = block_boundaries.shape[0], M = pos.shape[0] + long[:] v_min = np.zeros_like(pos, dtype=np.int64) + long[:] v_max = M*np.ones_like(pos, dtype=np.int64) - for i in range(M): + with nogil: + for i in range(M): - # Find the positional boundaries for SNP i: - for ldb_idx in range(B): - if block_boundaries[ldb_idx, 0] <= pos[i] < block_boundaries[ldb_idx, 1]: - block_start, block_end = block_boundaries[ldb_idx, 0], block_boundaries[ldb_idx, 1] - break + # Find the positional boundaries for SNP i: + for ldb_idx in range(B): + if block_boundaries[ldb_idx, 0] <= pos[i] < block_boundaries[ldb_idx, 1]: + block_start, block_end = block_boundaries[ldb_idx, 0], block_boundaries[ldb_idx, 1] + break - for j in range(i, M): - if pos[j] >= block_end: - v_max[i] = j - break + for j in range(i, M): + if pos[j] >= block_end: + v_max[i] = j + break - for j in range(i, -1, -1): - if pos[j] < block_start: - v_min[i] = j + 1 - break + for j in range(i, -1, -1): + if pos[j] < block_start: + v_min[i] = j + 1 + break return np.array((v_min, v_max)) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +@cython.exceptval(check=False) cpdef find_windowed_ld_boundaries(double[:] pos, double max_dist): + """ + Find the LD boundaries for the windowed estimator of LD, i.e., the + indices of the leftmost and rightmost neighbors for each SNP. + + :param pos: A vector with the position of each genetic variant. + :param max_dist: The maximum distance between SNPs to consider them neighbors. + """ - cdef unsigned int i, j, M = len(pos) - cdef long[:] v_min = np.zeros_like(pos, dtype=np.int) - cdef long[:] v_max = M*np.ones_like(pos, dtype=np.int) + cdef: + int i, j, M = pos.shape[0] + long[:] v_min = np.zeros_like(pos, dtype=np.int64) + long[:] v_max = M*np.ones_like(pos, dtype=np.int64) - for i in range(M): + with nogil: + for i in range(M): - for j in range(i, M): - if pos[j] - pos[i] > max_dist: - v_max[i] = j - break + for j in range(i, M): + if pos[j] - pos[i] > max_dist: + v_max[i] = j + break - for j in range(i, -1, -1): - if pos[i] - pos[j] > max_dist: - v_min[i] = j + 1 - break + for j in range(i, -1, -1): + if pos[i] - pos[j] > max_dist: + v_min[i] = j + 1 + break return np.array((v_min, v_max)) - +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +@cython.exceptval(check=False) cpdef find_shrinkage_ld_boundaries(double[:] cm_pos, - double genmap_Ne, + double genmap_ne, int genmap_sample_size, double cutoff): """ - Find the LD boundaries for the shrinkage estimator of Wen and Stephens (2010) + Find the LD boundaries for the shrinkage estimator of Wen and Stephens (2010). :param cm_pos: A vector with the position of each genetic variant in centi Morgan. - :param genmap_Ne: The effective population size for the genetic map sample. + :param genmap_ne: The effective population size for the genetic map sample. :param genmap_sample_size: The sample size used to estimate the genetic map. :param cutoff: The threshold below which we set the shrinkage factor to zero. """ - cdef unsigned int i, j, M = len(cm_pos) - cdef long[:] v_min = np.zeros_like(cm_pos, dtype=np.int) - cdef long[:] v_max = M*np.ones_like(cm_pos, dtype=np.int) + cdef unsigned int i, j, M = cm_pos.shape[0] + cdef long[:] v_min = np.zeros_like(cm_pos, dtype=int) + cdef long[:] v_max = M*np.ones_like(cm_pos, dtype=int) # The multiplicative term for the shrinkage factor # The shrinkage factor is 4 * Ne * (rho_ij/100) / (2*m) @@ -119,18 +205,19 @@ cpdef find_shrinkage_ld_boundaries(double[:] cm_pos, # to the distance between SNPs is: 4*Ne/(200*m), which is equivalent to 0.02*Ne/m # See also: https://github.com/stephenslab/rss/blob/master/misc/get_corr.R # and Wen and Stephens (2010) - cdef double mult_term = 0.02 * genmap_Ne / genmap_sample_size + cdef double mult_term = 0.02 * genmap_ne / genmap_sample_size - for i in range(M): + with nogil: + for i in range(M): - for j in range(i, M): - if exp(-mult_term*(cm_pos[j] - cm_pos[i])) < cutoff: - v_max[i] = j - break + for j in range(i, M): + if exp(-mult_term*(cm_pos[j] - cm_pos[i])) < cutoff: + v_max[i] = j + break - for j in range(i, -1, -1): - if exp(-mult_term*(cm_pos[i] - cm_pos[j])) < cutoff: - v_min[i] = j + 1 - break + for j in range(i, -1, -1): + if exp(-mult_term*(cm_pos[i] - cm_pos[j])) < cutoff: + v_min[i] = j + 1 + break return np.array((v_min, v_max)) diff --git a/magenpy/stats/ld/estimator.py b/magenpy/stats/ld/estimator.py index 9ae409e..f1ae261 100644 --- a/magenpy/stats/ld/estimator.py +++ b/magenpy/stats/ld/estimator.py @@ -1,15 +1,31 @@ import numpy as np -from magenpy import LDMatrix class SampleLD(object): """ - Compute the sample correlation (LD) matrix between - pairs of variants along a given chromosome. + A basic wrapper class to facilitate computing Linkage-Disequilibrium (LD) matrices. + + Linkage-Disequilibrium (LD) is a measure of the SNP-by-SNP pairwise correlation between + genetic variants in a population. LD tends to decay with genomic distance, and the rate + of decay is influenced by many factors. Therefore, LD matrices are often diagonally-dominant. + + This class `SampleLD` provides a basic interface to compute sample correlation coefficient between + all variants defined in a genotype matrix. The resulting LD matrix is a square and dense matrix. + + For sparse LD matrices, consider using the `WindowedLD`, `ShrinkageLD` or `BlockLD` estimators instead. + + !!! seealso "See Also" + * [WindowedLD][magenpy.stats.ld.estimator.WindowedLD] + * [ShrinkageLD][magenpy.stats.ld.estimator.ShrinkageLD] + * [BlockLD][magenpy.stats.ld.estimator.BlockLD] + + :ivar genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix` or its children. + """ def __init__(self, genotype_matrix): """ + Initialize the LD estimator with a genotype matrix. :param genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. """ @@ -34,11 +50,20 @@ def compute_ld_boundaries(self): For the sample LD matrix, we simply take the entire square matrix as our window, so the start position is 0 and end position is M for all SNPs. + + :return: A 2xM matrix of LD boundaries. """ m = self.genotype_matrix.n_snps return np.array((np.zeros(m), np.ones(m)*m)).astype(np.int64) - def compute(self, output_dir, temp_dir='temp'): + def compute(self, + output_dir, + temp_dir='temp', + overwrite=True, + delete_original=True, + dtype='int16', + compressor_name='lz4', + compression_level=5): """ A utility method to compute the LD matrix and store in Zarr array format. The computes the LD matrix and stores it in Zarr array format, set its attributes, @@ -46,55 +71,97 @@ def compute(self, output_dir, temp_dir='temp'): :param output_dir: The path where to store the resulting LD matrix. :param temp_dir: A temporary directory to store intermediate files and results. + :param overwrite: If True, overwrite any existing LD matrices in `temp_dir` and `output_dir`. + :param delete_original: If True, deletes dense or intermediate LD matrices generated along the way. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor to use for the LD matrix. + :param compression_level: The compression level to use for the LD matrix (1-9). + + :return: An instance of `LDMatrix` containing the computed LD matrix. + """ - from .utils import compute_ld_xarray, compute_ld_plink1p9, _validate_ld_matrix - from magenpy.GenotypeMatrix import xarrayGenotypeMatrix, plinkBEDGenotypeMatrix + from .utils import compute_ld_xarray, compute_ld_plink1p9 + from ...GenotypeMatrix import xarrayGenotypeMatrix, plinkBEDGenotypeMatrix + + assert str(dtype) in ('float32', 'float64', 'int8', 'int16') ld_boundaries = self.compute_ld_boundaries() if isinstance(self.genotype_matrix, xarrayGenotypeMatrix): - zarr_ld_mat = compute_ld_xarray(self.genotype_matrix, - ld_boundaries, - output_dir, - temp_dir=temp_dir) + ld_mat = compute_ld_xarray(self.genotype_matrix, + ld_boundaries, + output_dir, + temp_dir=temp_dir, + overwrite=overwrite, + delete_original=delete_original, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) elif isinstance(self.genotype_matrix, plinkBEDGenotypeMatrix): - zarr_ld_mat = compute_ld_plink1p9(self.genotype_matrix, - ld_boundaries, - output_dir, - temp_dir=temp_dir) + ld_mat = compute_ld_plink1p9(self.genotype_matrix, + ld_boundaries, + output_dir, + temp_dir=temp_dir, + overwrite=overwrite, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) else: raise NotImplementedError - zarr_ld_mat.attrs['Chromosome'] = int(self.genotype_matrix.chromosome) - zarr_ld_mat.attrs['Sample size'] = int(self.genotype_matrix.sample_size) + # Add attributes to the LDMatrix object: + ld_mat.set_store_attr('Chromosome', int(self.genotype_matrix.chromosome)) + ld_mat.set_store_attr('Sample size', int(self.genotype_matrix.sample_size)) + ld_mat.set_store_attr('LD estimator', 'sample') + + if self.genotype_matrix.genome_build is not None: + ld_mat.set_store_attr('Genome build', self.genotype_matrix.genome_build) - zarr_ld_mat.attrs['SNP'] = list(self.genotype_matrix.snps) - zarr_ld_mat.attrs['BP'] = list(map(int, self.genotype_matrix.bp_pos)) + ld_mat.set_metadata('snps', self.genotype_matrix.snps, overwrite=overwrite) + ld_mat.set_metadata('bp', self.genotype_matrix.bp_pos, overwrite=overwrite) + ld_mat.set_metadata('maf', self.genotype_matrix.maf, overwrite=overwrite) + ld_mat.set_metadata('a1', self.genotype_matrix.a1, overwrite=overwrite) + ld_mat.set_metadata('a2', self.genotype_matrix.a2, overwrite=overwrite) try: - zarr_ld_mat.attrs['cM'] = list(map(float, self.genotype_matrix.cm_pos)) + ld_mat.set_metadata('cm', self.genotype_matrix.cm_pos, overwrite=overwrite) except KeyError: pass - zarr_ld_mat.attrs['MAF'] = list(map(float, self.genotype_matrix.maf)) - zarr_ld_mat.attrs['A1'] = list(self.genotype_matrix.a1) - zarr_ld_mat.attrs['A2'] = list(self.genotype_matrix.a2) + ld_mat.set_metadata('ldscore', ld_mat.compute_ld_scores(), overwrite=overwrite) - zarr_ld_mat.attrs['LD estimator'] = 'sample' - zarr_ld_mat.attrs['LD boundaries'] = ld_boundaries.tolist() - - ld_mat = LDMatrix(zarr_ld_mat) - ld_mat.set_store_attr('LDScore', ld_mat.compute_ld_scores().tolist()) - - if _validate_ld_matrix(ld_mat): + if ld_mat.validate_ld_matrix(): return ld_mat class WindowedLD(SampleLD): """ - Compute the sample correlation matrix, but only in - pre-specified windows for each variant. + A wrapper class to facilitate computing windowed Linkage-Disequilibrium (LD) matrices. + Windowed LD matrices only record pairwise correlations between variants that are within a certain + distance of each other along the chromosome. This is useful for reducing the memory requirements + and noise in the LD matrix. + + The `WindowedLD` estimator supports a variety of ways for defining the window size: + + * `window_size`: The number of neighboring SNPs to consider on each side when computing LD. + * `kb_window_size`: The maximum distance in kilobases to consider when computing LD. + * `cm_window_size`: The maximum distance in centi Morgan to consider when computing LD. + + The LD boundaries computed here are the intersection of the windows defined by the window size around + each SNP (`window_size`), the window size in kilobases (`kb_window_size`), and the window size in centi Morgan + (`cm_window_size`). + + !!! seealso "See Also" + * [WindowedLD][magenpy.stats.ld.estimator.ShrinkageLD] + * [BlockLD][magenpy.stats.ld.estimator.BlockLD] + + :ivar genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. + :ivar window_size: The number of neighboring SNPs to consider on each side when computing LD. + :ivar kb_window_size: The maximum distance in kilobases to consider when computing LD. + :ivar cm_window_size: The maximum distance in centi Morgan to consider when computing LD. + """ def __init__(self, @@ -103,6 +170,9 @@ def __init__(self, kb_window_size=None, cm_window_size=None): """ + + Initialize the windowed LD estimator with a genotype matrix and window size parameters. + :param genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. :param window_size: The number of neighboring SNPs to consider on each side when computing LD. :param kb_window_size: The maximum distance in kilobases to consider when computing LD. @@ -123,6 +193,8 @@ def compute_ld_boundaries(self): The LD boundaries computed here are the intersection of the windows defined by the window size around each SNP (`window_size`), the window size in kilobases (`kb_window_size`), and the window size in centi Morgan (`cm_window_size`). + + :return: A 2xM matrix of LD boundaries. """ bounds = [] @@ -161,14 +233,37 @@ def compute_ld_boundaries(self): np.minimum.reduce([b[1, :] for b in bounds]) ]) - def compute(self, output_dir, temp_dir='temp'): + def compute(self, + output_dir, + temp_dir='temp', + overwrite=True, + delete_original=True, + dtype='int16', + compressor_name='lz4', + compression_level=5): """ + + Compute the windowed LD matrix and store in Zarr array format. + :param output_dir: The path where to store the resulting LD matrix. :param temp_dir: A temporary directory to store intermediate files and results. + :param overwrite: If True, overwrite any existing LD matrices in `temp_dir` and `output_dir`. + :param delete_original: If True, deletes dense or intermediate LD matrices generated along the way. + :param dtype: The data type for the entries of the LD matrix. + :param compressor_name: The name of the compressor to use for the LD matrix. + :param compression_level: The compression level to use for the LD matrix (1-9). + + :return: An instance of `LDMatrix` containing the computed LD matrix. """ - ld_mat = super(WindowedLD, self).compute(output_dir, - temp_dir) + ld_mat = super().compute(output_dir, + temp_dir, + overwrite=overwrite, + delete_original=delete_original, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) + ld_mat.set_store_attr('LD estimator', 'windowed') w_properties = {} @@ -187,6 +282,30 @@ def compute(self, output_dir, temp_dir='temp'): class ShrinkageLD(SampleLD): + """ + A wrapper class to facilitate computing shrinkage-based Linkage-Disequilibrium (LD) matrices. + Shrinkage LD matrices are a way to reduce noise in the LD matrix by shrinking the off-diagonal pairwise + correlation coefficients towards zero. This is useful for reducing the noise in the LD matrix and + improving the quality of downstream analyses. + + The shrinkage estimator implemented uses the shrinking procedure derived in: + + Wen X, Stephens M. USING LINEAR PREDICTORS TO IMPUTE ALLELE FREQUENCIES FROM SUMMARY OR POOLED GENOTYPE DATA. + Ann Appl Stat. 2010 Sep;4(3):1158-1182. doi: 10.1214/10-aoas338. PMID: 21479081; PMCID: PMC3072818. + + Computing the shrinkage intensity requires specifying the effective population size (Ne) and the sample size + used to infer the genetic map. In addition, it requires specifying a threshold below which the LD is set to zero. + + !!! seealso "See Also" + * [WindowedLD][magenpy.stats.ld.estimator.WindowedLD] + * [BlockLD][magenpy.stats.ld.estimator.BlockLD] + + :ivar genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. + :ivar genetic_map_ne: The effective population size (Ne) from which the genetic map is derived. + :ivar genetic_map_sample_size: The sample size used to infer the genetic map. + :ivar threshold: The shrinkage cutoff below which the LD is set to zero. + + """ def __init__(self, genotype_matrix, @@ -194,6 +313,9 @@ def __init__(self, genetic_map_sample_size, threshold=1e-3): """ + + Initialize the shrinkage LD estimator with a genotype matrix and shrinkage parameters. + :param genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. :param genetic_map_ne: The effective population size (Ne) from which the genetic map is derived. :param genetic_map_sample_size: The sample size used to infer the genetic map. @@ -208,7 +330,9 @@ def __init__(self, def compute_ld_boundaries(self): """ - Find the LD boundaries based on the shrinkage operator. + Compute the shrinkage-based Linkage-Disequilibrium (LD) boundaries. + + :return: A 2xM matrix of LD boundaries. """ from .c_utils import find_shrinkage_ld_boundaries @@ -217,24 +341,57 @@ def compute_ld_boundaries(self): self.genetic_map_sample_size, self.threshold) - def compute(self, output_dir, temp_dir='temp'): + def compute(self, + output_dir, + temp_dir='temp', + overwrite=True, + delete_original=True, + dtype='int16', + compressor_name='lz4', + compression_level=5, + chunk_size=1000): """ + + TODO: Add a mechanism to either automatically adjust the shrinkage threshold depending on the + float precision (dtype) or purge trailing zero entries that got quantized to zero. For example, + if we select a shrinkage threshold of 1e-3 with (int8), then we will have a lot of + trailing zeros stored in the resulting LD matrix. It's better if we got rid of those zeros to + minimize storage requirements and computation time. + + !!! note + LD Scores are computed before applying shrinkage. + :param output_dir: The path where to store the resulting LD matrix. :param temp_dir: A temporary directory to store intermediate files and results. + :param overwrite: If True, overwrite any existing LD matrices in `temp_dir` and `output_dir`. + :param delete_original: If True, deletes dense or intermediate LD matrices generated along the way. + :param dtype: The data type for the entries of the LD matrix. + :param compressor_name: The name of the compressor to use for the LD matrix. + :param compression_level: The compression level to use for the LD matrix (1-9). + :param chunk_size: An optional parameter that sets the maximum number of rows processed simultaneously. + The smaller the `chunk_size`, the less memory requirements needed for the shrinkage step. + + :return: An instance of `LDMatrix` containing the computed LD matrix. + """ - ld_mat = super(ShrinkageLD, self).compute(output_dir, - temp_dir) + ld_mat = super().compute(output_dir, + temp_dir, + overwrite=overwrite, + delete_original=delete_original, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) from .utils import shrink_ld_matrix - ld_mat = LDMatrix(shrink_ld_matrix(ld_mat.z_array, - self.genotype_matrix.cm_pos, - self.genotype_matrix.maf_var, - self.genetic_map_ne, - self.genetic_map_sample_size, - self.threshold, - ld_boundaries=ld_mat.ld_boundaries)) + ld_mat = shrink_ld_matrix(ld_mat, + self.genotype_matrix.cm_pos, + self.genotype_matrix.maf_var, + self.genetic_map_ne, + self.genetic_map_sample_size, + self.threshold, + chunk_size=chunk_size) ld_mat.set_store_attr('LD estimator', 'shrinkage') @@ -248,12 +405,37 @@ def compute(self, output_dir, temp_dir='temp'): class BlockLD(SampleLD): + """ + A wrapper class to facilitate computing block-based Linkage-Disequilibrium (LD) matrices. + Block-based LD matrices are a way to reduce the memory requirements of the LD matrix by + computing the pairwise correlation coefficients only between SNPs that are within the same LD block. + + LD blocks can be inferred by external software tools, such as `LDetect` of Berisa and Pickrell (2016): + + Berisa T, Pickrell JK. Approximately independent linkage disequilibrium blocks in human populations. + Bioinformatics. 2016 Jan 15;32(2):283-5. doi: 10.1093/bioinformatics/btv546. + Epub 2015 Sep 22. PMID: 26395773; PMCID: PMC4731402. + + The `BlockLD` estimator requires the LD blocks to be provided as input. The LD blocks are a Bx2 matrix + where B is the number of blocks and the columns are the start and end of each block, respectively. + + !!! seealso "See Also" + * [WindowedLD][magenpy.stats.ld.estimator.WindowedLD] + * [ShrinkageLD][magenpy.stats.ld.estimator.ShrinkageLD] + + :ivar genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. + :ivar ld_blocks: The LD blocks, a Bx2 matrix where B is the number of blocks and the columns are + the start and end of each block, respectively. + + """ def __init__(self, genotype_matrix, ld_blocks=None, ld_blocks_file=None): """ + Initialize the block-based LD estimator with a genotype matrix and LD blocks. + :param genotype_matrix: The genotype matrix, an instance of `GenotypeMatrix`. :param ld_blocks: The LD blocks, a Bx2 matrix where B is the number of blocks and the columns are the start and end of each block, respectively. @@ -265,25 +447,49 @@ def __init__(self, super().__init__(genotype_matrix=genotype_matrix) if ld_blocks is None: - from magenpy.parsers.misc_parsers import parse_ld_block_data + from ...parsers.misc_parsers import parse_ld_block_data self.ld_blocks = parse_ld_block_data(ld_blocks_file)[self.genotype_matrix.chromosome] def compute_ld_boundaries(self): """ - Find the per-SNP ld boundaries, given the provided LD blocks. + Compute the per-SNP Linkage-Disequilibrium (LD) boundaries for the block-based estimator. + + :return: A 2xM matrix of LD boundaries. """ from .c_utils import find_ld_block_boundaries return find_ld_block_boundaries(self.genotype_matrix.bp_pos, self.ld_blocks) - def compute(self, output_dir, temp_dir='temp'): + def compute(self, + output_dir, + temp_dir='temp', + overwrite=True, + delete_original=True, + dtype='int16', + compressor_name='lz4', + compression_level=5): """ + + Compute the block-based LD matrix and store in Zarr array format. + :param output_dir: The path where to store the resulting LD matrix. :param temp_dir: A temporary directory to store intermediate files and results. + :param overwrite: If True, overwrite any existing LD matrices in `temp_dir` and `output_dir`. + :param delete_original: If True, deletes dense or intermediate LD matrices generated along the way. + :param dtype: The data type for the entries of the LD matrix. + :param compressor_name: The name of the compressor to use for the LD matrix. + :param compression_level: The compression level to use for the LD matrix (1-9). + + :return: An instance of `LDMatrix` containing the computed LD matrix. """ - ld_mat = super(BlockLD, self).compute(output_dir, - temp_dir) + ld_mat = super().compute(output_dir, + temp_dir, + overwrite=overwrite, + delete_original=delete_original, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) ld_mat.set_store_attr('LD estimator', 'block') diff --git a/magenpy/stats/ld/utils.py b/magenpy/stats/ld/utils.py index f51a973..4bdb28a 100644 --- a/magenpy/stats/ld/utils.py +++ b/magenpy/stats/ld/utils.py @@ -3,55 +3,19 @@ import pandas as pd import numpy as np -from numcodecs import VLenArray import zarr -from magenpy.utils.compute_utils import generate_slice_dictionary +from ...LDMatrix import LDMatrix -def _validate_ld_matrix(ld_mat): +def move_ld_store(z_arr, target_path, overwrite=True): """ - Takes an `LDMatrix` object and checks its contents for validity. - Specifically, we check that: - - The dimensions of the matrix and its associated attributes are matching. - - The LD boundaries are correct. - - The masking is working properly. - :param ld_mat: An instance of `LDMatrix` - :return: True if `ld_mat` has the correct structure, False otherwise. + Move an LD store from its current path to the `target_path` + :param z_arr: An LDMatrix object + :param target_path: The target path where to move the LD store + :param overwrite: If True, overwrites the target path if it exists. """ - attributes = ['snps', 'a1', 'a2', 'maf', 'bp_position', 'cm_position', 'ld_score'] - - for attr in attributes: - attribute = getattr(ld_mat, attr) - if attribute is None: - continue - if len(attribute) != ld_mat.n_elements: - raise ValueError("Invalid LD Matrix: Attribute dimensions are not aligned!") - - # Check LD bounds: - ld_bounds = ld_mat.get_masked_boundaries() - - if ld_bounds.shape != (2, ld_mat.n_elements): - raise ValueError("Invalid LD Matrix: LD boundaries have the wrong dimensions!") - - ld_block_lengths = ld_bounds[1, :] - ld_bounds[0, :] - - # Iterate over the stored LD data to check its dimensions: - i = 0 - - for i, d in enumerate(ld_mat): - if len(d) != ld_block_lengths[i]: - raise ValueError(f"Invalid LD Matrix: Element {i} does not have matching LD boundaries!") - - if i != (ld_mat.n_elements - 1): - raise ValueError(f"Invalid LD Matrix: Conflicting total number of elements!") - - return True - - -def move_ld_store(z_arr, target_path, overwrite=True): - source_path = z_arr.store.dir_path() if overwrite or not any(os.scandir(target_path)): @@ -62,312 +26,113 @@ def move_ld_store(z_arr, target_path, overwrite=True): return zarr.open(target_path) -def delete_ld_store(z_arr): +def delete_ld_store(ld_mat): + """ + Delete the LD store from disk. + :param ld_mat: An LDMatrix object + """ try: - z_arr.store.rmdir() + ld_mat.store.rmdir() except Exception as e: print(e) -def from_plink_ld_bin_to_zarr(bin_file, dir_store, ld_boundaries): +def clump_snps(ldm, + statistic=None, + rsq_threshold=.9, + extract=True, + sort_key=None): """ - This method takes an LD binary file from PLINK and converts it to - a chunked Zarr matrix - :param bin_file: The path to the LD binary file - :param dir_store: The directory store where the Zarr array will be stored - :param ld_boundaries: The boundaries for the desired LD matrix. - """ - - n_rows = ld_boundaries.shape[1] - avg_ncol = int((ld_boundaries[1, :] - ld_boundaries[0, :]).mean()) - - n_chunks = estimate_row_chunk_size(n_rows, avg_ncol) - - if avg_ncol == n_rows: - z_rag = zarr.open(dir_store, - mode='w', - shape=(n_rows, n_rows), - chunks=n_chunks, - dtype=float) - else: - z_rag = zarr.open(dir_store, - mode='w', - shape=n_rows, - chunks=n_chunks[:1], - dtype=object, - object_codec=VLenArray(float)) - - chunk_size = z_rag.chunks[0] - - for i in range(int(np.ceil(z_rag.shape[0] / chunk_size))): - - n_chunk_rows = min(chunk_size, n_rows - i*chunk_size) - ld_chunk = np.fromfile(bin_file, - offset=i*chunk_size*n_rows, - count=n_chunk_rows*n_rows).reshape(n_chunk_rows, n_rows) - - for j, ld in enumerate(ld_chunk): - idx = i*chunk_size + j - start, end = ld_boundaries[:, idx] - ld_chunk[j] = ld_chunk[j][start: end] - - z_rag[i*chunk_size:(i+1)*chunk_size] = ld_chunk - - return z_rag + This function takes an LDMatrix object and clumps SNPs based + on the `stat` vector (usually p-value) and the provided r-squared threshold. + If two SNPs have an r-squared greater than the threshold, + the SNP with the higher `stat` value is excluded. + :param ldm: An LDMatrix object + :param statistic: A vector of statistics (e.g. p-values) for each SNP that will determine which SNPs to discard. + :param rsq_threshold: The r^2 threshold to use for filtering variants. + :param extract: If True, return remaining SNPs. If False, return removed SNPs. + :param sort_key: The key function for the sorting algorithm that will decide how to sort the `statistic`. + By default, we select the SNP with the minimum value for the `statistic` (e.g. smaller p-value). -def write_csr_to_zarr(csr_mat, z_arr, start_row=None, end_row=None, ld_boundaries=None, purge_data=False): - """ - Write from Scipy's csr matrix to Zarr array - :param csr_mat: A scipy compressed sparse row matrix `csr_matrix` - :param z_arr: A ragged zarr array with the same row dimension as the csr matrix - :param start_row: The start row - :param end_row: The end row - :param purge_data: If `True`, delete the data that was written to Zarr from `csr_mat` - :param ld_boundaries: If provided, we'd only write the elements within the provided boundaries. + :return: A list of SNP rsIDs that are left after clumping (or discarded if `extract=False`). """ - if start_row is None: - start_row = 0 + snps = ldm.snps - if end_row is None: - end_row = csr_mat.shape[0] + if statistic is None: + # if a statistic is not provided, then clump SNPs based on their base pair order, + # meaning that if two SNPs are highly correlated, we keep the one with smaller base pair position. + statistic = ldm.bp_position else: - end_row = min(end_row, csr_mat.shape[0]) - - ld_rows = [] - for i in range(start_row, end_row): - if ld_boundaries is None: - ld_rows.append(np.nan_to_num(csr_mat[i, :].data)) - else: - ld_rows.append(np.nan_to_num(csr_mat[i, ld_boundaries[0, i]:ld_boundaries[1, i]].data)) - - z_arr.oindex[np.arange(start_row, end_row)] = np.array(ld_rows + [None], dtype=object)[:-1] - - if purge_data: - # Delete data from csr matrix: - csr_mat.data[csr_mat.indptr[start_row]:csr_mat.indptr[end_row - 1]] = 0. - csr_mat.eliminate_zeros() - - -def from_plink_ld_table_to_zarr_chunked(ld_file, dir_store, ld_boundaries, snps): - """ - Transform a PLINK LD table to Zarr ragged array. - PLINK LD tables are of the format: - CHR_A BP_A SNP_A CHR_B BP_B SNP_B R - This function deploys a chunked implementation so it only requires - modest memory. - - :param dir_store: A path to the new Zarr store - :param ld_file: A path to the plink LD table - :param ld_boundaries: LD boundaries matrix - :param snps: A list of SNPs - """ - - # Preliminaries: - - # Estimate row chunk-size for the Zarr array: - rows, avg_ncols = len(snps), int((ld_boundaries[1, :] - ld_boundaries[0, :]).mean()) - chunks = estimate_row_chunk_size(rows, avg_ncols) - - # Create a ragged Zarr array: - z_arr = zarr.open(dir_store, - mode='w', - shape=rows, - chunks=chunks[:1], - dtype=object, - object_codec=VLenArray(float)) - - row_chunk_size = z_arr.chunks[0] - - # Create a chunked iterator with pandas: - # Chunk size will correspond to the average chunk size for the Zarr array: - ld_chunks = pd.read_csv(ld_file, - delim_whitespace=True, - usecols=['SNP_A', 'SNP_B', 'R'], - engine='c', - chunksize=row_chunk_size*avg_ncols // 2) - - # Create a ragged Zarr array: - z_arr = zarr.open(dir_store, - mode='w', - shape=rows, - chunks=(row_chunk_size,), - dtype=object, - object_codec=VLenArray(float)) - - # Create a dictionary mapping SNPs to their indices: - snp_dict = dict(zip(snps, np.arange(len(snps)))) - - # The sparse matrix will help us convert from triangular - # sparse matrix to square sparse matrix: - from scipy.sparse import csr_matrix - - sp_mat = None - - curr_chunk = 0 - - # For each chunk in the LD file: - for ld_chunk in ld_chunks: - - # Create an indexed LD chunk: - ld_chunk['index_A'] = ld_chunk['SNP_A'].map(snp_dict) - ld_chunk['index_B'] = ld_chunk['SNP_B'].map(snp_dict) - - ld_chunk['R'].values[ld_chunk['R'].values == 0.] = np.nan - - # Create a compressed sparse row matrix: - chunk_mat = csr_matrix((ld_chunk['R'].values, - (ld_chunk['index_A'].values, ld_chunk['index_B'].values)), - shape=(rows, rows)) - - if sp_mat is None: - sp_mat = chunk_mat + chunk_mat.T - sp_mat.setdiag(1.) - else: - sp_mat = sp_mat + (chunk_mat + chunk_mat.T) - - # The chunk of the snp of largest index: - max_index_chunk = ld_chunk['index_A'].max() // row_chunk_size - - if max_index_chunk > curr_chunk: - write_csr_to_zarr(sp_mat, z_arr, - start_row=curr_chunk*row_chunk_size, - end_row=max_index_chunk*row_chunk_size, - ld_boundaries=ld_boundaries, - purge_data=True) - curr_chunk = max_index_chunk - - write_csr_to_zarr(sp_mat, z_arr, - start_row=curr_chunk * row_chunk_size, - ld_boundaries=ld_boundaries, - purge_data=True) + assert len(statistic) == len(snps) - return z_arr + if sort_key is not None: + sort_key = lambda x: sort_key(statistic[x]) + sorted_idx = sorted(range(len(ldm)), key=sort_key) -def from_plink_ld_table_to_zarr(ld_file, dir_store, ld_boundaries=None, snps=None): - """ - Transform a PLINK LD table to Zarr ragged array. - PLINK LD tables are of the format: - CHR_A BP_A SNP_A CHR_B BP_B SNP_B R - :param dir_store: A path to the new Zarr store - :param ld_file: A path to the plink LD table - :param ld_boundaries: LD boundaries matrix - :param snps: A list of SNPs - """ + snps = ldm.snps + keep_snps_dict = dict(zip(snps, np.ones(len(snps), dtype=bool))) - ld_df = pd.read_csv(ld_file, - delim_whitespace=True, - usecols=['BP_A', 'SNP_A', 'BP_B', 'SNP_B', 'R'], - engine='c') + for idx in sorted_idx: - # Assume that PLINK's table is already sorted by BP_A, BP_B: - a_snps_slice = generate_slice_dictionary(ld_df.SNP_A.values) - r_a = ld_df.R.values + if not keep_snps_dict[snps[idx]]: + continue - ld_sort_b = ld_df.sort_values(['BP_B', 'BP_A']) - b_snps_slice = generate_slice_dictionary(ld_sort_b.SNP_B.values) - r_b = ld_sort_b.R.values + r, indices = ldm.get_row(idx, return_indices=True) + # Find the SNPs that we need to remove: + # We remove SNPs whose squared correlation coefficient with the index SNP is + # greater than the specified rsq_threshold: + snps_to_remove = snps[indices[np.where(r**2 > rsq_threshold)[0]]] - if snps is None: - snp_df = pd.DataFrame(np.concatenate([ld_df[['SNP_A', 'BP_A']].drop_duplicates().values, - ld_df[['SNP_B', 'BP_B']].drop_duplicates().values]), - columns=['SNP', 'BP']) - snps = snp_df.drop_duplicates().sort_values('BP')['SNP'].values + # Update the `keep_snps_dict` dictionary: + keep_snps_dict.update(dict(zip(snps_to_remove, np.zeros(len(snps_to_remove), dtype=bool)))) - if ld_boundaries is None: - before_bound = np.repeat([None], len(snps)) - after_bound = np.repeat([None], len(snps)) + if extract: + return [snp for snp, cond in keep_snps_dict.items() if cond] else: - before_bound = ld_boundaries[0, :] - np.arange(ld_boundaries.shape[1]) - after_bound = ld_boundaries[1, :] - np.arange(ld_boundaries.shape[1]) - 1 - - ld_array = [] - avg_ncol = 0 - - for i, snp in enumerate(snps): - - try: - if before_bound[i] < 0: - before = r_b[b_snps_slice[snp]][before_bound[i]:] - else: - before = [] - except KeyError: - before = [] - - try: - after = r_a[a_snps_slice[snp]][:after_bound[i]] - except KeyError: - after = [] - - ld_array.append(np.concatenate([before, [1.], after])) + return [snp for snp, cond in keep_snps_dict.items() if not cond] - avg_ncol += (len(ld_array[-1]) - avg_ncol) / (i + 1) - n_chunks = estimate_row_chunk_size(len(ld_array), int(avg_ncol)) - - z_arr = zarr.open(dir_store, - mode='w', - shape=len(ld_array), - chunks=n_chunks[:1], - dtype=object, - object_codec=VLenArray(float)) - - z_arr[:] = np.array(ld_array, dtype=object) - - return z_arr - - -def clump_snps(ldm, stat, rsq_threshold=.9, extract=True): - """ - This function takes an LDMatrix object and clumps the SNPs based - on the `stat` vector (usually p-value) and the provided r-squared threshold. - If two SNPs have an r-squared greater than the threshold, - the SNP with the higher `stat` value is excluded. - :param ldm: LDMatrix object - :param stat: A vector of statistics (e.g. p-values) for the SNPs - :param rsq_threshold: The r^2 threshold to use for filtering - :param extract: If True, return remaining SNPs. If False, return removed SNPs. - :return: A list of SNPs passing the specified filter +def expand_snps(seed_snps, ldm, rsq_threshold=0.9): """ + Given an initial set of SNPs, expand the set by adding + "neighbors" whose squared correlation with the is higher than + a user-specified threshold. - snps = ldm.snps - ld_bounds = ldm.ld_boundaries - remove_snps = set() + :param seed_snps: An iterable containing initial set of SNP rsIDs. + :param ldm: An `LDMatrix` object containing SNP-by-SNP correlations. + :param rsq_threshold: The r^2 threshold to use for including variants. - for idx, ld in enumerate(ldm): + """ - if snps[idx] in remove_snps: - continue + ldm_snps = ldm.snps + snp_seed_idx = np.where(np.isin(seed_snps, ldm_snps)) - rsq = np.array(ld)**2 + if len(snp_seed_idx) < 1: + print("Warning: None of the seed SNPs are present in the LD matrix object!") + return seed_snps - for s_idx in np.where(rsq > rsq_threshold)[0]: - real_idx = s_idx + ld_bounds[0, idx] - if idx == real_idx or snps[real_idx] in remove_snps: - continue + final_set = set(seed_snps) - if stat[idx] < stat[real_idx]: - remove_snps.add(snps[real_idx]) - else: - remove_snps.add(snps[idx]) + for idx in snp_seed_idx: + r, indices = ldm.get_row(idx, return_indices=True) + final_set = final_set.union(set(ldm_snps[indices[np.where(r**2 > rsq_threshold)[0]]])) - if extract: - return list(set(snps) - remove_snps) - else: - return list(remove_snps) + return list(final_set) -def shrink_ld_matrix(arr, +def shrink_ld_matrix(ld_mat_obj, cm_pos, maf_var, - genmap_Ne, + genmap_ne, genmap_sample_size, shrinkage_cutoff=1e-3, phased_haplotype=False, - ld_boundaries=None): + chunk_size=1000): + """ Shrink the entries of the LD matrix using the shrinkage estimator described in Lloyd-Jones (2019) and Wen and Stephens (2010). The estimator @@ -375,20 +140,18 @@ def shrink_ld_matrix(arr, https://github.com/stephenslab/rss/blob/master/misc/get_corr.R - :param arr: The Zarr array containing the original LD matrix. + :param ld_mat_obj: An `LDMatrix` object encapsulating the LD matrix whose entries we wish to shrink. :param cm_pos: The position of each variant in the LD matrix in centi Morgan. :param maf_var: A vector of the variance in minor allele frequency (MAF) for each SNP in the LD matrix. Should be equivalent to 2*pj*(1. - pj), where pj is the MAF of SNP j. - :param genmap_Ne: The effective population size for the genetic map. + :param genmap_ne: The effective population size for the genetic map. :param genmap_sample_size: The sample size used to estimate the genetic map. :param shrinkage_cutoff: The cutoff value below which we assume that the shrinkage factor is zero. :param phased_haplotype: A flag indicating whether the LD was calculated from phased haplotypes. - :param ld_boundaries: The LD boundaries to use when shrinking the LD matrix. + :param chunk_size: An optional parameter that sets the maximum number of rows processed simultaneously. The smaller + the `chunk_size`, the less memory requirements needed for this step. """ - if ld_boundaries is None: - ld_boundaries = np.array([np.repeat(None, arr.shape[0]), np.repeat(None, arr.shape[0])]) - # The multiplicative term for the shrinkage factor # The shrinkage factor is 4 * Ne * (rho_ij/100) / (2*m) # where Ne is the effective population size and m is the sample size @@ -399,7 +162,7 @@ def shrink_ld_matrix(arr, # See also: https://github.com/stephenslab/rss/blob/master/misc/get_corr.R # and Wen and Stephens (2010) - mult_term = .02*genmap_Ne / genmap_sample_size + mult_term = .02*genmap_ne / genmap_sample_size def harmonic_series_sum(n): """ @@ -408,14 +171,12 @@ def harmonic_series_sum(n): Acknowledgement: https://stackoverflow.com/a/27683292 """ from scipy.special import digamma - from numpy import euler_gamma - - return digamma(n + 1) + euler_gamma + return digamma(n + 1) + np.euler_gamma # Compute theta according to Eq. 2.8 in Wen and Stephens (2010) h_sum = harmonic_series_sum(2*genmap_sample_size - 1) # The sum of the harmonic series in Eq. 2.8 - theta = (1. / h_sum) / (2. * genmap_sample_size + 1. / h_sum) # The theta parameter (related to mutation) + theta = (1. / h_sum) / (2. * genmap_sample_size + 1. / h_sum) # The theta parameter (related to mutation rate) theta_factor = (1. - theta)**2 # The theta factor that we'll multiply all elements of the covariance matrix with theta_diag_factor = .5 * theta * (1. - .5 * theta) # The theta factor for the diagonal elements @@ -426,371 +187,94 @@ def harmonic_series_sum(n): # We need to turn the correlation matrix into a covariance matrix to # apply the shrinkage factor. For this, we have to multiply each row # by the product of standard deviations: - maf_sd = np.sqrt(maf_var) + maf_sd = np.sqrt(phased_mult*maf_var) # According to Eqs. 2.6 and 2.7 in Wen and Stephens (2010), the shrunk standard deviation should be: shrunk_sd = np.sqrt(theta_factor*maf_var*phased_mult + theta_diag_factor) - def update_prev_chunk(j): - """ - A utility function to update the LD matrix chunk by chunk for optimal speed and efficiency. - """ - chunk_start = (j - 1) - (j - 1) % chunk_size - chunk_end = min(chunk_start + chunk_size, arr.shape[0]) - arr[chunk_start:chunk_end] = chunk + global_indptr = ld_mat_obj.indptr - chunk_size = arr.chunks[0] - chunk = None + for chunk_idx in range(int(np.ceil(len(ld_mat_obj) / chunk_size))): - for j in range(arr.shape[0]): + start_row = chunk_idx*chunk_size + end_row = min((chunk_idx+1)*chunk_size, len(ld_mat_obj)) - if j % chunk_size == 0: - if j > 0: - update_prev_chunk(j) + # Load the subset of the LD matrix specified by chunk_size. + csr_mat = ld_mat_obj.load_rows(start_row=start_row, end_row=end_row, dtype=np.float32) - chunk = arr[j: j + chunk_size] + # Get the relevant portion of indices and pointers from the CSR matrix: + indptr = global_indptr[start_row:end_row+1] - start, end = ld_boundaries[:, j] + row_indices = np.concatenate([ + (start_row + r_idx)*np.ones(indptr[r_idx+1] - indptr[r_idx], dtype=int) + for r_idx in range(len(indptr) - 1) + ]) - # Compute the shrinkage factor the entries in row j - shrink_factor = np.exp(-mult_term * np.abs(cm_pos - cm_pos[j])[start: end]) - # Set any shrinkage factor below the cutoff value to zero: + # Compute the shrinkage factor for entries in the current block: + shrink_factor = np.exp(-mult_term*np.abs(cm_pos[csr_mat.indices] - cm_pos[row_indices])) + # Set shrinkage factors below the cutoff value to 0.: shrink_factor[shrink_factor < shrinkage_cutoff] = 0. - - # The factor to convert the entries of the correlation matrix into corresponding covariances: - to_cov_factor = phased_mult*maf_sd[j]*maf_sd[start: end] - # Compute the theta multiplicative factor following Eq. 2.6 in Wen and Stephens (2010) shrink_factor *= theta_factor + # The factor to convert the entries of the correlation matrix into corresponding covariances: + to_cov_factor = maf_sd[row_indices]*maf_sd[csr_mat.indices] + # Compute the new denominator for the Pearson correlation: # The shrunk standard deviation of SNP j multiplied by the shrunk standard deviations of each neighbor: - shrunk_sd_prod = shrunk_sd[j]*shrunk_sd[start: end] - - # Shrink the entries of the LD matrix: - try: - shrunk_corr = chunk[j % chunk_size]*to_cov_factor*shrink_factor / shrunk_sd_prod - shrunk_corr[j - start] = 1. - chunk[j % chunk_size] = shrunk_corr - except ValueError: - raise ValueError(f'Failed to apply shrinkage to row number: {j}') - - update_prev_chunk(j + 1) - - return arr - - -def sparsify_ld_matrix(arr, bounds): - """ - A utility function to sparsify chunked LD matrices - :param arr: the LD matrix - :param bounds: an 2xM array of start and end position for each row - :return: A sparsified array of the same format - """ - - def update_prev_chunk(j): - chunk_start = (j - 1) - (j - 1) % chunk_size - chunk_end = chunk_start + chunk_size - arr[chunk_start:chunk_end] = chunk - - chunk_size = arr.chunks[0] - chunk = None - - for j in range(bounds.shape[1]): - if j % chunk_size == 0: - if j > 0: - update_prev_chunk(j) - - chunk = arr[j: j + chunk_size] - - chunk[j % chunk_size, :bounds[0, j]] = 0 - chunk[j % chunk_size, bounds[1, j]:] = 0 - - update_prev_chunk(j + 1) - - return arr - - -def rechunk_zarr(arr, target_chunks, target_store, intermediate_store, **kwargs): - """ - Rechunk a Zarr matrix using utilities from `rechunker`. - """ - - if osp.isdir(target_store): - try: - z = zarr.open(target_store) - z.store.rmdir() - except Exception as e: - raise e - - from rechunker import rechunk - - rechunked = rechunk(arr, - target_chunks=target_chunks, - target_store=target_store, - temp_store=intermediate_store, - max_mem="128MiB", - **kwargs) - - try: - rechunked.execute() - except Exception as e: - raise e - - # Delete the older/intermediate stores: - delete_ld_store(zarr.open(intermediate_store)) - delete_ld_store(arr) - - return zarr.open(target_store) - - -def optimize_chunks_for_memory(chunked_array, cpus=None, max_mem=None): - """ - Determine optimal chunks that fit in max_mem. Max_mem should be numerical in GiB - Modified from: Sergio Hleap - """ - - import psutil - import dask.array as da - - if cpus is None: - cpus = psutil.cpu_count() + shrunk_sd_prod = shrunk_sd[row_indices]*shrunk_sd[csr_mat.indices] - if max_mem is None: - max_mem = psutil.virtual_memory().available / (1024.0 ** 3) + # Finally, compute the shrunk LD matrix entries: + csr_mat.data *= to_cov_factor*shrink_factor / shrunk_sd_prod - chunk_mem = max_mem / cpus - chunks = da.core.normalize_chunks(f"{chunk_mem}GiB", shape=chunked_array.shape, dtype=chunked_array.dtype) + # Update the LD matrix object inplace: + ld_mat_obj.update_rows_inplace(csr_mat, start_row=start_row, end_row=end_row) - return chunked_array.chunk(chunks) + return ld_mat_obj -def estimate_row_chunk_size(rows, cols, dtype=np.float64, chunk_size=128): +def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128): """ - Estimate the chunk size for ragged arrays, given the number of rows, columns, and data type. + Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB. + The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory. - :param rows: Number of rows. - :param cols: Number of columns. If a ragged array, provide average size of arrays - :param dtype: Data type - :param chunk_size: chunk size in MB + :param rows: Total number of rows in the matrix. + :param cols: Total number of columns. If sparse matrix with uneven columns, provide average column size. + :param dtype: The data type for the matrix entries. + :param mem_size: Size of the chunk in memory (MB) """ matrix_size = rows * cols * np.dtype(dtype).itemsize / 1024 ** 2 - n_chunks = matrix_size // chunk_size - - if n_chunks < 1: - return None, None - else: - return int(rows / n_chunks), None - - -def dense_zarr_array_to_ragged(z, - dir_store, - ld_boundaries, - rechunk=True, - delete_original=True): - """ - This function takes a dense chunked Zarr matrix - and, given an array of window sizes for each row, returns a sparse ragged array matrix. - This is a utility function that works with `dask` or `xarray` generated Linkage-Disequilibrium (LD) matrices - and aims to create compact LD matrices that are easier to manipulate and work with. - - :param z: The original LD matrix in Zarr format. - :param dir_store: The path to the new store where the sparse LD matrix will be stored. - :param ld_boundaries: The LD boundaries or window around each SNP. This is a 2xM array where - the first row contains the start and the second row contains the end of each window. - :param rechunk: If True, re-chunk the ragged array for optimized read/write performance. - :param delete_original: Delete the original store after creating the ragged array. - - """ - - avg_ncol = int((ld_boundaries[1, :] - ld_boundaries[0, :]).mean()) - - if rechunk: - n_chunks = estimate_row_chunk_size(z.shape[0], avg_ncol) - else: - n_chunks = z.chunks - - if avg_ncol == z.shape[0]: - z_rag = zarr.open(dir_store, - mode='w', - shape=z.shape, - chunks=n_chunks, - dtype=float) - else: - z_rag = zarr.open(dir_store, - mode='w', - shape=z.shape[0], - chunks=n_chunks[:1], - dtype=object, - object_codec=VLenArray(float)) - - chunk_size = z.chunks[0] - - for i in range(int(np.ceil(z.shape[0] / chunk_size))): - - start = i * chunk_size - end = min((i + 1) * chunk_size, z.shape[0]) - - z_chunk = z[start: end] - - z_rag_rows = [] - - for j in range(start, end): - z_rag_rows.append( - z_chunk[j - start][ld_boundaries[0, j]:ld_boundaries[1, j]] - ) - - if avg_ncol == z.shape[0]: - z_rag.oindex[np.arange(start, end)] = np.array(z_rag_rows) - else: - z_rag.oindex[np.arange(start, end)] = np.array(z_rag_rows + [None], dtype=object)[:-1] - - if delete_original: - delete_ld_store(z) - - return z_rag - - -def filter_zarr_array(z, - dir_store, - extract_snps, - ld_boundaries, - rechunk=True, - delete_original=False): - """ - This function takes a chunked Zarr matrix (dense or sparse LD matrix) - and, given a list of SNPs to extract, returns a filtered ragged array matrix. - - TODO: Optimize this for large chromosomes/LD matrices! - - :param z: the original Zarr matrix (implementation assumes 2D matrix) - :param dir_store: The path to the new Zarr matrix store - :param extract_snps: A list or vector of SNP IDs to keep. - :param ld_boundaries: The LD boundaries or window around each SNP. This is a 2xM array where - the first row contains the start and the second row contains the end of each window. - :param rechunk: If True, re-chunk the filtered array for optimized read/write performance. - :param delete_original: If True, delete the original store after transformation. - """ - - idx_map = pd.DataFrame({'SNP': extract_snps}).reset_index().merge( - pd.DataFrame({'SNP': z.attrs['SNP']}).reset_index(), - on='SNP', - suffixes=('_y', '_x') - ) - idx_map['chunk_x'] = (idx_map['index_x'] // z.chunks[0]).astype(int) - n_rows = len(extract_snps) - - idx_map['chunk_x'] = (idx_map['index_x'] // z.chunks[0]).astype(int) - - orig_bounds = np.array(z.attrs['LD boundaries']) - - avg_ncol = int((ld_boundaries[1, :] - ld_boundaries[0, :]).mean()) - - if rechunk: - n_chunks = estimate_row_chunk_size(n_rows, avg_ncol) - else: - n_chunks = z.chunks - - if avg_ncol == n_rows: - z_rag = zarr.open(dir_store, - mode='w', - shape=(n_rows, n_rows), - chunks=n_chunks, - dtype=float) - else: - z_rag = zarr.open(dir_store, - mode='w', - shape=n_rows, - chunks=n_chunks[:1], - dtype=object, - object_codec=VLenArray(float)) - - idx_x = idx_map['index_x'].values - chunk_size = z.chunks[0] - - for i in range(int(np.ceil(z.shape[0] / chunk_size))): - - start = i * chunk_size - end = min((i + 1) * chunk_size, z.shape[0]) - - z_chunk = z[start: end] - - z_rag_index = [] - z_rag_rows = [] - - for _, (k, _, j, _) in idx_map.loc[idx_map['chunk_x'] == i].iterrows(): + n_chunks = max(1, matrix_size // mem_size) - z_rag_index.append(k) - - # Find the index of SNPs in the original LD matrix that - # remain after matching with the `keep_snps` variable. - orig_idx = idx_x[(orig_bounds[0, j] <= idx_x) & (idx_x < orig_bounds[1, j])] - orig_bounds[0, j] - row_val = z_chunk[j - start][orig_idx] - - z_rag_rows.append(row_val) - - if len(z_rag_index) == 0: - continue - - if avg_ncol == n_rows: - z_rag.oindex[z_rag_index] = np.array(z_rag_rows) - else: - z_rag.oindex[z_rag_index] = np.array(z_rag_rows + [None], dtype=object)[:-1] - - z_rag.attrs.update(z.attrs.asdict()) - - # Update the attributes associated with the new matrix: - z_rag.attrs['LD boundaries'] = ld_boundaries.tolist() - - try: - z_rag.attrs['SNP'] = list(extract_snps) - except Exception: - pass - - try: - z_rag.attrs['BP'] = list(map(int, np.array(z.attrs['BP'])[idx_x])) - except Exception: - pass - - try: - z_rag.attrs['cM'] = list(map(float, np.array(z.attrs['cM'])[idx_x])) - except Exception: - pass - - try: - z_rag.attrs['MAF'] = list(map(float, np.array(z.attrs['MAF'])[idx_x])) - except Exception: - pass - - try: - z_rag.attrs['A1'] = list(np.array(z.attrs['A1'])[idx_x]) - except Exception: - pass - - try: - z_rag.attrs['A2'] = list(np.array(z.attrs['A2'])[idx_x]) - except Exception: - pass - - try: - z_rag.attrs['LDScore'] = list(map(float, np.array(z.attrs['LDScore'])[idx_x])) - except Exception: - pass - - if delete_original: - delete_ld_store(z) - - return z_rag + return rows // n_chunks def compute_ld_plink1p9(genotype_matrix, ld_boundaries, output_dir, - temp_dir='temp'): + temp_dir='temp', + overwrite=True, + dtype='int16', + compressor_name='lz4', + compression_level=5): - from magenpy.utils.executors import plink1Executor - from magenpy.GenotypeMatrix import plinkBEDGenotypeMatrix + """ + Compute LD matrices using plink 1.9. + + :param genotype_matrix: A plinkBEDGenotypeMatrix object + :param ld_boundaries: An array of LD boundaries for every SNP + :param output_dir: The output directory for the final LD matrix file (after processing). + :param temp_dir: A temporary directory to store intermediate files (e.g. files created for and by plink). + :param overwrite: If True, it overwrites any LD matrices in `output_dir`. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor to use for the Zarr arrays. + :param compression_level: The compression level to use for the Zarr arrays (1-9). + """ + + from ...utils.executors import plink1Executor + from ...GenotypeMatrix import plinkBEDGenotypeMatrix assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) @@ -808,11 +292,11 @@ def compute_ld_plink1p9(genotype_matrix, plink_output = osp.join(temp_dir, f'chr_{str(genotype_matrix.chromosome)}') # Set the window sizes in various units: + # (1) Number of neighboring SNPs: window_size = (ld_boundaries - np.arange(genotype_matrix.m)).max() + 10 # (2) Kilobases: - positional_bounds = np.clip(np.array([ld_boundaries[0, :] - 1, ld_boundaries[1, :]]), a_min=0, a_max=ld_boundaries.shape[1] - 1) @@ -842,47 +326,99 @@ def compute_ld_plink1p9(genotype_matrix, if cm_window_size is not None: cmd.append(f"--ld-window-cm {cm_window_size}") + # --------------------------------------------------------- + # Test if plink1.9 version is compatible with setting the --ld-window-r2 flag: + # This is important to account for due to differences in the behavior of plink1.9 + # across different versions. + # See here for discussion of this behavior: https://github.com/shz9/viprs/issues/3 + + plink1.verbose = False + + r2_flag_compatible = True + + from subprocess import CalledProcessError + + try: + plink1.execute(["--r gz", "--ld-window-r2 0"]) + except CalledProcessError as e: + if "--ld-window-r2 flag cannot be used with --r" in e.stderr.decode(): + r2_flag_compatible = False + + if r2_flag_compatible: + cmd += ["--ld-window-r2 0"] + + plink1.verbose = True + + # --------------------------------------------------------- + plink1.execute(cmd) # Convert from PLINK LD files to Zarr: - fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + - str(genotype_matrix.chromosome)) + fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + str(genotype_matrix.chromosome)) - z_ld_mat = from_plink_ld_table_to_zarr_chunked(f"{plink_output}.ld.gz", - fin_ld_store, - ld_boundaries, - genotype_matrix.snps) + # Compute the pandas chunk_size + # The goal of this is to process chunks of the LD table without overwhelming memory resources: + avg_ncols = int((ld_boundaries[1, :] - ld_boundaries[0, :]).mean()) + rows_per_chunk = estimate_rows_per_chunk(ld_boundaries.shape[1], avg_ncols, dtype=dtype) - return z_ld_mat + if rows_per_chunk > 0.1*ld_boundaries.shape[1]: + pandas_chunksize = None + else: + pandas_chunksize = rows_per_chunk*avg_ncols // 2 + + return LDMatrix.from_plink_table(f"{plink_output}.ld.gz", + genotype_matrix.snps, + fin_ld_store, + pandas_chunksize=pandas_chunksize, + overwrite=overwrite, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) def compute_ld_xarray(genotype_matrix, ld_boundaries, output_dir, - temp_dir='temp'): + temp_dir='temp', + overwrite=True, + delete_original=True, + dtype='int16', + compressor_name='lz4', + compression_level=5): + """ Compute the Linkage Disequilibrium matrix or snp-by-snp correlation matrix assuming that the genotypes are represented - by `xarray` or `dask`-like matrix. This function computes the - entire X'X/N and stores the result in Zarr arrays. - To create sparse matrices out of this, consult the - LD estimators and their implementations. + by `xarray` or `dask`-like matrix objects. This function computes the + entire X'X/N and stores the result on-disk in Zarr arrays. Then, we call the utilities + from the `LDMatrix` class to sparsify the dense matrix according to the parameters + specified by the `ld_boundaries` matrix. NOTE: We don't recommend using this for large-scale genotype matrices. - Use `compute_ld_plink` instead if you have plink installed on your system. + Use `compute_ld_plink1p9` instead if you have plink installed on your system. :param genotype_matrix: An `xarrayGenotypeMatrix` object + :param ld_boundaries: An array of LD boundaries for every SNP + :param output_dir: The output directory for the final LD matrix file. :param temp_dir: A temporary directory where to store intermediate results. - + :param overwrite: If True, overwrites LD matrices in `temp_dir` and `output_dir`, if they exist. + :param delete_original: If True, it deletes the original dense matrix after generating the sparse alternative. + :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 + and integer quantized data types int8 and int16). + :param compressor_name: The name of the compressor to use for the Zarr arrays. + :param compression_level: The compression level to use for the Zarr arrays (1-9). """ - from magenpy.GenotypeMatrix import xarrayGenotypeMatrix + from ...GenotypeMatrix import xarrayGenotypeMatrix + assert isinstance(genotype_matrix, xarrayGenotypeMatrix) g_data = genotype_matrix.xr_mat - # Re-chunk the array - g_data = g_data.chunk((min(1024, g_data.shape[0]), min(1024, g_data.shape[1]))) + # Re-chunk the array to optimize computational speed and efficiency: + # New chunksizes: + new_chunksizes = (min(1024, g_data.shape[0]), min(1024, g_data.shape[1])) + g_data = g_data.chunk(dict(zip(g_data.dims, new_chunksizes))) from ..transforms.genotype import standardize import dask.array as da @@ -890,27 +426,32 @@ def compute_ld_xarray(genotype_matrix, # Standardize the genotype matrix and fill missing data with zeros: g_mat = standardize(g_data).data - # Compute the LD matrix: - ld_mat = (da.dot(g_mat.T, g_mat) / genotype_matrix.sample_size).astype(np.float64) - ld_mat.to_zarr(temp_dir, overwrite=True) - - z_ld_mat = zarr.open(temp_dir) - z_ld_mat = rechunk_zarr(z_ld_mat, - ld_mat.rechunk({0: 'auto', 1: None}).chunksize, - temp_dir + '_rechunked', - temp_dir + '_intermediate') - - fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + - str(genotype_matrix.chromosome)) - - # If the matrix is sparse/thresholded, then convert to a ragged zarr array: - if (ld_boundaries[1, :] - ld_boundaries[0, :]).min() < genotype_matrix.n_snps: + # Compute the full LD matrix and store to a temporary directory in the form of Zarr arrays: + import warnings - z_ld_mat = dense_zarr_array_to_ragged(z_ld_mat, - fin_ld_store, - ld_boundaries) + # Ignore performance-related warnings from Dask: + with warnings.catch_warnings(): - else: - z_ld_mat = move_ld_store(z_ld_mat, fin_ld_store) - - return z_ld_mat + if np.issubdtype(np.dtype(dtype), np.integer): + # If the requested data type is integer, we need to convert + # the data to `float32` to avoid overflow errors when computing the dot product: + dot_dtype = np.float32 + else: + dot_dtype = dtype + + warnings.simplefilter("ignore") + ld_mat = (da.dot(g_mat.T, g_mat) / genotype_matrix.sample_size).astype(dot_dtype) + ld_mat.to_zarr(temp_dir, overwrite=overwrite) + + fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + str(genotype_matrix.chromosome)) + + # Load the dense matrix and transform it to a sparse matrix using utilities implemented in the + # `LDMatrix` class: + return LDMatrix.from_dense_zarr_matrix(temp_dir, + ld_boundaries, + fin_ld_store, + overwrite=overwrite, + delete_original=delete_original, + dtype=dtype, + compressor_name=compressor_name, + compression_level=compression_level) diff --git a/magenpy/stats/score/score.hpp b/magenpy/stats/score/score.hpp new file mode 100644 index 0000000..db06a30 --- /dev/null +++ b/magenpy/stats/score/score.hpp @@ -0,0 +1,267 @@ +#ifndef SCORE_H +#define SCORE_H + +#include +#include +#include +#include + +// Check for and include `cblas`: +#ifdef HAVE_CBLAS + #include +#endif + +// Check for and include `omp`: +#ifdef _OPENMP + #include +#endif + +/* ----------------------------- */ +bool omp_supported() { + #ifdef _OPENMP + return true; + #else + return false; + #endif +} + +bool blas_supported() { + #ifdef HAVE_CBLAS + return true; + #else + return false; + #endif +} + +template +void axpy(T* x, T* y, T a, int size) { + for (int i = 0; i < size; ++i) { + x[i] += y[i] * a; + } +} + +template +void blas_axpy(T *y, T *x, T alpha, int size) { + + #ifdef HAVE_CBLAS + int incx = 1; + int incy = 1; + + if constexpr (std::is_same::value) { + cblas_saxpy(size, alpha, x, incx, y, incy); + } + else { + cblas_daxpy(size, alpha, x, incx, y, incy); + } + #else + axpy(y, x, alpha, size); + #endif +} + + +template +void calculate_scores(std::string bed_filename, + T* effect_sizes, + int* snp_indices, + int* sample_indices, + int num_samples, + int num_snps, + int num_scores, + T* scores, + int threads) { + + // ---------------------------------------------------- + // Check if file is a valid PLINK BED file + std::ifstream initial_file(bed_filename, std::ios::binary); + if (!initial_file.is_open()) { + throw std::runtime_error("Error opening BED file."); + } + char magic_number[3]; + initial_file.read(magic_number, 3); + if (magic_number[0] != '\x6C' || magic_number[1] != '\x1B' || magic_number[2] != '\x01') { + throw std::runtime_error("Invalid PLINK BED file."); + } + initial_file.close(); + // ---------------------------------------------------- + + T* local_scores = scores; + bool use_local_scores = false; + + #ifdef _OPENMP + #pragma omp parallel num_threads(threads) + #endif + { + #ifdef _OPENMP + if (omp_get_num_threads() > 1) { + local_scores = new T[num_samples * num_scores]; + use_local_scores = true; + } + #endif + + //Open a separate file stream for each thread + std::ifstream bed_file(bed_filename, std::ios::binary); + + #ifdef _OPENMP + #pragma omp for schedule(runtime) + #endif + for (size_t i = 0; i < num_snps; ++i) { + + int snp_index = snp_indices[i]; + + bed_file.seekg(3 + snp_index * ((num_samples + 3) / 4), std::ios::beg); + size_t j = 0; + size_t sample_counter = 0; + + while (sample_counter < num_samples) { + unsigned char buffer; + bed_file.read(reinterpret_cast(&buffer), 1); + + for (int b = 0; b < 4 && sample_counter < num_samples; ++b, ++j) { + + int sample_index = sample_indices[sample_counter]; + + if (j == sample_index) { + int genotype = (buffer >> (b * 2)) & 0x3; + if (genotype != 1) { // Ignore missing genotypes + + T decoded_genotype = static_cast(genotype); + + if (genotype > 0) { + decoded_genotype = abs(genotype - 3); + } + else { + decoded_genotype += 2; + } + + blas_axpy(local_scores + sample_index * num_scores, + effect_sizes + snp_index * num_scores, + decoded_genotype, + num_scores); + } + sample_counter++; + } + } + } + } + + // Close the file stream for each thread + bed_file.close(); + + /* If multiple threads are used, add the local scores to the global scores + in a critical section. */ + #ifdef _OPENMP + if (use_local_scores) { + #pragma omp critical + { + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = 0; j < num_scores; ++j) { + scores[i * num_scores + j] += local_scores[i * num_scores + j]; + } + } + } + delete [] local_scores; + } + #endif + } + +} + +/* + +Explore producer-consumer style implementation for reading the bed file. + +#include +#include +#include + +template +class ThreadSafeQueue { +private: + std::queue queue_; + std::queue index_queue_; + std::mutex mutex_; + std::condition_variable cond_; + +public: + void push(T* value, int index) + std::lock_guard lock(mutex_); + queue_.push(std::move(value)); + index_queue_.push(index); + cond_.notify_one(); + } + + T* pop() { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this]{ return !queue_.empty(); }); + T* value = std::move(queue_.front()); + queue_.pop(); + return value; + } + + int pop_index() { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this]{ return !index_queue_.empty(); }); + int value = index_queue_.front(); + index_queue_.pop(); + return value; + } + +}; + +void reader(ThreadSafeQueue& queue, const std::string& bed_filename, int num_samples, int num_snps) { + std::ifstream bed_file(bed_filename, std::ios::binary); + for (size_t i = 0; i < num_snps; ++i) { + T* snp_entries = new T[num_samples]; + bed_file.seekg(3 + i * ((num_samples + 3) / 4), std::ios::beg); + size_t j = 0; + size_t sample_counter = 0; + while (sample_counter < num_samples) { + unsigned char buffer; + bed_file.read(reinterpret_cast(&buffer), 1); + for (int b = 0; b < 4 && sample_counter < num_samples; ++b, ++j) { + int genotype = (buffer >> (b * 2)) & 0x3; + if (genotype != 1) { // Ignore missing genotypes + snp_entries[sample_counter] = static_cast(genotype); + sample_counter++; + } + } + } + queue.push(snp_entries, i); + } + bed_file.close(); +} + +void reader(ThreadSafeQueue>& queue, const std::vector& snp_indices) { + for (int snp_index : snp_indices) { + std::vector snp_entries = // read SNP entries for snp_index + queue.push(std::move(snp_entries)); + } +} + +void worker(ThreadSafeQueue>& queue, int num_samples, int num_snps, int num_scores, T* effect_sizes, T* scores) { + while (num_snps > 0) { + + std::vector snp_entries = queue.pop(); + int snp_index = queue.pop_index(); + + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = 0; j < num_scores; ++j) { + scores[i * num_scores + j] += snp_entries[i] * effect_sizes[snp_index * num_scores + j]; + } + } + + num_snps--; + } +} + +ThreadSafeQueue> queue; + +std::thread reader_thread(reader, std::ref(queue), snp_indices); +std::thread worker_thread(worker, std::ref(queue)); + +reader_thread.join(); +worker_thread.join(); + +*/ + +#endif // SCORE_H diff --git a/magenpy/stats/score/score_cpp.pyx b/magenpy/stats/score/score_cpp.pyx new file mode 100644 index 0000000..ef63196 --- /dev/null +++ b/magenpy/stats/score/score_cpp.pyx @@ -0,0 +1,63 @@ +# distutils: language = c++ +# sources: stats/score/score.hpp + +from libcpp.string cimport string +from cython cimport floating +import numpy as np + +#string bed_filename, + +cdef extern from "score.hpp" nogil: + bint blas_supported() noexcept nogil + bint omp_supported() noexcept nogil + + void calculate_scores[T](string bed_filename, + T* effect_sizes, + int* snp_indices, + int* sample_indices, + int num_samples, + int num_snps, + int num_scores, + T* scores, + int threads) noexcept nogil + + +cpdef calculate_pgs(bed_filename, + double[:, ::1] effect_sizes, #floating[:, ::1] effect_sizes, + int[:] snp_indices, + int[:] sample_indices, #int[:] sample_indices, + int threads): + + """ + Calculate polygenic scores for a set of SNPs and samples using custom C++ + script, written for speed/efficiency. + + NOTE: Assumes SNP and sample indices are sorted! + + :param bed_filename: Path to PLINK BED file + :param effect_sizes: Numpy array or matrix of effect sizes for each SNP + :param snp_indices: Numpy array of SNP indices to use for PGS calculation + :param sample_indices: Numpy array of sample indices to use for PGS calculation + :param threads: Number of threads to use for PGS calculation + """ + + cdef: + string c_bed_filename = bed_filename.encode() + double[:, ::1] pgs = np.zeros((sample_indices.shape[0], effect_sizes.shape[1])) + + #if floating is float: + # pgs = np.zeros((sample_indices.shape[0], effect_sizes.shape[1]), dtype=np.float32) + #else: + # pgs = np.zeros((sample_indices.shape[0], effect_sizes.shape[1]), dtype=np.float64) + + calculate_scores(c_bed_filename, + &effect_sizes[0, 0], + &snp_indices[0], + &sample_indices[0], + sample_indices.shape[0], + snp_indices.shape[0], + effect_sizes.shape[1], + &pgs[0, 0], + threads) + + return np.array(pgs) diff --git a/magenpy/stats/score/utils.py b/magenpy/stats/score/utils.py index d7609e6..cd073c4 100644 --- a/magenpy/stats/score/utils.py +++ b/magenpy/stats/score/utils.py @@ -7,9 +7,32 @@ def score_plink2(genotype_matrix, betas, standardize_genotype=False, temp_dir='temp'): + """ + Perform linear scoring using PLINK2. + This function takes a genotype matrix object encapsulating and referencing + plink BED files as well as a matrix of effect sizes (betas) and performs + linear scoring of the form: - from magenpy.GenotypeMatrix import plinkBEDGenotypeMatrix - from magenpy.utils.executors import plink2Executor + y = X * betas + + This is useful for computing polygenic scores (PGS). The function supports + a matrix of `beta` values, in which case the function returns a matrix of + PGS values, one for each column of `beta`. For example, if there are 10 sets + of betas, the function will compute 10 polygenic scores for each individual represented + in the genotype matrix `X`. + + :param genotype_matrix: An instance of `plinkBEDGenotypeMatrix`. + :param betas: A matrix of effect sizes (betas). + :param standardize_genotype: If True, standardize the genotype to have mean zero and unit variance + before scoring. + :param temp_dir: The directory where the temporary files will be stored. + + :return: A numpy array of polygenic scores. + + """ + + from ...GenotypeMatrix import plinkBEDGenotypeMatrix + from ...utils.executors import plink2Executor assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) @@ -23,7 +46,7 @@ def score_plink2(genotype_matrix, except IndexError: betas_shape = 1 betas = betas.reshape(-1, 1) - score_col_nums = f"--score-col-nums 3" + score_col_nums = "--score-col-nums 3" # Create the samples file: @@ -70,7 +93,7 @@ def score_plink2(genotype_matrix, dtypes.update({'PRS' + str(i): np.float64}) chr_pgs = pd.read_csv(output_file + '.sscore', - delim_whitespace=True, + sep=r'\s+', names=['FID', 'IID'] + ['PRS' + str(i) for i in range(betas_shape)], skiprows=1, usecols=[0, 1] + [4 + betas_shape + i for i in range(betas_shape)], diff --git a/magenpy/stats/transforms/genotype.py b/magenpy/stats/transforms/genotype.py index 58b8875..33f60ad 100644 --- a/magenpy/stats/transforms/genotype.py +++ b/magenpy/stats/transforms/genotype.py @@ -3,8 +3,12 @@ def standardize(g_mat, fill_na=True): """ Standardize the genotype matrix, such that the columns (i.e. snps) have zero mean and unit variance. - :param g_mat: A two dimensional matrix (numpy, dask, xarray, etc.) + :param g_mat: A two-dimensional matrix (numpy, dask, xarray, etc.) where the rows are samples (individuals) + and the columns are genetic variants. :param fill_na: If true, fill the missing values with zero after standardizing. + + :return: The standardized genotype matrix. + """ sg_mat = (g_mat - g_mat.mean(axis=0)) / g_mat.std(axis=0) diff --git a/magenpy/stats/transforms/phenotype.py b/magenpy/stats/transforms/phenotype.py index b6c896a..038a6eb 100644 --- a/magenpy/stats/transforms/phenotype.py +++ b/magenpy/stats/transforms/phenotype.py @@ -11,6 +11,8 @@ def adjust_for_covariates(phenotype, covariates): :param phenotype: A vector of continuous or quantitative phenotypes. :param covariates: A matrix where each row corresponds to an individual and each column corresponds to a covariate (e.g. age, sex, PCs, etc.) + + :return: The residuals of the linear model fit. """ import statsmodels.api as sm @@ -23,6 +25,8 @@ def rint(phenotype, offset=3./8): Apply Rank-based inverse normal transform on the phenotype. :param phenotype: A vector of continuous or quantitative phenotypes. :param offset: The offset to use in the INT transformation (Blom's offset by default). + + :return: The RINT-transformed phenotype. """ from scipy.stats import rankdata, norm @@ -31,15 +35,18 @@ def rint(phenotype, offset=3./8): return norm.ppf((ranked_pheno - offset) / (len(ranked_pheno) - 2 * offset + 1)) -def find_outliers(phenotype, sigma_threshold=5): +def detect_outliers(phenotype, sigma_threshold=5): """ Detect samples with outlier phenotype values. This function takes a vector of quantitative phenotypes, computes the z-score for every individual, and returns a boolean vector indicating whether individual i has phenotype value within the specified standard deviations `sigma_threshold`. + :param phenotype: A numpy vector of continuous or quantitative phenotypes. :param sigma_threshold: The multiple of standard deviations or sigmas after which we consider the phenotypic value an outlier. + + :return: A boolean array indicating whether the phenotype value is an outlier. """ from scipy.stats import zscore return np.abs(zscore(phenotype)) < sigma_threshold @@ -49,5 +56,61 @@ def standardize(phenotype): """ Standardize the phenotype vector to have mean zero and unit variance :param phenotype: A numpy vector of continuous or quantitative phenotypes. + + :return: The standardized phenotype array. """ return (phenotype - phenotype.mean()) / phenotype.std() + + +def chained_transform(sample_table, + adjust_covariates=False, + standardize_phenotype=False, + rint_phenotype=False, + outlier_sigma_threshold=None, + transform_order=('standardize', 'covariate_adjust', 'rint', 'outlier_removal')): + """ + Apply a chain of transformations to the phenotype vector. + :param sample_table: An instance of SampleTable that contains phenotype information and other + covariates about the samples in the dataset. + :param adjust_covariates: If true, regress out the covariates from the phenotype. By default, we regress out all + the covariates present in the SampleTable. + :param standardize_phenotype: If true, standardize the phenotype. + :param rint_phenotype: If true, apply Rank-based inverse normal transform. + :param outlier_sigma_threshold: The multiple of standard deviations or sigmas after + which we consider the phenotypic value an outlier. + :param transform_order: A tuple specifying the order in which to apply the transformations. By default, + the order is standardize, covariate_adjust, rint, and outlier_removal. + + :return: The transformed phenotype vector and a boolean mask indicating the samples that were not removed. + """ + + phenotype = sample_table.phenotype + mask = np.ones_like(phenotype, dtype=bool) + + if sample_table.phenotype_likelihood != 'binomial': + for transform in transform_order: + + if transform == 'standardize': + # Standardize the phenotype: + if standardize_phenotype: + phenotype = standardize(phenotype) + + elif transform == 'covariate_adjust': + # Adjust the phenotype for a set of covariates: + if adjust_covariates: + phenotype = adjust_for_covariates(phenotype, sample_table.get_covariates()[mask, :]) + + elif transform == 'rint': + # Apply Rank-based inverse normal transform (RINT) to the phenotype: + if rint_phenotype: + phenotype = rint(phenotype) + + elif transform == 'outlier_removal': + # Remove outlier samples whose phenotypes are more than `threshold` standard deviations from the mean: + if outlier_sigma_threshold is not None: + # Find outliers: + mask = detect_outliers(phenotype, outlier_sigma_threshold) + # Filter phenotype vector: + phenotype = phenotype[mask] + + return phenotype, mask diff --git a/magenpy/stats/variant/utils.py b/magenpy/stats/variant/utils.py index 454cc51..35014bc 100644 --- a/magenpy/stats/variant/utils.py +++ b/magenpy/stats/variant/utils.py @@ -1,11 +1,19 @@ import os.path as osp import pandas as pd -from magenpy.utils.executors import plink2Executor -from magenpy.GenotypeMatrix import plinkBEDGenotypeMatrix -from magenpy.utils.model_utils import merge_snp_tables +from ...utils.executors import plink2Executor +from ...GenotypeMatrix import plinkBEDGenotypeMatrix +from ...utils.model_utils import merge_snp_tables def compute_allele_frequency_plink2(genotype_matrix, temp_dir='temp'): + """ + Compute the allele frequency for each SNP in the genotype matrix using PLINK2. + :param genotype_matrix: A GenotypeMatrix object. + :param temp_dir: The temporary directory where to store intermediate files. + + :return: A numpy array of allele frequencies. + + """ assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) @@ -34,7 +42,7 @@ def compute_allele_frequency_plink2(genotype_matrix, temp_dir='temp'): plink2.execute(cmd) - freq_df = pd.read_csv(plink_output + ".afreq", delim_whitespace=True) + freq_df = pd.read_csv(plink_output + ".afreq", sep=r'\s+') freq_df.rename(columns={'ID': 'SNP', 'REF': 'A2', 'ALT': 'A1', 'ALT1': 'A1', @@ -48,6 +56,13 @@ def compute_allele_frequency_plink2(genotype_matrix, temp_dir='temp'): def compute_sample_size_per_snp_plink2(genotype_matrix, temp_dir='temp'): + """ + Compute the sample size per SNP in the genotype matrix using PLINK2. + :param genotype_matrix: A GenotypeMatrix object. + :param temp_dir: The temporary directory where to store intermediate files. + + :return: A numpy array of sample sizes per SNP. + """ assert isinstance(genotype_matrix, plinkBEDGenotypeMatrix) @@ -76,7 +91,7 @@ def compute_sample_size_per_snp_plink2(genotype_matrix, temp_dir='temp'): plink2.execute(cmd) - miss_df = pd.read_csv(plink_output + ".vmiss", delim_whitespace=True) + miss_df = pd.read_csv(plink_output + ".vmiss", sep=r'\s+') miss_df = pd.DataFrame({'ID': genotype_matrix.snps}).merge(miss_df) if len(miss_df) != genotype_matrix.n_snps: diff --git a/magenpy/utils/compute_utils.py b/magenpy/utils/compute_utils.py index e712359..2fa6884 100644 --- a/magenpy/utils/compute_utils.py +++ b/magenpy/utils/compute_utils.py @@ -1,7 +1,5 @@ import numpy as np import pandas as pd -import collections -import six def generate_slice_dictionary(vec): @@ -39,7 +37,6 @@ def intersect_arrays(arr1, arr2, return_index=False): :param arr1: The first array :param arr2: The second array :param return_index: Return the index of shared elements in the first array - :return: """ # NOTE: For best and consistent results, we cast all data types to `str` @@ -55,7 +52,15 @@ def intersect_arrays(arr1, arr2, return_index=False): def iterable(arg): + """ + Check if an object is iterable (but not a string). + :param arg: A python object. + :return: True if the object is iterable, False otherwise. + """ + + import collections.abc + return ( - isinstance(arg, collections.Iterable) - and not isinstance(arg, six.string_types) + isinstance(arg, collections.abc.Iterable) + and not isinstance(arg, str) ) diff --git a/magenpy/utils/data_utils.py b/magenpy/utils/data_utils.py index 20719f0..2f0b4dc 100644 --- a/magenpy/utils/data_utils.py +++ b/magenpy/utils/data_utils.py @@ -3,13 +3,13 @@ def tgp_eur_data_path(): """ - Return the path of the attached 1000G sample data for - European individuals (N=378) and chromosome 22 (p=15938) + Return the path of the attached 1000G genotype data for + European samples (N=378) and a subset of chromosome 22 (p=15938) """ return osp.join(osp.dirname(osp.dirname(__file__)), 'data/1000G_eur_chr22') -def ukb_height_fastGWA_path(): +def ukb_height_sumstats_path(): """ Return the path of the attached GWAS summary statistics file for standing height. The file contains summary statistics for diff --git a/magenpy/utils/executors.py b/magenpy/utils/executors.py index d11aebb..1fd7f81 100644 --- a/magenpy/utils/executors.py +++ b/magenpy/utils/executors.py @@ -3,8 +3,19 @@ class plink2Executor(object): - - def __init__(self, threads='auto'): + """ + A wrapper class for interfacing with the `plink2` command line tool. + """ + + def __init__(self, threads='auto', verbose=True): + """ + Initialize the plink2 executor + :param threads: The number of threads to use for computations. If set to 'auto', the number of + available CPUs will be used. + :type threads: int or str + :param verbose: Whether to print the output of the command + :type verbose: bool + """ if threads == 'auto': self.threads = available_cpu() @@ -16,7 +27,14 @@ def __init__(self, threads='auto'): if not is_cmd_tool(self.plink2_path): raise Exception(f"Did not find the executable for plink2 at: {self.plink2_path}") + self.verbose = verbose + def execute(self, cmd): + """ + Execute a `plink2` command + :param cmd: The flags to pass to plink2. For example, ['--bfile', 'file', '--out', 'output'] + :type cmd: list of strings + """ cmd = [self.plink2_path] + cmd + [f'--threads {self.threads}'] @@ -25,13 +43,28 @@ def execute(self, cmd): try: run_shell_script(" ".join(cmd)) except CalledProcessError as e: - print("Invocation of plink2 returned the following error message:") - print(e.stderr.decode()) + if self.verbose: + print("Invocation of plink2 returned the following error message:") + print(e.stderr.decode()) -class plink1Executor(object): + raise e - def __init__(self, threads='auto'): + +class plink1Executor(object): + """ + A wrapper class for interfacing with the `plink1.9` command line tool. + """ + + def __init__(self, threads='auto', verbose=True): + """ + Initialize the plink1.9 executor + :param threads: The number of threads to use for computations. If set to 'auto', the number of + available CPUs will be used. + :type threads: int or str + :param verbose: Whether to print the output of the command + :type verbose: bool + """ if threads == 'auto': self.threads = available_cpu() @@ -43,7 +76,14 @@ def __init__(self, threads='auto'): if not is_cmd_tool(self.plink1_path): raise Exception(f"Did not find the executable for plink at: {self.plink1_path}") + self.verbose = verbose + def execute(self, cmd): + """ + Execute a plink command + :param cmd: The flags to pass to plink. For example, ['--bfile', 'file', '--out', 'output'] + :type cmd: list of strings + """ cmd = [self.plink1_path] + cmd + [f'--threads {self.threads}'] @@ -52,5 +92,7 @@ def execute(self, cmd): try: run_shell_script(" ".join(cmd)) except CalledProcessError as e: - print("Invocation of plink returned the following error message:") - print(e.stderr.decode()) + if self.verbose: + print("Invocation of plink returned the following error message:") + print(e.stderr.decode()) + raise e diff --git a/magenpy/utils/model_utils.py b/magenpy/utils/model_utils.py index a8d7a02..360c0ca 100644 --- a/magenpy/utils/model_utils.py +++ b/magenpy/utils/model_utils.py @@ -1,15 +1,63 @@ from tqdm import tqdm import numpy as np +import pandas as pd from scipy import stats +def match_chromosomes(chrom_1, chrom_2, check_patterns=('chr_', 'chr:', 'chr'), return_both=False): + """ + Given two lists of chromosome IDs, this function returns the + chromosomes that are common to both lists. By default, the returned chromosomes + follow the data type and order of the first list. If `return_both` is set to True, + the function returns the common chromosomes in both lists. + + The function also accounts for common ways to encode chromosomes, such as + chr18, chr_18, 18, etc. + + :param chrom_1: A list or numpy array of chromosome IDs + :param chrom_2: A list or numpy array of chromosome IDs + :param check_patterns: A list of patterns to check for and replace in the chromosome IDs + :param return_both: If True, return the common chromosomes in both lists + """ + + chrom_1 = np.array(list(chrom_1)) + chrom_2 = np.array(list(chrom_2)) + + # First, convert the data types to strings: + chr1_str = chrom_1.astype(str) + chr2_str = chrom_2.astype(str) + + _, chr1_idx, chr2_idx = np.intersect1d(chr1_str, chr2_str, return_indices=True) + + if len(chr1_idx) < 1: + # Replace patterns + for pattern in check_patterns: + chr1_str = np.char.replace(chr1_str, pattern, '') + chr2_str = np.char.replace(chr2_str, pattern, '') + + _, chr1_idx, chr2_idx = np.intersect1d(chr1_str, chr2_str, return_indices=True) + + if len(chr1_idx) < 1: + if return_both: + return [], [] + else: + return [] + else: + if return_both: + return chrom_1[chr1_idx], chrom_2[chr2_idx] + else: + return chrom_1[chr1_idx] + + def merge_snp_tables(ref_table, alt_table, how='inner', on='auto', signed_statistics=('BETA', 'STD_BETA', 'Z'), drop_duplicates=True, - correct_flips=True): + correct_flips=True, + return_ref_indices=False, + return_alt_indices=False): """ This function takes a reference SNP table with at least 3 columns ('SNP', 'A1', `A2`) and matches it with an alternative table that also has these 3 columns defined. In the most recent @@ -20,7 +68,7 @@ def merge_snp_tables(ref_table, The manner in which the join operation takes place depends on the `how` argument. Currently, the function supports `inner` and `left` joins. - The function removes duplicates if `drop_dupicates` parameter is set to True + The function removes duplicates if `drop_duplicates` parameter is set to True If `correct_flips` is set to True, the function will correct summary statistics in the alternative table `alt_table` (e.g. BETA, MAF) based whether the A1 alleles agree between the two tables. @@ -34,9 +82,16 @@ def merge_snp_tables(ref_table, :param signed_statistics: The columns with signed statistics to flip if `correct_flips` is set to True. :param drop_duplicates: Drop duplicate SNPs :param correct_flips: Correct SNP summary statistics that depend on status of alternative allele + :param return_ref_indices: Return the indices of the remaining entries in the reference table before merging. + :param return_alt_indices: Return the indices of the remaining entries in the alternative table before merging. """ + # Sanity checking steps: assert how in ('left', 'inner') + for tab in (ref_table, alt_table): + assert isinstance(tab, pd.DataFrame) + if not all([col in tab.columns for col in ('A1', 'A2')]): + raise ValueError("To merge SNP tables, we require that the columns `A1` and `A2` are present.") if on == 'auto': # Check that the `SNP` column is present in both tables: @@ -50,7 +105,15 @@ def merge_snp_tables(ref_table, elif isinstance(on, str): on = [on] - merged_table = ref_table[on + ['A1', 'A2']].merge(alt_table, how=how, on=on) + ref_include = on + ['A1', 'A2'] + + if return_ref_indices: + ref_table.reset_index(inplace=True, names='REF_IDX') + ref_include += ['REF_IDX'] + if return_alt_indices: + alt_table.reset_index(inplace=True, names='ALT_IDX') + + merged_table = ref_table[ref_include].merge(alt_table, how=how, on=on) if drop_duplicates: merged_table.drop_duplicates(inplace=True, subset=on) @@ -70,7 +133,10 @@ def merge_snp_tables(ref_table, flip = np.all(merged_table[['A2_x', 'A1_x']].values == merged_table[['A1_y', 'A2_y']].values, axis=1) # Variants to keep: - keep_snps = matching_allele | flip + if correct_flips: + keep_snps = matching_allele | flip + else: + keep_snps = matching_allele # Keep only SNPs with matching alleles or SNPs with flipped alleles: merged_table = merged_table.loc[keep_snps, ] @@ -292,6 +358,45 @@ def tree_to_rho(tree, min_corr): return tree.root.branch_length + get_shared_distance_matrix(tree) +def quantize(floats, int_dtype=np.int8): + """ + Quantize floating point numbers to the specified integer type. + NOTE: Assumes that the floats are in the range [-1, 1]. + :param floats: A numpy array of floats + :param int_dtype: The integer type to quantize to. + """ + + # Infer the boundaries from the integer type + info = np.iinfo(int_dtype) + + # Compute the scale and zero point + # NOTE: We add 1 to the info.min here to force the zero point to be exactly at 0. + # See discussions on Scale Quantization Mapping. + scale = 2. / (info.max - (info.min + 1)) + + # Quantize the floats to int + return np.clip((floats / scale).round(), info.min, info.max).astype(int_dtype) + + +def dequantize(ints, float_dtype=np.float32): + """ + Dequantize integers to the specified floating point type. + NOTE: Assumes original floats are in the range [-1, 1]. + :param ints: A numpy array of integers + :param float_dtype: The floating point type to dequantize to. + """ + + # Infer the boundaries from the integer type + info = np.iinfo(ints.dtype) + + # Compute the scale and zero point + # NOTE: We add 1 to the info.min here to force the zero point to be exactly at 0. + # See discussions on Scale Quantization Mapping. + scale = 2. / (info.max - (info.min + 1)) + + return ints.astype(float_dtype) * scale + + def multinomial_rvs(n, p): """ Copied from Warren Weckesser: diff --git a/magenpy/utils/system_utils.py b/magenpy/utils/system_utils.py index ac20bef..0a1ec4f 100644 --- a/magenpy/utils/system_utils.py +++ b/magenpy/utils/system_utils.py @@ -7,16 +7,32 @@ def available_cpu(): + """ + Get the number of available CPUs on the system. + """ return psutil.cpu_count() - 1 +def get_memory_usage(): + """ + Get the memory usage of the current process in Mega Bytes (MB) + """ + process = psutil.Process(os.getpid()) + mem_info = process.memory_info() + return mem_info.rss / (1024 ** 2) + + def valid_url(path): + """ + Check whether the provided `path` is a valid URL. + :param path: A string with the URL to check. + """ - import requests + import urllib.request try: - r = requests.head(path) - return r.status_code == requests.codes.ok + with urllib.request.urlopen(path) as response: + return response.getcode() == 200 # Check if the response status is OK (HTTP 200) except Exception: return False @@ -25,23 +41,47 @@ def is_cmd_tool(name): """ Check whether `name` is on PATH and marked as executable. From: https://stackoverflow.com/a/34177358 + :param name: A string with the name of the command-line tool. """ from shutil import which return which(name) is not None +def is_path_writable(path): + """ + Check whether the user has write-access to the provided `path`. + This function supports checking for nested directories (i.e., + we iterate upwards until finding a parent directory that currently + exists, and we check the write-access of that directory). + :param path: A string with the path to check. + """ + + # Get the absolute path first: + path = osp.abspath(path) + + while True: + + if osp.exists(path): + return os.access(path, os.W_OK) + else: + path = osp.dirname(path) + if path == '/' or len(path) == 0: + return False + + def makedir(dirs): """ Create directories on the filesystem, recursively. + :param dirs: A string or list of strings with the paths to create. """ if isinstance(dirs, str): dirs = [dirs] - for dir in dirs: + for dir_l in dirs: try: - os.makedirs(dir) + os.makedirs(dir_l) except OSError as e: if e.errno != errno.EEXIST: raise @@ -100,6 +140,7 @@ def run_shell_script(cmd): def delete_temp_files(prefix): """ Delete temporary files with the given `prefix`. + :param prefix: A string with the prefix of the temporary files to delete. """ for f in glob.glob(f"{prefix}*"): try: diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..8aaa5a5 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,102 @@ +site_name: magenpy + +repo_name: magenpy +repo_url: https://github.com/shz9/magenpy + +theme: + name: "material" + icon: + repo: fontawesome/brands/github + features: + - announce.dismiss + - content.action.edit + - content.action.view + - content.code.annotate + - content.code.copy + # - content.code.select + # - content.footnote.tooltips + # - content.tabs.link + - content.tooltips + # - header.autohide + - navigation.expand + - navigation.footer + - navigation.indexes + # - navigation.instant + # - navigation.instant.prefetch + # - navigation.instant.progress + # - navigation.prune + #- navigation.sections + #- navigation.tabs + # - navigation.tabs.sticky + #- navigation.top + - navigation.tracking + - search.highlight + - search.share + - search.suggest + - toc.follow + # - toc.integrate + palette: + - media: "(prefers-color-scheme)" + toggle: + icon: material/link + name: Switch to light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/toggle-switch + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + accent: indigo + toggle: + icon: material/toggle-switch-off + name: Switch to system preference + font: + text: Roboto + code: Roboto Mono + +plugins: + - mkdocstrings: + handlers: + python: + paths: [magenpy] # search packages in the src folder + options: + docstring_style: sphinx + - search + - autorefs + + +markdown_extensions: + - admonition + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - toc: + permalink: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.magiclink: + normalize_issue_symbols: true + repo_url_shorthand: true + user: shz9 + repo: magenpy + +nav: + - "Home": index.md + - "Installation": installation.md + - "Getting Started": getting_started.md + - "Features and Configurations": features.md + - "Tutorials": tutorials/overview.md + - "Command Line Scripts": + - "Overview": commandline/overview.md + - "magenpy_ld": commandline/magenpy_ld.md + - "magenpy_simulate": commandline/magenpy_simulate.md + - "Report issues/bugs": "https://github.com/shz9/magenpy/issues" + - "FAQ": faq.md + - "Citation": citation.md + - "API Reference": api/overview.md \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4e03326..a15f1e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,3 +7,9 @@ requires = [ "oldest-supported-numpy" ] build-backend = "setuptools.build_meta" + +[tool.cibuildwheel] +test-extras = "test" +test-command = "pytest {project}/tests" +# Optional +build-verbosity = 1 diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 0000000..1bcefd0 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,4 @@ +mkdocs +mkdocstrings-python +mkdocs-material +mkdocs-material-extensions diff --git a/requirements-optional.txt b/requirements-optional.txt index 443f69c..b62db07 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -1,3 +1,3 @@ -Cython matplotlib seaborn +statsmodels diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1 @@ +pytest diff --git a/requirements.txt b/requirements.txt index 5ffc7d2..9de93a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,10 +2,7 @@ dask scipy numpy pandas -pandas-plink==2.2.4 +pandas-plink psutil tqdm zarr -requests -rechunker - diff --git a/setup.py b/setup.py index 2ec9455..fc40d4a 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,11 @@ # ------------------------------------------------------ # Cython dependencies: - -# https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#distributing-cython-modules def no_cythonize(extensions, **_ignore): + """ + Copied from: + https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#distributing-cython-modules + """ for extension in extensions: sources = [] for sfile in extension.sources: @@ -32,11 +34,14 @@ def no_cythonize(extensions, **_ignore): Extension("magenpy.stats.ld.c_utils", sources=["magenpy/stats/ld/c_utils.pyx"], include_dirs=[np.get_include()], + define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], ), - Extension("magenpy.LDMatrix", - sources=["magenpy/LDMatrix.pyx"], - include_dirs=[np.get_include()], - ) + # Not ready yet: + # Extension("magenpy.stats.score.score_cpp", + # sources=["magenpy/stats/score/score_cpp.pyx"], + # include_dirs=[np.get_include()], + # language='c++' + # ) ] if cythonize is not None: @@ -64,11 +69,14 @@ def no_cythonize(extensions, **_ignore): with open("requirements-optional.txt") as fp: opt_requires = fp.read().strip().split("\n") +with open("requirements-test.txt") as fp: + test_requires = fp.read().strip().split("\n") + # ------------------------------------------------------ setup( name="magenpy", - version="0.0.12", + version="0.1.0", author="Shadi Zabad", author_email="shadi.zabad@mail.mcgill.ca", description="Modeling and Analysis of Statistical Genetics data in python", @@ -83,17 +91,21 @@ def no_cythonize(extensions, **_ignore): 'Topic :: Scientific/Engineering', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12' ], package_dir={'': '.'}, packages=find_packages(), + python_requires=">=3.8", package_data={'magenpy': ['data/*.bed', 'data/*.bim', 'data/*.fam', 'data/ukb_height_chr22.fastGWA.gz', 'config/*.ini']}, scripts=['bin/magenpy_ld', 'bin/magenpy_simulate'], install_requires=install_requires, - extras_require={'full': opt_requires}, + extras_require={'opt': opt_requires, 'test': test_requires}, ext_modules=extensions, zip_safe=False ) diff --git a/tests/conda_manual_testing.sh b/tests/conda_manual_testing.sh new file mode 100644 index 0000000..c3f00b4 --- /dev/null +++ b/tests/conda_manual_testing.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# A script to test the package with different Python versions manually using conda +# May be useful for sanity checks before pushing changes to the repository. + +# Usage: +# $ source tests/conda_manual_testing.sh + +# ============================================================================== +# Define Python versions (add more here if needed) +python_versions=("3.8" "3.9" "3.10" "3.11" "3.12") + +# ============================================================================== + +# Loop over Python versions +for version in "${python_versions[@]}" +do + # Create a new conda environment for the Python version + conda create --name "magenpy$version" python="$version" -y + + # Activate the conda environment + conda activate "magenpy$version" + + # Add some of the required dependencies: + conda install -c conda-forge -c anaconda pip wheel compilers -y + + # Check python version: + python --version + + # Install magenpy + make clean + python -m pip install -v -e .[test] + + # List the installed packages: + python -m pip list + + # Run pytest + python -m pytest -v + + magenpy_ld -h + magenpy_simulate -h + + # Deactivate the conda environment + conda deactivate + + # Remove the conda environment + conda env remove --name "magenpy$version" -y +done diff --git a/tests/test_gdl.py b/tests/test_gdl.py new file mode 100644 index 0000000..f018a27 --- /dev/null +++ b/tests/test_gdl.py @@ -0,0 +1,77 @@ +import numpy as np +import magenpy as mgp +import shutil +import pytest + + +@pytest.fixture(scope='module') +def gdl_object(): + """ + Initialize a GWADataLoader using data pre-packaged with magenpy. + Make this data loader available to all tests. + """ + gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + sumstats_files=mgp.ukb_height_sumstats_path(), + sumstats_format='fastgwa', + backend='xarray') + + yield gdl + + # Clean up after tests are done: + gdl.cleanup() + shutil.rmtree(gdl.temp_dir) + shutil.rmtree(gdl.output_dir) + + +def test_basic_properties(gdl_object): + """ + Test the basic properties of the GWADataLoader object. + """ + + # Check basic shapes parameters: + assert gdl_object is not None + assert len(gdl_object.sample_table) == gdl_object.n == 378 # Sample size + assert gdl_object.m == len(gdl_object.sumstats_table[22]) == 15935 # Number of variants + assert gdl_object.shapes == {22: 15935} # Number of variants per chromosome + assert gdl_object.chromosomes == [22] # List of chromosomes + assert gdl_object.n_annotations is None # Number of annotations + + # Check that the individual data sources have been properly harmonized: + + assert gdl_object.genotype is not None + assert gdl_object.sumstats_table is not None + + assert gdl_object.genotype[22].n_snps == gdl_object.n_snps == gdl_object.sumstats_table[22].n_snps + assert gdl_object.genotype[22].n == gdl_object.n + # Check that the variant IDs are harmonized: + assert np.array_equal(gdl_object.genotype[22].snps, gdl_object.sumstats_table[22].snps) + # Check that the variant positions are harmonized: + assert np.array_equal(gdl_object.genotype[22].bp_pos, gdl_object.sumstats_table[22].bp_pos) + # Check that the alternative alleles are harmonized: + assert np.array_equal(gdl_object.genotype[22].a1, gdl_object.sumstats_table[22].a1) + + +def test_filtering_methods(gdl_object): + """ + Test the filtering methods of the GWADataLoader object. Primarily, + we test the `filter_samples` and `filter_snps` methods to make sure + they are behaving as expected. + """ + + # Filter the samples: + + # First draw a random subset of samples to keep: + np.random.seed(0) + keep_samples = np.random.choice(gdl_object.samples, size=100, replace=False) + # Then apply the filtering method: + gdl_object.filter_samples(keep_samples=keep_samples) + + assert gdl_object.n == gdl_object.genotype[22].n == 100 + + # Filter the SNPs: + + # First draw a random subset of SNPs to keep: + keep_snps = np.random.choice(gdl_object.snps[22], size=3000, replace=False) + gdl_object.filter_snps(extract_snps=keep_snps, chromosome=22) + + assert gdl_object.n_snps == gdl_object.genotype[22].n_snps == 3000 diff --git a/tests/test_ld.py b/tests/test_ld.py new file mode 100644 index 0000000..ea8da98 --- /dev/null +++ b/tests/test_ld.py @@ -0,0 +1,115 @@ +import numpy as np +import magenpy as mgp +import shutil +import pytest + + +@pytest.fixture(scope='module') +def gdl_object(): + """ + Initialize a GWADataLoader using data pre-packaged with magenpy. + Make this data loader available to all tests. + """ + gdl = mgp.GWADataLoader(mgp.tgp_eur_data_path(), + backend='xarray') + + # Extract a smaller subset of variants for testing: + np.random.seed(0) + keep_snps = gdl.snps[22][np.random.choice(2000, 1000, replace=False)] + gdl.filter_snps(extract_snps=keep_snps, chromosome=22) + + yield gdl + + # Clean up after tests are done: + gdl.cleanup() + shutil.rmtree(gdl.temp_dir) + shutil.rmtree(gdl.output_dir) + + +def test_sample_ld_computation(gdl_object): + """ + Test the LD computation functionality according to the Sample estimator + """ + + gdl_object.compute_ld('sample', gdl_object.output_dir) + gdl_object.harmonize_data() + + # Check that the LD matrix has been computed: + assert gdl_object.ld is not None + assert gdl_object.ld[22] is not None + assert gdl_object.ld[22].validate_ld_matrix() + assert gdl_object.ld[22].stored_n_snps == gdl_object.n_snps + assert gdl_object.ld[22].sample_size == gdl_object.sample_size + + assert np.array_equal(gdl_object.ld[22].snps, gdl_object.snps[22]) + + # Add other checks? + + +def test_windowed_ld_computation(gdl_object): + """ + Test the LD computation functionality according to the Windowed estimator + """ + + gdl_object.compute_ld('windowed', + gdl_object.output_dir, + window_size=500, + kb_window_size=100, + cm_window_size=3.) + gdl_object.harmonize_data() + + # Check that the LD matrix has been computed: + assert gdl_object.ld is not None + assert gdl_object.ld[22] is not None + assert gdl_object.ld[22].validate_ld_matrix() + assert gdl_object.ld[22].stored_n_snps == gdl_object.n_snps + assert gdl_object.ld[22].sample_size == gdl_object.sample_size + + assert np.array_equal(gdl_object.ld[22].snps, gdl_object.snps[22]) + + # Add other checks? + + +def test_shrinkage_ld_computation(gdl_object): + """ + Test the LD computation functionality according to the Shrinkage estimator + """ + + gdl_object.compute_ld('shrinkage', + gdl_object.output_dir, + genetic_map_ne=11400, + genetic_map_sample_size=183) + gdl_object.harmonize_data() + + # Check that the LD matrix has been computed: + assert gdl_object.ld is not None + assert gdl_object.ld[22] is not None + assert gdl_object.ld[22].validate_ld_matrix() + assert gdl_object.ld[22].stored_n_snps == gdl_object.n_snps + assert gdl_object.ld[22].sample_size == gdl_object.sample_size + + assert np.array_equal(gdl_object.ld[22].snps, gdl_object.snps[22]) + + # Add other checks? + + +def test_block_ld_computation(gdl_object): + """ + Test the LD computation functionality according to the Block estimator + """ + + ld_block_url = "https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/EUR/fourier_ls-all.bed" + gdl_object.compute_ld('block', gdl_object.output_dir, ld_blocks_file=ld_block_url) + gdl_object.harmonize_data() + + # Check that the LD matrix has been computed: + assert gdl_object.ld is not None + assert gdl_object.ld[22] is not None + assert gdl_object.ld[22].validate_ld_matrix() + assert gdl_object.ld[22].stored_n_snps == gdl_object.n_snps + assert gdl_object.ld[22].sample_size == gdl_object.sample_size + + assert np.array_equal(gdl_object.ld[22].snps, gdl_object.snps[22]) + + # Add other checks? + diff --git a/tests/test_simulation.py b/tests/test_simulation.py new file mode 100644 index 0000000..5a566c0 --- /dev/null +++ b/tests/test_simulation.py @@ -0,0 +1,44 @@ +import magenpy as mgp +import numpy as np +import shutil +import pytest + + +@pytest.fixture(scope='module') +def gsim_object(): + """ + Initialize a GWADataLoader using data pre-packaged with magenpy. + Make this data loader available to all tests. + """ + gsim = mgp.PhenotypeSimulator(mgp.tgp_eur_data_path(), + backend='xarray') + + yield gsim + + # Clean up after tests are done: + gsim.cleanup() + shutil.rmtree(gsim.temp_dir) + shutil.rmtree(gsim.output_dir) + + +def test_simulator(gsim_object): + """ + Test the basic functionality of the phenotype simulator + """ + + gsim_object.simulate() + + assert gsim_object.sample_table is not None + assert gsim_object.sample_table.phenotype is not None + assert gsim_object.sample_table.phenotype_likelihood == 'gaussian' + assert len(gsim_object.sample_table.phenotype) == gsim_object.sample_size + + gsim_object.phenotype_likelihood = 'binomial' + + gsim_object.simulate() + + assert gsim_object.sample_table is not None + assert gsim_object.sample_table.phenotype is not None + assert gsim_object.sample_table.phenotype_likelihood == 'binomial' + assert sorted(np.unique(gsim_object.sample_table.phenotype)) == [0, 1] +