From 44871971ebfc73625c51bfc3a0eaed2e7489ca93 Mon Sep 17 00:00:00 2001 From: Shadi Date: Wed, 24 Apr 2024 19:04:08 -0400 Subject: [PATCH] Release `0.1.1` Minor updates / bug fixes + Adding containers --- CHANGELOG.md | 11 +++ benchmarks/benchmark_e_step.py | 6 +- bin/viprs_evaluate | 2 +- bin/viprs_fit | 1 + bin/viprs_score | 1 + containers/cli.Dockerfile | 43 ++++++++ containers/jupyter.Dockerfile | 53 ++++++++++ docs/getting_started.md | 2 +- docs/installation.md | 20 +++- pyproject.toml | 1 + setup.py | 175 ++++++++++++++++++++++----------- viprs/__init__.py | 2 +- viprs/model/BayesPRSModel.py | 2 +- viprs/model/vi/e_step_cpp.pyx | 2 +- 14 files changed, 256 insertions(+), 65 deletions(-) create mode 100644 containers/cli.Dockerfile create mode 100644 containers/jupyter.Dockerfile diff --git a/CHANGELOG.md b/CHANGELOG.md index 9475878..b62fc3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.1] - 2024-04-24 + +### Changed + +- Fixed bugs in the E-Step benchmarking script. +- Re-wrote the logic for finding BLAS libraries in the `setup.py` script. :crossed_fingers: + +### Added + +- `Dockerfile`s for both `cli` and `jupyter` modes. + ## [0.1.0] - 2024-04-05 A large scale restructuring of the code base to improve efficiency and usability. diff --git a/benchmarks/benchmark_e_step.py b/benchmarks/benchmark_e_step.py index bdf8ebc..67b08dc 100644 --- a/benchmarks/benchmark_e_step.py +++ b/benchmarks/benchmark_e_step.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + """ Benchmark the speed of the E-Step in VIPRS ---------------------------- @@ -214,9 +216,9 @@ def exec_func(): # ------------------------------------------------------------------------ # Create a grid: - grid = HyperparameterGrid() + grid = HyperparameterGrid(n_snps=gdl.n_snps) # Generate a grid for pi using 5 equidistant grid points: - grid.generate_pi_grid(steps=args.grid_size, n_snps=gdl.n_snps) + grid.generate_pi_grid(steps=args.grid_size) # Generate a grid for sigma epsilon using 5 equidistant grid points: grid.generate_sigma_epsilon_grid(steps=args.grid_size) diff --git a/bin/viprs_evaluate b/bin/viprs_evaluate index c5307bc..8245199 100644 --- a/bin/viprs_evaluate +++ b/bin/viprs_evaluate @@ -103,7 +103,7 @@ if args.covariates_file is not None: # Make sure that samples remain after reading both: assert sample_table.n > 0, "No samples found after merging the covariates and phenotype files." -prs_df = pd.read_csv(args.prs_file, sep='\t') +prs_df = pd.read_csv(args.prs_file, sep=r'\s+') # Merge the PRS data with the phenotype data: prs_df = prs_df.merge(sample_table.get_individual_table(), on=['FID', 'IID']) diff --git a/bin/viprs_fit b/bin/viprs_fit index f1ddb1a..bab758f 100644 --- a/bin/viprs_fit +++ b/bin/viprs_fit @@ -30,6 +30,7 @@ Usage: import os.path as osp + def check_args(args): """ Check the validity, consistency, and completeness of the commandline arguments. diff --git a/bin/viprs_score b/bin/viprs_score index e29ecfd..2bc6594 100644 --- a/bin/viprs_score +++ b/bin/viprs_score @@ -25,6 +25,7 @@ from magenpy.utils.system_utils import makedir, get_filenames from magenpy.GWADataLoader import GWADataLoader from viprs.model.BayesPRSModel import BayesPRSModel + print(fr""" ********************************************** _____ diff --git a/containers/cli.Dockerfile b/containers/cli.Dockerfile new file mode 100644 index 0000000..822ef84 --- /dev/null +++ b/containers/cli.Dockerfile @@ -0,0 +1,43 @@ +# Usage: +# ** Step 1 ** Build the docker image: +# docker build -f cli.Dockerfile -t viprs-cli . +# ** Step 2** Run the docker container in interactive shell mode: +# docker run -it viprs-cli /bin/bash +# ** Step 3** Test viprs fit: +# viprs_fit -h + +FROM python:3.11-slim-buster + +LABEL authors="Shadi Zabad" +LABEL version="0.1" +LABEL description="Docker image containing all requirements to run the commandline scripts in the VIPRS package" + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + unzip \ + wget \ + pkg-config \ + g++ gcc \ + libopenblas-dev \ + libomp-dev + +# Download and setup plink2: +RUN mkdir -p /software && \ + wget https://s3.amazonaws.com/plink2-assets/alpha5/plink2_linux_avx2_20240105.zip -O /software/plink2.zip && \ + unzip /software/plink2.zip -d /software && \ + rm /software/plink2.zip + +# Download and setup plink1.9: +RUN mkdir -p /software && \ + wget https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231211.zip -O /software/plink.zip && \ + unzip /software/plink.zip -d /software && \ + rm /software/plink.zip + +# Add plink1.9 and plink2 to PATH: +RUN echo 'export PATH=$PATH:/software' >> ~/.bashrc + +# Install viprs package from PyPI +RUN pip install --upgrade pip viprs + +# Test the installation +RUN viprs_fit -h diff --git a/containers/jupyter.Dockerfile b/containers/jupyter.Dockerfile new file mode 100644 index 0000000..cfdabaa --- /dev/null +++ b/containers/jupyter.Dockerfile @@ -0,0 +1,53 @@ +# Usage: +# ** Step 1 ** Build the docker image: +# docker build -f ../vemPRS/containers/jupyter.Dockerfile -t viprs-jupyter . +# ** Step 2 ** Run the docker container (pass the appropriate port): +# docker run -p 8888:8888 viprs-jupyter +# ** Step 3 ** Open the link in your browser: +# http://localhost:8888 + + +FROM python:3.11-slim-buster + +LABEL authors="Shadi Zabad" +LABEL version="0.1" +LABEL description="Docker image containing all requirements to run the VIPRS package in a Jupyter Notebook" + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + unzip \ + wget \ + pkg-config \ + g++ gcc \ + libopenblas-dev \ + libomp-dev + +# Download and setup plink2: +RUN mkdir -p /software && \ + wget https://s3.amazonaws.com/plink2-assets/alpha5/plink2_linux_avx2_20240105.zip -O /software/plink2.zip && \ + unzip /software/plink2.zip -d /software && \ + rm /software/plink2.zip + +# Download and setup plink1.9: +RUN mkdir -p /software && \ + wget https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231211.zip -O /software/plink.zip && \ + unzip /software/plink.zip -d /software && \ + rm /software/plink.zip + +# Add plink1.9 and plink2 to PATH: +RUN echo 'export PATH=$PATH:/software' >> ~/.bashrc + +# Install viprs package from PyPI +RUN pip install --upgrade pip viprs jupyterlab + +# Expose the port Jupyter Lab will be served on +EXPOSE 8888 + +# Set the working directory +WORKDIR /viprs_dir + +# Copy the current directory contents into the container at /app +COPY . /viprs_dir + +# Run Jupyter Lab +CMD ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--NotebookApp.token=''"] diff --git a/docs/getting_started.md b/docs/getting_started.md index 0964424..d2de201 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -30,7 +30,7 @@ summary statistics from `fastGWA`: ```python linenums="1" # Load genotype and GWAS summary statistics data (chromosome 22): gdl = mgp.GWADataLoader(bed_files=mgp.tgp_eur_data_path(), # Path of the genotype data - sumstats_files=mgp.ukb_height_fastGWA_path(), # Path of the summary statistics + sumstats_files=mgp.ukb_height_sumstats_path(), # Path of the summary statistics sumstats_format="fastGWA") # Specify the format of the summary statistics ``` diff --git a/docs/installation.md b/docs/installation.md index 8176d6d..3487bf0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -26,7 +26,7 @@ a new environment with the required dependencies as follows: ```bash python_version=3.11 # Change python version here if needed -conda create --name "viprs_env" -c anaconda -c conda-forge python=$python_version compilers openblas -y +conda create --name "viprs_env" -c anaconda -c conda-forge python=$python_version compilers pkg-config openblas -y conda activate viprs_env ``` @@ -68,3 +68,21 @@ source viprs_env/bin/activate python -m pip install --upgrade pip python -m pip install viprs>=0.1 ``` + +### Using `Docker` containers + +If you are using `Docker` containers, you can build a container with the `viprs` package +and all its dependencies by downloading the relevant `Dockerfile` from the +[repository](https://github.com/shz9/viprs/tree/master/containers) and building it +as follows: + +```bash +# Build the docker image: +docker build -f cli.Dockerfile -t viprs-cli . +# Run the container in interactive mode: +docker run -it viprs-cli /bin/bash +# Test that the package installed successfully: +viprs_fit -h +``` + +We plan to publish pre-built `Docker` images on `DockerHub` in the future. diff --git a/pyproject.toml b/pyproject.toml index da33b0a..a2d1b4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires = [ "extension-helpers", "scipy", "oldest-supported-numpy", + "pkgconfig" ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 2afa1ec..bcc4ecc 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ from setuptools import setup, Extension, find_packages from extension_helpers import add_openmp_flags_if_available from extension_helpers._openmp_helpers import check_openmp_support +import pkgconfig import numpy as np import warnings import os @@ -16,69 +17,127 @@ # Find and set BLAS-related flags and paths: -def get_blas_include_dirs(): +def find_blas_libraries(): """ - Get the include directories for the BLAS library from numpy build configuration. - - NOTE: np.distutils will be deprecated in future versions of numpy. Find alternative solutions - to linking to BLAS libraries. Alternative solutions: - - * Use the `blas_opt` key in the `numpy.__config__.show()` output to get the include directories. - * meson builder - * ... + Find BLAS libraries on the system using pkg-config. + This function will return the include directories (compiler flags) + and the linker flags to enable building the C/C++/Cython extensions + that require BLAS (or whose performance would be enhanced with BLAS). + + We use pkg-config (as encapsulated in the `pkgconfig` Python package) + to perform this search. Note that we augment the pkg-config + search path with the conda library path (if available) to + enable linking against BLAS libraries installed via Conda. + + :return: A dictionary with the following keys: + * 'found': A boolean indicating whether BLAS libraries were found. + * 'include_dirs': A list of include directories (compiler flags). + * 'extra_link_args': A list of linker flags. + * 'define_macros': A list of macros to define. + * 'libraries': A list of libraries to link against. """ - cblas_lib_path = None - - # Attempt (1): Getting the information from numpy distutils: - try: - from numpy.distutils.system_info import get_info - cblas_lib_path = get_info('blas_opt')['include_dirs'] - except (AttributeError, KeyError, ImportError, ModuleNotFoundError): - pass - - # Attempt (2): For newer versions of numpy, obtain information from np.show_config: - if cblas_lib_path is None: + # STEP 0: Get the current pkg-config search path: + current_pkg_config_path = os.getenv("PKG_CONFIG_PATH", "") + + # STEP 1: Augment the pkg-config search path with + # the path of the current Conda environment (if exists). + # This can leverage BLAS libraries installed via Conda. + + conda_path = os.getenv("CONDA_PREFIX") + + if conda_path is not None: + conda_pkgconfig_path = os.path.join(conda_path, 'lib/pkgconfig') + if os.path.isdir(conda_pkgconfig_path): + current_pkg_config_path += ":" + conda_pkgconfig_path + + # STEP 2: Add the updated path to the environment variable: + os.environ["PKG_CONFIG_PATH"] = current_pkg_config_path + + # STEP 3: Get all pkg-config packages and filter to + # those that have "blas" in the name. + blas_packages = [pkg for pkg in pkgconfig.list_all() + if "blas" in pkg] + + # First check: Make sure that compiler flags are defined and a + # valid cblas.h header file exists in the include directory: + if len(blas_packages) >= 1: + + blas_packages = [pkg for pkg in blas_packages + if pkgconfig.cflags(pkg) and + os.path.isfile(os.path.join(pkgconfig.variables(pkg)['includedir'], 'cblas.h'))] + + # If there remains more than one library after the previous + # search and filtering steps, then apply some heuristics + # to select the most relevant one: + if len(blas_packages) > 1: + # Check if the information about the most relevant library + # can be inferred from numpy. Note that this interface from + # numpy changes quite often between versions, so it's not + # a reliable check. But in case it works on some systems, + # we use it to link to the same library as numpy: try: - cblas_lib_path = [np.show_config(mode='dicts')['Build Dependencies']['blas']['include directory']] - except Exception: + for pkg in blas_packages: + if pkg in np.__config__.get_info('blas_opt')['libraries']: + blas_packages = [pkg] + break + except (KeyError, AttributeError): pass - # Attempt (3): Obtain information from conda environment: - if cblas_lib_path is None: - # If not found, check if the library is present in the - # conda environment: - conda_path = os.getenv("CONDA_PREFIX") - if conda_path is not None: - # If the header file exists in the conda environment, use it: - if os.path.isfile(os.path.join(conda_path, 'include', 'cblas.h')): - cblas_lib_path = [os.path.join(conda_path, 'include')] - - # Attempt (4): Obtain information from environment variable: - if cblas_lib_path is None: - cblas_lib_path = os.getenv('BLAS_INCLUDE_DIR') - - # If the header file is not found, issue a warning: - if (cblas_lib_path is None) or (not os.path.isfile(os.path.join(cblas_lib_path[0], 'cblas.h'))): - # Ok, we give up... + # If there are still multiple libraries, then apply some + # additional heuristics (based on name matching) to select + # the most relevant one. Some libraries (e.g. flexiblas) are published with support for 64bit + # and they expose libraries for non-BLAS API (with the _api suffix). + # Ignore these here if that is the case? + if len(blas_packages) > 1: + # Some libraries (e.g. flexiblas) are published with support for 64bit + # and they expose libraries for non-BLAS API (with the _api suffix). + # Ignore these here if that is the case? + + idx_to_remove = set() + + for pkg1 in blas_packages: + if pkg1 != 'blas': + for i, pkg2 in enumerate(blas_packages): + if pkg1 != pkg2 and pkg1 in pkg2: + idx_to_remove.add(i) + + blas_packages = [pkg for i, pkg in enumerate(blas_packages) if i not in idx_to_remove] + + # After applying all the heuristics, out of all the remaining libraries, + # select the first one in the list. Not the greatest solution, maybe + # down the line we can use the same BLAS order as numpy. + if len(blas_packages) >= 1: + final_blas_pkg = blas_packages[0] + else: + final_blas_pkg = None + + # STEP 4: If a relevant BLAS package was found, extract the flags + # needed for building the Cython/C/C++ extensions: + + if final_blas_pkg is not None: + blas_info = pkgconfig.parse(final_blas_pkg) + blas_info['define_macros'] = [('HAVE_CBLAS', None)] + else: + blas_info = { + 'include_dirs': [], + 'library_dirs': [], + 'libraries': [], + 'define_macros': [], + } warnings.warn(""" - ******************** WARNING ******************** + ********************* WARNING ********************* BLAS library header files not found on your system. - This may slow down some computations. If the - library is present on your system, please link to - it explicitly by setting the BLAS_INCLUDE_DIR - environment variable prior to installation. - """) + This may slow down some computations. If you are + using conda, we recommend installing BLAS libraries + beforehand. + ********************* WARNING ********************* + """, stacklevel=2) - cblas_lib_path = [] + return blas_info - # Define macros based on whether CBLAS header exists - macros = [('HAVE_CBLAS', None)] if len(cblas_lib_path) > 0 else [] - return len(cblas_lib_path) > 0, cblas_lib_path, macros - - -blas_found, blas_include, blas_macros = get_blas_include_dirs() +blas_flags = find_blas_libraries() # ------------------------------------------------------ # Build cython extensions: @@ -120,10 +179,11 @@ def no_cythonize(cy_extensions, **_ignore): Extension("viprs.model.vi.e_step_cpp", ["viprs/model/vi/e_step_cpp.pyx"], language="c++", - libraries=[[], ["cblas"]][blas_found], - include_dirs=[np.get_include()] + blas_include, - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")] + blas_macros, - extra_compile_args=["-O3"]) + libraries=blas_flags['libraries'], + include_dirs=[np.get_include()] + blas_flags['include_dirs'], + library_dirs=blas_flags['library_dirs'], + define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")] + blas_flags['define_macros'], + extra_compile_args=["-O3", "-std=c++17"]) ] if check_openmp_support(): @@ -140,6 +200,7 @@ def no_cythonize(cy_extensions, **_ignore): means that some computations may be slower than expected. It will preclude using multithreading in the coordinate ascent optimization algorithm. + ******************** WARNING ******************** """) @@ -178,7 +239,7 @@ def no_cythonize(cy_extensions, **_ignore): setup( name="viprs", - version="0.1.0", + version="0.1.1", author="Shadi Zabad", author_email="shadi.zabad@mail.mcgill.ca", description="Variational Inference of Polygenic Risk Scores (VIPRS)", diff --git a/viprs/__init__.py b/viprs/__init__.py index 2bbf187..103030a 100644 --- a/viprs/__init__.py +++ b/viprs/__init__.py @@ -5,5 +5,5 @@ from .model.gridsearch.HyperparameterGrid import HyperparameterGrid from .utils.data_utils import * -__version__ = '0.1.0' +__version__ = '0.1.1' __release_date__ = 'April 2024' diff --git a/viprs/model/BayesPRSModel.py b/viprs/model/BayesPRSModel.py index 8d4c29c..32777bd 100644 --- a/viprs/model/BayesPRSModel.py +++ b/viprs/model/BayesPRSModel.py @@ -305,7 +305,7 @@ def set_model_parameters(self, parameter_table): self.pip, self.post_mean_beta, self.post_var_beta = self.harmonize_data(parameter_table=parameter_table) - def read_inferred_parameters(self, f_names, sep="\t"): + def read_inferred_parameters(self, f_names, sep=r"\s+"): """ Read a file with the inferred parameters. :param f_names: A path (or list of paths) to the file with the effect sizes. diff --git a/viprs/model/vi/e_step_cpp.pyx b/viprs/model/vi/e_step_cpp.pyx index f9268d1..0557a48 100644 --- a/viprs/model/vi/e_step_cpp.pyx +++ b/viprs/model/vi/e_step_cpp.pyx @@ -65,7 +65,7 @@ cdef extern from "e_step.hpp" nogil: T* u_logs, T* half_var_tau, T* mu_mult, - T dq_scale, + T dq_scale, int threads, bint use_blas, bint low_memory) noexcept nogil