Merge branch 'main' into pytorch-shuffle-multiple-chunks

chanzuckerberg · Jun 3, 2024 · 5a22e2f · 5a22e2f
2 parents b07de68 + c18c1a9
commit 5a22e2f
Show file tree

Hide file tree

Showing 44 changed files with 1,117 additions and 796 deletions.
diff --git a/.github/workflows/docsite-build-deploy.yml b/.github/workflows/docsite-build-deploy.yml
@@ -35,6 +35,7 @@ jobs:
 
       - name: Install misc deps
         run: |
+          sudo apt-get update
           sudo apt-get install -y libcairo2-dev rsync
 
       - name: Build Sphinx website

diff --git a/.github/workflows/lts-compat-check.yml b/.github/workflows/lts-compat-check.yml
@@ -17,16 +17,14 @@ jobs:
         census-build-version:  # Add additional LTS releases as they occur
           - "latest"
           - "stable"
+          - "2023-12-15"
           - "2023-07-25"
           - "2023-05-15"
         py-pkg-version:
-          - "~=1.0.0"
-          - "~=1.1.0"
-          - "~=1.2.0"
-          - "~=1.3.0"
-          - "~=1.4.0"
-          - "~=1.5.0"
-          - "~=1.6.0"
+          - "~=1.10.0"
+          - "~=1.11.0"
+          - "~=1.12.0"
+          - "~=1.13.0"
           - "head-of-main"
 
     runs-on: ${{matrix.os}}
@@ -39,17 +37,17 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Install dependencies (including experimental)
+      - name: Install dependencies
         run: |
           python -m pip install -U pip setuptools wheel
-          pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
 
-          if [ {{matrix.cellxgene-census-version}} != "head-of-main" ]; then
-            pip install -e ./api/python/cellxgene_census/[experimental]
+          if [ {{matrix.py-pkg-version}} = "head-of-main" ]; then
+            pip install -e ./api/python/cellxgene_census/
           else
-            pip install -U cellxgene_census[experimental]==${{ matrix.py-pkg-version }}
+            pip install -U cellxgene_census${{ matrix.py-pkg-version }}
           fi
 
       - name: Test with pytest (API, main tests)
         run: |
-          PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/ --census_version ${{ matrix.census-build-version }}
+          PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/test_lts_compat.py --census_version ${{ matrix.census-build-version }}
diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml
@@ -23,6 +23,9 @@ jobs:
       matrix:
         os: [single-cell-8c64g-runner, macos-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
+        exclude:
+          - os: macos-latest
+            python-version: "3.8"
 
     runs-on: ${{matrix.os}}
 
@@ -60,7 +63,7 @@ jobs:
         run: |
           python -m pip install -U pip setuptools wheel
           pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
-          pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
           pip install -U cellxgene-census[experimental]
 
           # dump pip config for logs

diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
@@ -7,12 +7,21 @@ on:
   push:
     branches: [main]
 
+# If a new commit is pushed, cancel the jobs from previous commits.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   unit_tests_python_api:
     strategy:
+      fail-fast: false  # Don't stop the workflow if one of the jobs fails
       matrix:
         os: [single-cell-8c64g-runner, macos-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
+        exclude:
+          - os: macos-latest
+            python-version: "3.8"
 
     runs-on: ${{matrix.os}}
 
@@ -30,8 +39,10 @@ jobs:
         run: |
           python -m pip install -U pip setuptools wheel
           pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
-          pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
           pip install -e './api/python/cellxgene_census/[experimental]'
+      - name: Report Dependency Versions
+        run: pip list
       - name: Test with pytest (API, main tests)
         run: |
           PYTHONPATH=. coverage run --parallel-mode -m pytest -v -rP --durations=20 ./api/python/cellxgene_census/tests/

diff --git a/.github/workflows/r-dependency-check.yml b/.github/workflows/r-dependency-check.yml
@@ -27,10 +27,10 @@ jobs:
       - name: install packages (macOS)
         if: matrix.os == 'macos-latest'
         run: Rscript -e 'install.packages(c("igraph"), type="binary")'
-      - name: install cellxgene.census and dependencies (Linux)
+      - name: install cellxgene.census and dependencies
         # This should follow our user-facing instructions to install cellxgene.census.
         run: |
-            Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"), type="source")'
+            Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"))'
             Rscript -e 'BiocManager::install("SingleCellExperiment")'
       - name: run unit tests
         # [re-]fetch the cellxgene.census source package which includes the unit test code to run

diff --git a/.gitignore b/.gitignore
@@ -73,6 +73,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+_autosummary
 
 # PyBuilder
 target/

diff --git a/api/python/cellxgene_census/README.md b/api/python/cellxgene_census/README.md
@@ -7,3 +7,58 @@ The `cellxgene_census` package provides an API to facilitate the use of the CZ C
 For more help, please file a issue on the repo, or contact us at <[email protected]>.
 
 If you believe you have found a security issue, we would appreciate notification. Please send email to <[email protected]>.
+
+## Development Environment Setup
+
+- Create a virtual environment using `venv` or `conda`
+- `cd` to the root of this repository
+- `pip install -e api/python/cellxgene_census`
+- To install dependencies needed to work on the [experimental](./src/cellxgene_census/experimental/) portion of the API:
+  `pip install -e 'api/python/cellxgene_census[experimental]'`.
+- `pip install jupyterlab`
+- **Test it!** Either open up a new `jupyter` notebook or the `python` interpreter and run this code:
+
+```python
+import cellxgene_census
+
+with cellxgene_census.open_soma() as census:
+
+    # Reads SOMADataFrame as a slice
+    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
+        value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
+        column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
+    )
+
+    # Concatenates results to pyarrow.Table
+    cell_metadata = cell_metadata.concat()
+
+    # Converts to pandas.DataFrame
+    cell_metadata = cell_metadata.to_pandas()
+
+    print(cell_metadata)
+```
+
+The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:
+
+```python
+
+The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.
+
+                assay        cell_type                 tissue tissue_general suspension_type disease     sex
+0        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+1        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+2        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+3        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+4        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+...               ...              ...                    ...            ...             ...     ...     ...
+607636  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607637  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607638  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607639  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607640  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+
+[607641 rows x 7 columns]
+
+```
+
+- Learn more about the Census API by going through the tutorials in the [notebooks](../notebooks/)
diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml
@@ -31,17 +31,17 @@ dependencies= [
     # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to
     # ensure that the assets are readable (tiledbsoma supports backward compatible reading).
     # Make sure this version does not fall behind the builder's tiledbsoma version.
-    "tiledbsoma~=1.9.1",
+    "tiledbsoma==1.11.3",
     "anndata",
-    "numpy>=1.21",
+    "numpy>=1.21,<2.0",
     "requests",
     "typing_extensions",
     "s3fs>=2021.06.1",
 ]
 
 [project.optional-dependencies]
 experimental = [
-    "torch~=2.0",
+    "torch~=2.2.0",
     "torchdata~=0.7",
     "scikit-learn~=1.0",
     "scikit-misc>=0.2",  # scikit-misc 0.3 dropped Python 3.8 support

diff --git a/api/python/cellxgene_census/src/cellxgene_census/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/__init__.py
@@ -21,7 +21,7 @@
 
 from importlib import metadata
 
-from ._get_anndata import get_anndata
+from ._get_anndata import get_anndata, get_obs, get_var
 from ._open import (
     download_source_h5ad,
     get_default_soma_context,
@@ -44,6 +44,8 @@
 __all__ = [
     "download_source_h5ad",
     "get_anndata",
+    "get_obs",
+    "get_var",
     "get_census_version_description",
     "get_census_version_directory",
     "get_census_mirror_directory",

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -7,9 +7,10 @@
 Methods to retrieve slices of the census as AnnData objects.
 """
 
-from typing import Optional, Sequence
+from typing import Literal, Optional, Sequence
 
 import anndata
+import pandas as pd
 import tiledbsoma as soma
 from somacore.options import SparseDFCoord
 
@@ -146,3 +147,90 @@ def get_anndata(
                     adata.varm[emb] = embedding
 
         return adata
+
+
+def _get_axis_metadata(
+    census: soma.Collection,
+    axis: Literal["obs", "var"],
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    exp = _get_experiment(census, organism)
+    coords = (slice(None),) if coords is None else (coords,)
+    if axis == "obs":
+        df = exp.obs
+    elif axis == "var":
+        df = exp.ms["RNA"].var
+    else:
+        raise ValueError(f"axis should be either 'obs' or 'var', but '{axis}' was passed")
+    result: pd.DataFrame = (
+        df.read(coords=coords, column_names=column_names, value_filter=value_filter).concat().to_pandas()
+    )
+    return result
+
+
+def get_obs(
+    census: soma.Collection,
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    """Get the observation metadata for a query on the census.
+
+    Args:
+        census:
+            The census object, usually returned by :func:`open_soma`.
+        organism:
+            The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
+        value_filter:
+            Value filter for the ``obs`` metadata. Value is a filter query written in the
+            SOMA ``value_filter`` syntax.
+        coords:
+            Coordinates for the ``obs`` axis, which is indexed by the ``soma_joinid`` value.
+            May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
+        column_names:
+            Columns to fetch.
+
+    Returns:
+        A :class:`pandas.DataFrame` object containing metadata for the queried slice.
+    """
+    return _get_axis_metadata(
+        census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
+    )
+
+
+def get_var(
+    census: soma.Collection,
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    """Get the variable metadata for a query on the census.
+
+    Args:
+        census:
+            The census object, usually returned by :func:`open_soma`.
+        organism:
+            The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
+        value_filter:
+            Value filter for the ``var`` metadata. Value is a filter query written in the
+            SOMA ``value_filter`` syntax.
+        coords:
+            Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
+            May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
+        column_names:
+            Columns to fetch.
+
+    Returns:
+        A :class:`pandas.DataFrame` object containing metadata for the queried slice.
+    """
+    return _get_axis_metadata(
+        census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
+    )
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py
@@ -14,6 +14,7 @@
 
 import s3fs
 import tiledbsoma as soma
+from fsspec.callbacks import NoOpCallback, TqdmCallback
 
 from ._release_directory import (
     CensusLocator,
@@ -297,7 +298,9 @@ def get_source_h5ad_uri(dataset_id: str, *, census_version: str = DEFAULT_CENSUS
     return locator
 
 
-def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str = DEFAULT_CENSUS_VERSION) -> None:
+def download_source_h5ad(
+    dataset_id: str, to_path: str, *, census_version: str = DEFAULT_CENSUS_VERSION, progress_bar: bool = True
+) -> None:
     """Download the source H5AD dataset, for the given `dataset_id`, to the user-specified
     file name.
 
@@ -308,6 +311,8 @@ def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str =
             The file name where the downloaded H5AD will be written. Must not already exist.
         census_version:
             The census version name. Defaults to ``"stable"``.
+        progress_bar:
+            Whether to display a progress bar. Defaults to ``True``.
 
     Raises:
         ValueError: if the path already exists (i.e., will not overwrite an existing file), or is not a file.
@@ -326,6 +331,13 @@ def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str =
     if to_path.endswith("/"):
         raise ValueError("Specify to_path as a file name, not a directory name.")
 
+    if progress_bar:
+        callback = TqdmCallback(
+            tqdm_kwargs={"unit": "B", "unit_scale": True, "unit_divisor": 1024, "desc": "Downloading"}
+        )
+    else:
+        callback = NoOpCallback()
+
     locator = get_source_h5ad_uri(dataset_id, census_version=census_version)
     protocol = urllib.parse.urlparse(locator["uri"]).scheme
     assert protocol == "s3"
@@ -334,4 +346,8 @@ def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str =
         anon=True,
         cache_regions=True,
     )
-    fs.get_file(locator["uri"], to_path)
+    fs.get_file(
+        locator["uri"],
+        to_path,
+        callback=callback,
+    )
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py
@@ -89,7 +89,7 @@ def _highly_variable_genes_seurat_v3(
         n_batches = len(batch_index.cat.categories)
         n_samples = batch_index.value_counts().loc[batch_index.cat.categories.to_numpy()].to_numpy()
         if n_batches > 1:
-            batch_indexer = soma.IntIndexer(batch_index.index.to_numpy(), context=query.experiment.context).get_indexer
+            batch_indexer = batch_index.index.get_indexer
             batch_codes = batch_index.cat.codes.to_numpy().astype(np.int64)
     else:
         n_batches = 1