Skip to content

Commit

Permalink
Merge branch 'main' into pytorch-shuffle-multiple-chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi committed Jun 3, 2024
2 parents b07de68 + c18c1a9 commit 5a22e2f
Show file tree
Hide file tree
Showing 44 changed files with 1,117 additions and 796 deletions.
1 change: 1 addition & 0 deletions .github/workflows/docsite-build-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
- name: Install misc deps
run: |
sudo apt-get update
sudo apt-get install -y libcairo2-dev rsync
- name: Build Sphinx website
Expand Down
24 changes: 11 additions & 13 deletions .github/workflows/lts-compat-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,14 @@ jobs:
census-build-version: # Add additional LTS releases as they occur
- "latest"
- "stable"
- "2023-12-15"
- "2023-07-25"
- "2023-05-15"
py-pkg-version:
- "~=1.0.0"
- "~=1.1.0"
- "~=1.2.0"
- "~=1.3.0"
- "~=1.4.0"
- "~=1.5.0"
- "~=1.6.0"
- "~=1.10.0"
- "~=1.11.0"
- "~=1.12.0"
- "~=1.13.0"
- "head-of-main"

runs-on: ${{matrix.os}}
Expand All @@ -39,17 +37,17 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies (including experimental)
- name: Install dependencies
run: |
python -m pip install -U pip setuptools wheel
pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
if [ {{matrix.cellxgene-census-version}} != "head-of-main" ]; then
pip install -e ./api/python/cellxgene_census/[experimental]
if [ {{matrix.py-pkg-version}} = "head-of-main" ]; then
pip install -e ./api/python/cellxgene_census/
else
pip install -U cellxgene_census[experimental]==${{ matrix.py-pkg-version }}
pip install -U cellxgene_census${{ matrix.py-pkg-version }}
fi
- name: Test with pytest (API, main tests)
run: |
PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/ --census_version ${{ matrix.census-build-version }}
PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/test_lts_compat.py --census_version ${{ matrix.census-build-version }}
5 changes: 4 additions & 1 deletion .github/workflows/py-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ jobs:
matrix:
os: [single-cell-8c64g-runner, macos-latest]
python-version: ["3.8", "3.9", "3.10", "3.11"]
exclude:
- os: macos-latest
python-version: "3.8"

runs-on: ${{matrix.os}}

Expand Down Expand Up @@ -60,7 +63,7 @@ jobs:
run: |
python -m pip install -U pip setuptools wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -U cellxgene-census[experimental]
# dump pip config for logs
Expand Down
13 changes: 12 additions & 1 deletion .github/workflows/py-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,21 @@ on:
push:
branches: [main]

# If a new commit is pushed, cancel the jobs from previous commits.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
unit_tests_python_api:
strategy:
fail-fast: false # Don't stop the workflow if one of the jobs fails
matrix:
os: [single-cell-8c64g-runner, macos-latest]
python-version: ["3.8", "3.9", "3.10", "3.11"]
exclude:
- os: macos-latest
python-version: "3.8"

runs-on: ${{matrix.os}}

Expand All @@ -30,8 +39,10 @@ jobs:
run: |
python -m pip install -U pip setuptools wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -e './api/python/cellxgene_census/[experimental]'
- name: Report Dependency Versions
run: pip list
- name: Test with pytest (API, main tests)
run: |
PYTHONPATH=. coverage run --parallel-mode -m pytest -v -rP --durations=20 ./api/python/cellxgene_census/tests/
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/r-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ jobs:
- name: install packages (macOS)
if: matrix.os == 'macos-latest'
run: Rscript -e 'install.packages(c("igraph"), type="binary")'
- name: install cellxgene.census and dependencies (Linux)
- name: install cellxgene.census and dependencies
# This should follow our user-facing instructions to install cellxgene.census.
run: |
Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"), type="source")'
Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"))'
Rscript -e 'BiocManager::install("SingleCellExperiment")'
- name: run unit tests
# [re-]fetch the cellxgene.census source package which includes the unit test code to run
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ instance/

# Sphinx documentation
docs/_build/
_autosummary

# PyBuilder
target/
Expand Down
55 changes: 55 additions & 0 deletions api/python/cellxgene_census/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,58 @@ The `cellxgene_census` package provides an API to facilitate the use of the CZ C
For more help, please file a issue on the repo, or contact us at <[email protected]>.

If you believe you have found a security issue, we would appreciate notification. Please send email to <[email protected]>.

## Development Environment Setup

- Create a virtual environment using `venv` or `conda`
- `cd` to the root of this repository
- `pip install -e api/python/cellxgene_census`
- To install dependencies needed to work on the [experimental](./src/cellxgene_census/experimental/) portion of the API:
`pip install -e 'api/python/cellxgene_census[experimental]'`.
- `pip install jupyterlab`
- **Test it!** Either open up a new `jupyter` notebook or the `python` interpreter and run this code:

```python
import cellxgene_census

with cellxgene_census.open_soma() as census:

# Reads SOMADataFrame as a slice
cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
)

# Concatenates results to pyarrow.Table
cell_metadata = cell_metadata.concat()

# Converts to pandas.DataFrame
cell_metadata = cell_metadata.to_pandas()

print(cell_metadata)
```

The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:

```python

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.

assay cell_type tissue tissue_general suspension_type disease sex
0 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
1 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
2 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
3 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
4 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
... ... ... ... ... ... ... ...
607636 microwell-seq neuron adrenal gland adrenal gland cell normal female
607637 microwell-seq neuron adrenal gland adrenal gland cell normal female
607638 microwell-seq neuron adrenal gland adrenal gland cell normal female
607639 microwell-seq neuron adrenal gland adrenal gland cell normal female
607640 microwell-seq neuron adrenal gland adrenal gland cell normal female

[607641 rows x 7 columns]

```

- Learn more about the Census API by going through the tutorials in the [notebooks](../notebooks/)
6 changes: 3 additions & 3 deletions api/python/cellxgene_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ dependencies= [
# NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to
# ensure that the assets are readable (tiledbsoma supports backward compatible reading).
# Make sure this version does not fall behind the builder's tiledbsoma version.
"tiledbsoma~=1.9.1",
"tiledbsoma==1.11.3",
"anndata",
"numpy>=1.21",
"numpy>=1.21,<2.0",
"requests",
"typing_extensions",
"s3fs>=2021.06.1",
]

[project.optional-dependencies]
experimental = [
"torch~=2.0",
"torch~=2.2.0",
"torchdata~=0.7",
"scikit-learn~=1.0",
"scikit-misc>=0.2", # scikit-misc 0.3 dropped Python 3.8 support
Expand Down
4 changes: 3 additions & 1 deletion api/python/cellxgene_census/src/cellxgene_census/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from importlib import metadata

from ._get_anndata import get_anndata
from ._get_anndata import get_anndata, get_obs, get_var
from ._open import (
download_source_h5ad,
get_default_soma_context,
Expand All @@ -44,6 +44,8 @@
__all__ = [
"download_source_h5ad",
"get_anndata",
"get_obs",
"get_var",
"get_census_version_description",
"get_census_version_directory",
"get_census_mirror_directory",
Expand Down
90 changes: 89 additions & 1 deletion api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
Methods to retrieve slices of the census as AnnData objects.
"""

from typing import Optional, Sequence
from typing import Literal, Optional, Sequence

import anndata
import pandas as pd
import tiledbsoma as soma
from somacore.options import SparseDFCoord

Expand Down Expand Up @@ -146,3 +147,90 @@ def get_anndata(
adata.varm[emb] = embedding

return adata


def _get_axis_metadata(
census: soma.Collection,
axis: Literal["obs", "var"],
organism: str,
*,
value_filter: Optional[str] = None,
coords: Optional[SparseDFCoord] = slice(None),
column_names: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
exp = _get_experiment(census, organism)
coords = (slice(None),) if coords is None else (coords,)
if axis == "obs":
df = exp.obs
elif axis == "var":
df = exp.ms["RNA"].var
else:
raise ValueError(f"axis should be either 'obs' or 'var', but '{axis}' was passed")
result: pd.DataFrame = (
df.read(coords=coords, column_names=column_names, value_filter=value_filter).concat().to_pandas()
)
return result


def get_obs(
census: soma.Collection,
organism: str,
*,
value_filter: Optional[str] = None,
coords: Optional[SparseDFCoord] = slice(None),
column_names: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
"""Get the observation metadata for a query on the census.
Args:
census:
The census object, usually returned by :func:`open_soma`.
organism:
The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
value_filter:
Value filter for the ``obs`` metadata. Value is a filter query written in the
SOMA ``value_filter`` syntax.
coords:
Coordinates for the ``obs`` axis, which is indexed by the ``soma_joinid`` value.
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
column_names:
Columns to fetch.
Returns:
A :class:`pandas.DataFrame` object containing metadata for the queried slice.
"""
return _get_axis_metadata(
census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
)


def get_var(
census: soma.Collection,
organism: str,
*,
value_filter: Optional[str] = None,
coords: Optional[SparseDFCoord] = slice(None),
column_names: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
"""Get the variable metadata for a query on the census.
Args:
census:
The census object, usually returned by :func:`open_soma`.
organism:
The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
value_filter:
Value filter for the ``var`` metadata. Value is a filter query written in the
SOMA ``value_filter`` syntax.
coords:
Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
column_names:
Columns to fetch.
Returns:
A :class:`pandas.DataFrame` object containing metadata for the queried slice.
"""
return _get_axis_metadata(
census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
)
20 changes: 18 additions & 2 deletions api/python/cellxgene_census/src/cellxgene_census/_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import s3fs
import tiledbsoma as soma
from fsspec.callbacks import NoOpCallback, TqdmCallback

from ._release_directory import (
CensusLocator,
Expand Down Expand Up @@ -297,7 +298,9 @@ def get_source_h5ad_uri(dataset_id: str, *, census_version: str = DEFAULT_CENSUS
return locator


def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str = DEFAULT_CENSUS_VERSION) -> None:
def download_source_h5ad(
dataset_id: str, to_path: str, *, census_version: str = DEFAULT_CENSUS_VERSION, progress_bar: bool = True
) -> None:
"""Download the source H5AD dataset, for the given `dataset_id`, to the user-specified
file name.
Expand All @@ -308,6 +311,8 @@ def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str =
The file name where the downloaded H5AD will be written. Must not already exist.
census_version:
The census version name. Defaults to ``"stable"``.
progress_bar:
Whether to display a progress bar. Defaults to ``True``.
Raises:
ValueError: if the path already exists (i.e., will not overwrite an existing file), or is not a file.
Expand All @@ -326,6 +331,13 @@ def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str =
if to_path.endswith("/"):
raise ValueError("Specify to_path as a file name, not a directory name.")

if progress_bar:
callback = TqdmCallback(
tqdm_kwargs={"unit": "B", "unit_scale": True, "unit_divisor": 1024, "desc": "Downloading"}
)
else:
callback = NoOpCallback()

locator = get_source_h5ad_uri(dataset_id, census_version=census_version)
protocol = urllib.parse.urlparse(locator["uri"]).scheme
assert protocol == "s3"
Expand All @@ -334,4 +346,8 @@ def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str =
anon=True,
cache_regions=True,
)
fs.get_file(locator["uri"], to_path)
fs.get_file(
locator["uri"],
to_path,
callback=callback,
)
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _highly_variable_genes_seurat_v3(
n_batches = len(batch_index.cat.categories)
n_samples = batch_index.value_counts().loc[batch_index.cat.categories.to_numpy()].to_numpy()
if n_batches > 1:
batch_indexer = soma.IntIndexer(batch_index.index.to_numpy(), context=query.experiment.context).get_indexer
batch_indexer = batch_index.index.get_indexer
batch_codes = batch_index.cat.codes.to_numpy().astype(np.int64)
else:
n_batches = 1
Expand Down
Loading

0 comments on commit 5a22e2f

Please sign in to comment.