Merge branch 'main' into mlin/similarity-search-api-optimize-predict

chanzuckerberg · Sep 5, 2024 · 011242d · 011242d
2 parents cd16bd1 + fc7aefe
commit 011242d
Show file tree

Hide file tree

Showing 28 changed files with 228 additions and 230 deletions.
diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml
@@ -25,10 +25,10 @@ jobs:
       fail-fast: false  # don't fail-fast, as errors are often specific to a single cell in the matrix
       matrix:
         os: [sc-dev-64g-runner, macos-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
         exclude:
           - os: macos-latest
-            python-version: "3.8"
+            python-version: "3.12"
 
     runs-on: ${{matrix.os}}
 

diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
@@ -21,10 +21,10 @@ jobs:
       fail-fast: false  # Don't stop the workflow if one of the jobs fails
       matrix:
         os: [sc-dev-64g-runner, macos-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
         exclude:
           - os: macos-latest
-            python-version: "3.8"
+            python-version: "3.12"
 
     runs-on: ${{matrix.os}}
 

diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
 ]
 license = { text = "MIT" }
 readme = "README.md"
-requires-python = ">= 3.8, < 3.12"
+requires-python = ">= 3.10, < 3.13"
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
@@ -22,18 +22,17 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Bio-Informatics",
     "Operating System :: POSIX :: Linux",
     "Operating System :: MacOS :: MacOS X",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 dependencies= [
     # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to
     # ensure that the assets are readable (tiledbsoma supports backward compatible reading).
     # Make sure this version does not fall behind the builder's tiledbsoma version.
-    "tiledbsoma~=1.12.3",
+    "tiledbsoma>=1.12.3",
     "anndata",
-    "numpy>=1.21,<2.0",
+    "numpy>=1.23,<2.0",
     "requests",
     "typing_extensions",
     "s3fs>=2021.06.1",
@@ -43,9 +42,8 @@ dependencies= [
 experimental = [
     "torch",
     "torchdata~=0.7",
-    "scikit-learn~=1.0",
+    "scikit-learn>=1.2",
     "scikit-misc>=0.2,<0.4",  # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels
-    "psutil~=5.0",
     "datasets~=2.0",
     "tdigest~=0.5",
     # choose newest version of tiledb-vector-search that doesn't need a newer version of tiledb
@@ -81,7 +79,7 @@ root = "../../.."
 [tool.ruff]
 line-length = 120
 src = ["api/python/cellxgene_census/src"]
-target-version = "py38"
+target-version = "py310"
 
 [tool.ruff.lint]
 select = [
@@ -129,6 +127,8 @@ ignore = [
     "D205",
     # Prefer absolute imports over relative imports from parent modules TODO: enable
     "TID252",
+    # It's okay to use zip without the strict kwarg. In fact, numba doesn't like it when you use it
+    "B905",
 ]
 
 [tool.ruff.lint.pydocstyle]

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -7,7 +7,8 @@
 Methods to retrieve slices of the census as AnnData objects.
 """
 
-from typing import Literal, Optional, Sequence
+from collections.abc import Sequence
+from typing import Literal
 from warnings import warn
 
 import anndata
@@ -27,20 +28,20 @@ def get_anndata(
     organism: str,
     measurement_name: str = "RNA",
     X_name: str = "raw",
-    X_layers: Optional[Sequence[str]] = (),
-    obsm_layers: Optional[Sequence[str]] = (),
-    obsp_layers: Optional[Sequence[str]] = (),
-    varm_layers: Optional[Sequence[str]] = (),
-    varp_layers: Optional[Sequence[str]] = (),
-    obs_value_filter: Optional[str] = None,
-    obs_coords: Optional[SparseDFCoord] = None,
-    var_value_filter: Optional[str] = None,
-    var_coords: Optional[SparseDFCoord] = None,
-    column_names: Optional[soma.AxisColumnNames] = None,
-    obs_embeddings: Optional[Sequence[str]] = (),
-    var_embeddings: Optional[Sequence[str]] = (),
-    obs_column_names: Optional[Sequence[str]] = None,
-    var_column_names: Optional[Sequence[str]] = None,
+    X_layers: Sequence[str] | None = (),
+    obsm_layers: Sequence[str] | None = (),
+    obsp_layers: Sequence[str] | None = (),
+    varm_layers: Sequence[str] | None = (),
+    varp_layers: Sequence[str] | None = (),
+    obs_value_filter: str | None = None,
+    obs_coords: SparseDFCoord | None = None,
+    var_value_filter: str | None = None,
+    var_coords: SparseDFCoord | None = None,
+    column_names: soma.AxisColumnNames | None = None,
+    obs_embeddings: Sequence[str] | None = (),
+    var_embeddings: Sequence[str] | None = (),
+    obs_column_names: Sequence[str] | None = None,
+    var_column_names: Sequence[str] | None = None,
 ) -> anndata.AnnData:
     """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
     and return it as an :class:`anndata.AnnData` object.
@@ -176,9 +177,9 @@ def _get_axis_metadata(
     axis: Literal["obs", "var"],
     organism: str,
     *,
-    value_filter: Optional[str] = None,
-    coords: Optional[SparseDFCoord] = slice(None),
-    column_names: Optional[Sequence[str]] = None,
+    value_filter: str | None = None,
+    coords: SparseDFCoord | None = slice(None),
+    column_names: Sequence[str] | None = None,
 ) -> pd.DataFrame:
     exp = _get_experiment(census, organism)
     coords = (slice(None),) if coords is None else (coords,)
@@ -198,9 +199,9 @@ def get_obs(
     census: soma.Collection,
     organism: str,
     *,
-    value_filter: Optional[str] = None,
-    coords: Optional[SparseDFCoord] = slice(None),
-    column_names: Optional[Sequence[str]] = None,
+    value_filter: str | None = None,
+    coords: SparseDFCoord | None = slice(None),
+    column_names: Sequence[str] | None = None,
 ) -> pd.DataFrame:
     """Get the observation metadata for a query on the census.
 
@@ -230,9 +231,9 @@ def get_var(
     census: soma.Collection,
     organism: str,
     *,
-    value_filter: Optional[str] = None,
-    coords: Optional[SparseDFCoord] = slice(None),
-    column_names: Optional[Sequence[str]] = None,
+    value_filter: str | None = None,
+    coords: SparseDFCoord | None = slice(None),
+    column_names: Sequence[str] | None = None,
 ) -> pd.DataFrame:
     """Get the variable metadata for a query on the census.
 

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py
@@ -10,7 +10,7 @@
 import logging
 import os.path
 import urllib.parse
-from typing import Any, Dict, Optional, get_args
+from typing import Any, get_args
 
 import s3fs
 import tiledbsoma as soma
@@ -32,7 +32,7 @@
     "anon": True,
     "cache_regions": True,
 }
-DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = {
+DEFAULT_TILEDB_CONFIGURATION: dict[str, Any] = {
     # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters
     "py.init_buffer_bytes": 1 * 1024**3,
     "soma.init_buffer_bytes": 1 * 1024**3,
@@ -71,7 +71,7 @@ def _resolve_census_locator(locator: CensusLocator, mirror: CensusMirror) -> Res
 
 def _open_soma(
     locator: ResolvedCensusLocator,
-    context: Optional[soma.options.SOMATileDBContext] = None,
+    context: soma.options.SOMATileDBContext | None = None,
 ) -> soma.Collection:
     """Private. Merge config defaults and return open census as a soma Collection/context."""
     # if no user-defined context, cellxgene_census defaults take precedence over SOMA defaults
@@ -85,7 +85,7 @@ def _open_soma(
     return soma.open(locator["uri"], mode="r", soma_type=soma.Collection, context=context)
 
 
-def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> soma.options.SOMATileDBContext:
+def get_default_soma_context(tiledb_config: dict[str, Any] | None = None) -> soma.options.SOMATileDBContext:
     """Return a :class:`tiledbsoma.SOMATileDBContext` with sensible defaults that can be further customized by the
     user. The customized context can then be passed to :func:`cellxgene_census.open_soma` with the ``context``
     argument or to :meth:`somacore.SOMAObject.open` with the ``context`` argument, such as
@@ -132,11 +132,11 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) ->
 
 def open_soma(
     *,
-    census_version: Optional[str] = DEFAULT_CENSUS_VERSION,
-    mirror: Optional[str] = None,
-    uri: Optional[str] = None,
-    tiledb_config: Optional[Dict[str, Any]] = None,
-    context: Optional[soma.options.SOMATileDBContext] = None,
+    census_version: str | None = DEFAULT_CENSUS_VERSION,
+    mirror: str | None = None,
+    uri: str | None = None,
+    tiledb_config: dict[str, Any] | None = None,
+    context: soma.options.SOMATileDBContext | None = None,
 ) -> soma.Collection:
     """Open the Census by version or URI.
 

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py
@@ -7,9 +7,8 @@
 Methods to retrieve information about versions of the publicly hosted Census object.
 """
 
-import typing
 from collections import OrderedDict
-from typing import Any, Dict, Literal, Optional, Union, cast
+from typing import Any, Literal, cast
 
 import requests
 from typing_extensions import NotRequired, TypedDict
@@ -37,7 +36,7 @@ class CensusLocator(TypedDict):
 
     uri: str
     relative_uri: str
-    s3_region: Optional[str]
+    s3_region: str | None
 
 
 class CensusVersionRetraction(TypedDict):
@@ -55,13 +54,13 @@ class CensusVersionRetraction(TypedDict):
     """
 
     date: str
-    reason: Optional[str]
-    info_url: Optional[str]
-    replaced_by: Optional[str]
+    reason: str | None
+    info_url: str | None
+    replaced_by: str | None
 
 
 ReleaseFlag = Literal["lts", "retracted"]
-ReleaseFlags = Dict[ReleaseFlag, bool]
+ReleaseFlags = dict[ReleaseFlag, bool]
 
 
 class CensusVersionDescription(TypedDict):
@@ -82,15 +81,15 @@ class CensusVersionDescription(TypedDict):
             If retracted, details of the retraction.
     """
 
-    release_date: Optional[str]
+    release_date: str | None
     release_build: str
     soma: CensusLocator
     h5ads: CensusLocator
     flags: NotRequired[ReleaseFlags]
     retraction: NotRequired[CensusVersionRetraction]
 
 
-CensusDirectory = Dict[CensusVersionName, Union[CensusVersionName, CensusVersionDescription]]
+CensusDirectory = dict[CensusVersionName, CensusVersionName | CensusVersionDescription]
 
 """
 A provider identifies a storage medium for the Census, which can either be a cloud provider or a local file.
@@ -132,11 +131,11 @@ class CensusMirror(TypedDict):
 
     provider: Provider
     base_uri: str
-    region: Optional[str]
+    region: str | None
     embeddings_base_uri: str
 
 
-CensusMirrors = Dict[CensusMirrorName, Union[CensusMirrorName, CensusMirror]]
+CensusMirrors = dict[CensusMirrorName, CensusMirrorName | CensusMirror]
 
 
 class ResolvedCensusLocator(TypedDict):
@@ -155,7 +154,7 @@ class ResolvedCensusLocator(TypedDict):
     """
 
     uri: str
-    region: Optional[str]
+    region: str | None
     provider: str
 
 
@@ -200,8 +199,8 @@ def get_census_version_description(census_version: str) -> CensusVersionDescript
 
 
 def get_census_version_directory(
-    *, lts: Optional[bool] = None, retracted: Optional[bool] = False
-) -> Dict[CensusVersionName, CensusVersionDescription]:
+    *, lts: bool | None = None, retracted: bool | None = False
+) -> dict[CensusVersionName, CensusVersionDescription]:
     """Get the directory of Census versions currently available, optionally filtering by specified
     flags. If a filtering flag is not specified, Census versions will not be filtered by that flag.
     Defaults to including both "long-term stable" (LTS) and weekly Census versions, and excluding
@@ -358,7 +357,7 @@ def get_census_version_directory(
 
     directory: dict[str, str | dict[str, Any]] = response.json()
     directory_out: CensusDirectory = {}
-    aliases: typing.Set[CensusVersionName] = set()
+    aliases: set[CensusVersionName] = set()
 
     # Resolve all aliases for easier use
     for census_version_name in list(directory.keys()):
@@ -401,7 +400,7 @@ def get_census_version_directory(
         directory_out[census_version_name] = census_version_description.copy()
 
     # Cast is safe, as we have removed all aliases
-    unordered_directory = cast(Dict[CensusVersionName, CensusVersionDescription], directory_out)
+    unordered_directory = cast(dict[CensusVersionName, CensusVersionDescription], directory_out)
 
     # Sort by aliases and release date, descending
     aliased_releases = [(k, v) for k, v in unordered_directory.items() if k in aliases]
@@ -417,7 +416,7 @@ def get_census_version_directory(
     return ordered_directory
 
 
-def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]:
+def get_census_mirror_directory() -> dict[CensusMirrorName, CensusMirror]:
     """Get the directory of Census mirrors currently available.
 
     Returns:
@@ -429,7 +428,7 @@ def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]:
     """
     mirrors = _get_census_mirrors()
     del mirrors["default"]
-    return cast(Dict[CensusMirrorName, CensusMirror], mirrors)
+    return cast(dict[CensusMirrorName, CensusMirror], mirrors)
 
 
 def _get_census_mirrors() -> CensusMirrors:

diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py
@@ -8,7 +8,7 @@
 
 import json
 import warnings
-from typing import Any, Dict, cast
+from typing import Any, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -55,7 +55,7 @@ def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBC
         embedding_metadata = json.loads(E.metadata["CxG_embedding_info"])
         assert isinstance(embedding_metadata, dict)
 
-    return cast(Dict[str, Any], embedding_metadata)
+    return cast(dict[str, Any], embedding_metadata)
 
 
 def _get_embedding(
@@ -67,7 +67,7 @@ def _get_embedding(
     context: soma.options.SOMATileDBContext | None = None,
 ) -> npt.NDArray[np.float32]:
     """Private. Like get_embedding, but accepts a Census object and a Census directory."""
-    if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)):
+    if isinstance(obs_soma_joinids, pa.Array | pa.ChunkedArray | pd.Series):
         obs_soma_joinids = obs_soma_joinids.to_numpy()
     assert isinstance(obs_soma_joinids, np.ndarray)
     if obs_soma_joinids.dtype != np.int64:
@@ -194,7 +194,7 @@ def get_embedding_metadata_by_name(
     response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
 
-    manifest = cast(Dict[str, Dict[str, Any]], response.json())
+    manifest = cast(dict[str, dict[str, Any]], response.json())
     embeddings = []
     for _, obj in manifest.items():
         if (