[builder] improve unit tests (#1067)

* fix several small bugs found by unit tests * additional unit tests * add utils tests * lint * improve flexibility of create_dask_client API * improve ability to capture coverage during tests * lint * improve coverage measurement on builder tests * expand conditions build is tested with * debug CI * more test debugging * more debugging * cleanup * increase coverage with additional params * turn off memory limit in test * further constrain resource usage in tests * further restrict resources * more unit test resource fixes * sigh, more resource reductions for unit tests * bump tiledbsoma version to latest (#1073) * [builder] add unit test for float precision reduction (#1076) * add test for bfloat conversion * [python] fix HVG unit test (#1077) * fix HVG crash * lint * fix typo
chanzuckerberg · Apr 2, 2024 · 8d736c4 · 8d736c4
1 parent 153f6a7
commit 8d736c4
Show file tree

Hide file tree

Showing 14 changed files with 412 additions and 49 deletions.
diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
@@ -69,7 +69,9 @@ jobs:
           pip install -r ./tools/scripts/requirements-dev.txt
       - name: Test with pytest (builder)
         run: |
-          PYTHONPATH=. coverage run --parallel-mode -m pytest -v -rP ./tools/cellxgene_census_builder/tests/
+          # Run with and without Numba JIT. This gives a more accurate representation of code coverage
+          PYTHONPATH=. coverage run --parallel-mode -m pytest -v -s -rP ./tools/cellxgene_census_builder/tests/
+          PYTHONPATH=. NUMBA_DISABLE_JIT=1 coverage run --parallel-mode -m pytest -v -s -rP ./tools/cellxgene_census_builder/tests/
       - uses: actions/upload-artifact@v4
         with:
           name: coverage-builder-${{ matrix.os }}-${{ matrix.python-version }}

diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py
@@ -66,7 +66,7 @@ def _get_batch_index(
         batch_series = obs[cast(str, batch_key[0])]
 
     batch_series = batch_series.astype("category")
-    return batch_series
+    return batch_series.cat.remove_unused_categories()
 
 
 def _highly_variable_genes_seurat_v3(

diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml
@@ -35,7 +35,7 @@ dependencies= [
     # recent cellxgene-census _readers_ are able to read the results of a Census build (writer).
     # The compatibility matrix is defined here:
     #    https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
-    "tiledbsoma==1.9.2",
+    "tiledbsoma==1.9.3",
     "cellxgene-census==1.12.0",
     "scipy==1.12.0",
     "fsspec[http]==2024.3.1",

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py
@@ -178,15 +178,19 @@ def get_estimated_density(self) -> float:
 
         Arbitarily picks density of 1.0 if the file is empty on either axis
         """
-        if self.n_obs * self.n_vars == 0:
+        # Use whole-file n_obs/n_vars, not the slice length
+        n_obs = len(self._obs)
+        n_vars = len(self._var)
+        if n_obs * n_vars == 0:
             return 1.0
 
         nnz: int
         if isinstance(self._X, CSRDataset | CSCDataset):
             nnz = self._X.group["data"].size
         else:
             nnz = self._X.size
-        return nnz / (self.n_obs * self.n_vars)
+
+        return nnz / (n_obs * n_vars)
 
     def _load_dataframe(self, elem: h5py.Group, column_names: tuple[str, ...] | None) -> pd.DataFrame:
         # if reading all, just use the built-in
@@ -236,7 +240,10 @@ def _load_h5ad(
         ), "Unsupported AnnData encoding-type or encoding-version - likely indicates file was created with an unsupported AnnData version"
 
         # Verify we are reading the expected CxG schema version.
-        schema_version = read_elem(self._file["uns/schema_version"])
+        if "schema_version" in self._file["uns"]:
+            schema_version = read_elem(self._file["uns/schema_version"])
+        else:
+            schema_version = "UNKNOWN"
         if schema_version != CXG_SCHEMA_VERSION:
             raise ValueError(
                 f"{self.filename} -- incorrect CxG schema version (got {schema_version}, expected {CXG_SCHEMA_VERSION})"
@@ -316,8 +323,10 @@ def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFun
     * genes only  (var.feature_biotype == 'gene')
     * Single organism
     """
-    organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None)
-    assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None)
+    organism_ontology_term_id = filter_spec.get("organism_ontology_term_id")
+    assert isinstance(organism_ontology_term_id, str)
+    assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids")
+    assert isinstance(assay_ontology_term_ids, list)
 
     def _filter(ad: AnnDataProxy) -> AnnDataProxy:
         """Filter observations and features per Census schema."""

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/consolidate.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/consolidate.py
@@ -139,8 +139,8 @@ def _consolidate_array(
     modes = consolidation_modes or ["fragment_meta", "array_meta", "commits", "fragments"]
     uri = obj.uri
 
-    # use ~1/8 of RAM, clamed to [1, 32].
-    total_buffer_size = clamp(int(psutil.virtual_memory().total / 8 // 1024**3), 1, 32) * 1024**3
+    # use ~1/32th of RAM, clamped to [1, 32].
+    total_buffer_size = clamp(int(psutil.virtual_memory().total / 32 // 1024**3), 1, 32) * 1024**3
 
     for mode in modes:
         tiledb.consolidate(

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/mp.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/mp.py
@@ -1,6 +1,7 @@
 import logging
 import multiprocessing
 import time
+from typing import Any
 
 import dask
 import dask.distributed
@@ -33,27 +34,25 @@ def setup(self, worker: dask.distributed.Worker) -> None:
 
 def create_dask_client(
     args: CensusBuildArgs,
-    *,
-    n_workers: int | None = None,
-    threads_per_worker: int | None = None,
-    memory_limit: str | float | int | None = "auto",
+    **kwargs: Any,
 ) -> dask.distributed.Client:
     """Create and return a Dask client."""
     # create a new client
     assert _mp_config_checks()
-
-    n_workers = max(1, n_workers or cpu_count())
+    kwargs.update(
+        {
+            "n_workers": max(1, kwargs.get("n_workers", cpu_count())),
+        }
+    )
     dask.config.set(
         {
             "distributed.scheduler.worker-ttl": "24 hours",  # some tasks are very long-lived, e.g., consolidation
         }
     )
 
     client = dask.distributed.Client(
-        n_workers=n_workers,
-        threads_per_worker=threads_per_worker,
-        memory_limit=memory_limit,
         dashboard_address=":8787" if args.config.dashboard else None,
+        **kwargs,
     )
     client.register_plugin(SetupDaskWorker(args))
     logger.info(f"Dask client created: {client}")

diff --git a/tools/cellxgene_census_builder/tests/anndata/conftest.py b/tools/cellxgene_census_builder/tests/anndata/conftest.py
@@ -5,7 +5,7 @@
 from cellxgene_census_builder.build_soma.datasets import Dataset
 from cellxgene_census_builder.build_state import CensusBuildArgs
 
-from ..conftest import ORGANISMS, get_anndata
+from ..conftest import ORGANISMS, X_FORMAT, get_anndata
 
 
 @pytest.fixture
@@ -91,8 +91,14 @@ def datasets_with_incorrect_schema_version(census_build_args: CensusBuildArgs) -
 
 
 @pytest.fixture
-def h5ad_simple(tmp_path: pathlib.Path) -> str:
-    adata = get_anndata(ORGANISMS[0])
+def h5ad_simple(request: pytest.FixtureRequest, tmp_path: pathlib.Path) -> str:
+    # parameterization is optional
+    try:
+        X_format: X_FORMAT = request.param
+    except AttributeError:
+        X_format = "csr"  # default
+
+    adata = get_anndata(ORGANISMS[0], X_format=X_format)
 
     path = "simple.h5ad"
     adata.write_h5ad(tmp_path / path)