Skip to content

Commit

Permalink
[builder] improve unit tests (#1067)
Browse files Browse the repository at this point in the history
* fix several small bugs found by unit tests

* additional unit tests

* add utils tests

* lint

* improve flexibility of create_dask_client API

* improve ability to capture coverage during tests

* lint

* improve coverage measurement on builder tests

* expand conditions build is tested with

* debug CI

* more test debugging

* more debugging

* cleanup

* increase coverage with additional params

* turn off memory limit in test

* further constrain resource usage in tests

* further restrict resources

* more unit test resource fixes

* sigh, more resource reductions for unit tests

* bump tiledbsoma version to latest (#1073)

* [builder] add unit test for float precision reduction (#1076)

* add test for bfloat conversion

* [python] fix HVG unit test (#1077)

* fix HVG crash

* lint

* fix typo
  • Loading branch information
Bruce Martin authored Apr 2, 2024
1 parent 153f6a7 commit 8d736c4
Show file tree
Hide file tree
Showing 14 changed files with 412 additions and 49 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/py-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ jobs:
pip install -r ./tools/scripts/requirements-dev.txt
- name: Test with pytest (builder)
run: |
PYTHONPATH=. coverage run --parallel-mode -m pytest -v -rP ./tools/cellxgene_census_builder/tests/
# Run with and without Numba JIT. This gives a more accurate representation of code coverage
PYTHONPATH=. coverage run --parallel-mode -m pytest -v -s -rP ./tools/cellxgene_census_builder/tests/
PYTHONPATH=. NUMBA_DISABLE_JIT=1 coverage run --parallel-mode -m pytest -v -s -rP ./tools/cellxgene_census_builder/tests/
- uses: actions/upload-artifact@v4
with:
name: coverage-builder-${{ matrix.os }}-${{ matrix.python-version }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _get_batch_index(
batch_series = obs[cast(str, batch_key[0])]

batch_series = batch_series.astype("category")
return batch_series
return batch_series.cat.remove_unused_categories()


def _highly_variable_genes_seurat_v3(
Expand Down
2 changes: 1 addition & 1 deletion tools/cellxgene_census_builder/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies= [
# recent cellxgene-census _readers_ are able to read the results of a Census build (writer).
# The compatibility matrix is defined here:
# https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
"tiledbsoma==1.9.2",
"tiledbsoma==1.9.3",
"cellxgene-census==1.12.0",
"scipy==1.12.0",
"fsspec[http]==2024.3.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,19 @@ def get_estimated_density(self) -> float:
Arbitarily picks density of 1.0 if the file is empty on either axis
"""
if self.n_obs * self.n_vars == 0:
# Use whole-file n_obs/n_vars, not the slice length
n_obs = len(self._obs)
n_vars = len(self._var)
if n_obs * n_vars == 0:
return 1.0

nnz: int
if isinstance(self._X, CSRDataset | CSCDataset):
nnz = self._X.group["data"].size
else:
nnz = self._X.size
return nnz / (self.n_obs * self.n_vars)

return nnz / (n_obs * n_vars)

def _load_dataframe(self, elem: h5py.Group, column_names: tuple[str, ...] | None) -> pd.DataFrame:
# if reading all, just use the built-in
Expand Down Expand Up @@ -236,7 +240,10 @@ def _load_h5ad(
), "Unsupported AnnData encoding-type or encoding-version - likely indicates file was created with an unsupported AnnData version"

# Verify we are reading the expected CxG schema version.
schema_version = read_elem(self._file["uns/schema_version"])
if "schema_version" in self._file["uns"]:
schema_version = read_elem(self._file["uns/schema_version"])
else:
schema_version = "UNKNOWN"
if schema_version != CXG_SCHEMA_VERSION:
raise ValueError(
f"{self.filename} -- incorrect CxG schema version (got {schema_version}, expected {CXG_SCHEMA_VERSION})"
Expand Down Expand Up @@ -316,8 +323,10 @@ def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFun
* genes only (var.feature_biotype == 'gene')
* Single organism
"""
organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None)
assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None)
organism_ontology_term_id = filter_spec.get("organism_ontology_term_id")
assert isinstance(organism_ontology_term_id, str)
assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids")
assert isinstance(assay_ontology_term_ids, list)

def _filter(ad: AnnDataProxy) -> AnnDataProxy:
"""Filter observations and features per Census schema."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def _consolidate_array(
modes = consolidation_modes or ["fragment_meta", "array_meta", "commits", "fragments"]
uri = obj.uri

# use ~1/8 of RAM, clamed to [1, 32].
total_buffer_size = clamp(int(psutil.virtual_memory().total / 8 // 1024**3), 1, 32) * 1024**3
# use ~1/32th of RAM, clamped to [1, 32].
total_buffer_size = clamp(int(psutil.virtual_memory().total / 32 // 1024**3), 1, 32) * 1024**3

for mode in modes:
tiledb.consolidate(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import multiprocessing
import time
from typing import Any

import dask
import dask.distributed
Expand Down Expand Up @@ -33,27 +34,25 @@ def setup(self, worker: dask.distributed.Worker) -> None:

def create_dask_client(
args: CensusBuildArgs,
*,
n_workers: int | None = None,
threads_per_worker: int | None = None,
memory_limit: str | float | int | None = "auto",
**kwargs: Any,
) -> dask.distributed.Client:
"""Create and return a Dask client."""
# create a new client
assert _mp_config_checks()

n_workers = max(1, n_workers or cpu_count())
kwargs.update(
{
"n_workers": max(1, kwargs.get("n_workers", cpu_count())),
}
)
dask.config.set(
{
"distributed.scheduler.worker-ttl": "24 hours", # some tasks are very long-lived, e.g., consolidation
}
)

client = dask.distributed.Client(
n_workers=n_workers,
threads_per_worker=threads_per_worker,
memory_limit=memory_limit,
dashboard_address=":8787" if args.config.dashboard else None,
**kwargs,
)
client.register_plugin(SetupDaskWorker(args))
logger.info(f"Dask client created: {client}")
Expand Down
12 changes: 9 additions & 3 deletions tools/cellxgene_census_builder/tests/anndata/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from cellxgene_census_builder.build_soma.datasets import Dataset
from cellxgene_census_builder.build_state import CensusBuildArgs

from ..conftest import ORGANISMS, get_anndata
from ..conftest import ORGANISMS, X_FORMAT, get_anndata


@pytest.fixture
Expand Down Expand Up @@ -91,8 +91,14 @@ def datasets_with_incorrect_schema_version(census_build_args: CensusBuildArgs) -


@pytest.fixture
def h5ad_simple(tmp_path: pathlib.Path) -> str:
adata = get_anndata(ORGANISMS[0])
def h5ad_simple(request: pytest.FixtureRequest, tmp_path: pathlib.Path) -> str:
# parameterization is optional
try:
X_format: X_FORMAT = request.param
except AttributeError:
X_format = "csr" # default

adata = get_anndata(ORGANISMS[0], X_format=X_format)

path = "simple.h5ad"
adata.write_h5ad(tmp_path / path)
Expand Down
Loading

0 comments on commit 8d736c4

Please sign in to comment.