Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunked writing of h5py.Dataset and zarr.Array #1624

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,13 +375,12 @@
# It's in the `AnnData.concatenate` docstring, but should we keep it?
@_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
def write_basic(
f: GroupStorageType,
k: str,
Expand All @@ -391,7 +390,50 @@
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write methods which underlying library handles natively."""
f.create_dataset(k, data=elem, **dataset_kwargs)
dtype = dataset_kwargs.get("dtype", elem.dtype)
f.create_dataset(k, data=elem, **dataset_kwargs, dtype=dtype)


def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType):
"""
Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.

* If `dest` has chunks, it will return the chunks of `dest`.
* If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
"""
if dest.chunks and hasattr(dest, "iter_chunks"):
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved
return dest.iter_chunks()

Check warning on line 405 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L405

Added line #L405 was not covered by tests
else:
itemsize = elem.dtype.itemsize
shape = elem.shape
entry_chunk_size = 100 * 1024 * 1024 // itemsize # number of elements to write
n_rows = max(
entry_chunk_size // shape[0], 1000
) # Number of rows that works out to
return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved


@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
def write_chunked_dense_array_to_group(
f: GroupStorageType,
k: str,
elem: ArrayStorageType,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write to a h5py.Dataset in chunks.

`h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory
before writing. Instead, we will write in chunks to avoid this. We don't need to do this for
zarr since zarr handles this automatically.
"""
dtype = dataset_kwargs.get("dtype", elem.dtype)
dest = f.create_dataset(k, shape=elem.shape, **dataset_kwargs, dtype=dtype)

for chunk in _iter_chunks_for_copy(elem, dest):
dest[chunk] = elem[chunk]


_REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
Expand Down Expand Up @@ -602,10 +644,14 @@
# Allow resizing for hdf5
if isinstance(f, H5Group) and "maxshape" not in dataset_kwargs:
dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs)

g.create_dataset("data", data=value.data, **dataset_kwargs)
g.create_dataset("indices", data=value.indices, **dataset_kwargs)
g.create_dataset("indptr", data=value.indptr, dtype=indptr_dtype, **dataset_kwargs)
_writer.write_elem(g, "data", value.data, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "indices", value.indices, dataset_kwargs=dataset_kwargs)
_writer.write_elem(
g,
"indptr",
value.indptr,
dataset_kwargs={"dtype": indptr_dtype, **dataset_kwargs},
)


write_csr = partial(write_sparse_compressed, fmt="csr")
Expand Down
6 changes: 2 additions & 4 deletions tests/test_io_dispatched.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,5 @@ def zarr_reader(func, elem_name: str, elem, iospec):
write_dispatched(f, "/", adata, callback=zarr_writer)
_ = read_dispatched(f, zarr_reader)

assert h5ad_write_keys == zarr_write_keys
assert h5ad_read_keys == zarr_read_keys

assert sorted(h5ad_write_keys) == sorted(h5ad_read_keys)
assert sorted(h5ad_write_keys) == sorted(zarr_write_keys)
assert sorted(h5ad_read_keys) == sorted(zarr_read_keys)
12 changes: 12 additions & 0 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,18 @@ def create_sparse_store(
pytest.param(
pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool"
),
pytest.param(
zarr.ones((100, 100), chunks=(10, 10)),
"array",
id="zarr_dense_array",
),
pytest.param(
create_dense_store(
h5py.File("test1.h5", mode="w", driver="core", backing_store=False)
)["X"],
"array",
id="h5_dense_array",
),
# pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr
# TODO consider how specific encodings should be. Should we be fully describing the written type?
# Currently the info we add is: "what you wouldn't be able to figure out yourself"
Expand Down
Loading