scverse · ivirshup · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 29, 2024
diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py
@@ -375,13 +375,12 @@
 # It's in the `AnnData.concatenate` docstring, but should we keep it?
 @_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
-@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
-@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
+@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
 def write_basic(
     f: GroupStorageType,
     k: str,
@@ -391,7 +390,50 @@
     dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ):
     """Write methods which underlying library handles natively."""
-    f.create_dataset(k, data=elem, **dataset_kwargs)
+    dtype = dataset_kwargs.get("dtype", elem.dtype)
+    f.create_dataset(k, data=elem, **dataset_kwargs, dtype=dtype)
+
+
+def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType):
+    """
+    Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.
+
+    * If `dest` has chunks, it will return the chunks of `dest`.
+    * If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
+    """
+    if dest.chunks and hasattr(dest, "iter_chunks"):
+        return dest.iter_chunks()
+    else:
+        itemsize = elem.dtype.itemsize
+        shape = elem.shape
+        entry_chunk_size = 100 * 1024 * 1024 // itemsize  # number of elements to write
+        n_rows = max(
+            entry_chunk_size // shape[0], 1000
+        )  # Number of rows that works out to
+        return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
+
+
+@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
+@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
+def write_chunked_dense_array_to_group(
+    f: GroupStorageType,
+    k: str,
+    elem: ArrayStorageType,
+    *,
+    _writer: Writer,
+    dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
+):
+    """Write to a h5py.Dataset in chunks.
+
+    `h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory
+    before writing. Instead, we will write in chunks to avoid this. We don't need to do this for
+    zarr since zarr handles this automatically.
+    """
+    dtype = dataset_kwargs.get("dtype", elem.dtype)
+    dest = f.create_dataset(k, shape=elem.shape, **dataset_kwargs, dtype=dtype)
+
+    for chunk in _iter_chunks_for_copy(elem, dest):
+        dest[chunk] = elem[chunk]
 
 
 _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
@@ -602,10 +644,14 @@
     # Allow resizing for hdf5
     if isinstance(f, H5Group) and "maxshape" not in dataset_kwargs:
         dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs)
-
-    g.create_dataset("data", data=value.data, **dataset_kwargs)
-    g.create_dataset("indices", data=value.indices, **dataset_kwargs)
-    g.create_dataset("indptr", data=value.indptr, dtype=indptr_dtype, **dataset_kwargs)
+    _writer.write_elem(g, "data", value.data, dataset_kwargs=dataset_kwargs)
+    _writer.write_elem(g, "indices", value.indices, dataset_kwargs=dataset_kwargs)
+    _writer.write_elem(
+        g,
+        "indptr",
+        value.indptr,
+        dataset_kwargs={"dtype": indptr_dtype, **dataset_kwargs},
+    )
 
 
 write_csr = partial(write_sparse_compressed, fmt="csr")

diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py
@@ -175,7 +175,5 @@ def zarr_reader(func, elem_name: str, elem, iospec):
         write_dispatched(f, "/", adata, callback=zarr_writer)
         _ = read_dispatched(f, zarr_reader)
 
-    assert h5ad_write_keys == zarr_write_keys
-    assert h5ad_read_keys == zarr_read_keys
-
-    assert sorted(h5ad_write_keys) == sorted(h5ad_read_keys)
+    assert sorted(h5ad_write_keys) == sorted(zarr_write_keys)
+    assert sorted(h5ad_read_keys) == sorted(zarr_read_keys)
diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py
@@ -185,6 +185,18 @@ def create_sparse_store(
         pytest.param(
             pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool"
         ),
+        pytest.param(
+            zarr.ones((100, 100), chunks=(10, 10)),
+            "array",
+            id="zarr_dense_array",
+        ),
+        pytest.param(
+            create_dense_store(
+                h5py.File("test1.h5", mode="w", driver="core", backing_store=False)
+            )["X"],
+            "array",
+            id="h5_dense_array",
+        ),
         # pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr
         # TODO consider how specific encodings should be. Should we be fully describing the written type?
         # Currently the info we add is: "what you wouldn't be able to figure out yourself"