Merge branch 'v3' into tom/fix/dtype-str-special-case

TomAugspurger · Oct 10, 2024 · df92bad · df92bad
2 parents 7e76e9e + 395604d
commit df92bad
Show file tree

Hide file tree

Showing 21 changed files with 209 additions and 70 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -49,3 +49,7 @@ repos:
     hooks:
       - id: rst-directive-colons
       - id: rst-inline-touching-normal
+  - repo: https://github.com/numpy/numpydoc
+    rev: v1.8.0
+    hooks:
+      - id: numpydoc-validation
diff --git a/pyproject.toml b/pyproject.toml
@@ -319,3 +319,7 @@ ignore = [
 	"PC111",  # fix Python code in documentation - enable later
 	"PC180",  # for JavaScript - not interested
 ]
+
+[tool.numpydoc_validation]
+# See https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks for list of checks
+checks = ["GL06", "GL07", "GL10", "PR03", "PR05", "PR06"]
diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
@@ -20,11 +20,11 @@
     from zarr.core.indexing import SelectorTuple
 
 __all__ = [
-    "BaseCodec",
     "ArrayArrayCodec",
     "ArrayBytesCodec",
     "ArrayBytesCodecPartialDecodeMixin",
     "ArrayBytesCodecPartialEncodeMixin",
+    "BaseCodec",
     "BytesBytesCodec",
     "CodecInput",
     "CodecOutput",

diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py
@@ -43,7 +43,7 @@ class Store(ABC):
     _mode: AccessMode
     _is_open: bool
 
-    def __init__(self, mode: AccessModeLiteral = "r", *args: Any, **kwargs: Any) -> None:
+    def __init__(self, *args: Any, mode: AccessModeLiteral = "r", **kwargs: Any) -> None:
         self._is_open = False
         self._mode = AccessMode.from_literal(mode)
 

diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py
@@ -14,6 +14,7 @@
 from zarr.core.group import AsyncGroup
 from zarr.core.metadata.v2 import ArrayV2Metadata
 from zarr.core.metadata.v3 import ArrayV3Metadata
+from zarr.errors import NodeTypeValidationError
 from zarr.storage import (
     StoreLike,
     StorePath,
@@ -159,7 +160,7 @@ async def load(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     path : str or None, optional
         The path within the store from which to load.
@@ -203,7 +204,7 @@ async def open(
 
     Parameters
     ----------
-    store : Store or string, optional
+    store : Store or str, optional
         Store or path to directory in file system or name of zip file.
     mode : {'r', 'r+', 'a', 'w', 'w-'}, optional
         Persistence mode: 'r' means read only (must exist); 'r+' means
@@ -247,7 +248,10 @@ async def open(
 
     try:
         return await open_array(store=store_path, zarr_format=zarr_format, **kwargs)
-    except KeyError:
+    except (KeyError, NodeTypeValidationError):
+        # KeyError for a missing key
+        # NodeTypeValidationError for failing to parse node metadata as an array when it's
+        # actually a group
         return await open_group(store=store_path, zarr_format=zarr_format, **kwargs)
 
 
@@ -267,7 +271,7 @@ async def save(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     args : ndarray
         NumPy arrays with data to save.
@@ -303,7 +307,7 @@ async def save_array(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     arr : ndarray
         NumPy array with data to save.
@@ -351,7 +355,7 @@ async def save_group(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     args : ndarray
         NumPy arrays with data to save.
@@ -467,7 +471,7 @@ async def group(
 
     Parameters
     ----------
-    store : Store or string, optional
+    store : Store or str, optional
         Store or path to directory in file system.
     overwrite : bool, optional
         If True, delete any pre-existing data in `store` at `path` before
@@ -481,7 +485,7 @@ async def group(
         to all attribute read operations.
     synchronizer : object, optional
         Array synchronizer.
-    path : string, optional
+    path : str, optional
         Group path within store.
     meta_array : array-like, optional
         An array instance to use for determining arrays to create and return
@@ -547,7 +551,7 @@ async def open_group(
 
     Parameters
     ----------
-    store : Store, string, or mapping, optional
+    store : Store, str, or mapping, optional
         Store or path to directory in file system or name of zip file.
 
         Strings are interpreted as paths on the local file system
@@ -570,16 +574,18 @@ async def open_group(
         to all attribute read operations.
     synchronizer : object, optional
         Array synchronizer.
-    path : string, optional
+    path : str, optional
         Group path within store.
-    chunk_store : Store or string, optional
+    chunk_store : Store or str, optional
         Store or path to directory in file system or name of zip file.
     storage_options : dict
         If using an fsspec URL to create the store, these will be passed to
         the backend implementation. Ignored otherwise.
     meta_array : array-like, optional
         An array instance to use for determining arrays to create and return
         to users. Use `numpy.empty(())` by default.
+    attributes : dict
+        A dictionary of JSON-serializable values with user-defined attributes.
 
     Returns
     -------
@@ -664,22 +670,22 @@ async def create(
         False, will be set to `shape`, i.e., single chunk for the whole array.
         If an int, the chunk size in each dimension will be given by the value
         of `chunks`. Default is True.
-    dtype : string or dtype, optional
+    dtype : str or dtype, optional
         NumPy dtype.
     compressor : Codec, optional
         Primary compressor.
     fill_value : object
         Default value to use for uninitialized portions of the array.
     order : {'C', 'F'}, optional
         Memory layout to be used within each chunk.
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     synchronizer : object, optional
         Array synchronizer.
     overwrite : bool, optional
         If True, delete all pre-existing data in `store` at `path` before
         creating the array.
-    path : string, optional
+    path : str, optional
         Path under which array is stored.
     chunk_store : MutableMapping, optional
         Separate storage for chunks. If not provided, `store` will be used
@@ -937,11 +943,11 @@ async def open_array(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     zarr_format : {2, 3, None}, optional
         The zarr format to use when saving.
-    path : string, optional
+    path : str, optional
         Path in store to array.
     storage_options : dict
         If using an fsspec URL to create the store, these will be passed to

diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py
@@ -207,6 +207,7 @@ def open_group(
     zarr_version: ZarrFormat | None = None,  # deprecated
     zarr_format: ZarrFormat | None = None,
     meta_array: Any | None = None,  # not used in async api
+    attributes: dict[str, JSON] | None = None,
 ) -> Group:
     return Group(
         sync(
@@ -221,6 +222,7 @@ def open_group(
                 zarr_version=zarr_version,
                 zarr_format=zarr_format,
                 meta_array=meta_array,
+                attributes=attributes,
             )
         )
     )

diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -28,8 +28,8 @@
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
-    "VLenUTF8Codec",
     "VLenBytesCodec",
+    "VLenUTF8Codec",
     "ZstdCodec",
 ]
 

diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py
@@ -17,6 +17,7 @@
 from zarr.core.common import ChunkCoords, concurrent_map
 from zarr.core.config import config
 from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice
+from zarr.core.metadata.v2 import _default_fill_value
 from zarr.registry import register_pipeline
 
 if TYPE_CHECKING:
@@ -247,7 +248,17 @@ async def read_batch(
                 if chunk_array is not None:
                     out[out_selection] = chunk_array
                 else:
-                    out[out_selection] = chunk_spec.fill_value
+                    fill_value = chunk_spec.fill_value
+
+                    if fill_value is None:
+                        # Zarr V2 allowed `fill_value` to be null in the metadata.
+                        # Zarr V3 requires it to be set. This has already been
+                        # validated when decoding the metadata, but we support reading
+                        # Zarr V2 data and need to support the case where fill_value
+                        # is None.
+                        fill_value = _default_fill_value(dtype=chunk_spec.dtype)
+
+                    out[out_selection] = fill_value
         else:
             chunk_bytes_batch = await concurrent_map(
                 [
@@ -274,7 +285,10 @@ async def read_batch(
                         tmp = tmp.squeeze(axis=drop_axes)
                     out[out_selection] = tmp
                 else:
-                    out[out_selection] = chunk_spec.fill_value
+                    fill_value = chunk_spec.fill_value
+                    if fill_value is None:
+                        fill_value = _default_fill_value(dtype=chunk_spec.dtype)
+                    out[out_selection] = fill_value
 
     def _merge_chunk_array(
         self,

diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -68,6 +68,7 @@
 from zarr.core.metadata.v2 import ArrayV2Metadata
 from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.core.sync import collect_aiterator, sync
+from zarr.errors import MetadataValidationError
 from zarr.registry import get_pipeline_class
 from zarr.storage import StoreLike, make_store_path
 from zarr.storage.common import StorePath, ensure_no_existing_node
@@ -145,7 +146,7 @@ async def get_array_metadata(
         else:
             zarr_format = 2
     else:
-        raise ValueError(f"unexpected zarr_format: {zarr_format}")
+        raise MetadataValidationError("zarr_format", "2, 3, or None", zarr_format)
 
     metadata_dict: dict[str, Any]
     if zarr_format == 2:
@@ -382,7 +383,7 @@ async def _create_v2(
             chunks=chunks,
             order=order,
             dimension_separator=dimension_separator,
-            fill_value=0 if fill_value is None else fill_value,
+            fill_value=fill_value,
             compressor=compressor,
             filters=filters,
             attributes=attributes,
@@ -1290,11 +1291,11 @@ def get_basic_selection(
             array. May be any combination of int and/or slice or ellipsis for multidimensional arrays.
         out : NDBuffer, optional
             If given, load the selected data directly into this buffer.
+        prototype : BufferPrototype, optional
+            The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used.
         fields : str or sequence of str, optional
             For arrays with a structured dtype, one or more fields can be specified to
             extract data for.
-        prototype : BufferPrototype, optional
-            The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used.
 
         Returns
         -------
@@ -2286,6 +2287,17 @@ def resize(self, new_shape: ChunkCoords) -> Array:
         This method does not modify the original Array object. Instead, it returns a new Array
         with the specified shape.
 
+        Notes
+        -----
+        When resizing an array, the data are not rearranged in any way.
+
+        If one or more dimensions are shrunk, any chunks falling outside the
+        new array shape will be deleted from the underlying store.
+        However, it is noteworthy that the chunks partially falling inside the new array
+        (i.e. boundary chunks) will remain intact, and therefore,
+        the data falling outside the new array but inside the boundary chunks
+        would be restored by a subsequent resize operation that grows the array size.
+
         Examples
         --------
         >>> import zarr
@@ -2303,17 +2315,6 @@ def resize(self, new_shape: ChunkCoords) -> Array:
         (20000, 1000)
         >>> z2.shape
         (50, 50)
-
-        Notes
-        -----
-        When resizing an array, the data are not rearranged in any way.
-
-        If one or more dimensions are shrunk, any chunks falling outside the
-        new array shape will be deleted from the underlying store.
-        However, it is noteworthy that the chunks partially falling inside the new array
-        (i.e. boundary chunks) will remain intact, and therefore,
-        the data falling outside the new array but inside the boundary chunks
-        would be restored by a subsequent resize operation that grows the array size.
         """
         return type(self)(
             sync(