Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added info for Group and Array #2400

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions src/zarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
zeros,
zeros_like,
)
from zarr.core._info import GroupInfo
from zarr.core.array import Array, AsyncArray
from zarr.core.config import config
from zarr.core.group import AsyncGroup, Group
Expand All @@ -38,6 +39,7 @@
"AsyncArray",
"AsyncGroup",
"Group",
"GroupInfo",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why we want to put this in the user API?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think any object that a user can get through a public API (like zarr.group().info) should be part of the API somewhere, if only to enable things like static typing without using private imports.

I don't have a preference for whether it's in the top-level API like I've added here, or some submodule.

"__version__",
"array",
"config",
Expand Down
143 changes: 143 additions & 0 deletions src/zarr/core/_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import dataclasses
import textwrap
from typing import Literal


@dataclasses.dataclass(kw_only=True)
class GroupInfo:
"""
Information about a group.

Attributes
----------
name : str
The path of the group within the Store
type : "Group"
zarr_format : {2, 3}
The zarr format of the Group.
read_only : bool
Whether the Group's access mode is read only.
store_type : str
The name of the Store class containing this group.
count_members : int, optional
The number of child members below this group. This
will be set when the Group has consolidated metadata
or when using :class:`Group.info_complete`.
count_arrays : int, optional
The number of child arrays below this group. This
will be set when the Group has consolidated metadata
or when using :class:`Group.info_complete`.
count_groups : int, optional
The number of child groups below this group. This
will be set when the Group has consolidated metadata
or when using :class:`Group.info_complete`.
"""

name: str
type: Literal["Group"] = "Group"
zarr_format: Literal[2, 3]
read_only: bool
store_type: str
count_members: int | None = None
count_arrays: int | None = None
count_groups: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Name : {name}
Type : {type}
Zarr format : {zarr_format}
Read-only : {read_only}
Store type : {store_type}""")

if self.count_members is not None:
template += "\nNo. members : {count_members}"
if self.count_arrays is not None:
template += "\nNo. arrays : {count_arrays}"
if self.count_groups is not None:
template += "\nNo. groups : {count_groups}"
return template.format(**dataclasses.asdict(self))


def human_readable_size(size: int) -> str:
if size < 2**10:
return f"{size}"
elif size < 2**20:
return f"{size / float(2**10):.1f}K"
elif size < 2**30:
return f"{size / float(2**20):.1f}M"
elif size < 2**40:
return f"{size / float(2**30):.1f}G"
elif size < 2**50:
return f"{size / float(2**40):.1f}T"
else:
return f"{size / float(2**50):.1f}P"


def byte_info(size: int) -> str:
if size < 2**10:
return str(size)
else:
return f"{size} ({human_readable_size(size)})"


@dataclasses.dataclass(kw_only=True)
class ArrayInfo:
type: Literal["Array"] = "Array"
zarr_format: Literal[2, 3]
data_type: str
shape: tuple[int, ...]
chunk_shape: tuple[int, ...] | None = None
order: Literal["C", "F"]
read_only: bool
store_type: str
compressor: str | None = None
filters: list[str] | None = None
codecs: str | None = None
count_bytes: int | None = None
count_bytes_stored: int | None = None
count_chunks_initialized: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Type : {type}
Zarr format : {zarr_format}
Data type : {data_type}
Shape : {shape}
Chunk shape : {chunk_shape}
Order : {order}
Read-only : {read_only}
Store type : {store_type}""")

kwargs = dataclasses.asdict(self)
if self.chunk_shape is None:
# for non-regular chunk grids
kwargs["chunk_shape"] = "<variable>"
if self.compressor is not None:
template += "\nCompressor : {compressor}"

if self.filters is not None:
template += "\nFilters : {filters}"

if self.codecs is not None:
template += "\nCodecs : {codecs}"

if self.count_bytes is not None:
template += "\nNo. bytes : {count_bytes}"
kwargs["count_bytes"] = byte_info(self.count_bytes)

if self.count_bytes_stored is not None:
template += "\nNo. bytes stored : {count_bytes_stored}"
kwargs["count_stored"] = byte_info(self.count_bytes_stored)

if (
self.count_bytes is not None
and self.count_bytes_stored is not None
and self.count_bytes_stored > 0
):
template += "\nStorage ratio : {storage_ratio}"
kwargs["storage_ratio"] = f"{self.count_bytes / self.count_bytes_stored:.1f}"

if self.count_chunks_initialized is not None:
template += "\nChunks Initialized : {count_chunks_initialized}"
return template.format(**kwargs)
82 changes: 77 additions & 5 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Compressor, V2Filters
from zarr.core._info import ArrayInfo
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
BufferPrototype,
Expand Down Expand Up @@ -1199,9 +1200,65 @@ async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
def __repr__(self) -> str:
return f"<AsyncArray {self.store_path} shape={self.shape} dtype={self.dtype}>"

async def info(self) -> None:
@property
def info(self) -> ArrayInfo:
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
"""
Return the statically known information for an array.

Returns
-------
ArrayInfo

See Also
--------
AsyncArray.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
"""
return self._info()

async def info_complete(self) -> ArrayInfo:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we just don't have this in the API till we implement it (since it's new).

# TODO: get the size of the object from the store.
extra = {
"count_chunks_initialized": self.nchunks_initialized, # this should be async?
# count_bytes_stored isn't yet implemented.
}
return self._info(extra=extra)

raise NotImplementedError

def _info(self, extra: dict[str, int] | None = None) -> ArrayInfo:
kwargs: dict[str, Any] = {}
if self.metadata.zarr_format == 2:
assert isinstance(self.metadata, ArrayV2Metadata)
if self.metadata.compressor is not None:
kwargs["compressor"] = str(self.metadata.compressor)
if self.metadata.filters is not None:
kwargs["filters"] = str(self.metadata.filters)
kwargs["data_type"] = str(self.metadata.dtype)
kwargs["chunk_shape"] = self.metadata.chunks
else:
kwargs["codecs"] = str(self.metadata.codecs)
kwargs["data_type"] = str(self.metadata.data_type)
# just regular?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to figure out what to do here for other chunking types.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

until we have other chunk grid types, I'd say we can either raise an error here (my preference) or leave this field null.

chunk_grid = self.metadata.chunk_grid
if isinstance(chunk_grid, RegularChunkGrid):
kwargs["chunk_shape"] = chunk_grid.chunk_shape
else:
raise NotImplementedError(
"'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}"
)

return ArrayInfo(
zarr_format=self.metadata.zarr_format,
shape=self.shape,
order=self.order,
read_only=self.store_path.store.mode.readonly,
store_type=type(self.store_path.store).__name__,
count_bytes=self.dtype.itemsize * self.size,
**kwargs,
)


# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
@dataclass(frozen=False)
Expand Down Expand Up @@ -2900,10 +2957,25 @@ def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
def __repr__(self) -> str:
return f"<Array {self.store_path} shape={self.shape} dtype={self.dtype}>"

def info(self) -> None:
return sync(
self._async_array.info(),
)
@property
def info(self) -> ArrayInfo:
"""
Return the statically known information for an array.

Returns
-------
ArrayInfo

See Also
--------
Array.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
"""
return self._async_array.info
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add docstrings here and in info_complete


def info_complete(self) -> ArrayInfo:
return sync(self._async_array.info_complete())


def nchunks_initialized(
Expand Down
100 changes: 96 additions & 4 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from zarr._compat import _deprecate_positional_args
from zarr.abc.metadata import Metadata
from zarr.abc.store import Store, set_or_delete
from zarr.core._info import GroupInfo
from zarr.core.array import Array, AsyncArray, _build_parents
from zarr.core.attributes import Attributes
from zarr.core.buffer import default_buffer_prototype
Expand Down Expand Up @@ -793,8 +794,69 @@ def attrs(self) -> dict[str, Any]:
return self.metadata.attributes

@property
def info(self) -> None:
raise NotImplementedError
def info(self) -> GroupInfo:
"""
Return the statically known information for a group.

Returns
-------
GroupInfo

See Also
--------
AsyncGroup.info_complete
All information about a group, including dynamic information
like the children members.
"""

if self.metadata.consolidated_metadata:
members = list(self.metadata.consolidated_metadata.flattened_metadata.values())
else:
members = None
return self._info(members=members)

async def info_complete(self) -> GroupInfo:
"""
Return information for a group.

If this group doesn't contain consolidated metadata then
this will need to read from the backing Store.

Returns
-------
GroupInfo

See Also
--------
AsyncGroup.info
"""
members = [x[1].metadata async for x in self.members(max_depth=None)]
return self._info(members=members)

def _info(
self, members: list[ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] | None = None
) -> GroupInfo:
kwargs = {}
if members is not None:
kwargs["count_members"] = len(members)
count_arrays = 0
count_groups = 0
for member in members:
if isinstance(member, GroupMetadata):
count_groups += 1
else:
count_arrays += 1
kwargs["count_arrays"] = count_arrays
kwargs["count_groups"] = count_groups

return GroupInfo(
name=self.store_path.path,
read_only=self.store_path.store.mode.readonly,
store_type=type(self.store_path.store).__name__,
zarr_format=self.metadata.zarr_format,
# maybe do a typeddict
**kwargs, # type: ignore[arg-type]
)

@property
def store(self) -> Store:
Expand Down Expand Up @@ -1439,8 +1501,38 @@ def attrs(self) -> Attributes:
return Attributes(self)

@property
def info(self) -> None:
raise NotImplementedError
def info(self) -> GroupInfo:
"""
Return the statically known information for a group.

Returns
-------
GroupInfo

See Also
--------
Group.info_complete
All information about a group, including dynamic information
like the children members.
"""
return self._async_group.info

def info_complete(self) -> GroupInfo:
"""
Return information for a group.

If this group doesn't contain consolidated metadata then
this will need to read from the backing Store.

Returns
-------
GroupInfo

See Also
--------
Group.info
"""
return self._sync(self._async_group.info_complete())

@property
def store(self) -> Store:
Expand Down
Loading