From 5e0ffe80d039d9261517d96ce87220ce8d48e4f2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 21 Oct 2024 07:33:22 -0500 Subject: [PATCH] Added Store.getsize Closes https://github.com/zarr-developers/zarr-python/issues/2420 --- src/zarr/abc/store.py | 28 ++++++++++++++++++++++++++++ src/zarr/storage/local.py | 3 +++ src/zarr/storage/remote.py | 15 ++++++++++++++- src/zarr/testing/store.py | 12 ++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index a995a6bf3..e6d0570c7 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -5,6 +5,8 @@ from itertools import starmap from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable +from zarr.core.buffer.core import default_buffer_prototype + if TYPE_CHECKING: from collections.abc import AsyncGenerator, Iterable from types import TracebackType @@ -386,6 +388,32 @@ async def _get_many( for req in requests: yield (req[0], await self.get(*req)) + async def getsize(self, key: str) -> int: + """ + Return the size, in bytes, of a value in a Store. + + Parameters + ---------- + key : str + + Returns + ------- + nbytes: int + The size of the value in bytes. + + Raises + ------ + FileNotFoundError + When the given key does not exist in the store. + """ + # Note to implementers: this default implementation is very inefficient since + # it requires reading the entire object. Many systems will have ways to get the + # size of an object without reading it. + value = await self.get(key, prototype=default_buffer_prototype()) + if value is None: + raise FileNotFoundError(key) + return len(value) + @runtime_checkable class ByteGetter(Protocol): diff --git a/src/zarr/storage/local.py b/src/zarr/storage/local.py index 5c03009a9..fde825b68 100644 --- a/src/zarr/storage/local.py +++ b/src/zarr/storage/local.py @@ -242,3 +242,6 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: yield str(key).replace(to_strip, "") except (FileNotFoundError, NotADirectoryError): pass + + async def getsize(self, key: str) -> int: + return os.path.getsize(self.root / key) diff --git a/src/zarr/storage/remote.py b/src/zarr/storage/remote.py index 0a0ec7f7c..12a8664da 100644 --- a/src/zarr/storage/remote.py +++ b/src/zarr/storage/remote.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Self +from typing import TYPE_CHECKING, Any, Self, cast import fsspec @@ -301,3 +301,16 @@ async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]: find_str = f"{self.path}/{prefix}" for onefile in await self.fs._find(find_str, detail=False, maxdepth=None, withdirs=False): yield onefile.removeprefix(find_str) + + async def getsize(self, key: str) -> int: + path = _dereference_path(self.path, key) + info = await self.fs._info(path) + + size = info.get("size") + + if size is None: + # Not all filesystems support size. Fall back to reading the entire object + return await super().getsize(key) + else: + # fsspec doesn't have typing. We'll need to assume this is correct. + return cast(int, size) diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index b4da75b06..af8b7332e 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -338,3 +338,15 @@ async def test_set_if_not_exists(self, store: S) -> None: result = await store.get("k2", default_buffer_prototype()) assert result == new + + async def test_getsize(self, store: S) -> None: + key = "k" + data = self.buffer_cls.from_bytes(b"0" * 10) + await self.set(store, key, data) + + result = await store.getsize(key) + assert result == 10 + + async def test_getsize_raises(self, store: S) -> None: + with pytest.raises(FileNotFoundError): + await store.getsize("not-a-real-key")