Skip to content

Commit

Permalink
Modify getsize to return total size, not just the top level
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery committed Sep 13, 2024
1 parent e1d98cd commit 86d737f
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 49 deletions.
2 changes: 2 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Enhancements

Maintenance
~~~~~~~~~~~
* ``getsize`` now returns the total size of all nested arrays.
By :user:`Ben Jeffery <benjeffery>` :issue:`253`.

Deprecations
~~~~~~~~~~~~
Expand Down
70 changes: 37 additions & 33 deletions zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from collections import OrderedDict
from collections.abc import MutableMapping
from functools import lru_cache
from os import scandir
from pickle import PicklingError
from threading import Lock, RLock
from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any
Expand Down Expand Up @@ -270,9 +269,15 @@ def _getsize(store: BaseStore, path: Path = None) -> int:
# also include zarr.json?
# members += ['zarr.json']
else:
members = listdir(store, path)
prefix = _path_to_prefix(path)
members = [prefix + k for k in members]
to_visit = [path]
members = []
while to_visit:
print(to_visit)
current_path = to_visit.pop()
current_members = listdir(store, current_path)
prefix = _path_to_prefix(current_path)
members.extend([prefix + k for k in current_members])
to_visit.extend([prefix + k for k in current_members])
for k in members:
try:
v = store[k]
Expand Down Expand Up @@ -976,8 +981,12 @@ def getsize(self, path: Path = None):
elif isinstance(value, self.cls):
# total size for directory
size = 0
for v in value.values():
if not isinstance(v, self.cls):
to_visit = list(value.values())
while to_visit:
v = to_visit.pop()
if isinstance(v, self.cls):
to_visit.extend(v.values())
else:
size += buffer_size(v)
return size

Expand Down Expand Up @@ -1274,9 +1283,13 @@ def getsize(self, path=None):
return os.path.getsize(fs_path)
elif os.path.isdir(fs_path):
size = 0
for child in scandir(fs_path):
if child.is_file():
size += child.stat().st_size
for root, _, files in os.walk(fs_path):
# Include the size of the directory itself, as this can be substantial
# for directories with many files.
size += os.path.getsize(root)
for file in files:
file_path = os.path.join(root, file)
size += os.path.getsize(file_path)
return size
else:
return 0
Expand Down Expand Up @@ -1921,29 +1934,19 @@ def listdir(self, path=None):
def getsize(self, path=None):
path = normalize_storage_path(path)
with self.mutex:
children = self.listdir(path)
if children:
size = 0
for child in children:
if path:
name = path + "/" + child
else:
name = child
try:
info = self.zf.getinfo(name)
except KeyError:
pass
else:
size += info.compress_size
return size
elif path:
to_visit = [path] if path else self.listdir(path)
total_size = 0
while to_visit:
current_path = to_visit.pop()
try:
info = self.zf.getinfo(path)
return info.compress_size
info = self.zf.getinfo(current_path)
total_size += info.compress_size
except KeyError:
return 0
else:
return 0
children = self.listdir(current_path)
for child in children:
full_path = current_path + "/" + child if current_path else child
to_visit.append(full_path)
return total_size

def clear(self):
if self.mode == "r":
Expand Down Expand Up @@ -2527,6 +2530,8 @@ def listdir(self, path: Path = None):
return listing

def getsize(self, path=None) -> int:
print("WYF")
print(self._store, path)
return getsize(self._store, path=path)

def _pop_value(self):
Expand Down Expand Up @@ -2795,10 +2800,9 @@ def getsize(self, path=None):
size = self.cursor.execute(
"""
SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr
WHERE k LIKE (? || "%") AND
0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/")
WHERE k LIKE (? || "%")
""",
(path, path),
(path,),
)
for (s,) in size:
return s
Expand Down
13 changes: 9 additions & 4 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import pickle
import shutil
import tempfile
from typing import Any, Literal, Optional, Tuple, Union, Sequence
import unittest
from itertools import zip_longest
Expand Down Expand Up @@ -100,6 +101,7 @@ class TestArray:
write_empty_chunks = True
read_only = False
storage_transformers: Tuple[Any, ...] = ()
group_size = 0

def create_store(self) -> BaseStore:
return KVStore(dict())
Expand Down Expand Up @@ -229,15 +231,15 @@ def test_nbytes_stored(self):
buffer_size(v) for k, v in z.store.items() if k != "zarr.json"
)
else:
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored
z[:] = 42
if self.version == 3:
expect_nbytes_stored = sum(
buffer_size(v) for k, v in z.store.items() if k != "zarr.json"
)
else:
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored

# mess with store
Expand Down Expand Up @@ -1677,6 +1679,8 @@ def test_nbytes_stored(self):


class TestArrayWithDirectoryStore(TestArray):
group_size = 4096

def create_store(self):
path = mkdtemp()
atexit.register(shutil.rmtree, path)
Expand All @@ -1686,10 +1690,10 @@ def create_store(self):
def test_nbytes_stored(self):
# dict as store
z = self.create_array(shape=1000, chunks=100)
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored
z[:] = 42
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored


Expand Down Expand Up @@ -2028,6 +2032,7 @@ def expected(self):

@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec")
class TestArrayWithN5FSStore(TestArrayWithN5Store):
group_size = 0
def create_store(self):
path = mkdtemp()
atexit.register(shutil.rmtree, path)
Expand Down
15 changes: 3 additions & 12 deletions zarr/tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,19 +366,10 @@ def test_hierarchy(self):

# test getsize (optional)
if hasattr(store, "getsize"):
# TODO: proper behavior of getsize?
# v3 returns size of all nested arrays, not just the
# size of the arrays in the current folder.
if self.version == 2:
assert 6 == store.getsize()
else:
assert 15 == store.getsize()
assert 15 == store.getsize()
assert 3 == store.getsize("a")
assert 3 == store.getsize("b")
if self.version == 2:
assert 3 == store.getsize("c")
else:
assert 9 == store.getsize("c")
assert 3 == store.getsize("c")
assert 3 == store.getsize("c/d")
assert 6 == store.getsize("c/e")
assert 3 == store.getsize("c/e/f")
Expand Down Expand Up @@ -2256,7 +2247,7 @@ def test_getsize():
store["foo"] = b"aaa"
store["bar"] = b"bbbb"
store["baz/quux"] = b"ccccc"
assert 7 == getsize(store)
assert 12 == getsize(store)
assert 5 == getsize(store, "baz")

store = KVStore(dict())
Expand Down

0 comments on commit 86d737f

Please sign in to comment.