Skip to content

Commit

Permalink
Fix Unable to get file size when dataset has no protocol attribute (#…
Browse files Browse the repository at this point in the history
…2174)

* add protocol check

Signed-off-by: Sajid Alam <[email protected]>

* add test

Signed-off-by: Sajid Alam <[email protected]>

* lint

Signed-off-by: Sajid Alam <[email protected]>

* Update hooks.py

Signed-off-by: Sajid Alam <[email protected]>

* add fallback to private attritbute for known datasets

Signed-off-by: Sajid Alam <[email protected]>

* coverage

Signed-off-by: Sajid Alam <[email protected]>

* Update RELEASE.md

Signed-off-by: Sajid Alam <[email protected]>

* Update hooks.py

Signed-off-by: Sajid Alam <[email protected]>

* coverage

Signed-off-by: Sajid Alam <[email protected]>

* Update RELEASE.md

Signed-off-by: Sajid Alam <[email protected]>

---------

Signed-off-by: Sajid Alam <[email protected]>
  • Loading branch information
SajidAlamQB authored Nov 8, 2024
1 parent 02331cc commit 2f46dc1
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 10 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Please follow the established format:
- Display full dataset type with library prefix in metadata panel (#2136)
- Enable SQLite WAL mode for Azure ML to fix database locking issues (#2131)
- Replace `flake8`, `isort`, `pylint` and `black` by `ruff` (#2149)
- Refactor `DatasetStatsHook` to avoid showing error when dataset doesn't have file size info (#2174)


# Release 10.0.0
Expand Down
28 changes: 18 additions & 10 deletions package/kedro_viz/integrations/kedro/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path, PurePosixPath
from typing import Any, Union

import fsspec
from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog
from kedro.io.core import get_filepath_str
Expand Down Expand Up @@ -141,19 +142,26 @@ def get_file_size(self, dataset: Any) -> Union[int, None]:
Args:
dataset: A dataset instance for which we need the file size
Returns: file size for the dataset if file_path is valid, if not returns None
Returns:
File size for the dataset if available, otherwise None.
"""

if not (hasattr(dataset, "_filepath") and dataset._filepath):
return None

try:
file_path = get_filepath_str(
PurePosixPath(dataset._filepath), dataset._protocol
)
return dataset._fs.size(file_path)
if hasattr(dataset, "filepath") and dataset.filepath:
filepath = dataset.filepath
# Fallback to private '_filepath' for known datasets
elif hasattr(dataset, "_filepath") and dataset._filepath:
filepath = dataset._filepath
else:
return None

fs, path_in_fs = fsspec.core.url_to_fs(filepath)
if fs.exists(path_in_fs):
file_size = fs.size(path_in_fs)
return file_size
else:
return None

except Exception as exc:
except Exception as exc: # pragma: no cover
logger.warning(
"Unable to get file size for the dataset %s: %s", dataset, exc
)
Expand Down
41 changes: 41 additions & 0 deletions package/tests/test_integrations/test_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,44 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data
assert example_dataset_stats_hook_obj.get_file_size(
example_csv_dataset
) == example_csv_dataset._fs.size(file_path)


def test_get_file_size_file_does_not_exist(example_dataset_stats_hook_obj, mocker):
class MockDataset:
def __init__(self):
self._filepath = "/non/existent/path.csv"

mock_dataset = MockDataset()
mock_fs = mocker.Mock()
mock_fs.exists.return_value = False

mocker.patch(
"fsspec.core.url_to_fs",
return_value=(mock_fs, "/non/existent/path.csv"),
)

# Call get_file_size and expect it to return None
file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset)
assert file_size is None


def test_get_file_size_public_filepath(example_dataset_stats_hook_obj, mocker):
class MockDataset:
def __init__(self):
self.filepath = "/path/to/existing/file.csv"

mock_dataset = MockDataset()

# Mock fs.exists to return True
mock_fs = mocker.Mock()
mock_fs.exists.return_value = True
mock_fs.size.return_value = 456

mocker.patch(
"fsspec.core.url_to_fs",
return_value=(mock_fs, "/path/to/existing/file.csv"),
)

# Call get_file_size and expect it to return the mocked file size
file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset)
assert file_size == 456

0 comments on commit 2f46dc1

Please sign in to comment.