diff --git a/RELEASE.md b/RELEASE.md index 3261588f5..5be798e33 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -21,6 +21,7 @@ Please follow the established format: - Display full dataset type with library prefix in metadata panel (#2136) - Enable SQLite WAL mode for Azure ML to fix database locking issues (#2131) - Replace `flake8`, `isort`, `pylint` and `black` by `ruff` (#2149) +- Refactor `DatasetStatsHook` to avoid showing error when dataset doesn't have file size info (#2174) # Release 10.0.0 diff --git a/package/kedro_viz/integrations/kedro/hooks.py b/package/kedro_viz/integrations/kedro/hooks.py index 3089e61f5..88bc5be17 100644 --- a/package/kedro_viz/integrations/kedro/hooks.py +++ b/package/kedro_viz/integrations/kedro/hooks.py @@ -7,6 +7,7 @@ from pathlib import Path, PurePosixPath from typing import Any, Union +import fsspec from kedro.framework.hooks import hook_impl from kedro.io import DataCatalog from kedro.io.core import get_filepath_str @@ -141,19 +142,26 @@ def get_file_size(self, dataset: Any) -> Union[int, None]: Args: dataset: A dataset instance for which we need the file size - Returns: file size for the dataset if file_path is valid, if not returns None + Returns: + File size for the dataset if available, otherwise None. """ - - if not (hasattr(dataset, "_filepath") and dataset._filepath): - return None - try: - file_path = get_filepath_str( - PurePosixPath(dataset._filepath), dataset._protocol - ) - return dataset._fs.size(file_path) + if hasattr(dataset, "filepath") and dataset.filepath: + filepath = dataset.filepath + # Fallback to private '_filepath' for known datasets + elif hasattr(dataset, "_filepath") and dataset._filepath: + filepath = dataset._filepath + else: + return None + + fs, path_in_fs = fsspec.core.url_to_fs(filepath) + if fs.exists(path_in_fs): + file_size = fs.size(path_in_fs) + return file_size + else: + return None - except Exception as exc: + except Exception as exc: # pragma: no cover logger.warning( "Unable to get file size for the dataset %s: %s", dataset, exc ) diff --git a/package/tests/test_integrations/test_hooks.py b/package/tests/test_integrations/test_hooks.py index 2f6d7dd13..600c594d1 100644 --- a/package/tests/test_integrations/test_hooks.py +++ b/package/tests/test_integrations/test_hooks.py @@ -137,3 +137,44 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data assert example_dataset_stats_hook_obj.get_file_size( example_csv_dataset ) == example_csv_dataset._fs.size(file_path) + + +def test_get_file_size_file_does_not_exist(example_dataset_stats_hook_obj, mocker): + class MockDataset: + def __init__(self): + self._filepath = "/non/existent/path.csv" + + mock_dataset = MockDataset() + mock_fs = mocker.Mock() + mock_fs.exists.return_value = False + + mocker.patch( + "fsspec.core.url_to_fs", + return_value=(mock_fs, "/non/existent/path.csv"), + ) + + # Call get_file_size and expect it to return None + file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) + assert file_size is None + + +def test_get_file_size_public_filepath(example_dataset_stats_hook_obj, mocker): + class MockDataset: + def __init__(self): + self.filepath = "/path/to/existing/file.csv" + + mock_dataset = MockDataset() + + # Mock fs.exists to return True + mock_fs = mocker.Mock() + mock_fs.exists.return_value = True + mock_fs.size.return_value = 456 + + mocker.patch( + "fsspec.core.url_to_fs", + return_value=(mock_fs, "/path/to/existing/file.csv"), + ) + + # Call get_file_size and expect it to return the mocked file size + file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) + assert file_size == 456