diff --git a/RELEASE.md b/RELEASE.md index df97109faf..3bcf6773d8 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -4,6 +4,7 @@ ## Bug fixes and other changes * Addressed arbitrary file write via archive extraction security vulnerability in micropackaging. +* Added the `_EPHEMERAL` attribute to `AbstractDataset` and other Dataset classes that inherit from it. ## Breaking changes to the API diff --git a/docs/source/data/how_to_create_a_custom_dataset.md b/docs/source/data/how_to_create_a_custom_dataset.md index a596de7ae7..73a074e49e 100644 --- a/docs/source/data/how_to_create_a_custom_dataset.md +++ b/docs/source/data/how_to_create_a_custom_dataset.md @@ -38,6 +38,8 @@ At the minimum, a valid Kedro dataset needs to subclass the base [AbstractDatase `AbstractDataset` is generically typed with an input data type for saving data, and an output data type for loading data. This typing is optional however, and defaults to `Any` type. +The `_EPHEMERAL` boolean attribute in `AbstractDataset` indicates if a dataset is persistent. For example, in the case of [MemoryDataset](/api/kedro.io.MemoryDataset), which is not persistent, it is set to True. By default, `_EPHEMERAL` is set to False. + Here is an example skeleton for `ImageDataset`:
diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index afec2e1134..a284f2aed0 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -60,6 +60,8 @@ def __init__( ValueError: If the provided dataset is not a valid dict/YAML representation of a dataset or an actual dataset. """ + self._EPHEMERAL = True + if isinstance(dataset, dict): self._dataset = self._from_config(dataset, version) elif isinstance(dataset, AbstractDataset): diff --git a/kedro/io/core.py b/kedro/io/core.py index 10ee60aeb4..0a3dbc93a2 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -113,6 +113,13 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): # param2 will be True by default """ + """ + Datasets are persistent by default. User-defined datasets that + are not made to be persistent, such as instances of `MemoryDataset`, + need to change the `_EPHEMERAL` attribute to 'True'. + """ + _EPHEMERAL = False + @classmethod def from_config( cls: type, diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py index 6b901d60b8..4ed3a7a248 100644 --- a/kedro/io/lambda_dataset.py +++ b/kedro/io/lambda_dataset.py @@ -99,7 +99,6 @@ def __init__( # noqa: PLR0913 DatasetError: If a method is specified, but is not a Callable. """ - for name, value in [ ("load", load), ("save", save), diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index bce8315966..b3b08e1fae 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -12,7 +12,8 @@ class MemoryDataset(AbstractDataset): """``MemoryDataset`` loads and saves data from/to an in-memory - Python object. + Python object. The `_EPHEMERAL` attribute is set to True to + indicate MemoryDataset's non-persistence. Example: :: @@ -54,6 +55,7 @@ def __init__( self._data = _EMPTY self._copy_mode = copy_mode self.metadata = metadata + self._EPHEMERAL = True if data is not _EMPTY: self._save(data) diff --git a/kedro/io/shared_memory_dataset.py b/kedro/io/shared_memory_dataset.py index 8aff822db9..db10722826 100644 --- a/kedro/io/shared_memory_dataset.py +++ b/kedro/io/shared_memory_dataset.py @@ -18,6 +18,8 @@ def __init__(self, manager: SyncManager | None = None): manager: An instance of multiprocessing manager for shared objects. """ + self._EPHEMERAL = True + if manager: self.shared_memory_dataset = manager.MemoryDataset() # type: ignore[attr-defined] else: diff --git a/tests/io/test_cached_dataset.py b/tests/io/test_cached_dataset.py index fa93dfaa2f..f49820f11d 100644 --- a/tests/io/test_cached_dataset.py +++ b/tests/io/test_cached_dataset.py @@ -45,6 +45,9 @@ def test_load_empty(self, cached_ds): with pytest.raises(DatasetError, match=r"has not been saved yet"): _ = cached_ds.load() + def test_ephemeral_attribute(self, cached_ds): + assert cached_ds._EPHEMERAL is True + def test_save_load(self, cached_ds): cached_ds.save(42) assert cached_ds.load() == 42 diff --git a/tests/io/test_lambda_dataset.py b/tests/io/test_lambda_dataset.py index 22392355cb..cc2bd07cc6 100644 --- a/tests/io/test_lambda_dataset.py +++ b/tests/io/test_lambda_dataset.py @@ -52,6 +52,10 @@ def _dummy_release(): assert actual == expected +def test_ephemeral_attribute(mocked_dataset): + assert mocked_dataset._EPHEMERAL is False + + class TestLambdaDatasetLoad: def test_load_invocation(self, mocker): """Test the basic `load` method invocation""" diff --git a/tests/io/test_memory_dataset.py b/tests/io/test_memory_dataset.py index 606d7eee30..e2b0fd2d83 100644 --- a/tests/io/test_memory_dataset.py +++ b/tests/io/test_memory_dataset.py @@ -54,6 +54,9 @@ def test_load_none(self): loaded_data = MemoryDataset(None).load() assert loaded_data is None + def test_ephemeral_attribute(self, memory_dataset): + assert memory_dataset._EPHEMERAL is True + def test_load_infer_mode( self, memory_dataset, input_data, mocked_infer_mode, mocked_copy_with_mode ): diff --git a/tests/io/test_shared_memory_dataset.py b/tests/io/test_shared_memory_dataset.py index 8ea44d4dc6..d135b3aadd 100644 --- a/tests/io/test_shared_memory_dataset.py +++ b/tests/io/test_shared_memory_dataset.py @@ -13,6 +13,9 @@ def shared_memory_dataset(): class TestSharedMemoryDataset: + def test_ephemeral_attribute(self, shared_memory_dataset): + assert shared_memory_dataset._EPHEMERAL is True + def test_save_and_load(self, shared_memory_dataset, input_data): """Test basic load""" shared_memory_dataset.save(input_data)