Skip to content

Commit

Permalink
Add attribute to flag persistence in Dataset classes (#3520)
Browse files Browse the repository at this point in the history
* Add _EPHEMERAL attribute to AbstractDataset

Signed-off-by: lrcouto <[email protected]>

* Add test verification for _EPHEMERAL

Signed-off-by: lrcouto <[email protected]>

* Remove unnecessary test

Signed-off-by: lrcouto <[email protected]>

* Separate test on LambdaDataset

Signed-off-by: lrcouto <[email protected]>

* Separate test on LambdaDataset

Signed-off-by: lrcouto <[email protected]>

* Update kedro/io/core.py

Co-authored-by: Merel Theisen <[email protected]>
Signed-off-by: L. R. Couto <[email protected]>

* Minor corrections

Signed-off-by: lrcouto <[email protected]>

* Add info about _EPHEMERAL to docs and release notes

Signed-off-by: lrcouto <[email protected]>

* Update docs/source/data/how_to_create_a_custom_dataset.md

Co-authored-by: Jo Stichbury <[email protected]>
Signed-off-by: L. R. Couto <[email protected]>

* Minor clarification on dog page

Signed-off-by: lrcouto <[email protected]>

* Move release note to 0.19.3

Signed-off-by: lrcouto <[email protected]>

---------

Signed-off-by: lrcouto <[email protected]>
Signed-off-by: L. R. Couto <[email protected]>
Co-authored-by: Merel Theisen <[email protected]>
Co-authored-by: Jo Stichbury <[email protected]>
  • Loading branch information
3 people authored Jan 24, 2024
1 parent 7384abd commit 7b0f701
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 2 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Major features and improvements

## Bug fixes and other changes
* Added the `_EPHEMERAL` attribute to `AbstractDataset` and other Dataset classes that inherit from it.

## Breaking changes to the API

Expand Down
2 changes: 2 additions & 0 deletions docs/source/data/how_to_create_a_custom_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ At the minimum, a valid Kedro dataset needs to subclass the base [AbstractDatase
`AbstractDataset` is generically typed with an input data type for saving data, and an output data type for loading data.
This typing is optional however, and defaults to `Any` type.

The `_EPHEMERAL` boolean attribute in `AbstractDataset` indicates if a dataset is persistent. For example, in the case of [MemoryDataset](/kedro.io.MemoryDataset), which is not persistent, it is set to True. By default, `_EPHEMERAL` is set to False.

Here is an example skeleton for `ImageDataset`:

<details>
Expand Down
2 changes: 2 additions & 0 deletions kedro/io/cached_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def __init__(
ValueError: If the provided dataset is not a valid dict/YAML
representation of a dataset or an actual dataset.
"""
self._EPHEMERAL = True

if isinstance(dataset, dict):
self._dataset = self._from_config(dataset, version)
elif isinstance(dataset, AbstractDataset):
Expand Down
7 changes: 7 additions & 0 deletions kedro/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]):
# param2 will be True by default
"""

"""
Datasets are persistent by default. User-defined datasets that
are not made to be persistent, such as instances of `MemoryDataset`,
need to change the `_EPHEMERAL` attribute to 'True'.
"""
_EPHEMERAL = False

@classmethod
def from_config(
cls: type,
Expand Down
1 change: 0 additions & 1 deletion kedro/io/lambda_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def __init__( # noqa: PLR0913
DatasetError: If a method is specified, but is not a Callable.
"""

for name, value in [
("load", load),
("save", save),
Expand Down
4 changes: 3 additions & 1 deletion kedro/io/memory_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

class MemoryDataset(AbstractDataset):
"""``MemoryDataset`` loads and saves data from/to an in-memory
Python object.
Python object. The `_EPHEMERAL` attribute is set to True to
indicate MemoryDataset's non-persistence.
Example:
::
Expand Down Expand Up @@ -54,6 +55,7 @@ def __init__(
self._data = _EMPTY
self._copy_mode = copy_mode
self.metadata = metadata
self._EPHEMERAL = True
if data is not _EMPTY:
self._save(data)

Expand Down
2 changes: 2 additions & 0 deletions kedro/io/shared_memory_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(self, manager: SyncManager | None = None):
manager: An instance of multiprocessing manager for shared objects.
"""
self._EPHEMERAL = True

if manager:
self.shared_memory_dataset = manager.MemoryDataset() # type: ignore[attr-defined]
else:
Expand Down
3 changes: 3 additions & 0 deletions tests/io/test_cached_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def test_load_empty(self, cached_ds):
with pytest.raises(DatasetError, match=r"has not been saved yet"):
_ = cached_ds.load()

def test_ephemeral_attribute(self, cached_ds):
assert cached_ds._EPHEMERAL is True

def test_save_load(self, cached_ds):
cached_ds.save(42)
assert cached_ds.load() == 42
Expand Down
4 changes: 4 additions & 0 deletions tests/io/test_lambda_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ def _dummy_release():
assert actual == expected


def test_ephemeral_attribute(mocked_dataset):
assert mocked_dataset._EPHEMERAL is False


class TestLambdaDatasetLoad:
def test_load_invocation(self, mocker):
"""Test the basic `load` method invocation"""
Expand Down
3 changes: 3 additions & 0 deletions tests/io/test_memory_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def test_load_none(self):
loaded_data = MemoryDataset(None).load()
assert loaded_data is None

def test_ephemeral_attribute(self, memory_dataset):
assert memory_dataset._EPHEMERAL is True

def test_load_infer_mode(
self, memory_dataset, input_data, mocked_infer_mode, mocked_copy_with_mode
):
Expand Down
3 changes: 3 additions & 0 deletions tests/io/test_shared_memory_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def shared_memory_dataset():


class TestSharedMemoryDataset:
def test_ephemeral_attribute(self, shared_memory_dataset):
assert shared_memory_dataset._EPHEMERAL is True

def test_save_and_load(self, shared_memory_dataset, input_data):
"""Test basic load"""
shared_memory_dataset.save(input_data)
Expand Down

0 comments on commit 7b0f701

Please sign in to comment.