Skip to content

Commit

Permalink
Add/1034 (#1352) dataset lazy loading default
Browse files Browse the repository at this point in the history
* Towards lazy-by-default for dataset loading

* Isolate lazy behavior to pytest function outside of class

* Solve concurrency issue where test would use same cache

* Ensure metadata is downloaded to verify dataset is processed

* Clean up to reflect new defaults and tests

* Fix oversight from 1335

* Download data as was 0.14 behavior

* Restore test

* Formatting

* Test obsolete, replaced by test_get_dataset_lazy_behavior
  • Loading branch information
PGijsbers authored Sep 22, 2024
1 parent 1d707e6 commit 07e9b9c
Show file tree
Hide file tree
Showing 5 changed files with 282 additions and 201 deletions.
40 changes: 9 additions & 31 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ def _name_to_id(

def get_datasets(
dataset_ids: list[str | int],
download_data: bool = True, # noqa: FBT001, FBT002
download_qualities: bool = True, # noqa: FBT001, FBT002
download_data: bool = False, # noqa: FBT001, FBT002
download_qualities: bool = False, # noqa: FBT001, FBT002
) -> list[OpenMLDataset]:
"""Download datasets.
Expand Down Expand Up @@ -450,14 +450,14 @@ def get_datasets(


@openml.utils.thread_safe_if_oslo_installed
def get_dataset( # noqa: C901, PLR0912, PLR0915
def get_dataset( # noqa: C901, PLR0912
dataset_id: int | str,
download_data: bool | None = None, # Optional for deprecation warning; later again only bool
download_data: bool = False, # noqa: FBT002, FBT001
version: int | None = None,
error_if_multiple: bool = False, # noqa: FBT002, FBT001
cache_format: Literal["pickle", "feather"] = "pickle",
download_qualities: bool | None = None, # Same as above
download_features_meta_data: bool | None = None, # Same as above
download_qualities: bool = False, # noqa: FBT002, FBT001
download_features_meta_data: bool = False, # noqa: FBT002, FBT001
download_all_files: bool = False, # noqa: FBT002, FBT001
force_refresh_cache: bool = False, # noqa: FBT001, FBT002
) -> OpenMLDataset:
Expand Down Expand Up @@ -485,7 +485,7 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
----------
dataset_id : int or str
Dataset ID of the dataset to download
download_data : bool (default=True)
download_data : bool (default=False)
If True, also download the data file. Beware that some datasets are large and it might
make the operation noticeably slower. Metadata is also still retrieved.
If False, create the OpenMLDataset and only populate it with the metadata.
Expand All @@ -499,12 +499,12 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
Format for caching the dataset - may be feather or pickle
Note that the default 'pickle' option may load slower than feather when
no.of.rows is very high.
download_qualities : bool (default=True)
download_qualities : bool (default=False)
Option to download 'qualities' meta-data in addition to the minimal dataset description.
If True, download and cache the qualities file.
If False, create the OpenMLDataset without qualities metadata. The data may later be added
to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method.
download_features_meta_data : bool (default=True)
download_features_meta_data : bool (default=False)
Option to download 'features' meta-data in addition to the minimal dataset description.
If True, download and cache the features file.
If False, create the OpenMLDataset without features metadata. The data may later be added
Expand All @@ -523,28 +523,6 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
dataset : :class:`openml.OpenMLDataset`
The downloaded dataset.
"""
# TODO(0.15): Remove the deprecation warning and make the default False; adjust types above
# and documentation. Also remove None-to-True-cases below
if any(
download_flag is None
for download_flag in [download_data, download_qualities, download_features_meta_data]
):
warnings.warn(
"Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu"
"res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy "
"loading. To disable this message until version 0.15 explicitly set `download_data`, "
"`download_qualities`, and `download_features_meta_data` to a bool while calling "
"`get_dataset`.",
FutureWarning,
stacklevel=2,
)

download_data = True if download_data is None else download_data
download_qualities = True if download_qualities is None else download_qualities
download_features_meta_data = (
True if download_features_meta_data is None else download_features_meta_data
)

if download_all_files:
warnings.warn(
"``download_all_files`` is experimental and is likely to break with new releases.",
Expand Down
4 changes: 2 additions & 2 deletions openml/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class TestBase(unittest.TestCase):
logger = logging.getLogger("unit_tests_published_entities")
logger.setLevel(logging.DEBUG)

def setUp(self, n_levels: int = 1) -> None:
def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
"""Setup variables and temporary directories.
In particular, this methods:
Expand Down Expand Up @@ -92,7 +92,7 @@ def setUp(self, n_levels: int = 1) -> None:
self.static_cache_dir = static_cache_dir
self.cwd = Path.cwd()
workdir = Path(__file__).parent.absolute()
tmp_dir_name = self.id()
tmp_dir_name = self.id() + tmpdir_suffix
self.workdir = workdir / tmp_dir_name
shutil.rmtree(self.workdir, ignore_errors=True)

Expand Down
Loading

0 comments on commit 07e9b9c

Please sign in to comment.