From 07e9b9c85d50346c98b3e6a2190adc707ed07814 Mon Sep 17 00:00:00 2001 From: Pieter Gijsbers Date: Sun, 22 Sep 2024 21:14:07 +0200 Subject: [PATCH] Add/1034 (#1352) dataset lazy loading default * Towards lazy-by-default for dataset loading * Isolate lazy behavior to pytest function outside of class * Solve concurrency issue where test would use same cache * Ensure metadata is downloaded to verify dataset is processed * Clean up to reflect new defaults and tests * Fix oversight from 1335 * Download data as was 0.14 behavior * Restore test * Formatting * Test obsolete, replaced by test_get_dataset_lazy_behavior --- openml/datasets/functions.py | 40 +- openml/testing.py | 4 +- tests/test_datasets/test_dataset_functions.py | 433 +++++++++++------- tests/test_openml/test_config.py | 4 +- tests/test_tasks/test_task_functions.py | 2 +- 5 files changed, 282 insertions(+), 201 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 6a9f57abb..410867b01 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -416,8 +416,8 @@ def _name_to_id( def get_datasets( dataset_ids: list[str | int], - download_data: bool = True, # noqa: FBT001, FBT002 - download_qualities: bool = True, # noqa: FBT001, FBT002 + download_data: bool = False, # noqa: FBT001, FBT002 + download_qualities: bool = False, # noqa: FBT001, FBT002 ) -> list[OpenMLDataset]: """Download datasets. @@ -450,14 +450,14 @@ def get_datasets( @openml.utils.thread_safe_if_oslo_installed -def get_dataset( # noqa: C901, PLR0912, PLR0915 +def get_dataset( # noqa: C901, PLR0912 dataset_id: int | str, - download_data: bool | None = None, # Optional for deprecation warning; later again only bool + download_data: bool = False, # noqa: FBT002, FBT001 version: int | None = None, error_if_multiple: bool = False, # noqa: FBT002, FBT001 cache_format: Literal["pickle", "feather"] = "pickle", - download_qualities: bool | None = None, # Same as above - download_features_meta_data: bool | None = None, # Same as above + download_qualities: bool = False, # noqa: FBT002, FBT001 + download_features_meta_data: bool = False, # noqa: FBT002, FBT001 download_all_files: bool = False, # noqa: FBT002, FBT001 force_refresh_cache: bool = False, # noqa: FBT001, FBT002 ) -> OpenMLDataset: @@ -485,7 +485,7 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915 ---------- dataset_id : int or str Dataset ID of the dataset to download - download_data : bool (default=True) + download_data : bool (default=False) If True, also download the data file. Beware that some datasets are large and it might make the operation noticeably slower. Metadata is also still retrieved. If False, create the OpenMLDataset and only populate it with the metadata. @@ -499,12 +499,12 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915 Format for caching the dataset - may be feather or pickle Note that the default 'pickle' option may load slower than feather when no.of.rows is very high. - download_qualities : bool (default=True) + download_qualities : bool (default=False) Option to download 'qualities' meta-data in addition to the minimal dataset description. If True, download and cache the qualities file. If False, create the OpenMLDataset without qualities metadata. The data may later be added to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method. - download_features_meta_data : bool (default=True) + download_features_meta_data : bool (default=False) Option to download 'features' meta-data in addition to the minimal dataset description. If True, download and cache the features file. If False, create the OpenMLDataset without features metadata. The data may later be added @@ -523,28 +523,6 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915 dataset : :class:`openml.OpenMLDataset` The downloaded dataset. """ - # TODO(0.15): Remove the deprecation warning and make the default False; adjust types above - # and documentation. Also remove None-to-True-cases below - if any( - download_flag is None - for download_flag in [download_data, download_qualities, download_features_meta_data] - ): - warnings.warn( - "Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu" - "res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy " - "loading. To disable this message until version 0.15 explicitly set `download_data`, " - "`download_qualities`, and `download_features_meta_data` to a bool while calling " - "`get_dataset`.", - FutureWarning, - stacklevel=2, - ) - - download_data = True if download_data is None else download_data - download_qualities = True if download_qualities is None else download_qualities - download_features_meta_data = ( - True if download_features_meta_data is None else download_features_meta_data - ) - if download_all_files: warnings.warn( "``download_all_files`` is experimental and is likely to break with new releases.", diff --git a/openml/testing.py b/openml/testing.py index 4af361507..529a304d4 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -56,7 +56,7 @@ class TestBase(unittest.TestCase): logger = logging.getLogger("unit_tests_published_entities") logger.setLevel(logging.DEBUG) - def setUp(self, n_levels: int = 1) -> None: + def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None: """Setup variables and temporary directories. In particular, this methods: @@ -92,7 +92,7 @@ def setUp(self, n_levels: int = 1) -> None: self.static_cache_dir = static_cache_dir self.cwd = Path.cwd() workdir = Path(__file__).parent.absolute() - tmp_dir_name = self.id() + tmp_dir_name = self.id() + tmpdir_suffix self.workdir = workdir / tmp_dir_name shutil.rmtree(self.workdir, ignore_errors=True) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 0740bd1b1..47e97496d 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,12 +1,15 @@ # License: BSD 3-Clause from __future__ import annotations +import itertools import os -from pathlib import Path import random import shutil import time +import uuid from itertools import product +from pathlib import Path +from typing import Iterable from unittest import mock import arff @@ -49,9 +52,6 @@ class TestOpenMLDataset(TestBase): _multiprocess_can_split_ = True - def setUp(self): - super().setUp() - def tearDown(self): self._remove_pickle_files() super().tearDown() @@ -169,44 +169,6 @@ def test_illegal_length_tag(self): except openml.exceptions.OpenMLServerException as e: assert e.code == 477 - def _datasets_retrieved_successfully(self, dids, metadata_only=True): - """Checks that all files for the given dids have been downloaded. - - This includes: - - description - - qualities - - features - - absence of data arff if metadata_only, else it must be present too. - """ - for did in dids: - assert os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "description.xml" - ) - ) - assert os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml" - ) - ) - assert os.path.exists( - os.path.join( - openml.config.get_cache_directory(), "datasets", str(did), "features.xml" - ) - ) - - data_assert = self.assertFalse if metadata_only else self.assertTrue - data_assert( - os.path.exists( - os.path.join( - openml.config.get_cache_directory(), - "datasets", - str(did), - "dataset.arff", - ), - ), - ) - @pytest.mark.production() def test__name_to_id_with_deactivated(self): """Check that an activated dataset is returned if an earlier deactivated one exists.""" @@ -261,47 +223,32 @@ def test__name_to_id_version_does_not_exist(self): def test_get_datasets_by_name(self): # did 1 and 2 on the test server: dids = ["anneal", "kr-vs-kp"] - datasets = openml.datasets.get_datasets(dids, download_data=False) + datasets = openml.datasets.get_datasets(dids) assert len(datasets) == 2 - self._datasets_retrieved_successfully([1, 2]) + _assert_datasets_retrieved_successfully([1, 2]) def test_get_datasets_by_mixed(self): # did 1 and 2 on the test server: dids = ["anneal", 2] - datasets = openml.datasets.get_datasets(dids, download_data=False) + datasets = openml.datasets.get_datasets(dids) assert len(datasets) == 2 - self._datasets_retrieved_successfully([1, 2]) + _assert_datasets_retrieved_successfully([1, 2]) def test_get_datasets(self): dids = [1, 2] datasets = openml.datasets.get_datasets(dids) assert len(datasets) == 2 - self._datasets_retrieved_successfully([1, 2], metadata_only=False) - - def test_get_datasets_lazy(self): - dids = [1, 2] - datasets = openml.datasets.get_datasets(dids, download_data=False) - assert len(datasets) == 2 - self._datasets_retrieved_successfully([1, 2], metadata_only=True) - - datasets[0].get_data() - datasets[1].get_data() - self._datasets_retrieved_successfully([1, 2], metadata_only=False) + _assert_datasets_retrieved_successfully([1, 2]) - @pytest.mark.production() def test_get_dataset_by_name(self): dataset = openml.datasets.get_dataset("anneal") assert type(dataset) == OpenMLDataset assert dataset.dataset_id == 1 - self._datasets_retrieved_successfully([1], metadata_only=False) + _assert_datasets_retrieved_successfully([1]) assert len(dataset.features) > 1 assert len(dataset.qualities) > 4 - # Issue324 Properly handle private datasets when trying to access them - openml.config.server = self.production_server - self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) - @pytest.mark.skip("Feature is experimental, can not test against stable server.") def test_get_dataset_download_all_files(self): # openml.datasets.get_dataset(id, download_all_files=True) @@ -319,45 +266,28 @@ def test_get_dataset_uint8_dtype(self): assert df["carbon"].dtype == "uint8" @pytest.mark.production() - def test_get_dataset(self): - # This is the only non-lazy load to ensure default behaviour works. - dataset = openml.datasets.get_dataset(1) - assert type(dataset) == OpenMLDataset - assert dataset.name == "anneal" - self._datasets_retrieved_successfully([1], metadata_only=False) - - assert len(dataset.features) > 1 - assert len(dataset.qualities) > 4 - + def test_get_dataset_cannot_access_private_data(self): # Issue324 Properly handle private datasets when trying to access them openml.config.server = self.production_server self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) - @pytest.mark.production() - def test_get_dataset_lazy(self): - dataset = openml.datasets.get_dataset(1, download_data=False) - assert type(dataset) == OpenMLDataset - assert dataset.name == "anneal" - self._datasets_retrieved_successfully([1], metadata_only=True) - - assert len(dataset.features) > 1 - assert len(dataset.qualities) > 4 - - dataset.get_data() - self._datasets_retrieved_successfully([1], metadata_only=False) - - # Issue324 Properly handle private datasets when trying to access them + @pytest.mark.skip("Need to find dataset name of private dataset") + def test_dataset_by_name_cannot_access_private_data(self): openml.config.server = self.production_server - self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45, False) + self.assertRaises( + OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE" + ) def test_get_dataset_lazy_all_functions(self): """Test that all expected functionality is available without downloading the dataset.""" - dataset = openml.datasets.get_dataset(1, download_data=False) + dataset = openml.datasets.get_dataset(1) # We only tests functions as general integrity is tested by test_get_dataset_lazy def ensure_absence_of_real_data(): assert not os.path.exists( - os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff") + os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff" + ) ) tag = "test_lazy_tag_%d" % random.randint(1, 1000000) @@ -380,14 +310,14 @@ def ensure_absence_of_real_data(): ensure_absence_of_real_data() def test_get_dataset_sparse(self): - dataset = openml.datasets.get_dataset(102, download_data=False) + dataset = openml.datasets.get_dataset(102) X, *_ = dataset.get_data(dataset_format="array") assert isinstance(X, scipy.sparse.csr_matrix) def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly did = 44 - dataset = openml.datasets.get_dataset(did, download_data=False) + dataset = openml.datasets.get_dataset(did) assert dataset.row_id_attribute == "Counter" def test__get_dataset_description(self): @@ -519,19 +449,6 @@ def test__get_dataset_qualities(self): qualities_xml_path = self.workdir / "qualities.xml" assert qualities_xml_path.exists() - def test__get_dataset_skip_download(self): - dataset = openml.datasets.get_dataset( - 2, - download_qualities=False, - download_features_meta_data=False, - ) - # Internal representation without lazy loading - assert dataset._qualities is None - assert dataset._features is None - # External representation with lazy loading - assert dataset.qualities is not None - assert dataset.features is not None - def test_get_dataset_force_refresh_cache(self): did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, @@ -588,18 +505,21 @@ def test_deletion_of_cache_dir(self): ) assert not os.path.exists(did_cache_dir) - # Use _get_dataset_arff to load the description, trigger an exception in the - # test target and have a slightly higher coverage - @mock.patch("openml.datasets.functions._get_dataset_arff") + # get_dataset_description is the only data guaranteed to be downloaded + @mock.patch("openml.datasets.functions._get_dataset_description") def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception("Boom!") - self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1) - datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets") + self.assertRaisesRegex( + Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1 + ) + datasets_cache_dir = os.path.join( + self.workdir, "org", "openml", "test", "datasets" + ) assert len(os.listdir(datasets_cache_dir)) == 0 def test_publish_dataset(self): # lazy loading not possible as we need the arff-file. - openml.datasets.get_dataset(3) + openml.datasets.get_dataset(3, download_data=True) file_path = os.path.join( openml.config.get_cache_directory(), "datasets", @@ -624,18 +544,20 @@ def test_publish_dataset(self): def test__retrieve_class_labels(self): openml.config.set_root_cache_directory(self.static_cache_dir) - labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels() + labels = openml.datasets.get_dataset(2).retrieve_class_labels() assert labels == ["1", "2", "3", "4", "5", "U"] - labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels( + labels = openml.datasets.get_dataset(2).retrieve_class_labels( target_name="product-type", ) assert labels == ["C", "H", "G"] # Test workaround for string-typed class labels - custom_ds = openml.datasets.get_dataset(2, download_data=False) + custom_ds = openml.datasets.get_dataset(2) custom_ds.features[31].data_type = "string" - labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name) + labels = custom_ds.retrieve_class_labels( + target_name=custom_ds.features[31].name + ) assert labels == ["COIL", "SHEET"] def test_upload_dataset_with_url(self): @@ -678,7 +600,9 @@ def test_data_status(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) did = dataset.id # admin key for test server (only adminds can activate datasets. @@ -728,7 +652,11 @@ def test_attributes_arff_from_df_numeric_column(self): # Test column names are automatically converted to str if needed (#819) df = pd.DataFrame({0: [1, 2, 3], 0.5: [4, 5, 6], "target": [0, 1, 1]}) attributes = attributes_arff_from_df(df) - assert attributes == [("0", "INTEGER"), ("0.5", "INTEGER"), ("target", "INTEGER")] + assert attributes == [ + ("0", "INTEGER"), + ("0.5", "INTEGER"), + ("target", "INTEGER"), + ] def test_attributes_arff_from_df_mixed_dtype_categories(self): # liac-arff imposed categorical attributes to be of sting dtype. We @@ -750,7 +678,8 @@ def test_attributes_arff_from_df_unknown_dtype(self): for arr, dt in zip(data, dtype): df = pd.DataFrame(arr) err_msg = ( - f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff" + f"The dtype '{dt}' of the column '0' is not currently " + "supported by liac-arff" ) with pytest.raises(ValueError, match=err_msg): attributes_arff_from_df(df) @@ -781,12 +710,16 @@ def test_create_dataset_numpy(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded arff does not match original one" - assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" + assert ( + _get_online_dataset_format(dataset.id) == "arff" + ), "Wrong format for dataset" def test_create_dataset_list(self): data = [ @@ -836,16 +769,23 @@ def test_create_dataset_list(self): dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" - assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" + assert ( + _get_online_dataset_format(dataset.id) == "arff" + ), "Wrong format for dataset" def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix( - ([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])), + ( + [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]), + ), ) column_names = [ @@ -944,7 +884,7 @@ def test_create_invalid_dataset(self): def test_get_online_dataset_arff(self): dataset_id = 100 # Australian # lazy loading not used as arff file is checked. - dataset = openml.datasets.get_dataset(dataset_id) + dataset = openml.datasets.get_dataset(dataset_id, download_data=True) decoder = arff.ArffDecoder() # check if the arff from the dataset is # the same as the arff from _get_arff function @@ -977,7 +917,7 @@ def test_topic_api_error(self): def test_get_online_dataset_format(self): # Phoneme dataset dataset_id = 77 - dataset = openml.datasets.get_dataset(dataset_id, download_data=False) + dataset = openml.datasets.get_dataset(dataset_id) assert dataset.format.lower() == _get_online_dataset_format( dataset_id @@ -991,7 +931,14 @@ def test_create_dataset_pandas(self): ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], ] - column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + column_names = [ + "rnd_str", + "outlook", + "temperature", + "humidity", + "windy", + "play", + ] df = pd.DataFrame(data, columns=column_names) # enforce the type of each column df["outlook"] = df["outlook"].astype("category") @@ -1027,19 +974,26 @@ def test_create_dataset_pandas(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" # Check that DataFrame with Sparse columns are supported properly sparse_data = scipy.sparse.coo_matrix( - ([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])), + ( + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]), + ), ) column_names = ["input1", "input2", "y"] df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names) # meta-information - description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns" + description = ( + "Synthetic dataset created from a Pandas DataFrame with Sparse columns" + ) dataset = openml.datasets.functions.create_dataset( name=name, description=description, @@ -1060,11 +1014,15 @@ def test_create_dataset_pandas(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) assert ( _get_online_dataset_arff(dataset.id) == dataset._dataset ), "Uploaded ARFF does not match original one" - assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset" + assert ( + _get_online_dataset_format(dataset.id) == "sparse_arff" + ), "Wrong format for dataset" # Check that we can overwrite the attributes data = [["a"], ["b"], ["c"], ["d"], ["e"]] @@ -1092,9 +1050,13 @@ def test_create_dataset_pandas(self): ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) downloaded_data = _get_online_dataset_arff(dataset.id) - assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one" + assert ( + downloaded_data == dataset._dataset + ), "Uploaded ARFF does not match original one" assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data def test_ignore_attributes_dataset(self): @@ -1105,7 +1067,14 @@ def test_ignore_attributes_dataset(self): ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], ] - column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + column_names = [ + "rnd_str", + "outlook", + "temperature", + "humidity", + "windy", + "play", + ] df = pd.DataFrame(data, columns=column_names) # enforce the type of each column df["outlook"] = df["outlook"].astype("category") @@ -1199,7 +1168,14 @@ def test_publish_fetch_ignore_attribute(self): ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], ] - column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + column_names = [ + "rnd_str", + "outlook", + "temperature", + "humidity", + "windy", + "play", + ] df = pd.DataFrame(data, columns=column_names) # enforce the type of each column df["outlook"] = df["outlook"].astype("category") @@ -1241,35 +1217,29 @@ def test_publish_fetch_ignore_attribute(self): # publish dataset dataset.publish() TestBase._mark_entity_for_removal("data", dataset.id) - TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], dataset.id) + ) # test if publish was successful assert isinstance(dataset.id, int) downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id) assert downloaded_dataset.ignore_attribute == ignore_attribute - def _wait_for_dataset_being_processed(self, dataset_id): - downloaded_dataset = None - # fetching from server - # loop till timeout or fetch not successful - max_waiting_time_seconds = 600 - # time.time() works in seconds + def _wait_for_dataset_being_processed( + self, dataset_id, poll_delay: int = 10, max_waiting_time_seconds: int = 600 + ): start_time = time.time() - while time.time() - start_time < max_waiting_time_seconds: + while (time.time() - start_time) < max_waiting_time_seconds: try: - downloaded_dataset = openml.datasets.get_dataset(dataset_id) - break + # being able to download qualities is a sign that the dataset is processed + return openml.datasets.get_dataset(dataset_id, download_qualities=True) except OpenMLServerException as e: - # returned code 273: Dataset not processed yet - # returned code 362: No qualities found TestBase.logger.error( f"Failed to fetch dataset:{dataset_id} with '{e!s}'.", ) - time.sleep(10) - continue - if downloaded_dataset is None: - raise ValueError(f"TIMEOUT: Failed to fetch uploaded dataset - {dataset_id}") - return downloaded_dataset + time.sleep(poll_delay) + raise ValueError(f"TIMEOUT: Failed to fetch uploaded dataset - {dataset_id}") def test_create_dataset_row_id_attribute_error(self): # meta-information @@ -1433,7 +1403,9 @@ def test_get_dataset_cache_format_feather(self): cache_dir = openml.config.get_cache_directory() cache_dir_for_id = os.path.join(cache_dir, "datasets", "128") feather_file = os.path.join(cache_dir_for_id, "dataset.feather") - pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3") + pickle_file = os.path.join( + cache_dir_for_id, "dataset.feather.attributes.pkl.py3" + ) data = pd.read_feather(feather_file) assert os.path.isfile(feather_file), "Feather file is missing" assert os.path.isfile(pickle_file), "Attributes pickle file is missing" @@ -1478,7 +1450,9 @@ def test_data_edit_critical_field(self): # for this, we need to first clone a dataset to do changes did = fork_dataset(1) self._wait_for_dataset_being_processed(did) - result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil") + result = edit_dataset( + did, default_target_attribute="shape", ignore_attribute="oil" + ) assert did == result n_tries = 10 @@ -1486,7 +1460,9 @@ def test_data_edit_critical_field(self): for i in range(n_tries): edited_dataset = openml.datasets.get_dataset(did) try: - assert edited_dataset.default_target_attribute == "shape", edited_dataset + assert ( + edited_dataset.default_target_attribute == "shape" + ), edited_dataset assert edited_dataset.ignore_attribute == ["oil"], edited_dataset break except AssertionError as e: @@ -1495,10 +1471,12 @@ def test_data_edit_critical_field(self): time.sleep(10) # Delete the cache dir to get the newer version of the dataset shutil.rmtree( - os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)), + os.path.join( + self.workdir, "org", "openml", "test", "datasets", str(did) + ), ) - def test_data_edit_errors(self): + def test_data_edit_requires_field(self): # Check server exception when no field to edit is provided self.assertRaisesRegex( OpenMLServerException, @@ -1509,6 +1487,8 @@ def test_data_edit_errors(self): edit_dataset, data_id=64, # blood-transfusion-service-center ) + + def test_data_edit_requires_valid_dataset(self): # Check server exception when unknown dataset is provided self.assertRaisesRegex( OpenMLServerException, @@ -1518,6 +1498,7 @@ def test_data_edit_errors(self): description="xor operation dataset", ) + def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self): # Need to own a dataset to be able to edit meta-data # Will be creating a forked version of an existing dataset to allow the unit test user # to edit meta-data of a dataset @@ -1543,6 +1524,7 @@ def test_data_edit_errors(self): default_target_attribute="y", ) + def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self): # Check server exception when a non-owner or non-admin tries to edit critical fields self.assertRaisesRegex( OpenMLServerException, @@ -1570,7 +1552,7 @@ def test_get_dataset_parquet(self): # Parquet functionality is disabled on the test server # There is no parquet-copy of the test server yet. openml.config.server = self.production_server - dataset = openml.datasets.get_dataset(61) + dataset = openml.datasets.get_dataset(61, download_data=True) assert dataset._parquet_url is not None assert dataset.parquet_file is not None assert os.path.isfile(dataset.parquet_file) @@ -1582,7 +1564,9 @@ def test_list_datasets_with_high_size_parameter(self): openml.config.server = self.production_server datasets_a = openml.datasets.list_datasets(output_format="dataframe") - datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf) + datasets_b = openml.datasets.list_datasets( + output_format="dataframe", size=np.inf + ) # Reverting to test server openml.config.server = self.test_server @@ -1662,7 +1646,9 @@ def test_invalid_attribute_validations( (None, None, ["outlook", "windy"]), ], ) -def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): +def test_valid_attribute_validations( + default_target_attribute, row_id_attribute, ignore_attribute +): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], ["b", "sunny", 80.0, 90.0, "TRUE", "no"], @@ -1713,7 +1699,14 @@ def test_delete_dataset(self): ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], ] - column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + column_names = [ + "rnd_str", + "outlook", + "temperature", + "humidity", + "windy", + "play", + ] df = pd.DataFrame(data, columns=column_names) # enforce the type of each column df["outlook"] = df["outlook"].astype("category") @@ -1756,7 +1749,10 @@ def test_delete_dataset(self): def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_not_owned.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1778,7 +1774,10 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_has_tasks.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1800,7 +1799,10 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_successful.xml" ) mock_delete.return_value = create_request_response( status_code=200, @@ -1819,7 +1821,10 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key) def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( - test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml" + test_files_directory + / "mock_responses" + / "datasets" + / "data_delete_not_exist.xml" ) mock_delete.return_value = create_request_response( status_code=412, @@ -1856,7 +1861,9 @@ def test_list_datasets(all_datasets: pd.DataFrame): def test_list_datasets_by_tag(all_datasets: pd.DataFrame): - tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe") + tag_datasets = openml.datasets.list_datasets( + tag="study_14", output_format="dataframe" + ) assert 0 < len(tag_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(tag_datasets) @@ -1912,3 +1919,97 @@ def test_list_datasets_combined_filters(all_datasets: pd.DataFrame): ) assert 1 <= len(combined_filter_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(combined_filter_datasets) + + +def _dataset_file_is_downloaded(did: int, file: str): + cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did) + return (cache_directory / file).exists() + + +def _dataset_description_is_downloaded(did: int): + return _dataset_file_is_downloaded(did, "description.xml") + + +def _dataset_qualities_is_downloaded(did: int): + return _dataset_file_is_downloaded(did, "qualities.xml") + + +def _dataset_features_is_downloaded(did: int): + return _dataset_file_is_downloaded(did, "features.xml") + + +def _dataset_data_file_is_downloaded(did: int): + parquet_present = _dataset_file_is_downloaded(did, "dataset.pq") + arff_present = _dataset_file_is_downloaded(did, "dataset.arff") + return parquet_present or arff_present + + +def _assert_datasets_retrieved_successfully( + dids: Iterable[int], + with_qualities: bool = False, + with_features: bool = False, + with_data: bool = False, +): + """Checks that all files for the given dids have been downloaded. + + This includes: + - description + - qualities + - features + - absence of data arff if metadata_only, else it must be present too. + """ + for did in dids: + assert _dataset_description_is_downloaded(did) + + has_qualities = _dataset_qualities_is_downloaded(did) + assert has_qualities if with_qualities else not has_qualities + + has_features = _dataset_features_is_downloaded(did) + assert has_features if with_features else not has_features + + has_data = _dataset_data_file_is_downloaded(did) + assert has_data if with_data else not has_data + + +@pytest.fixture() +def isolate_for_test(): + t = TestOpenMLDataset() + t.setUp(tmpdir_suffix=uuid.uuid4().hex) + yield + t.tearDown() + + +@pytest.mark.parametrize( + ("with_data", "with_qualities", "with_features"), + itertools.product([True, False], repeat=3), +) +def test_get_dataset_lazy_behavior( + isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool +): + dataset = openml.datasets.get_dataset( + 1, + download_data=with_data, + download_qualities=with_qualities, + download_features_meta_data=with_features, + ) + assert type(dataset) == OpenMLDataset + assert dataset.name == "anneal" + + _assert_datasets_retrieved_successfully( + [1], + with_qualities=with_qualities, + with_features=with_features, + with_data=with_data, + ) + assert ( + dataset.features + ), "Features should be downloaded on-demand if not during get_dataset" + assert ( + dataset.qualities + ), "Qualities should be downloaded on-demand if not during get_dataset" + assert ( + dataset.get_data() + ), "Data should be downloaded on-demand if not during get_dataset" + _assert_datasets_retrieved_successfully( + [1], with_qualities=True, with_features=True, with_data=True + ) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 58528c5c9..a92cd0cfd 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -49,8 +49,9 @@ def test_get_config_as_dict(self): _config["avoid_duplicate_runs"] = False _config["connection_n_retries"] = 20 _config["retry_policy"] = "robot" + _config["show_progress"] = False assert isinstance(config, dict) - assert len(config) == 6 + assert len(config) == 7 self.assertDictEqual(config, _config) def test_setup_with_config(self): @@ -62,6 +63,7 @@ def test_setup_with_config(self): _config["avoid_duplicate_runs"] = True _config["retry_policy"] = "human" _config["connection_n_retries"] = 100 + _config["show_progress"] = False orig_config = openml.config.get_config_as_dict() openml.config._setup(_config) updated_config = openml.config.get_config_as_dict() diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index b7eaf7e49..d269fec59 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -149,7 +149,7 @@ def test__get_task_live(self): openml.tasks.get_task(34536) def test_get_task(self): - task = openml.tasks.get_task(1) # anneal; crossvalidation + task = openml.tasks.get_task(1, download_data=True) # anneal; crossvalidation assert isinstance(task, OpenMLTask) assert os.path.exists( os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml")