Merge branch 'develop' into add/cleanup

openml · Sep 28, 2024 · 731b3e1 · 731b3e1
2 parents 8f78f0c + 7764ddb
commit 731b3e1
Show file tree

Hide file tree

Showing 19 changed files with 574 additions and 379 deletions.
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -9,6 +9,9 @@ Changelog
 next
 ~~~~~~
 
+ * ADD #1335: Improve MinIO support.
+    * Add progress bar for downloading MinIO files. Enable it with setting `show_progress` to true on either `openml.config` or the configuration file.
+    * When using `download_all_files`, files are only downloaded if they do not yet exist in the cache.
  * MAINT #1340: Add Numpy 2.0 support. Update tests to work with scikit-learn <= 1.5.
  * ADD #1342: Add HTTP header to requests to indicate they are from openml-python.
 

diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
@@ -50,6 +50,15 @@
 X, y, categorical_indicator, attribute_names = dataset.get_data(
     dataset_format="dataframe", target=dataset.default_target_attribute
 )
+
+############################################################################
+# Tip: you can get a progress bar for dataset downloads, simply set it in
+# the configuration. Either in code or in the configuration file
+# (see also the introduction tutorial)
+
+openml.config.show_progress = True
+
+
 ############################################################################
 # Visualize the dataset
 # =====================

diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -28,6 +28,7 @@
     OpenMLServerException,
     OpenMLServerNoResult,
 )
+from .utils import ProgressBar
 
 _HEADERS = {"user-agent": f"openml-python/{__version__}"}
 
@@ -163,12 +164,12 @@ def _download_minio_file(
     proxy_client = ProxyManager(proxy) if proxy else None
 
     client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
-
     try:
         client.fget_object(
             bucket_name=bucket,
             object_name=object_name,
             file_path=str(destination),
+            progress=ProgressBar() if config.show_progress else None,
             request_headers=_HEADERS,
         )
         if destination.is_file() and destination.suffix == ".zip":

diff --git a/openml/config.py b/openml/config.py
@@ -28,6 +28,7 @@ class _Config(TypedDict):
     avoid_duplicate_runs: bool
     retry_policy: Literal["human", "robot"]
     connection_n_retries: int
+    show_progress: bool
 
 
 def _create_log_handlers(create_file_handler: bool = True) -> None:  # noqa: FBT001, FBT002
@@ -111,6 +112,7 @@ def set_file_log_level(file_output_level: int) -> None:
     "avoid_duplicate_runs": True,
     "retry_policy": "human",
     "connection_n_retries": 5,
+    "show_progress": False,
 }
 
 # Default values are actually added here in the _setup() function which is
@@ -131,6 +133,7 @@ def get_server_base_url() -> str:
 
 
 apikey: str = _defaults["apikey"]
+show_progress: bool = _defaults["show_progress"]
 # The current cache directory (without the server name)
 _root_cache_directory = Path(_defaults["cachedir"])
 avoid_duplicate_runs = _defaults["avoid_duplicate_runs"]
@@ -238,6 +241,7 @@ def _setup(config: _Config | None = None) -> None:
     global server  # noqa: PLW0603
     global _root_cache_directory  # noqa: PLW0603
     global avoid_duplicate_runs  # noqa: PLW0603
+    global show_progress  # noqa: PLW0603
 
     config_file = determine_config_file_path()
     config_dir = config_file.parent
@@ -255,6 +259,7 @@ def _setup(config: _Config | None = None) -> None:
     avoid_duplicate_runs = config["avoid_duplicate_runs"]
     apikey = config["apikey"]
     server = config["server"]
+    show_progress = config["show_progress"]
     short_cache_dir = Path(config["cachedir"])
     n_retries = int(config["connection_n_retries"])
 
@@ -328,11 +333,11 @@ def _parse_config(config_file: str | Path) -> _Config:
         logger.info("Error opening file %s: %s", config_file, e.args[0])
     config_file_.seek(0)
     config.read_file(config_file_)
-    if isinstance(config["FAKE_SECTION"]["avoid_duplicate_runs"], str):
-        config["FAKE_SECTION"]["avoid_duplicate_runs"] = config["FAKE_SECTION"].getboolean(
-            "avoid_duplicate_runs"
-        )  # type: ignore
-    return dict(config.items("FAKE_SECTION"))  # type: ignore
+    configuration = dict(config.items("FAKE_SECTION"))
+    for boolean_field in ["avoid_duplicate_runs", "show_progress"]:
+        if isinstance(config["FAKE_SECTION"][boolean_field], str):
+            configuration[boolean_field] = config["FAKE_SECTION"].getboolean(boolean_field)  # type: ignore
+    return configuration  # type: ignore
 
 
 def get_config_as_dict() -> _Config:
@@ -343,6 +348,7 @@ def get_config_as_dict() -> _Config:
         "avoid_duplicate_runs": avoid_duplicate_runs,
         "connection_n_retries": connection_n_retries,
         "retry_policy": retry_policy,
+        "show_progress": show_progress,
     }
 
 

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -345,9 +345,10 @@ def _download_data(self) -> None:
         # import required here to avoid circular import.
         from .functions import _get_dataset_arff, _get_dataset_parquet
 
-        self.data_file = str(_get_dataset_arff(self))
         if self._parquet_url is not None:
             self.parquet_file = str(_get_dataset_parquet(self))
+        if self.parquet_file is None:
+            self.data_file = str(_get_dataset_arff(self))
 
     def _get_arff(self, format: str) -> dict:  # noqa: A002
         """Read ARFF file and return decoded arff.
@@ -535,18 +536,7 @@ def _cache_compressed_file_from_file(
             feather_attribute_file,
         ) = self._compressed_cache_file_paths(data_file)
 
-        if data_file.suffix == ".arff":
-            data, categorical, attribute_names = self._parse_data_from_arff(data_file)
-        elif data_file.suffix == ".pq":
-            try:
-                data = pd.read_parquet(data_file)
-            except Exception as e:  # noqa: BLE001
-                raise Exception(f"File: {data_file}") from e
-
-            categorical = [data[c].dtype.name == "category" for c in data.columns]
-            attribute_names = list(data.columns)
-        else:
-            raise ValueError(f"Unknown file type for file '{data_file}'.")
+        attribute_names, categorical, data = self._parse_data_from_file(data_file)
 
         # Feather format does not work for sparse datasets, so we use pickle for sparse datasets
         if scipy.sparse.issparse(data):
@@ -572,6 +562,24 @@ def _cache_compressed_file_from_file(
 
         return data, categorical, attribute_names
 
+    def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
+        if data_file.suffix == ".arff":
+            data, categorical, attribute_names = self._parse_data_from_arff(data_file)
+        elif data_file.suffix == ".pq":
+            attribute_names, categorical, data = self._parse_data_from_pq(data_file)
+        else:
+            raise ValueError(f"Unknown file type for file '{data_file}'.")
+        return attribute_names, categorical, data
+
+    def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
+        try:
+            data = pd.read_parquet(data_file)
+        except Exception as e:  # noqa: BLE001
+            raise Exception(f"File: {data_file}") from e
+        categorical = [data[c].dtype.name == "category" for c in data.columns]
+        attribute_names = list(data.columns)
+        return attribute_names, categorical, data
+
     def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:  # noqa: PLR0912, C901
         """Load data from compressed format or arff. Download data if not present on disk."""
         need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
@@ -636,8 +644,10 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
                 "Please manually delete the cache file if you want OpenML-Python "
                 "to attempt to reconstruct it.",
             )
-            assert self.data_file is not None
-            data, categorical, attribute_names = self._parse_data_from_arff(Path(self.data_file))
+            file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
+            assert file_to_load is not None
+            attr, cat, df = self._parse_data_from_file(Path(file_to_load))
+            return df, cat, attr
 
         data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
         if self.cache_format == "pickle" and not data_up_to_date:

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -416,8 +416,8 @@ def _name_to_id(
 
 def get_datasets(
     dataset_ids: list[str | int],
-    download_data: bool = True,  # noqa: FBT001, FBT002
-    download_qualities: bool = True,  # noqa: FBT001, FBT002
+    download_data: bool = False,  # noqa: FBT001, FBT002
+    download_qualities: bool = False,  # noqa: FBT001, FBT002
 ) -> list[OpenMLDataset]:
     """Download datasets.
 
@@ -452,12 +452,12 @@ def get_datasets(
 @openml.utils.thread_safe_if_oslo_installed
 def get_dataset(  # noqa: C901, PLR0912
     dataset_id: int | str,
-    download_data: bool | None = None,  # Optional for deprecation warning; later again only bool
+    download_data: bool = False,  # noqa: FBT002, FBT001
     version: int | None = None,
     error_if_multiple: bool = False,  # noqa: FBT002, FBT001
     cache_format: Literal["pickle", "feather"] = "pickle",
-    download_qualities: bool | None = None,  # Same as above
-    download_features_meta_data: bool | None = None,  # Same as above
+    download_qualities: bool = False,  # noqa: FBT002, FBT001
+    download_features_meta_data: bool = False,  # noqa: FBT002, FBT001
     download_all_files: bool = False,  # noqa: FBT002, FBT001
     force_refresh_cache: bool = False,  # noqa: FBT001, FBT002
 ) -> OpenMLDataset:
@@ -485,7 +485,7 @@ def get_dataset(  # noqa: C901, PLR0912
     ----------
     dataset_id : int or str
         Dataset ID of the dataset to download
-    download_data : bool (default=True)
+    download_data : bool (default=False)
         If True, also download the data file. Beware that some datasets are large and it might
         make the operation noticeably slower. Metadata is also still retrieved.
         If False, create the OpenMLDataset and only populate it with the metadata.
@@ -499,12 +499,12 @@ def get_dataset(  # noqa: C901, PLR0912
         Format for caching the dataset - may be feather or pickle
         Note that the default 'pickle' option may load slower than feather when
         no.of.rows is very high.
-    download_qualities : bool (default=True)
+    download_qualities : bool (default=False)
         Option to download 'qualities' meta-data in addition to the minimal dataset description.
         If True, download and cache the qualities file.
         If False, create the OpenMLDataset without qualities metadata. The data may later be added
         to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method.
-    download_features_meta_data : bool (default=True)
+    download_features_meta_data : bool (default=False)
         Option to download 'features' meta-data in addition to the minimal dataset description.
         If True, download and cache the features file.
         If False, create the OpenMLDataset without features metadata. The data may later be added
@@ -523,28 +523,6 @@ def get_dataset(  # noqa: C901, PLR0912
     dataset : :class:`openml.OpenMLDataset`
         The downloaded dataset.
     """
-    # TODO(0.15): Remove the deprecation warning and make the default False; adjust types above
-    #   and documentation. Also remove None-to-True-cases below
-    if any(
-        download_flag is None
-        for download_flag in [download_data, download_qualities, download_features_meta_data]
-    ):
-        warnings.warn(
-            "Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu"
-            "res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy "
-            "loading. To disable this message until version 0.15 explicitly set `download_data`, "
-            "`download_qualities`, and `download_features_meta_data` to a bool while calling "
-            "`get_dataset`.",
-            FutureWarning,
-            stacklevel=2,
-        )
-
-    download_data = True if download_data is None else download_data
-    download_qualities = True if download_qualities is None else download_qualities
-    download_features_meta_data = (
-        True if download_features_meta_data is None else download_features_meta_data
-    )
-
     if download_all_files:
         warnings.warn(
             "``download_all_files`` is experimental and is likely to break with new releases.",
@@ -589,7 +567,6 @@ def get_dataset(  # noqa: C901, PLR0912
         if download_qualities:
             qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
 
-        arff_file = _get_dataset_arff(description) if download_data else None
         if "oml:parquet_url" in description and download_data:
             try:
                 parquet_file = _get_dataset_parquet(
@@ -598,10 +575,14 @@ def get_dataset(  # noqa: C901, PLR0912
                 )
             except urllib3.exceptions.MaxRetryError:
                 parquet_file = None
-            if parquet_file is None and arff_file:
-                logger.warning("Failed to download parquet, fallback on ARFF.")
         else:
             parquet_file = None
+
+        arff_file = None
+        if parquet_file is None and download_data:
+            logger.warning("Failed to download parquet, fallback on ARFF.")
+            arff_file = _get_dataset_arff(description)
+
         remove_dataset_cache = False
     except OpenMLServerException as e:
         # if there was an exception
@@ -1259,10 +1240,9 @@ def _get_dataset_parquet(
     if old_file_path.is_file():
         old_file_path.rename(output_file_path)
 
-    # For this release, we want to be able to force a new download even if the
-    # parquet file is already present when ``download_all_files`` is set.
-    # For now, it would be the only way for the user to fetch the additional
-    # files in the bucket (no function exists on an OpenMLDataset to do this).
+    # The call below skips files already on disk, so avoids downloading the parquet file twice.
+    # To force the old behavior of always downloading everything, use `force_refresh_cache`
+    # of `get_dataset`
     if download_all_files:
         openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
 

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -13,7 +13,6 @@
 import traceback
 import warnings
 from collections import OrderedDict
-from distutils.version import LooseVersion
 from json.decoder import JSONDecodeError
 from re import IGNORECASE
 from typing import Any, Callable, List, Sized, cast
@@ -25,6 +24,7 @@
 import sklearn.base
 import sklearn.model_selection
 import sklearn.pipeline
+from packaging.version import Version
 
 import openml
 from openml.exceptions import PyOpenMLError
@@ -48,7 +48,7 @@
     r"(?P<version>(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$",
 )
 
-sctypes = np.sctypes if LooseVersion(np.__version__) < "2.0" else np.core.sctypes
+sctypes = np.sctypes if Version(np.__version__) < Version("2.0") else np.core.sctypes
 SIMPLE_NUMPY_TYPES = [
     nptype
     for type_cat, nptypes in sctypes.items()
@@ -237,14 +237,13 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
         -------
         str
         """
-        openml_major_version = int(LooseVersion(openml.__version__).version[1])
         # This explicit check is necessary to support existing entities on the OpenML servers
         # that used the fixed dependency string (in the else block)
-        if openml_major_version > 11:
+        if Version(openml.__version__) > Version("0.11"):
             # OpenML v0.11 onwards supports sklearn>=0.24
             # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with
             # variables declared for extracting minimum dependency for that version
-            if LooseVersion(sklearn_version) >= "0.24":
+            if Version(sklearn_version) >= Version("0.24"):
                 from sklearn import _min_dependencies as _mindep
 
                 dependency_list = {
@@ -253,18 +252,18 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
                     "joblib": f"{_mindep.JOBLIB_MIN_VERSION}",
                     "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}",
                 }
-            elif LooseVersion(sklearn_version) >= "0.23":
+            elif Version(sklearn_version) >= Version("0.23"):
                 dependency_list = {
                     "numpy": "1.13.3",
                     "scipy": "0.19.1",
                     "joblib": "0.11",
                     "threadpoolctl": "2.0.0",
                 }
-                if LooseVersion(sklearn_version).version[2] == 0:
+                if Version(sklearn_version).micro == 0:
                     dependency_list.pop("threadpoolctl")
-            elif LooseVersion(sklearn_version) >= "0.21":
+            elif Version(sklearn_version) >= Version("0.21"):
                 dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"}
-            elif LooseVersion(sklearn_version) >= "0.19":
+            elif Version(sklearn_version) >= Version("0.19"):
                 dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"}
             else:
                 dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
@@ -1226,8 +1225,8 @@ def _check_dependencies(
             version = match.group("version")
 
             module = importlib.import_module(dependency_name)
-            required_version = LooseVersion(version)
-            installed_version = LooseVersion(module.__version__)  # type: ignore
+            required_version = Version(version)
+            installed_version = Version(module.__version__)  # type: ignore
 
             if operation == "==":
                 check = required_version == installed_version
@@ -1258,7 +1257,7 @@ def _serialize_type(self, o: Any) -> OrderedDict[str, str]:
             np.int32: "np.int32",
             np.int64: "np.int64",
         }
-        if LooseVersion(np.__version__) < "1.24":
+        if Version(np.__version__) < Version("1.24"):
             mapping[float] = "np.float"
             mapping[int] = "np.int"
 
@@ -1278,7 +1277,7 @@ def _deserialize_type(self, o: str) -> Any:
         }
 
         # TODO(eddiebergman): Might be able to remove this
-        if LooseVersion(np.__version__) < "1.24":
+        if Version(np.__version__) < Version("1.24"):
             mapping["np.float"] = np.float  # type: ignore # noqa: NPY001
             mapping["np.int"] = np.int  # type: ignore # noqa: NPY001