Skip to content

Commit

Permalink
Merge branch 'develop' into add/cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
PGijsbers authored Sep 28, 2024
2 parents 8f78f0c + 7764ddb commit 731b3e1
Show file tree
Hide file tree
Showing 19 changed files with 574 additions and 379 deletions.
3 changes: 3 additions & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ Changelog
next
~~~~~~

* ADD #1335: Improve MinIO support.
* Add progress bar for downloading MinIO files. Enable it with setting `show_progress` to true on either `openml.config` or the configuration file.
* When using `download_all_files`, files are only downloaded if they do not yet exist in the cache.
* MAINT #1340: Add Numpy 2.0 support. Update tests to work with scikit-learn <= 1.5.
* ADD #1342: Add HTTP header to requests to indicate they are from openml-python.

Expand Down
9 changes: 9 additions & 0 deletions examples/20_basic/simple_datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="dataframe", target=dataset.default_target_attribute
)

############################################################################
# Tip: you can get a progress bar for dataset downloads, simply set it in
# the configuration. Either in code or in the configuration file
# (see also the introduction tutorial)

openml.config.show_progress = True


############################################################################
# Visualize the dataset
# =====================
Expand Down
3 changes: 2 additions & 1 deletion openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
OpenMLServerException,
OpenMLServerNoResult,
)
from .utils import ProgressBar

_HEADERS = {"user-agent": f"openml-python/{__version__}"}

Expand Down Expand Up @@ -163,12 +164,12 @@ def _download_minio_file(
proxy_client = ProxyManager(proxy) if proxy else None

client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)

try:
client.fget_object(
bucket_name=bucket,
object_name=object_name,
file_path=str(destination),
progress=ProgressBar() if config.show_progress else None,
request_headers=_HEADERS,
)
if destination.is_file() and destination.suffix == ".zip":
Expand Down
16 changes: 11 additions & 5 deletions openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class _Config(TypedDict):
avoid_duplicate_runs: bool
retry_policy: Literal["human", "robot"]
connection_n_retries: int
show_progress: bool


def _create_log_handlers(create_file_handler: bool = True) -> None: # noqa: FBT001, FBT002
Expand Down Expand Up @@ -111,6 +112,7 @@ def set_file_log_level(file_output_level: int) -> None:
"avoid_duplicate_runs": True,
"retry_policy": "human",
"connection_n_retries": 5,
"show_progress": False,
}

# Default values are actually added here in the _setup() function which is
Expand All @@ -131,6 +133,7 @@ def get_server_base_url() -> str:


apikey: str = _defaults["apikey"]
show_progress: bool = _defaults["show_progress"]
# The current cache directory (without the server name)
_root_cache_directory = Path(_defaults["cachedir"])
avoid_duplicate_runs = _defaults["avoid_duplicate_runs"]
Expand Down Expand Up @@ -238,6 +241,7 @@ def _setup(config: _Config | None = None) -> None:
global server # noqa: PLW0603
global _root_cache_directory # noqa: PLW0603
global avoid_duplicate_runs # noqa: PLW0603
global show_progress # noqa: PLW0603

config_file = determine_config_file_path()
config_dir = config_file.parent
Expand All @@ -255,6 +259,7 @@ def _setup(config: _Config | None = None) -> None:
avoid_duplicate_runs = config["avoid_duplicate_runs"]
apikey = config["apikey"]
server = config["server"]
show_progress = config["show_progress"]
short_cache_dir = Path(config["cachedir"])
n_retries = int(config["connection_n_retries"])

Expand Down Expand Up @@ -328,11 +333,11 @@ def _parse_config(config_file: str | Path) -> _Config:
logger.info("Error opening file %s: %s", config_file, e.args[0])
config_file_.seek(0)
config.read_file(config_file_)
if isinstance(config["FAKE_SECTION"]["avoid_duplicate_runs"], str):
config["FAKE_SECTION"]["avoid_duplicate_runs"] = config["FAKE_SECTION"].getboolean(
"avoid_duplicate_runs"
) # type: ignore
return dict(config.items("FAKE_SECTION")) # type: ignore
configuration = dict(config.items("FAKE_SECTION"))
for boolean_field in ["avoid_duplicate_runs", "show_progress"]:
if isinstance(config["FAKE_SECTION"][boolean_field], str):
configuration[boolean_field] = config["FAKE_SECTION"].getboolean(boolean_field) # type: ignore
return configuration # type: ignore


def get_config_as_dict() -> _Config:
Expand All @@ -343,6 +348,7 @@ def get_config_as_dict() -> _Config:
"avoid_duplicate_runs": avoid_duplicate_runs,
"connection_n_retries": connection_n_retries,
"retry_policy": retry_policy,
"show_progress": show_progress,
}


Expand Down
40 changes: 25 additions & 15 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,9 +345,10 @@ def _download_data(self) -> None:
# import required here to avoid circular import.
from .functions import _get_dataset_arff, _get_dataset_parquet

self.data_file = str(_get_dataset_arff(self))
if self._parquet_url is not None:
self.parquet_file = str(_get_dataset_parquet(self))
if self.parquet_file is None:
self.data_file = str(_get_dataset_arff(self))

def _get_arff(self, format: str) -> dict: # noqa: A002
"""Read ARFF file and return decoded arff.
Expand Down Expand Up @@ -535,18 +536,7 @@ def _cache_compressed_file_from_file(
feather_attribute_file,
) = self._compressed_cache_file_paths(data_file)

if data_file.suffix == ".arff":
data, categorical, attribute_names = self._parse_data_from_arff(data_file)
elif data_file.suffix == ".pq":
try:
data = pd.read_parquet(data_file)
except Exception as e: # noqa: BLE001
raise Exception(f"File: {data_file}") from e

categorical = [data[c].dtype.name == "category" for c in data.columns]
attribute_names = list(data.columns)
else:
raise ValueError(f"Unknown file type for file '{data_file}'.")
attribute_names, categorical, data = self._parse_data_from_file(data_file)

# Feather format does not work for sparse datasets, so we use pickle for sparse datasets
if scipy.sparse.issparse(data):
Expand All @@ -572,6 +562,24 @@ def _cache_compressed_file_from_file(

return data, categorical, attribute_names

def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
if data_file.suffix == ".arff":
data, categorical, attribute_names = self._parse_data_from_arff(data_file)
elif data_file.suffix == ".pq":
attribute_names, categorical, data = self._parse_data_from_pq(data_file)
else:
raise ValueError(f"Unknown file type for file '{data_file}'.")
return attribute_names, categorical, data

def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
try:
data = pd.read_parquet(data_file)
except Exception as e: # noqa: BLE001
raise Exception(f"File: {data_file}") from e
categorical = [data[c].dtype.name == "category" for c in data.columns]
attribute_names = list(data.columns)
return attribute_names, categorical, data

def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: # noqa: PLR0912, C901
"""Load data from compressed format or arff. Download data if not present on disk."""
need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
Expand Down Expand Up @@ -636,8 +644,10 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
"Please manually delete the cache file if you want OpenML-Python "
"to attempt to reconstruct it.",
)
assert self.data_file is not None
data, categorical, attribute_names = self._parse_data_from_arff(Path(self.data_file))
file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
assert file_to_load is not None
attr, cat, df = self._parse_data_from_file(Path(file_to_load))
return df, cat, attr

data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
if self.cache_format == "pickle" and not data_up_to_date:
Expand Down
54 changes: 17 additions & 37 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ def _name_to_id(

def get_datasets(
dataset_ids: list[str | int],
download_data: bool = True, # noqa: FBT001, FBT002
download_qualities: bool = True, # noqa: FBT001, FBT002
download_data: bool = False, # noqa: FBT001, FBT002
download_qualities: bool = False, # noqa: FBT001, FBT002
) -> list[OpenMLDataset]:
"""Download datasets.
Expand Down Expand Up @@ -452,12 +452,12 @@ def get_datasets(
@openml.utils.thread_safe_if_oslo_installed
def get_dataset( # noqa: C901, PLR0912
dataset_id: int | str,
download_data: bool | None = None, # Optional for deprecation warning; later again only bool
download_data: bool = False, # noqa: FBT002, FBT001
version: int | None = None,
error_if_multiple: bool = False, # noqa: FBT002, FBT001
cache_format: Literal["pickle", "feather"] = "pickle",
download_qualities: bool | None = None, # Same as above
download_features_meta_data: bool | None = None, # Same as above
download_qualities: bool = False, # noqa: FBT002, FBT001
download_features_meta_data: bool = False, # noqa: FBT002, FBT001
download_all_files: bool = False, # noqa: FBT002, FBT001
force_refresh_cache: bool = False, # noqa: FBT001, FBT002
) -> OpenMLDataset:
Expand Down Expand Up @@ -485,7 +485,7 @@ def get_dataset( # noqa: C901, PLR0912
----------
dataset_id : int or str
Dataset ID of the dataset to download
download_data : bool (default=True)
download_data : bool (default=False)
If True, also download the data file. Beware that some datasets are large and it might
make the operation noticeably slower. Metadata is also still retrieved.
If False, create the OpenMLDataset and only populate it with the metadata.
Expand All @@ -499,12 +499,12 @@ def get_dataset( # noqa: C901, PLR0912
Format for caching the dataset - may be feather or pickle
Note that the default 'pickle' option may load slower than feather when
no.of.rows is very high.
download_qualities : bool (default=True)
download_qualities : bool (default=False)
Option to download 'qualities' meta-data in addition to the minimal dataset description.
If True, download and cache the qualities file.
If False, create the OpenMLDataset without qualities metadata. The data may later be added
to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method.
download_features_meta_data : bool (default=True)
download_features_meta_data : bool (default=False)
Option to download 'features' meta-data in addition to the minimal dataset description.
If True, download and cache the features file.
If False, create the OpenMLDataset without features metadata. The data may later be added
Expand All @@ -523,28 +523,6 @@ def get_dataset( # noqa: C901, PLR0912
dataset : :class:`openml.OpenMLDataset`
The downloaded dataset.
"""
# TODO(0.15): Remove the deprecation warning and make the default False; adjust types above
# and documentation. Also remove None-to-True-cases below
if any(
download_flag is None
for download_flag in [download_data, download_qualities, download_features_meta_data]
):
warnings.warn(
"Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu"
"res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy "
"loading. To disable this message until version 0.15 explicitly set `download_data`, "
"`download_qualities`, and `download_features_meta_data` to a bool while calling "
"`get_dataset`.",
FutureWarning,
stacklevel=2,
)

download_data = True if download_data is None else download_data
download_qualities = True if download_qualities is None else download_qualities
download_features_meta_data = (
True if download_features_meta_data is None else download_features_meta_data
)

if download_all_files:
warnings.warn(
"``download_all_files`` is experimental and is likely to break with new releases.",
Expand Down Expand Up @@ -589,7 +567,6 @@ def get_dataset( # noqa: C901, PLR0912
if download_qualities:
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)

arff_file = _get_dataset_arff(description) if download_data else None
if "oml:parquet_url" in description and download_data:
try:
parquet_file = _get_dataset_parquet(
Expand All @@ -598,10 +575,14 @@ def get_dataset( # noqa: C901, PLR0912
)
except urllib3.exceptions.MaxRetryError:
parquet_file = None
if parquet_file is None and arff_file:
logger.warning("Failed to download parquet, fallback on ARFF.")
else:
parquet_file = None

arff_file = None
if parquet_file is None and download_data:
logger.warning("Failed to download parquet, fallback on ARFF.")
arff_file = _get_dataset_arff(description)

remove_dataset_cache = False
except OpenMLServerException as e:
# if there was an exception
Expand Down Expand Up @@ -1259,10 +1240,9 @@ def _get_dataset_parquet(
if old_file_path.is_file():
old_file_path.rename(output_file_path)

# For this release, we want to be able to force a new download even if the
# parquet file is already present when ``download_all_files`` is set.
# For now, it would be the only way for the user to fetch the additional
# files in the bucket (no function exists on an OpenMLDataset to do this).
# The call below skips files already on disk, so avoids downloading the parquet file twice.
# To force the old behavior of always downloading everything, use `force_refresh_cache`
# of `get_dataset`
if download_all_files:
openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)

Expand Down
25 changes: 12 additions & 13 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import traceback
import warnings
from collections import OrderedDict
from distutils.version import LooseVersion
from json.decoder import JSONDecodeError
from re import IGNORECASE
from typing import Any, Callable, List, Sized, cast
Expand All @@ -25,6 +24,7 @@
import sklearn.base
import sklearn.model_selection
import sklearn.pipeline
from packaging.version import Version

import openml
from openml.exceptions import PyOpenMLError
Expand All @@ -48,7 +48,7 @@
r"(?P<version>(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$",
)

sctypes = np.sctypes if LooseVersion(np.__version__) < "2.0" else np.core.sctypes
sctypes = np.sctypes if Version(np.__version__) < Version("2.0") else np.core.sctypes
SIMPLE_NUMPY_TYPES = [
nptype
for type_cat, nptypes in sctypes.items()
Expand Down Expand Up @@ -237,14 +237,13 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
-------
str
"""
openml_major_version = int(LooseVersion(openml.__version__).version[1])
# This explicit check is necessary to support existing entities on the OpenML servers
# that used the fixed dependency string (in the else block)
if openml_major_version > 11:
if Version(openml.__version__) > Version("0.11"):
# OpenML v0.11 onwards supports sklearn>=0.24
# assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with
# variables declared for extracting minimum dependency for that version
if LooseVersion(sklearn_version) >= "0.24":
if Version(sklearn_version) >= Version("0.24"):
from sklearn import _min_dependencies as _mindep

dependency_list = {
Expand All @@ -253,18 +252,18 @@ def _min_dependency_str(cls, sklearn_version: str) -> str:
"joblib": f"{_mindep.JOBLIB_MIN_VERSION}",
"threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}",
}
elif LooseVersion(sklearn_version) >= "0.23":
elif Version(sklearn_version) >= Version("0.23"):
dependency_list = {
"numpy": "1.13.3",
"scipy": "0.19.1",
"joblib": "0.11",
"threadpoolctl": "2.0.0",
}
if LooseVersion(sklearn_version).version[2] == 0:
if Version(sklearn_version).micro == 0:
dependency_list.pop("threadpoolctl")
elif LooseVersion(sklearn_version) >= "0.21":
elif Version(sklearn_version) >= Version("0.21"):
dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"}
elif LooseVersion(sklearn_version) >= "0.19":
elif Version(sklearn_version) >= Version("0.19"):
dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"}
else:
dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
Expand Down Expand Up @@ -1226,8 +1225,8 @@ def _check_dependencies(
version = match.group("version")

module = importlib.import_module(dependency_name)
required_version = LooseVersion(version)
installed_version = LooseVersion(module.__version__) # type: ignore
required_version = Version(version)
installed_version = Version(module.__version__) # type: ignore

if operation == "==":
check = required_version == installed_version
Expand Down Expand Up @@ -1258,7 +1257,7 @@ def _serialize_type(self, o: Any) -> OrderedDict[str, str]:
np.int32: "np.int32",
np.int64: "np.int64",
}
if LooseVersion(np.__version__) < "1.24":
if Version(np.__version__) < Version("1.24"):
mapping[float] = "np.float"
mapping[int] = "np.int"

Expand All @@ -1278,7 +1277,7 @@ def _deserialize_type(self, o: str) -> Any:
}

# TODO(eddiebergman): Might be able to remove this
if LooseVersion(np.__version__) < "1.24":
if Version(np.__version__) < Version("1.24"):
mapping["np.float"] = np.float # type: ignore # noqa: NPY001
mapping["np.int"] = np.int # type: ignore # noqa: NPY001

Expand Down
Loading

0 comments on commit 731b3e1

Please sign in to comment.