From 7ec7692effa9b80b76dadfb2b57b3b3a513d81a4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:49:45 +0000 Subject: [PATCH 1/3] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.1.14 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.1.14...v0.6.9) - [github.com/pre-commit/mirrors-mypy: v1.8.0 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.8.0...v1.11.2) - [github.com/python-jsonschema/check-jsonschema: 0.27.3 → 0.29.3](https://github.com/python-jsonschema/check-jsonschema/compare/0.27.3...0.29.3) - [github.com/pre-commit/pre-commit-hooks: v4.5.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.5.0...v5.0.0) --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f13625a0..6598ae9e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,20 +7,20 @@ files: | )/.*\.py$ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.14 + rev: v0.6.9 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --no-cache] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 + rev: v1.11.2 hooks: - id: mypy additional_dependencies: - types-requests - types-python-dateutil - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.27.3 + rev: 0.29.3 hooks: - id: check-github-workflows files: '^github/workflows/.*\.ya?ml$' @@ -28,7 +28,7 @@ repos: - id: check-dependabot files: '^\.github/dependabot\.ya?ml$' - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - id: check-added-large-files files: ".*" From 232d9d1defee2ec7fa408edf83c297ba73a8c700 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 14 Oct 2024 10:47:53 +0200 Subject: [PATCH 2/3] fix(pre-commit): Minor fixes --- .pre-commit-config.yaml | 2 +- openml/_api_calls.py | 19 ++-- openml/cli.py | 3 +- openml/config.py | 8 +- openml/datasets/dataset.py | 8 +- openml/datasets/functions.py | 31 ++---- openml/evaluations/functions.py | 22 ++-- openml/extensions/sklearn/extension.py | 78 ++++++++------ openml/flows/flow.py | 10 +- openml/flows/functions.py | 38 +++---- openml/runs/functions.py | 28 +++-- openml/runs/run.py | 2 +- openml/runs/trace.py | 22 ++-- openml/setups/functions.py | 9 +- openml/study/functions.py | 32 ++---- openml/tasks/functions.py | 9 +- openml/tasks/split.py | 6 +- openml/tasks/task.py | 14 +-- openml/testing.py | 2 +- openml/utils.py | 21 ++-- pyproject.toml | 140 +++++++++++++------------ 21 files changed, 236 insertions(+), 268 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6598ae9e5..e46a59318 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: - types-requests - types-python-dateutil - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.29.3 + rev: 0.29.4 hooks: - id: check-github-workflows files: '^github/workflows/.*\.ya?ml$' diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 4f673186e..b74b50cb4 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -351,7 +351,7 @@ def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None return md5_checksum == md5_checksum_download -def _send_request( # noqa: C901 +def _send_request( # noqa: C901, PLR0912 request_method: str, url: str, data: DATA_TYPE, @@ -387,18 +387,15 @@ def _send_request( # noqa: C901 # -- Check if encoding is not UTF-8 perhaps if __is_checksum_equal(response.content, md5_checksum): raise OpenMLHashException( - "Checksum of downloaded file is unequal to the expected checksum {}" - "because the text encoding is not UTF-8 when downloading {}. " - "There might be a sever-sided issue with the file, " - "see: https://github.com/openml/openml-python/issues/1180.".format( - md5_checksum, - url, - ), + f"Checksum of downloaded file is unequal to the expected checksum" + f"{md5_checksum} because the text encoding is not UTF-8 when " + f"downloading {url}. There might be a sever-sided issue with the file, " + "see: https://github.com/openml/openml-python/issues/1180.", ) raise OpenMLHashException( - "Checksum of downloaded file is unequal to the expected checksum {} " - "when downloading {}.".format(md5_checksum, url), + f"Checksum of downloaded file is unequal to the expected checksum " + f"{md5_checksum} when downloading {url}.", ) return response @@ -464,7 +461,7 @@ def __parse_server_exception( server_exception = xmltodict.parse(response.text) except xml.parsers.expat.ExpatError as e: raise e - except Exception as e: # noqa: BLE001 + except Exception as e: # OpenML has a sophisticated error system # where information about failures is provided. try to parse this raise OpenMLServerError( diff --git a/openml/cli.py b/openml/cli.py index 5732442d0..d0a46e498 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -1,4 +1,5 @@ -""""Command Line Interface for `openml` to configure its settings.""" +"""Command Line Interface for `openml` to configure its settings.""" + from __future__ import annotations import argparse diff --git a/openml/config.py b/openml/config.py index 6a37537dc..b21c981e2 100644 --- a/openml/config.py +++ b/openml/config.py @@ -278,8 +278,8 @@ def _setup(config: _Config | None = None) -> None: _root_cache_directory.mkdir(exist_ok=True, parents=True) except PermissionError: openml_logger.warning( - "No permission to create openml cache directory at %s! This can result in " - "OpenML-Python not working properly." % _root_cache_directory, + f"No permission to create openml cache directory at {_root_cache_directory}!" + " This can result in OpenML-Python not working properly.", ) if cache_exists: @@ -287,8 +287,8 @@ def _setup(config: _Config | None = None) -> None: else: _create_log_handlers(create_file_handler=False) openml_logger.warning( - "No permission to create OpenML directory at %s! This can result in OpenML-Python " - "not working properly." % config_dir, + f"No permission to create OpenML directory at {config_dir}! This can result in " + " OpenML-Python not working properly.", ) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 30febcba5..c9064ba70 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -156,14 +156,14 @@ def find_invalid_characters(string: str, pattern: str) -> str: ) if dataset_id is None: - pattern = "^[\x00-\x7F]*$" + pattern = "^[\x00-\x7f]*$" if description and not re.match(pattern, description): # not basiclatin (XSD complains) invalid_characters = find_invalid_characters(description, pattern) raise ValueError( f"Invalid symbols {invalid_characters} in description: {description}", ) - pattern = "^[\x00-\x7F]*$" + pattern = "^[\x00-\x7f]*$" if citation and not re.match(pattern, citation): # not basiclatin (XSD complains) invalid_characters = find_invalid_characters(citation, pattern) @@ -574,7 +574,7 @@ def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]: try: data = pd.read_parquet(data_file) - except Exception as e: # noqa: BLE001 + except Exception as e: raise Exception(f"File: {data_file}") from e categorical = [data[c].dtype.name == "category" for c in data.columns] attribute_names = list(data.columns) @@ -816,7 +816,7 @@ def get_data( # noqa: C901, PLR0912, PLR0915 to_exclude.extend(self.ignore_attribute) if len(to_exclude) > 0: - logger.info("Going to remove the following attributes: %s" % to_exclude) + logger.info(f"Going to remove the following attributes: {to_exclude}") keep = np.array([column not in to_exclude for column in attribute_names]) data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep] diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 410867b01..f7eee98d6 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -6,6 +6,7 @@ import warnings from collections import OrderedDict from pathlib import Path +from pyexpat import ExpatError from typing import TYPE_CHECKING, Any, overload from typing_extensions import Literal @@ -15,7 +16,6 @@ import pandas as pd import urllib3 import xmltodict -from pyexpat import ExpatError from scipy.sparse import coo_matrix import openml._api_calls @@ -85,8 +85,7 @@ def list_datasets( *, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... @overload @@ -98,8 +97,7 @@ def list_datasets( tag: str | None, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... @overload @@ -111,8 +109,7 @@ def list_datasets( tag: str | None = ..., output_format: Literal["dict"] = "dict", **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_datasets( @@ -207,8 +204,7 @@ def _list_datasets( data_id: list | None = ..., output_format: Literal["dict"] = "dict", **kwargs: Any, -) -> dict: - ... +) -> dict: ... @overload @@ -216,8 +212,7 @@ def _list_datasets( data_id: list | None = ..., output_format: Literal["dataframe"] = "dataframe", **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def _list_datasets( @@ -256,18 +251,16 @@ def _list_datasets( for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" if data_id is not None: - api_call += "/data_id/%s" % ",".join([str(int(i)) for i in data_id]) + api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id])) return __list_datasets(api_call=api_call, output_format=output_format) @overload -def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: - ... +def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... @overload -def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: - ... +def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... def __list_datasets( @@ -785,10 +778,8 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915 if not is_row_id_an_attribute: raise ValueError( "'row_id_attribute' should be one of the data attribute. " - " Got '{}' while candidates are {}.".format( - row_id_attribute, - [attr[0] for attr in attributes_], - ), + f" Got '{row_id_attribute}' while candidates are" + f" {[attr[0] for attr in attributes_]}.", ) if isinstance(data, pd.DataFrame): diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index a854686d1..a39096a58 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -32,8 +32,7 @@ def list_evaluations( per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal["dict", "object"] = "dict", -) -> dict: - ... +) -> dict: ... @overload @@ -51,8 +50,7 @@ def list_evaluations( per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal["dataframe"] = ..., -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_evaluations( @@ -204,24 +202,24 @@ def _list_evaluations( ------- dict of objects, or dataframe """ - api_call = "evaluation/list/function/%s" % function + api_call = f"evaluation/list/function/{function}" if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" if tasks is not None: - api_call += "/task/%s" % ",".join([str(int(i)) for i in tasks]) + api_call += "/task/{}".format(",".join([str(int(i)) for i in tasks])) if setups is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setups]) + api_call += "/setup/{}".format(",".join([str(int(i)) for i in setups])) if flows is not None: - api_call += "/flow/%s" % ",".join([str(int(i)) for i in flows]) + api_call += "/flow/{}".format(",".join([str(int(i)) for i in flows])) if runs is not None: - api_call += "/run/%s" % ",".join([str(int(i)) for i in runs]) + api_call += "/run/{}".format(",".join([str(int(i)) for i in runs])) if uploaders is not None: - api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploaders]) + api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders])) if study is not None: api_call += "/study/%d" % study if sort_order is not None: - api_call += "/sort_order/%s" % sort_order + api_call += f"/sort_order/{sort_order}" return __list_evaluations(api_call, output_format=output_format) @@ -236,7 +234,7 @@ def __list_evaluations( # Minimalistic check if the XML is useful if "oml:evaluations" not in evals_dict: raise ValueError( - "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict), + "Error in return XML, does not contain " f'"oml:evaluations": {evals_dict!s}', ) assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type( diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 02322196e..2d40d03b8 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -48,12 +48,27 @@ r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", ) -sctypes = np.sctypes if Version(np.__version__) < Version("2.0") else np.core.sctypes +# NOTE(eddiebergman): This was imported before but became deprecated, +# as a result I just enumerated them manually by copy-ing and pasting, +# recommended solution in Numpy 2.0 guide was to explicitly list them. SIMPLE_NUMPY_TYPES = [ - nptype - for type_cat, nptypes in sctypes.items() - for nptype in nptypes # type: ignore - if type_cat != "others" + np.int8, + np.int16, + np.int32, + np.int64, + np.longlong, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.ulonglong, + np.float16, + np.float32, + np.float64, + np.longdouble, + np.complex64, + np.complex128, + np.clongdouble, ] SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES) @@ -312,7 +327,7 @@ def flow_to_model( strict_version=strict_version, ) - def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0913, PLR0912 + def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0912 self, o: Any, components: dict | None = None, @@ -419,7 +434,7 @@ def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0913, PLR0912 strict_version=strict_version, ) else: - raise ValueError("Cannot flow_to_sklearn %s" % serialized_type) + raise ValueError(f"Cannot flow_to_sklearn {serialized_type}") else: rval = OrderedDict( @@ -979,17 +994,17 @@ def flatten_all(list_): # length 2 is for {VotingClassifier.estimators, # Pipeline.steps, FeatureUnion.transformer_list} # length 3 is for ColumnTransformer - msg = "Length of tuple of type {} does not match assumptions".format( - sub_component_type, + raise ValueError( + f"Length of tuple of type {sub_component_type}" + " does not match assumptions" ) - raise ValueError(msg) if isinstance(sub_component, str): if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS: msg = ( "Second item of tuple does not match assumptions. " "If string, can be only 'drop' or 'passthrough' but" - "got %s" % sub_component + f"got {sub_component}" ) raise ValueError(msg) elif sub_component is None: @@ -1002,15 +1017,15 @@ def flatten_all(list_): elif not isinstance(sub_component, OpenMLFlow): msg = ( "Second item of tuple does not match assumptions. " - "Expected OpenMLFlow, got %s" % type(sub_component) + f"Expected OpenMLFlow, got {type(sub_component)}" ) raise TypeError(msg) if identifier in reserved_keywords: parent_model = f"{model.__module__}.{model.__class__.__name__}" - msg = "Found element shadowing official " "parameter for {}: {}".format( - parent_model, - identifier, + msg = ( + "Found element shadowing official " + f"parameter for {parent_model}: {identifier}" ) raise PyOpenMLError(msg) @@ -1035,9 +1050,9 @@ def flatten_all(list_): model=None, ) component_reference: OrderedDict[str, str | dict] = OrderedDict() - component_reference[ - "oml-python:serialized_object" - ] = COMPOSITION_STEP_CONSTANT + component_reference["oml-python:serialized_object"] = ( + COMPOSITION_STEP_CONSTANT + ) cr_value: dict[str, Any] = OrderedDict() cr_value["key"] = identifier cr_value["step_name"] = identifier @@ -1218,7 +1233,7 @@ def _check_dependencies( for dependency_string in dependencies_list: match = DEPENDENCIES_PATTERN.match(dependency_string) if not match: - raise ValueError("Cannot parse dependency %s" % dependency_string) + raise ValueError(f"Cannot parse dependency {dependency_string}") dependency_name = match.group("name") operation = match.group("operation") @@ -1237,7 +1252,7 @@ def _check_dependencies( installed_version > required_version or installed_version == required_version ) else: - raise NotImplementedError("operation '%s' is not supported" % operation) + raise NotImplementedError(f"operation '{operation}' is not supported") message = ( "Trying to deserialize a model with dependency " f"{dependency_string} not satisfied." @@ -1363,7 +1378,7 @@ def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) value = getattr(o, key, None) - if w is not None and len(w) and w[0].category == DeprecationWarning: + if w is not None and len(w) and w[0].category is DeprecationWarning: # if the parameter is deprecated, don't show it continue @@ -1812,9 +1827,9 @@ def _prediction_to_probabilities( # then we need to add a column full of zeros into the probabilities # for class 3 because the rest of the library expects that the # probabilities are ordered the same way as the classes are ordered). - message = "Estimator only predicted for {}/{} classes!".format( - proba_y.shape[1], - len(task.class_labels), + message = ( + f"Estimator only predicted for {proba_y.shape[1]}/{len(task.class_labels)}" + " classes!" ) warnings.warn(message, stacklevel=2) openml.config.logger.warning(message) @@ -2008,9 +2023,8 @@ def is_subcomponent_specification(values): pass else: raise TypeError( - "Subcomponent flow should be of type flow, but is {}".format( - type(subcomponent_flow), - ), + "Subcomponent flow should be of type flow, but is" + f" {type(subcomponent_flow)}", ) current = { @@ -2129,8 +2143,8 @@ def instantiate_model_from_hpo_class( """ if not self._is_hpo_class(model): raise AssertionError( - "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV" - % model, + f"Flow model {model} is not an instance of" + " sklearn.model_selection._search.BaseSearchCV", ) base_estimator = model.estimator base_estimator.set_params(**trace_iteration.get_parameters()) @@ -2197,8 +2211,8 @@ def _obtain_arff_trace( """ if not self._is_hpo_class(model): raise AssertionError( - "Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV" - % model, + f"Flow model {model} is not an instance of " + "sklearn.model_selection._search.BaseSearchCV", ) if not hasattr(model, "cv_results_"): raise ValueError("model should contain `cv_results_`") @@ -2235,7 +2249,7 @@ def _obtain_arff_trace( # hyperparameter layer_sizes of MLPClassifier type = "STRING" # noqa: A001 else: - raise TypeError("Unsupported param type in param grid: %s" % key) + raise TypeError(f"Unsupported param type in param grid: {key}") # renamed the attribute param to parameter, as this is a required # OpenML convention - this also guards against name collisions diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 4e437e35c..a3ff50ca1 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -135,15 +135,13 @@ def __init__( # noqa: PLR0913 keys_parameters_meta_info = set(parameters_meta_info.keys()) if len(keys_parameters.difference(keys_parameters_meta_info)) > 0: raise ValueError( - "Parameter %s only in parameters, but not in " - "parameters_meta_info." - % str(keys_parameters.difference(keys_parameters_meta_info)), + f"Parameter {keys_parameters.difference(keys_parameters_meta_info)!s} only in " + "parameters, but not in parameters_meta_info.", ) if len(keys_parameters_meta_info.difference(keys_parameters)) > 0: raise ValueError( - "Parameter %s only in parameters_meta_info, " - "but not in parameters." - % str(keys_parameters_meta_info.difference(keys_parameters)), + f"Parameter {keys_parameters_meta_info.difference(keys_parameters)!s} only in " + " parameters_meta_info, but not in parameters.", ) self.external_version = external_version diff --git a/openml/flows/functions.py b/openml/flows/functions.py index b01e54b44..3d056ac60 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -140,8 +140,7 @@ def list_flows( tag: str | None = ..., output_format: Literal["dict"] = "dict", **kwargs: Any, -) -> dict: - ... +) -> dict: ... @overload @@ -152,8 +151,7 @@ def list_flows( *, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... @overload @@ -163,8 +161,7 @@ def list_flows( tag: str | None, output_format: Literal["dataframe"], **kwargs: Any, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_flows( @@ -243,18 +240,15 @@ def list_flows( @overload -def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: - ... +def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: ... @overload -def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: - ... +def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... @overload -def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: - ... +def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... def _list_flows( @@ -391,13 +385,11 @@ def get_flow_id( @overload -def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: - ... +def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... @overload -def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: - ... +def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... def __list_flows( @@ -453,7 +445,7 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None: while len(stack) > 0: current = stack.pop() if current.flow_id is None: - raise ValueError("Flow %s has no flow_id!" % current.name) + raise ValueError(f"Flow {current.name} has no flow_id!") for component in current.components.values(): stack.append(component) @@ -492,10 +484,10 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 Whether to ignore matching of flow descriptions. """ if not isinstance(flow1, OpenMLFlow): - raise TypeError("Argument 1 must be of type OpenMLFlow, but is %s" % type(flow1)) + raise TypeError(f"Argument 1 must be of type OpenMLFlow, but is {type(flow1)}") if not isinstance(flow2, OpenMLFlow): - raise TypeError("Argument 2 must be of type OpenMLFlow, but is %s" % type(flow2)) + raise TypeError(f"Argument 2 must be of type OpenMLFlow, but is {type(flow2)}") # TODO as they are actually now saved during publish, it might be good to # check for the equality of these as well. @@ -522,11 +514,11 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 for name in set(attr1.keys()).union(attr2.keys()): if name not in attr1: raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name, + f"Component {name} only available in " "argument2, but not in argument1.", ) if name not in attr2: raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name, + f"Component {name} only available in " "argument2, but not in argument1.", ) assert_flows_equal( attr1[name], @@ -549,9 +541,9 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 symmetric_difference = params_flow_1 ^ params_flow_2 if len(symmetric_difference) > 0: raise ValueError( - "Flow %s: parameter set of flow " + f"Flow {flow1.name}: parameter set of flow " "differs from the parameters stored " - "on the server." % flow1.name, + "on the server.", ) if ignore_parameter_values_on_older_children: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index f7963297d..510f767d5 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -679,9 +679,9 @@ def _calculate_local_measure( # type: ignore user_defined_measures_per_fold[measure][rep_no][fold_no] = user_defined_measures_fold[ measure ] - user_defined_measures_per_sample[measure][rep_no][fold_no][ - sample_no - ] = user_defined_measures_fold[measure] + user_defined_measures_per_sample[measure][rep_no][fold_no][sample_no] = ( + user_defined_measures_fold[measure] + ) trace: OpenMLRunTrace | None = None if len(traces) > 0: @@ -783,13 +783,9 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 raise NotImplementedError(task.task_type) config.logger.info( - "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format( - str(model), - openml.datasets.get_dataset(task.dataset_id).name, - rep_no, - fold_no, - sample_no, - ), + f"Going to run model {model!s} on " + f"dataset {openml.datasets.get_dataset(task.dataset_id).name} " + f"for repeat {rep_no} fold {fold_no} sample {sample_no}" ) ( pred_y, @@ -978,7 +974,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore else: raise ValueError( 'Could not find keys "value" or ' - '"array_data" in %s' % str(evaluation_dict.keys()), + f'"array_data" in {evaluation_dict.keys()!s}', ) if ( "@repeat" in evaluation_dict @@ -1211,15 +1207,15 @@ def _list_runs( # noqa: PLR0913 for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" if id is not None: - api_call += "/run/%s" % ",".join([str(int(i)) for i in id]) + api_call += "/run/{}".format(",".join([str(int(i)) for i in id])) if task is not None: - api_call += "/task/%s" % ",".join([str(int(i)) for i in task]) + api_call += "/task/{}".format(",".join([str(int(i)) for i in task])) if setup is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) + api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup])) if flow is not None: - api_call += "/flow/%s" % ",".join([str(int(i)) for i in flow]) + api_call += "/flow/{}".format(",".join([str(int(i)) for i in flow])) if uploader is not None: - api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploader]) + api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploader])) if study is not None: api_call += "/study/%d" % study if display_errors: diff --git a/openml/runs/run.py b/openml/runs/run.py index 766f8c97f..945264131 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -480,7 +480,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]: ] else: - raise NotImplementedError("Task type %s is not yet supported." % str(task.task_type)) + raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.") return arff_dict diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 3b7d60c2f..bc9e1b5d6 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -80,8 +80,8 @@ def __post_init__(self) -> None: if self.parameters is not None and not isinstance(self.parameters, dict): raise TypeError( - "argument parameters is not an instance of OrderedDict, but %s" - % str(type(self.parameters)), + f"argument parameters is not an instance of OrderedDict, but" + f" {type(self.parameters)!s}", ) def get_parameters(self) -> dict[str, Any]: @@ -351,7 +351,7 @@ def _trace_from_arff_struct( for required_attribute in REQUIRED_ATTRIBUTES: if required_attribute not in attribute_idx: - raise ValueError("arff misses required attribute: %s" % required_attribute) + raise ValueError(f"arff misses required attribute: {required_attribute}") if "setup_string" in attribute_idx: raise ValueError(error_message) @@ -383,7 +383,7 @@ def _trace_from_arff_struct( else: raise ValueError( 'expected {"true", "false"} value for selected field, ' - "received: %s" % selected_value, + f"received: {selected_value}", ) parameters = { @@ -448,7 +448,7 @@ def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace: else: raise ValueError( 'expected {"true", "false"} value for ' - "selected field, received: %s" % selected_value, + f"selected field, received: {selected_value}", ) current = OpenMLTraceIteration( @@ -504,10 +504,8 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace: if list(param_keys) != list(trace_itr_keys): raise ValueError( "Cannot merge traces because the parameters are not equal: " - "{} vs {}".format( - list(trace_itr.parameters.keys()), - list(iteration.parameters.keys()), - ), + f"{list(trace_itr.parameters.keys())} vs " + f"{list(iteration.parameters.keys())}", ) if key in merged_trace: @@ -521,9 +519,9 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace: return cls(None, merged_trace) def __repr__(self) -> str: - return "[Run id: {}, {} trace iterations]".format( - -1 if self.run_id is None else self.run_id, - len(self.trace_iterations), + return ( + f"[Run id: {-1 if self.run_id is None else self.run_id}, " + f"{len(self.trace_iterations)} trace iterations]" ) def __iter__(self) -> Iterator[OpenMLTraceIteration]: diff --git a/openml/setups/functions.py b/openml/setups/functions.py index ee0c6d707..0bcd2b4e2 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -212,7 +212,7 @@ def _list_setups( """ api_call = "setup/list" if setup is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) + api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup])) if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" @@ -230,13 +230,12 @@ def __list_setups( # Minimalistic check if the XML is useful if "oml:setups" not in setups_dict: raise ValueError( - 'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict), + 'Error in return XML, does not contain "oml:setups":' f" {setups_dict!s}", ) if "@xmlns:oml" not in setups_dict["oml:setups"]: raise ValueError( - "Error in return XML, does not contain " - '"oml:setups"/@xmlns:oml: %s' % str(setups_dict), + "Error in return XML, does not contain " f'"oml:setups"/@xmlns:oml: {setups_dict!s}', ) if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: @@ -364,7 +363,7 @@ def _create_setup_from_xml( else: raise ValueError( "Expected None, list or dict, received " - "something else: %s" % str(type(xml_parameters)), + f"something else: {type(xml_parameters)!s}", ) if _output_format in ["dataframe", "dict"]: diff --git a/openml/study/functions.py b/openml/study/functions.py index 9d726d286..7fdc6f636 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -90,7 +90,7 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: ) result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)["oml:study"] study_id = int(result_dict["oml:id"]) - alias = result_dict["oml:alias"] if "oml:alias" in result_dict else None + alias = result_dict.get("oml:alias", None) main_entity_type = result_dict["oml:main_entity_type"] if entity_type != main_entity_type: @@ -99,9 +99,7 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: f", expected '{entity_type}'" ) - benchmark_suite = ( - result_dict["oml:benchmark_suite"] if "oml:benchmark_suite" in result_dict else None - ) + benchmark_suite = result_dict.get("oml:benchmark_suite", None) name = result_dict["oml:name"] description = result_dict["oml:description"] status = result_dict["oml:status"] @@ -300,7 +298,7 @@ def update_study_status(study_id: int, status: str) -> None: """ legal_status = {"active", "deactivated"} if status not in legal_status: - raise ValueError("Illegal status value. " "Legal values: %s" % legal_status) + raise ValueError("Illegal status value. " f"Legal values: {legal_status}") data = {"study_id": study_id, "status": status} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data) result = xmltodict.parse(result_xml) @@ -442,8 +440,7 @@ def list_suites( status: str | None = ..., uploader: list[int] | None = ..., output_format: Literal["dict"] = "dict", -) -> dict: - ... +) -> dict: ... @overload @@ -453,8 +450,7 @@ def list_suites( status: str | None = ..., uploader: list[int] | None = ..., output_format: Literal["dataframe"] = "dataframe", -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_suites( @@ -538,8 +534,7 @@ def list_studies( uploader: list[str] | None = ..., benchmark_suite: int | None = ..., output_format: Literal["dict"] = "dict", -) -> dict: - ... +) -> dict: ... @overload @@ -550,8 +545,7 @@ def list_studies( uploader: list[str] | None = ..., benchmark_suite: int | None = ..., output_format: Literal["dataframe"] = "dataframe", -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def list_studies( @@ -637,13 +631,11 @@ def list_studies( @overload -def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: - ... +def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: ... @overload -def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: - ... +def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... def _list_studies( @@ -674,13 +666,11 @@ def _list_studies( @overload -def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: - ... +def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... @overload -def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: - ... +def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... def __list_studies( diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 9fd2e4be1..54030422d 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -98,8 +98,9 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]: raise ValueError( "Error in return XML, value of " "oml:estimationprocedures/@xmlns:oml is not " - "http://openml.org/openml, but %s" - % str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]), + "http://openml.org/openml, but {}".format( + str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"]) + ), ) procs: list[dict[str, Any]] = [] @@ -276,7 +277,7 @@ def __list_tasks( # noqa: PLR0912, C901 raise ValueError( "Error in return XML, value of " '"oml:runs"/@xmlns:oml is not ' - '"http://openml.org/openml": %s' % str(tasks_dict), + f'"http://openml.org/openml": {tasks_dict!s}', ) assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"]) @@ -527,7 +528,7 @@ def _create_task_from_xml(xml: str) -> OpenMLTask: TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, }.get(task_type) if cls is None: - raise NotImplementedError("Task type %s not supported." % common_kwargs["task_type"]) + raise NotImplementedError("Task type {} not supported.".format(common_kwargs["task_type"])) return cls(**common_kwargs) # type: ignore diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 81105f1fd..ac538496e 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -177,9 +177,9 @@ def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarr If the specified repeat, fold, or sample is not known. """ if repeat not in self.split: - raise ValueError("Repeat %s not known" % str(repeat)) + raise ValueError(f"Repeat {repeat!s} not known") if fold not in self.split[repeat]: - raise ValueError("Fold %s not known" % str(fold)) + raise ValueError(f"Fold {fold!s} not known") if sample not in self.split[repeat][fold]: - raise ValueError("Sample %s not known" % str(sample)) + raise ValueError(f"Sample {sample!s} not known") return self.split[repeat][fold][sample] diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 064b834ba..e7d19bdce 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -207,7 +207,7 @@ def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: {"@name": "source_data", "#text": str(self.dataset_id)}, {"@name": "estimation_procedure", "#text": str(self.estimation_procedure_id)}, ] - if self.evaluation_measure is not None: # + if self.evaluation_measure is not None: oml_input.append({"@name": "evaluation_measures", "#text": self.evaluation_measure}) return { @@ -283,8 +283,7 @@ def get_X_and_y( ) -> tuple[ np.ndarray | scipy.sparse.spmatrix, np.ndarray | None, - ]: - ... + ]: ... @overload def get_X_and_y( @@ -292,8 +291,7 @@ def get_X_and_y( ) -> tuple[ pd.DataFrame, pd.Series | pd.DataFrame | None, - ]: - ... + ]: ... # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`? def get_X_and_y( @@ -542,12 +540,10 @@ def __init__( # noqa: PLR0913 def get_X( self, dataset_format: Literal["array"] = "array", - ) -> np.ndarray | scipy.sparse.spmatrix: - ... + ) -> np.ndarray | scipy.sparse.spmatrix: ... @overload - def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: - ... + def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: ... def get_X( self, diff --git a/openml/testing.py b/openml/testing.py index 529a304d4..9016ff6a9 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -182,7 +182,7 @@ def _get_sentinel(self, sentinel: str | None = None) -> str: md5.update(str(time.time()).encode("utf-8")) md5.update(str(os.getpid()).encode("utf-8")) sentinel = md5.hexdigest()[:10] - sentinel = "TEST%s" % sentinel + sentinel = f"TEST{sentinel}" return sentinel def _add_sentinel_to_flow_name( diff --git a/openml/utils.py b/openml/utils.py index a03610512..66c4df800 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -35,8 +35,7 @@ def extract_xml_tags( node: Mapping[str, Any], *, allow_none: Literal[True] = ..., -) -> Any | None: - ... +) -> Any | None: ... @overload @@ -45,8 +44,7 @@ def extract_xml_tags( node: Mapping[str, Any], *, allow_none: Literal[False], -) -> Any: - ... +) -> Any: ... def extract_xml_tags( @@ -198,7 +196,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool: "user", } if entity_type not in legal_entities: - raise ValueError("Can't delete a %s" % entity_type) + raise ValueError(f"Can't delete a {entity_type}") url_suffix = "%s/%d" % (entity_type, entity_id) try: @@ -245,8 +243,7 @@ def _list_all( list_output_format: Literal["dict"] = ..., *args: P.args, **filters: P.kwargs, -) -> dict: - ... +) -> dict: ... @overload @@ -255,8 +252,7 @@ def _list_all( list_output_format: Literal["object"], *args: P.args, **filters: P.kwargs, -) -> dict: - ... +) -> dict: ... @overload @@ -265,8 +261,7 @@ def _list_all( list_output_format: Literal["dataframe"], *args: P.args, **filters: P.kwargs, -) -> pd.DataFrame: - ... +) -> pd.DataFrame: ... def _list_all( # noqa: C901, PLR0912 @@ -376,7 +371,7 @@ def _create_cache_directory(key: str) -> Path: try: cache_dir.mkdir(exist_ok=True, parents=True) - except Exception as e: # noqa: BLE001 + except Exception as e: raise openml.exceptions.OpenMLCacheException( f"Cannot create cache directory {cache_dir}." ) from e @@ -412,7 +407,7 @@ def _create_cache_directory_for_id(key: str, id_: int) -> Path: """ cache_dir = _get_cache_dir_for_id(key, id_, create=True) if cache_dir.exists() and not cache_dir.is_dir(): - raise ValueError("%s cache dir exists but is not a directory!" % key) + raise ValueError(f"{key} cache dir exists but is not a directory!") cache_dir.mkdir(exist_ok=True, parents=True) return cache_dir diff --git a/pyproject.toml b/pyproject.toml index ffb1eb001..c5a3dac0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,12 +127,79 @@ markers = [ # https://github.com/charliermarsh/ruff [tool.ruff] -target-version = "py37" +target-version = "py38" line-length = 100 -show-source = true +output-format = "grouped" src = ["openml", "tests", "examples"] unsafe-fixes = true +exclude = [ + # TODO(eddiebergman): Tests should be re-enabled after the refactor + "tests", + # + ".bzr", + ".direnv", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "docs", +] + +# Exclude a variety of commonly ignored directories. +[tool.ruff.lint.per-file-ignores] +"tests/*.py" = [ + "D100", # Undocumented public module + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "S101", # Use of assert + "ANN201", # Missing return type annotation for public function + "FBT001", # Positional boolean argument + "PLR2004",# No use of magic numbers + "PD901", # X is a bad variable name. (pandas) + "TCH", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch + "N803", # Argument name {name} should be lowercase +] +"openml/cli.py" = [ + "T201", # print found + "T203", # pprint found +] +"openml/__version__.py" = [ + "D100", # Undocumented public module +] +"__init__.py" = [ + "I002", # Missing required import (i.e. from __future__ import annotations) +] +"examples/*.py" = [ + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D415", # First line should end with a . or ? or ! + "INP001", # File is part of an implicit namespace package, add an __init__.py + "I002", # Missing required import (i.e. from __future__ import annotations) + "E741", # Ambigiuous variable name + "T201", # print found + "T203", # pprint found + "ERA001", # found commeneted out code + "E402", # Module level import not at top of cell + "E501", # Line too long +] + +[tool.ruff.lint] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" @@ -212,74 +279,9 @@ ignore = [ "N802", # Public function name should be lower case (i.e. get_X()) ] -exclude = [ - # TODO(eddiebergman): Tests should be re-enabled after the refactor - "tests", - # - ".bzr", - ".direnv", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".nox", - ".pants.d", - ".ruff_cache", - ".svn", - ".tox", - ".venv", - "__pypackages__", - "_build", - "buck-out", - "build", - "dist", - "node_modules", - "venv", - "docs", -] - -# Exclude a variety of commonly ignored directories. -[tool.ruff.per-file-ignores] -"tests/*.py" = [ - "D100", # Undocumented public module - "D101", # Missing docstring in public class - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function - "S101", # Use of assert - "ANN201", # Missing return type annotation for public function - "FBT001", # Positional boolean argument - "PLR2004",# No use of magic numbers - "PD901", # X is a bad variable name. (pandas) - "TCH", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch - "N803", # Argument name {name} should be lowercase -] -"openml/cli.py" = [ - "T201", # print found - "T203", # pprint found -] -"openml/__version__.py" = [ - "D100", # Undocumented public module -] -"__init__.py" = [ - "I002", # Missing required import (i.e. from __future__ import annotations) -] -"examples/*.py" = [ - "D101", # Missing docstring in public class - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function - "D415", # First line should end with a . or ? or ! - "INP001", # File is part of an implicit namespace package, add an __init__.py - "I002", # Missing required import (i.e. from __future__ import annotations) - "E741", # Ambigiuous variable name - "T201", # print found - "T203", # pprint found - "ERA001", # found commeneted out code - "E402", # Module level import not at top of cell - "E501", # Line too long -] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["openml"] no-lines-before = ["future"] required-imports = ["from __future__ import annotations"] @@ -287,7 +289,7 @@ combine-as-imports = true extra-standard-library = ["typing_extensions"] force-wrap-aliases = true -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "numpy" [tool.mypy] From b830d7c4a9834341acd9545451757e3f461f8148 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 14 Oct 2024 10:49:15 +0200 Subject: [PATCH 3/3] maint: Update to 3.8 min --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c5a3dac0e..0496bf23d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -293,7 +293,7 @@ force-wrap-aliases = true convention = "numpy" [tool.mypy] -python_version = "3.7" +python_version = "3.8" packages = ["openml", "tests"] show_error_codes = true