diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index c3260e303..02322196e 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -13,7 +13,6 @@ import traceback import warnings from collections import OrderedDict -from distutils.version import LooseVersion from json.decoder import JSONDecodeError from re import IGNORECASE from typing import Any, Callable, List, Sized, cast @@ -25,6 +24,7 @@ import sklearn.base import sklearn.model_selection import sklearn.pipeline +from packaging.version import Version import openml from openml.exceptions import PyOpenMLError @@ -48,7 +48,7 @@ r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", ) -sctypes = np.sctypes if LooseVersion(np.__version__) < "2.0" else np.core.sctypes +sctypes = np.sctypes if Version(np.__version__) < Version("2.0") else np.core.sctypes SIMPLE_NUMPY_TYPES = [ nptype for type_cat, nptypes in sctypes.items() @@ -237,14 +237,13 @@ def _min_dependency_str(cls, sklearn_version: str) -> str: ------- str """ - openml_major_version = int(LooseVersion(openml.__version__).version[1]) # This explicit check is necessary to support existing entities on the OpenML servers # that used the fixed dependency string (in the else block) - if openml_major_version > 11: + if Version(openml.__version__) > Version("0.11"): # OpenML v0.11 onwards supports sklearn>=0.24 # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with # variables declared for extracting minimum dependency for that version - if LooseVersion(sklearn_version) >= "0.24": + if Version(sklearn_version) >= Version("0.24"): from sklearn import _min_dependencies as _mindep dependency_list = { @@ -253,18 +252,18 @@ def _min_dependency_str(cls, sklearn_version: str) -> str: "joblib": f"{_mindep.JOBLIB_MIN_VERSION}", "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}", } - elif LooseVersion(sklearn_version) >= "0.23": + elif Version(sklearn_version) >= Version("0.23"): dependency_list = { "numpy": "1.13.3", "scipy": "0.19.1", "joblib": "0.11", "threadpoolctl": "2.0.0", } - if LooseVersion(sklearn_version).version[2] == 0: + if Version(sklearn_version).micro == 0: dependency_list.pop("threadpoolctl") - elif LooseVersion(sklearn_version) >= "0.21": + elif Version(sklearn_version) >= Version("0.21"): dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"} - elif LooseVersion(sklearn_version) >= "0.19": + elif Version(sklearn_version) >= Version("0.19"): dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"} else: dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} @@ -1226,8 +1225,8 @@ def _check_dependencies( version = match.group("version") module = importlib.import_module(dependency_name) - required_version = LooseVersion(version) - installed_version = LooseVersion(module.__version__) # type: ignore + required_version = Version(version) + installed_version = Version(module.__version__) # type: ignore if operation == "==": check = required_version == installed_version @@ -1258,7 +1257,7 @@ def _serialize_type(self, o: Any) -> OrderedDict[str, str]: np.int32: "np.int32", np.int64: "np.int64", } - if LooseVersion(np.__version__) < "1.24": + if Version(np.__version__) < Version("1.24"): mapping[float] = "np.float" mapping[int] = "np.int" @@ -1278,7 +1277,7 @@ def _deserialize_type(self, o: str) -> Any: } # TODO(eddiebergman): Might be able to remove this - if LooseVersion(np.__version__) < "1.24": + if Version(np.__version__) < Version("1.24"): mapping["np.float"] = np.float # type: ignore # noqa: NPY001 mapping["np.int"] = np.int # type: ignore # noqa: NPY001 diff --git a/pyproject.toml b/pyproject.toml index 99ff2b804..b970a35b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "numpy>=1.6.2", "minio", "pyarrow", + "packaging", ] requires-python = ">=3.8" authors = [ diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index cb4d0bc11..e181aaa15 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -9,7 +9,7 @@ import unittest import warnings from collections import OrderedDict -from distutils.version import LooseVersion +from packaging.version import Version from typing import Any from unittest import mock @@ -179,24 +179,24 @@ def _serialization_test_helper( @pytest.mark.sklearn() def test_serialize_model(self): - max_features = "auto" if LooseVersion(sklearn.__version__) < "1.3" else "sqrt" + max_features = "auto" if Version(sklearn.__version__) < Version("1.3") else "sqrt" model = sklearn.tree.DecisionTreeClassifier( criterion="entropy", max_features=max_features, max_leaf_nodes=2000, ) - tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" + tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes" fixture_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" fixture_short_name = "sklearn.DecisionTreeClassifier" # str obtained from self.extension._get_sklearn_description(model) fixture_description = "A decision tree classifier." version_fixture = self.extension._min_dependency_str(sklearn.__version__) - presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"' + presort_val = "false" if Version(sklearn.__version__) < Version("0.22") else '"deprecated"' # min_impurity_decrease has been introduced in 0.20 # min_impurity_split has been deprecated in 0.20 - if LooseVersion(sklearn.__version__) < "0.19": + if Version(sklearn.__version__) < Version("0.19"): fixture_parameters = OrderedDict( ( ("class_weight", "null"), @@ -213,7 +213,7 @@ def test_serialize_model(self): ("splitter", '"best"'), ), ) - elif LooseVersion(sklearn.__version__) < "1.0": + elif Version(sklearn.__version__) < Version("1.0"): fixture_parameters = OrderedDict( ( ("class_weight", "null"), @@ -231,7 +231,7 @@ def test_serialize_model(self): ("splitter", '"best"'), ), ) - elif LooseVersion(sklearn.__version__) < "1.4": + elif Version(sklearn.__version__) < Version("1.4"): fixture_parameters = OrderedDict( ( ("class_weight", "null"), @@ -267,10 +267,10 @@ def test_serialize_model(self): ), ) - if LooseVersion(sklearn.__version__) >= "0.22": + if Version(sklearn.__version__) >= Version("0.22"): fixture_parameters.update({"ccp_alpha": "0.0"}) fixture_parameters.move_to_end("ccp_alpha", last=False) - if LooseVersion(sklearn.__version__) >= "0.24": + if Version(sklearn.__version__) >= Version("0.24"): del fixture_parameters["presort"] structure_fixture = {f"sklearn.tree.{tree_name}.DecisionTreeClassifier": []} @@ -307,30 +307,30 @@ def test_can_handle_flow(self): def test_serialize_model_clustering(self): model = sklearn.cluster.KMeans() - sklearn_version = LooseVersion(sklearn.__version__) - cluster_name = "k_means_" if sklearn_version < "0.22" else "_kmeans" + sklearn_version = Version(sklearn.__version__) + cluster_name = "k_means_" if sklearn_version < Version("0.22") else "_kmeans" fixture_name = f"sklearn.cluster.{cluster_name}.KMeans" fixture_short_name = "sklearn.KMeans" # str obtained from self.extension._get_sklearn_description(model) fixture_description = "K-Means clustering{}".format( - "" if sklearn_version < "0.22" else ".", + "" if sklearn_version < Version("0.22") else ".", ) version_fixture = self.extension._min_dependency_str(sklearn.__version__) n_jobs_val = "1" - if sklearn_version >= "0.20": + if sklearn_version >= Version("0.20"): n_jobs_val = "null" - if sklearn_version >= "0.23": + if sklearn_version >= Version("0.23"): n_jobs_val = '"deprecated"' - precomp_val = '"auto"' if sklearn_version < "0.23" else '"deprecated"' + precomp_val = '"auto"' if sklearn_version < Version("0.23") else '"deprecated"' n_init = "10" - if sklearn_version >= "1.2": + if sklearn_version >= Version("1.2"): n_init = '"warn"' - if sklearn_version >= "1.4": + if sklearn_version >= Version("1.4"): n_init = '"auto"' - algorithm = '"auto"' if sklearn_version < "1.1" else '"lloyd"' + algorithm = '"auto"' if sklearn_version < Version("1.1") else '"lloyd"' fixture_parameters = OrderedDict([ ("algorithm", algorithm), ("copy_x", "true"), @@ -345,7 +345,7 @@ def test_serialize_model_clustering(self): ("verbose", "0"), ]) - if sklearn_version >= "1.0": + if sklearn_version >= Version("1.0" ): fixture_parameters.pop("n_jobs") fixture_parameters.pop("precompute_distances") @@ -369,7 +369,7 @@ def test_serialize_model_clustering(self): @pytest.mark.sklearn() def test_serialize_model_with_subcomponent(self): - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" estimator_param = {estimator_name: sklearn.tree.DecisionTreeClassifier()} model = sklearn.ensemble.AdaBoostClassifier( n_estimators=100, @@ -377,9 +377,9 @@ def test_serialize_model_with_subcomponent(self): ) weight_name = "{}weight_boosting".format( - "" if LooseVersion(sklearn.__version__) < "0.22" else "_", + "" if Version(sklearn.__version__) < Version("0.22") else "_", ) - tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" + tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes" fixture_name = ( f"sklearn.ensemble.{weight_name}.AdaBoostClassifier" f"({estimator_name}=sklearn.tree.{tree_name}.DecisionTreeClassifier)" @@ -417,7 +417,7 @@ def test_serialize_model_with_subcomponent(self): assert serialization.name == fixture_name assert serialization.class_name == fixture_class_name assert serialization.custom_name == fixture_short_name - if LooseVersion(sklearn.__version__) < "1.4": + if Version(sklearn.__version__) < Version("1.4"): assert serialization.description == fixture_description assert serialization.parameters["algorithm"] == '"SAMME.R"' assert isinstance(serialization.parameters[estimator_name], str) @@ -439,7 +439,7 @@ def test_serialize_pipeline(self): dummy = sklearn.dummy.DummyClassifier(strategy="prior") model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("dummy", dummy)]) - scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" + scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" fixture_name = ( "sklearn.pipeline.Pipeline(" f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler," @@ -464,7 +464,7 @@ def test_serialize_pipeline(self): assert serialization.name == fixture_name assert serialization.custom_name == fixture_short_name - if LooseVersion(sklearn.__version__) < "1.3": + if Version(sklearn.__version__) < Version("1.3"): # Newer versions of scikit-learn have update docstrings assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) @@ -473,9 +473,9 @@ def test_serialize_pipeline(self): # The parameters only have the name of base objects(not the whole flow) # as value # memory parameter has been added in 0.19, verbose in 0.21 - if LooseVersion(sklearn.__version__) < "0.19": + if Version(sklearn.__version__) < Version("0.19"): assert len(serialization.parameters) == 1 - elif LooseVersion(sklearn.__version__) < "0.21": + elif Version(sklearn.__version__) < Version("0.21"): assert len(serialization.parameters) == 2 else: assert len(serialization.parameters) == 3 @@ -508,8 +508,8 @@ def test_serialize_pipeline_clustering(self): km = sklearn.cluster.KMeans() model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("clusterer", km)]) - scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" - cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans" + scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" + cluster_name = "k_means_" if Version(sklearn.__version__) < Version("0.22") else "_kmeans" fixture_name = ( "sklearn.pipeline.Pipeline(" f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler," @@ -533,7 +533,7 @@ def test_serialize_pipeline_clustering(self): assert serialization.name == fixture_name assert serialization.custom_name == fixture_short_name - if LooseVersion(sklearn.__version__) < "1.3": + if Version(sklearn.__version__) < Version("1.3"): # Newer versions of scikit-learn have update docstrings assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) @@ -542,9 +542,9 @@ def test_serialize_pipeline_clustering(self): # The parameters only have the name of base objects(not the whole flow) # as value # memory parameter has been added in 0.19 - if LooseVersion(sklearn.__version__) < "0.19": + if Version(sklearn.__version__) < Version("0.19"): assert len(serialization.parameters) == 1 - elif LooseVersion(sklearn.__version__) < "0.21": + elif Version(sklearn.__version__) < Version("0.21"): assert len(serialization.parameters) == 2 else: assert len(serialization.parameters) == 3 @@ -572,7 +572,7 @@ def test_serialize_pipeline_clustering(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_serialize_column_transformer(self): @@ -592,7 +592,7 @@ def test_serialize_column_transformer(self): remainder="passthrough", ) - scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" + scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" fixture = ( "sklearn.compose._column_transformer.ColumnTransformer(" f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler," @@ -631,7 +631,7 @@ def test_serialize_column_transformer(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_serialize_column_transformer_pipeline(self): @@ -652,8 +652,8 @@ def test_serialize_column_transformer_pipeline(self): model = sklearn.pipeline.Pipeline( steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())], ) - scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" - tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" + scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" + tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes" fixture_name = ( "sklearn.pipeline.Pipeline(" "transformer=sklearn.compose._column_transformer." @@ -692,19 +692,19 @@ def test_serialize_column_transformer_pipeline(self): ) structure = serialization.get_structure("name") assert serialization.name == fixture_name - if LooseVersion(sklearn.__version__) < "1.3": # Not yet up-to-date for later versions + if Version(sklearn.__version__) < Version("1.3"): # Not yet up-to-date for later versions assert serialization.description == fixture_description self.assertDictEqual(structure, fixture_structure) @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="Pipeline processing behaviour updated", ) def test_serialize_feature_union(self): - sparse_parameter = "sparse" if LooseVersion(sklearn.__version__) < "1.4" else "sparse_output" + sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" ohe_params = {sparse_parameter: False} - if LooseVersion(sklearn.__version__) >= "0.20": + if Version(sklearn.__version__) >= Version("0.20"): ohe_params["categories"] = "auto" ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) scaler = sklearn.preprocessing.StandardScaler() @@ -719,8 +719,8 @@ def test_serialize_feature_union(self): ) structure = serialization.get_structure("name") # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" - scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" + module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" fixture_name = ( "sklearn.pipeline.FeatureUnion(" f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," @@ -765,7 +765,7 @@ def test_serialize_feature_union(self): @pytest.mark.sklearn() def test_serialize_feature_union_switched_names(self): - ohe_params = {"categories": "auto"} if LooseVersion(sklearn.__version__) >= "0.20" else {} + ohe_params = {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {} ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) scaler = sklearn.preprocessing.StandardScaler() fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)]) @@ -787,8 +787,8 @@ def test_serialize_feature_union_switched_names(self): ) # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" - scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" + module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" assert ( fu1_serialization.name == "sklearn.pipeline.FeatureUnion(" f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," @@ -802,7 +802,7 @@ def test_serialize_feature_union_switched_names(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) >= "1.4", + Version(sklearn.__version__) >= Version("1.4"), "AdaBoost parameter name changed as did the way its forwarded to GridSearchCV", ) def test_serialize_complex_flow(self): @@ -836,15 +836,15 @@ def test_serialize_complex_flow(self): ) structure = serialized.get_structure("name") # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" + module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder scaler_name = "sklearn.preprocessing.{}.StandardScaler".format( - "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data", + "data" if Version(sklearn.__version__) < Version("0.22") else "_data", ) tree_name = "sklearn.tree.{}.DecisionTreeClassifier".format( - "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes", + "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes", ) - weight_name = "weight" if LooseVersion(sklearn.__version__) < "0.22" else "_weight" + weight_name = "weight" if Version(sklearn.__version__) < Version("0.22") else "_weight" boosting_name = "sklearn.ensemble.{}_boosting.AdaBoostClassifier(base_estimator={})".format( weight_name, tree_name, @@ -870,7 +870,7 @@ def test_serialize_complex_flow(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.21", + Version(sklearn.__version__) < Version("0.21"), reason="Pipeline till 0.20 doesn't support 'passthrough'", ) def test_serialize_strings_as_pipeline_steps(self): @@ -971,7 +971,7 @@ def test_serialize_strings_as_pipeline_steps(self): @pytest.mark.sklearn() def test_serialize_type(self): supported_types = [float, np.float32, np.float64, int, np.int32, np.int64] - if LooseVersion(np.__version__) < "1.24": + if Version(np.__version__) < Version("1.24"): supported_types.append(float) supported_types.append(int) @@ -1294,7 +1294,7 @@ def test_paralizable_check(self): # using this param distribution should not raise an exception legal_param_dist = {"n_estimators": [2, 3, 4]} - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" legal_models = [ sklearn.ensemble.RandomForestClassifier(), sklearn.ensemble.RandomForestClassifier(n_jobs=5), @@ -1320,7 +1320,7 @@ def test_paralizable_check(self): sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist), ] - if LooseVersion(sklearn.__version__) < "0.20": + if Version(sklearn.__version__) < Version("0.20"): has_refit_time = [False, False, False, False, False, False, False, False, False] else: has_refit_time = [False, False, False, False, False, False, True, True, False] @@ -1336,44 +1336,44 @@ def test_paralizable_check(self): @pytest.mark.sklearn() def test__get_fn_arguments_with_defaults(self): - sklearn_version = LooseVersion(sklearn.__version__) - if sklearn_version < "0.19": + sklearn_version = Version(sklearn.__version__) + if sklearn_version < Version("0.19"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 15), (sklearn.tree.DecisionTreeClassifier.__init__, 12), (sklearn.pipeline.Pipeline.__init__, 0), ] - elif sklearn_version < "0.21": + elif sklearn_version < Version("0.21"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 16), (sklearn.tree.DecisionTreeClassifier.__init__, 13), (sklearn.pipeline.Pipeline.__init__, 1), ] - elif sklearn_version < "0.22": + elif sklearn_version < Version("0.22"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 16), (sklearn.tree.DecisionTreeClassifier.__init__, 13), (sklearn.pipeline.Pipeline.__init__, 2), ] - elif sklearn_version < "0.23": + elif sklearn_version < Version("0.23"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 18), (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] - elif sklearn_version < "0.24": + elif sklearn_version < Version("0.24"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 18), (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] - elif sklearn_version < "1.0": + elif sklearn_version < Version("1.0"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 18), (sklearn.tree.DecisionTreeClassifier.__init__, 13), (sklearn.pipeline.Pipeline.__init__, 2), ] - elif sklearn_version < "1.4": + elif sklearn_version < Version("1.4"): fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 17), (sklearn.tree.DecisionTreeClassifier.__init__, 12), @@ -1410,13 +1410,13 @@ def test_deserialize_with_defaults(self): pipe_orig = sklearn.pipeline.Pipeline(steps=steps) pipe_adjusted = sklearn.clone(pipe_orig) - if LooseVersion(sklearn.__version__) < "0.23": + if Version(sklearn.__version__) < Version("0.23"): params = { "Imputer__strategy": "median", "OneHotEncoder__sparse": False, "Estimator__min_samples_leaf": 42, } - elif LooseVersion(sklearn.__version__) < "1.4": + elif Version(sklearn.__version__) < Version("1.4"): params = { "Imputer__strategy": "mean", "OneHotEncoder__sparse": True, @@ -1455,13 +1455,13 @@ def test_deserialize_adaboost_with_defaults(self): pipe_orig = sklearn.pipeline.Pipeline(steps=steps) pipe_adjusted = sklearn.clone(pipe_orig) - if LooseVersion(sklearn.__version__) < "0.22": + if Version(sklearn.__version__) < Version("0.22"): params = { "Imputer__strategy": "median", "OneHotEncoder__sparse": False, "Estimator__n_estimators": 10, } - elif LooseVersion(sklearn.__version__) < "1.4": + elif Version(sklearn.__version__) < Version("1.4"): params = { "Imputer__strategy": "mean", "OneHotEncoder__sparse": True, @@ -1504,11 +1504,11 @@ def test_deserialize_complex_with_defaults(self): pipe_orig = sklearn.pipeline.Pipeline(steps=steps) pipe_adjusted = sklearn.clone(pipe_orig) - impute_strategy = "median" if LooseVersion(sklearn.__version__) < "0.23" else "mean" - sparse = LooseVersion(sklearn.__version__) >= "0.23" - sparse_parameter = "sparse" if LooseVersion(sklearn.__version__) < "1.4" else "sparse_output" + impute_strategy = "median" if Version(sklearn.__version__) < Version("0.23") else "mean" + sparse = Version(sklearn.__version__) >= Version("0.23") + sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" estimator_name = ( - "base_estimator" if LooseVersion(sklearn.__version__) < "1.2" else "estimator" + "base_estimator" if Version(sklearn.__version__) < Version("1.2") else "estimator" ) params = { "Imputer__strategy": impute_strategy, @@ -1532,7 +1532,7 @@ def test_deserialize_complex_with_defaults(self): @pytest.mark.sklearn() def test_openml_param_name_to_sklearn(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) @@ -1569,13 +1569,13 @@ def test_openml_param_name_to_sklearn(self): def test_obtain_parameter_values_flow_not_from_server(self): model = sklearn.linear_model.LogisticRegression(solver="lbfgs") flow = self.extension.model_to_flow(model) - logistic_name = "logistic" if LooseVersion(sklearn.__version__) < "0.22" else "_logistic" + logistic_name = "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic" msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!" with pytest.raises(ValueError, match=msg): self.extension.obtain_parameter_values(flow) - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" model = sklearn.ensemble.AdaBoostClassifier( **{ estimator_name: sklearn.linear_model.LogisticRegression( @@ -1768,7 +1768,7 @@ def test_run_model_on_fold_classification_1_array(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.21", + Version(sklearn.__version__) < Version("0.21"), reason="SimpleImputer, ColumnTransformer available only after 0.19 and " "Pipeline till 0.20 doesn't support indexing and 'passthrough'", ) @@ -1786,7 +1786,7 @@ def test_run_model_on_fold_classification_1_dataframe(self): y_test = y.iloc[test_indices] # Helper functions to return required columns for ColumnTransformer - sparse = {"sparse" if LooseVersion(sklearn.__version__) < "1.4" else "sparse_output": False} + sparse = {"sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False} cat_imp = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore", **sparse), @@ -2173,7 +2173,7 @@ def test_trim_flow_name(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.21", + Version(sklearn.__version__) < Version("0.21"), reason="SimpleImputer, ColumnTransformer available only after 0.19 and " "Pipeline till 0.20 doesn't support indexing and 'passthrough'", ) @@ -2230,7 +2230,7 @@ def test_run_on_model_with_empty_steps(self): assert isinstance(flow.components["dummystep"], OpenMLFlow) assert flow.components["dummystep"].name == "passthrough" assert isinstance(flow.components["classifier"], OpenMLFlow) - if LooseVersion(sklearn.__version__) < "0.22": + if Version(sklearn.__version__) < Version("0.22"): assert flow.components["classifier"].name == "sklearn.svm.classes.SVC" else: assert flow.components["classifier"].name == "sklearn.svm._classes.SVC" @@ -2276,7 +2276,7 @@ def test_sklearn_serialization_with_none_step(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_failed_serialization_of_custom_class(self): @@ -2313,7 +2313,7 @@ def test_failed_serialization_of_custom_class(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_setupid_with_column_transformer(self): diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 2e81c7ae3..dafbeaf3c 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -6,7 +6,7 @@ import hashlib import re import time -from distutils.version import LooseVersion +from packaging.version import Version from unittest import mock import pytest @@ -156,7 +156,7 @@ def test_from_xml_to_xml(self): @pytest.mark.sklearn() def test_to_xml_from_xml(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) @@ -269,7 +269,7 @@ def test_semi_legal_flow(self): # TODO: Test if parameters are set correctly! # should not throw error as it contains two differentiable forms of # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48) - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" semi_legal = sklearn.ensemble.BaggingClassifier( **{ estimator_name: sklearn.ensemble.BaggingClassifier( @@ -311,7 +311,7 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): get_flow_mock.return_value = flow_copy flow_exists_mock.return_value = 1 - if LooseVersion(sklearn.__version__) < "0.22": + if Version(sklearn.__version__) < Version("0.22"): fixture = ( "The flow on the server is inconsistent with the local flow. " "The server flow ID is 1. Please check manually and remove " @@ -375,9 +375,9 @@ def test_existing_flow_exists(self): # create a flow nb = sklearn.naive_bayes.GaussianNB() - sparse = "sparse" if LooseVersion(sklearn.__version__) < "1.4" else "sparse_output" + sparse = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" ohe_params = {sparse: False, "handle_unknown": "ignore"} - if LooseVersion(sklearn.__version__) >= "0.20": + if Version(sklearn.__version__) >= Version("0.20"): ohe_params["categories"] = "auto" steps = [ ("imputation", SimpleImputer(strategy="median")), @@ -418,7 +418,7 @@ def test_sklearn_to_upload_to_flow(self): # Test a more complicated flow ohe_params = {"handle_unknown": "ignore"} - if LooseVersion(sklearn.__version__) >= "0.20": + if Version(sklearn.__version__) >= Version("0.20"): ohe_params["categories"] = "auto" ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) scaler = sklearn.preprocessing.StandardScaler(with_mean=False) @@ -428,7 +428,7 @@ def test_sklearn_to_upload_to_flow(self): percentile=30, ) fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)]) - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) @@ -499,8 +499,8 @@ def test_sklearn_to_upload_to_flow(self): assert new_flow is not flow # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" - if LooseVersion(sklearn.__version__) < "0.22": + module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + if Version(sklearn.__version__) < Version("0.22"): fixture_name = ( f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV(" "estimator=sklearn.pipeline.Pipeline(" diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 6fd2bb765..f9ce97c2f 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -5,7 +5,7 @@ import functools import unittest from collections import OrderedDict -from distutils.version import LooseVersion +from packaging.version import Version from unittest import mock from unittest.mock import patch @@ -279,7 +279,7 @@ def test_are_flows_equal_ignore_if_older(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="OrdinalEncoder introduced in 0.20. " "No known models with list of lists parameters in older versions.", ) @@ -334,7 +334,7 @@ def test_get_flow_reinstantiate_model_no_extension(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) == "0.19.1", + Version(sklearn.__version__) == Version("0.19.1"), reason="Requires scikit-learn!=0.19.1, because target flow is from that version.", ) @pytest.mark.production() @@ -353,7 +353,7 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception( @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) >= "1.0.0", + Version(sklearn.__version__) >= Version("1.0.0"), reason="Requires scikit-learn < 1.0.0.", # Because scikit-learn dropped min_impurity_split hyperparameter in 1.0, # and the requested flow is from 1.0.0 exactly. @@ -367,8 +367,8 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self): @pytest.mark.sklearn() @unittest.skipIf( - (LooseVersion(sklearn.__version__) < "0.23.2") - or (LooseVersion(sklearn.__version__) >= "1.0"), + (Version(sklearn.__version__) < Version("0.23.2")) + or (Version(sklearn.__version__) >= Version("1.0")), reason="Requires scikit-learn 0.23.2 or ~0.24.", # Because these still have min_impurity_split, but with new scikit-learn module structure." ) @@ -381,7 +381,7 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) > "0.23", + Version(sklearn.__version__) > Version("0.23"), reason="Requires scikit-learn<=0.23, because the scikit-learn module structure changed.", ) @pytest.mark.production() diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 2106173da..40a778d8b 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -7,7 +7,7 @@ import time import unittest import warnings -from distutils.version import LooseVersion +from packaging.version import Version from unittest import mock import arff @@ -249,7 +249,7 @@ def _perform_run( "sklearn.model_selection._search.GridSearchCV", "sklearn.pipeline.Pipeline", ] - if LooseVersion(sklearn.__version__) < "0.22": + if Version(sklearn.__version__) < Version("0.22"): classes_without_random_state.append("sklearn.linear_model.base.LinearRegression") else: classes_without_random_state.append("sklearn.linear_model._base.LinearRegression") @@ -680,7 +680,7 @@ def test_run_and_upload_pipeline_dummy_pipeline(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_run_and_upload_column_transformer_pipeline(self): @@ -745,7 +745,7 @@ def get_ct_cf(nominal_indices, numeric_indices): @pytest.mark.sklearn() @unittest.skip("https://github.com/openml/OpenML/issues/1180") @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) @mock.patch("warnings.warn") @@ -796,7 +796,7 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): @pytest.mark.sklearn() def test_run_and_upload_gridsearch(self): - estimator_name = "base_estimator" if LooseVersion(sklearn.__version__) < "1.4" else "estimator" + estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" gridsearch = GridSearchCV( BaggingClassifier(**{estimator_name: SVC()}), {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]}, @@ -935,7 +935,7 @@ def test_learning_curve_task_2(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.21", + Version(sklearn.__version__) < Version("0.21"), reason="Pipelines don't support indexing (used for the assert check)", ) def test_initialize_cv_from_run(self): @@ -998,7 +998,7 @@ def _test_local_evaluations(self, run): (sklearn.metrics.precision_score, {"average": "macro"}), (sklearn.metrics.brier_score_loss, {}), ] - if LooseVersion(sklearn.__version__) < "0.23": + if Version(sklearn.__version__) < Version("0.23"): tests.append((sklearn.metrics.jaccard_similarity_score, {})) else: tests.append((sklearn.metrics.jaccard_score, {})) @@ -1030,7 +1030,7 @@ def test_local_run_swapped_parameter_order_model(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test_local_run_swapped_parameter_order_flow(self): @@ -1059,7 +1059,7 @@ def test_local_run_swapped_parameter_order_flow(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test_local_run_metric_score(self): @@ -1097,7 +1097,7 @@ def test_online_run_metric_score(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test_initialize_model_from_run(self): @@ -1159,7 +1159,7 @@ def test_initialize_model_from_run(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test__run_exists(self): @@ -1333,7 +1333,7 @@ def test_run_with_illegal_flow_id_1_after_load(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="OneHotEncoder cannot handle mixed type DataFrame as input", ) def test__run_task_get_arffcontent(self): @@ -1341,7 +1341,7 @@ def test__run_task_get_arffcontent(self): num_instances = 3196 num_folds = 10 num_repeats = 1 - loss = "log" if LooseVersion(sklearn.__version__) < "1.3" else "log_loss" + loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss" clf = make_pipeline( OneHotEncoder(handle_unknown="ignore"), @@ -1572,7 +1572,7 @@ def test_get_runs_list_by_tag(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_run_on_dataset_with_missing_labels_dataframe(self): @@ -1609,7 +1609,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) def test_run_on_dataset_with_missing_labels_array(self): @@ -1757,7 +1757,7 @@ def test_format_prediction_task_regression(self): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.21", + Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", ) @mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") @@ -1767,10 +1767,10 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): x, y = task.get_X_and_y(dataset_format="dataframe") num_instances = x.shape[0] line_length = 6 + len(task.class_labels) - loss = "log" if LooseVersion(sklearn.__version__) < "1.3" else "log_loss" + loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss" clf = SGDClassifier(loss=loss, random_state=1) n_jobs = 2 - backend = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing" + backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" with parallel_backend(backend, n_jobs=n_jobs): res = openml.runs.functions._run_task_get_arffcontent( extension=self.extension, @@ -1815,7 +1815,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.21", + Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", ) @mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") @@ -1826,7 +1826,7 @@ def test_joblib_backends(self, parallel_mock): num_instances = x.shape[0] line_length = 6 + len(task.class_labels) - backend_choice = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing" + backend_choice = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" for n_jobs, backend, call_count in [ (1, backend_choice, 10), (2, backend_choice, 10), @@ -1873,7 +1873,7 @@ def test_joblib_backends(self, parallel_mock): assert parallel_mock.call_count == call_count @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", + Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test_delete_run(self): diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index b3f418756..9e5cb4e5e 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -2,7 +2,7 @@ from __future__ import annotations import unittest -from distutils.version import LooseVersion +from packaging.version import Version import pytest import sklearn @@ -17,7 +17,7 @@ class TestStudyFunctions(TestBase): @pytest.mark.sklearn() @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.24", + Version(sklearn.__version__) < Version("0.24"), reason="columntransformer introduction in 0.24.0", ) def test_Figure1a(self):