diff --git a/setup.cfg b/setup.cfg index 60b49b7..e73657a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = pydantic jupyter notebook + gpboost importlib-metadata; python_version<"3.8" diff --git a/src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py b/src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py new file mode 100644 index 0000000..3b0c472 --- /dev/null +++ b/src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py @@ -0,0 +1,121 @@ +# stdlib +import multiprocessing +from typing import Any, List, Optional + +# third party +from gpboost import GPBoostClassifier +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder + +# hyperimpute absolute +import hyperimpute.plugins.core.params as params +import hyperimpute.plugins.prediction.classifiers.base as base + + +class GPBoostPlugin(base.ClassifierPlugin): + """Classification plugin based on the GPBoost classifier. + + Args: + n_estimators: int + The maximum number of estimators at which boosting is terminated. + max_depth: int + Maximum depth of a tree. + reg_lambda: float + L2 regularization term on weights (xgb’s lambda). + reg_alpha: float + L1 regularization term on weights (xgb’s alpha). + colsample_bytree: float + Subsample ratio of columns when constructing each tree. + subsample: float + Subsample ratio of the training instance. + learning_rate: float + Boosting learning rate + boosting_type: str + Specify which booster to use: gbtree, gblinear or dart. + min_child_weight: int + Minimum sum of instance weight(hessian) needed in a child. + random_state: float + Random number seed. + + + Example: + >>> from hyperimpute.plugins.prediction import Predictions + >>> plugin = Predictions(category="classifiers").get("gpboost") + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> plugin.fit_predict(X, y) + """ + + boosting_type = ["gbdt", "goss", "dart"] + + def __init__( + self, + boosting_type: int = 0, + max_depth: Optional[int] = 3, + n_estimators: int = 100, + reg_lambda: float = 0, + reg_alpha: float = 0, + colsample_bytree: float = 1.0, + subsample: float = 1.0, + learning_rate: float = 1e-3, + min_child_weight: int = 0.001, + n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)), + random_state: int = 0, + hyperparam_search_iterations: Optional[int] = None, + **kwargs: Any + ) -> None: + super().__init__(**kwargs) + if hyperparam_search_iterations: + n_estimators = int(hyperparam_search_iterations) + + self.model = GPBoostClassifier( + boosting_type=GPBoostPlugin.boosting_type[boosting_type], + n_estimators=n_estimators, + max_depth=max_depth, + reg_lambda=reg_lambda, + reg_alpha=reg_alpha, + colsample_bytree=colsample_bytree, + subsample=subsample, + learning_rate=learning_rate, + min_child_weight=min_child_weight, + random_state=random_state, + n_jobs=n_jobs, + **kwargs, + ) + + @staticmethod + def name() -> str: + return "gpboost" + + @staticmethod + def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]: + return [ + params.Float("reg_lambda", 1e-3, 10.0), + params.Float("reg_alpha", 1e-3, 10.0), + params.Float("colsample_bytree", 0.1, 0.9), + params.Float("subsample", 0.1, 0.9), + params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]), + params.Integer("max_depth", 2, 5), + params.Integer("n_estimators", 10, 300), + params.Integer("min_child_weight", 0, 300), + params.Integer("boosting_type", 0, len(GPBoostPlugin.boosting_type) - 1), + ] + + def _fit(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> "GPBoostPlugin": + y = np.asarray(args[0]) + self.encoder = LabelEncoder() + y = self.encoder.fit_transform(y) + self.model.fit(X, y, **kwargs) + return self + + def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame: + return self.encoder.inverse_transform(self.model.predict(X, *args, **kwargs)) + + def _predict_proba( + self, X: pd.DataFrame, *args: Any, **kwargs: Any + ) -> pd.DataFrame: + return self.model.predict_proba(X, *args, **kwargs) + + +plugin = GPBoostPlugin diff --git a/src/hyperimpute/plugins/prediction/regression/plugin_gpboost_regressor.py b/src/hyperimpute/plugins/prediction/regression/plugin_gpboost_regressor.py new file mode 100644 index 0000000..cc989f8 --- /dev/null +++ b/src/hyperimpute/plugins/prediction/regression/plugin_gpboost_regressor.py @@ -0,0 +1,115 @@ +# stdlib +import multiprocessing +from typing import Any, List, Optional + +# third party +from gpboost import GPBoostRegressor +import pandas as pd + +# hyperimpute absolute +import hyperimpute.plugins.core.params as params +import hyperimpute.plugins.prediction.regression.base as base + + +class GPBoostRegressorPlugin(base.RegressionPlugin): + """Classification plugin based on the GPBoost classifier. + + Args: + n_estimators: int + The maximum number of estimators at which boosting is terminated. + max_depth: int + Maximum depth of a tree. + reg_lambda: float + L2 regularization term on weights (xgb’s lambda). + reg_alpha: float + L1 regularization term on weights (xgb’s alpha). + colsample_bytree: float + Subsample ratio of columns when constructing each tree. + subsample: float + Subsample ratio of the training instance. + learning_rate: float + Boosting learning rate + boosting_type: str + Specify which booster to use: gbtree, gblinear or dart. + min_child_weight: int + Minimum sum of instance weight(hessian) needed in a child. + random_state: float + Random number seed. + + + Example: + >>> from hyperimpute.plugins.prediction import Predictions + >>> plugin = Predictions(category="regression").get("gpboost_regressor") + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> plugin.fit_predict(X, y) + """ + + boosting_type = ["gbdt", "goss", "dart"] + + def __init__( + self, + boosting_type: int = 0, + max_depth: Optional[int] = -1, + n_estimators: int = 100, + reg_lambda: float = 0, + reg_alpha: float = 0, + colsample_bytree: float = 1.0, + subsample: float = 1.0, + learning_rate: float = 1e-3, + min_child_weight: int = 0.001, + n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)), + random_state: int = 0, + hyperparam_search_iterations: Optional[int] = None, + **kwargs: Any + ) -> None: + super().__init__(**kwargs) + if hyperparam_search_iterations: + n_estimators = int(hyperparam_search_iterations) + + self.model = GPBoostRegressor( + boosting_type=GPBoostRegressorPlugin.boosting_type[boosting_type], + n_estimators=n_estimators, + max_depth=max_depth, + reg_lambda=reg_lambda, + reg_alpha=reg_alpha, + colsample_bytree=colsample_bytree, + subsample=subsample, + learning_rate=learning_rate, + min_child_weight=min_child_weight, + random_state=random_state, + n_jobs=n_jobs, + **kwargs, + ) + + @staticmethod + def name() -> str: + return "gpboost_regressor" + + @staticmethod + def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]: + return [ + params.Float("reg_lambda", 1e-3, 10.0), + params.Float("reg_alpha", 1e-3, 10.0), + params.Float("colsample_bytree", 0.1, 0.9), + params.Float("subsample", 0.1, 0.9), + params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]), + params.Integer("max_depth", 2, 5), + params.Integer("n_estimators", 10, 300), + params.Integer("min_child_weight", 0, 300), + params.Integer( + "boosting_type", 0, len(GPBoostRegressorPlugin.boosting_type) - 1 + ), + ] + + def _fit( + self, X: pd.DataFrame, *args: Any, **kwargs: Any + ) -> "GPBoostRegressorPlugin": + self.model.fit(X, *args, **kwargs) + return self + + def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame: + return self.model.predict(X, *args, **kwargs) + + +plugin = GPBoostRegressorPlugin diff --git a/src/hyperimpute/version.py b/src/hyperimpute/version.py index f1380ee..9cb17e7 100644 --- a/src/hyperimpute/version.py +++ b/src/hyperimpute/version.py @@ -1 +1 @@ -__version__ = "0.1.7" +__version__ = "0.1.8" diff --git a/tests/prediction/classifiers/test_gpboost.py b/tests/prediction/classifiers/test_gpboost.py new file mode 100644 index 0000000..b6c90b8 --- /dev/null +++ b/tests/prediction/classifiers/test_gpboost.py @@ -0,0 +1,97 @@ +# stdlib +from typing import Any + +# third party +import numpy as np +import optuna +import pytest +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split + +# hyperimpute absolute +from hyperimpute.plugins.prediction import PredictionPlugin, Predictions +from hyperimpute.plugins.prediction.classifiers.plugin_gpboost import plugin +from hyperimpute.utils.serialization import load_model, save_model +from hyperimpute.utils.tester import evaluate_estimator + + +def from_api() -> PredictionPlugin: + return Predictions().get("gpboost") + + +def from_module() -> PredictionPlugin: + return plugin() + + +def from_pickle() -> PredictionPlugin: + buff = save_model(plugin()) + return load_model(buff) + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_sanity(test_plugin: PredictionPlugin) -> None: + assert test_plugin is not None + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_name(test_plugin: PredictionPlugin) -> None: + assert test_plugin.name() == "gpboost" + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_type(test_plugin: PredictionPlugin) -> None: + assert test_plugin.type() == "prediction" + assert test_plugin.subtype() == "classifier" + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_hyperparams(test_plugin: PredictionPlugin) -> None: + assert len(test_plugin.hyperparameter_space()) == 9 + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_fit_predict(test_plugin: PredictionPlugin) -> None: + X, y = load_iris(return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + y_pred = test_plugin.fit(X_train, y_train).predict(X_test) + + assert np.abs(np.subtract(y_pred.values, y_test.values)).mean() < 1 + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_score(test_plugin: PredictionPlugin) -> None: + X, y = load_iris(return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + test_plugin.fit(X_train, y_train) + + assert test_plugin.score(X_test, y_test) > 0.5 + + +def test_param_search() -> None: + if len(plugin.hyperparameter_space()) == 0: + return + + X, y = load_iris(return_X_y=True, as_frame=True) + + def evaluate_args(**kwargs: Any) -> float: + kwargs["n_estimators"] = 10 + + model = plugin(**kwargs) + metrics = evaluate_estimator(model, X, y) + + return metrics["clf"]["aucroc"][0] + + def objective(trial: optuna.Trial) -> float: + args = plugin.sample_hyperparameters(trial) + return evaluate_args(**args) + + study = optuna.create_study( + load_if_exists=True, + directions=["maximize"], + study_name=f"test_param_search_{plugin.name()}", + ) + study.optimize(objective, n_trials=10, timeout=60) + + assert len(study.trials) == 10 diff --git a/tests/prediction/regression/test_gpboost_regression.py b/tests/prediction/regression/test_gpboost_regression.py new file mode 100644 index 0000000..655eb0c --- /dev/null +++ b/tests/prediction/regression/test_gpboost_regression.py @@ -0,0 +1,84 @@ +# stdlib +from typing import Any + +# third party +import optuna +import pytest +from sklearn.datasets import load_diabetes + +# hyperimpute absolute +from hyperimpute.plugins.prediction import PredictionPlugin, Predictions +from hyperimpute.plugins.prediction.regression.plugin_gpboost_regressor import plugin +from hyperimpute.utils.serialization import load_model, save_model +from hyperimpute.utils.tester import evaluate_regression + + +def from_api() -> PredictionPlugin: + return Predictions(category="regression").get("gpboost_regressor") + + +def from_module() -> PredictionPlugin: + return plugin() + + +def from_pickle() -> PredictionPlugin: + buff = save_model(plugin()) + return load_model(buff) + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_sanity(test_plugin: PredictionPlugin) -> None: + assert test_plugin is not None + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_name(test_plugin: PredictionPlugin) -> None: + assert test_plugin.name() == "gpboost_regressor" + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_type(test_plugin: PredictionPlugin) -> None: + assert test_plugin.type() == "prediction" + assert test_plugin.subtype() == "regression" + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_hyperparams(test_plugin: PredictionPlugin) -> None: + assert len(test_plugin.hyperparameter_space()) == 9 + + +@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) +def test_gpboost_plugin_fit_predict(test_plugin: PredictionPlugin) -> None: + X, y = load_diabetes(return_X_y=True) + + score = evaluate_regression(test_plugin, X, y) + + assert score["clf"]["rmse"][0] < 6000 + + +def test_param_search() -> None: + if len(plugin.hyperparameter_space()) == 0: + return + + X, y = load_diabetes(return_X_y=True) + + def evaluate_args(**kwargs: Any) -> float: + kwargs["n_estimators"] = 10 + + model = plugin(**kwargs) + metrics = evaluate_regression(model, X, y) + + return metrics["clf"]["rmse"][0] + + def objective(trial: optuna.Trial) -> float: + args = plugin.sample_hyperparameters(trial) + return evaluate_args(**args) + + study = optuna.create_study( + load_if_exists=True, + directions=["maximize"], + study_name=f"test_param_search_{plugin.name()}", + ) + study.optimize(objective, n_trials=10, timeout=60) + + assert len(study.trials) == 10