Skip to content

Commit

Permalink
GPBoost support (#26)
Browse files Browse the repository at this point in the history
* gpboost clf

* add GPBoost
  • Loading branch information
bcebere authored Nov 17, 2022
1 parent 85cd746 commit feae8fc
Show file tree
Hide file tree
Showing 6 changed files with 419 additions and 1 deletion.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ install_requires =
pydantic
jupyter
notebook
gpboost
importlib-metadata; python_version<"3.8"


Expand Down
121 changes: 121 additions & 0 deletions src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# stdlib
import multiprocessing
from typing import Any, List, Optional

# third party
from gpboost import GPBoostClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# hyperimpute absolute
import hyperimpute.plugins.core.params as params
import hyperimpute.plugins.prediction.classifiers.base as base


class GPBoostPlugin(base.ClassifierPlugin):
"""Classification plugin based on the GPBoost classifier.
Args:
n_estimators: int
The maximum number of estimators at which boosting is terminated.
max_depth: int
Maximum depth of a tree.
reg_lambda: float
L2 regularization term on weights (xgb’s lambda).
reg_alpha: float
L1 regularization term on weights (xgb’s alpha).
colsample_bytree: float
Subsample ratio of columns when constructing each tree.
subsample: float
Subsample ratio of the training instance.
learning_rate: float
Boosting learning rate
boosting_type: str
Specify which booster to use: gbtree, gblinear or dart.
min_child_weight: int
Minimum sum of instance weight(hessian) needed in a child.
random_state: float
Random number seed.
Example:
>>> from hyperimpute.plugins.prediction import Predictions
>>> plugin = Predictions(category="classifiers").get("gpboost")
>>> from sklearn.datasets import load_iris
>>> X, y = load_iris(return_X_y=True)
>>> plugin.fit_predict(X, y)
"""

boosting_type = ["gbdt", "goss", "dart"]

def __init__(
self,
boosting_type: int = 0,
max_depth: Optional[int] = 3,
n_estimators: int = 100,
reg_lambda: float = 0,
reg_alpha: float = 0,
colsample_bytree: float = 1.0,
subsample: float = 1.0,
learning_rate: float = 1e-3,
min_child_weight: int = 0.001,
n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)),
random_state: int = 0,
hyperparam_search_iterations: Optional[int] = None,
**kwargs: Any
) -> None:
super().__init__(**kwargs)
if hyperparam_search_iterations:
n_estimators = int(hyperparam_search_iterations)

self.model = GPBoostClassifier(
boosting_type=GPBoostPlugin.boosting_type[boosting_type],
n_estimators=n_estimators,
max_depth=max_depth,
reg_lambda=reg_lambda,
reg_alpha=reg_alpha,
colsample_bytree=colsample_bytree,
subsample=subsample,
learning_rate=learning_rate,
min_child_weight=min_child_weight,
random_state=random_state,
n_jobs=n_jobs,
**kwargs,
)

@staticmethod
def name() -> str:
return "gpboost"

@staticmethod
def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
return [
params.Float("reg_lambda", 1e-3, 10.0),
params.Float("reg_alpha", 1e-3, 10.0),
params.Float("colsample_bytree", 0.1, 0.9),
params.Float("subsample", 0.1, 0.9),
params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]),
params.Integer("max_depth", 2, 5),
params.Integer("n_estimators", 10, 300),
params.Integer("min_child_weight", 0, 300),
params.Integer("boosting_type", 0, len(GPBoostPlugin.boosting_type) - 1),
]

def _fit(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> "GPBoostPlugin":
y = np.asarray(args[0])
self.encoder = LabelEncoder()
y = self.encoder.fit_transform(y)
self.model.fit(X, y, **kwargs)
return self

def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame:
return self.encoder.inverse_transform(self.model.predict(X, *args, **kwargs))

def _predict_proba(
self, X: pd.DataFrame, *args: Any, **kwargs: Any
) -> pd.DataFrame:
return self.model.predict_proba(X, *args, **kwargs)


plugin = GPBoostPlugin
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# stdlib
import multiprocessing
from typing import Any, List, Optional

# third party
from gpboost import GPBoostRegressor
import pandas as pd

# hyperimpute absolute
import hyperimpute.plugins.core.params as params
import hyperimpute.plugins.prediction.regression.base as base


class GPBoostRegressorPlugin(base.RegressionPlugin):
"""Classification plugin based on the GPBoost classifier.
Args:
n_estimators: int
The maximum number of estimators at which boosting is terminated.
max_depth: int
Maximum depth of a tree.
reg_lambda: float
L2 regularization term on weights (xgb’s lambda).
reg_alpha: float
L1 regularization term on weights (xgb’s alpha).
colsample_bytree: float
Subsample ratio of columns when constructing each tree.
subsample: float
Subsample ratio of the training instance.
learning_rate: float
Boosting learning rate
boosting_type: str
Specify which booster to use: gbtree, gblinear or dart.
min_child_weight: int
Minimum sum of instance weight(hessian) needed in a child.
random_state: float
Random number seed.
Example:
>>> from hyperimpute.plugins.prediction import Predictions
>>> plugin = Predictions(category="regression").get("gpboost_regressor")
>>> from sklearn.datasets import load_iris
>>> X, y = load_iris(return_X_y=True)
>>> plugin.fit_predict(X, y)
"""

boosting_type = ["gbdt", "goss", "dart"]

def __init__(
self,
boosting_type: int = 0,
max_depth: Optional[int] = -1,
n_estimators: int = 100,
reg_lambda: float = 0,
reg_alpha: float = 0,
colsample_bytree: float = 1.0,
subsample: float = 1.0,
learning_rate: float = 1e-3,
min_child_weight: int = 0.001,
n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)),
random_state: int = 0,
hyperparam_search_iterations: Optional[int] = None,
**kwargs: Any
) -> None:
super().__init__(**kwargs)
if hyperparam_search_iterations:
n_estimators = int(hyperparam_search_iterations)

self.model = GPBoostRegressor(
boosting_type=GPBoostRegressorPlugin.boosting_type[boosting_type],
n_estimators=n_estimators,
max_depth=max_depth,
reg_lambda=reg_lambda,
reg_alpha=reg_alpha,
colsample_bytree=colsample_bytree,
subsample=subsample,
learning_rate=learning_rate,
min_child_weight=min_child_weight,
random_state=random_state,
n_jobs=n_jobs,
**kwargs,
)

@staticmethod
def name() -> str:
return "gpboost_regressor"

@staticmethod
def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
return [
params.Float("reg_lambda", 1e-3, 10.0),
params.Float("reg_alpha", 1e-3, 10.0),
params.Float("colsample_bytree", 0.1, 0.9),
params.Float("subsample", 0.1, 0.9),
params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]),
params.Integer("max_depth", 2, 5),
params.Integer("n_estimators", 10, 300),
params.Integer("min_child_weight", 0, 300),
params.Integer(
"boosting_type", 0, len(GPBoostRegressorPlugin.boosting_type) - 1
),
]

def _fit(
self, X: pd.DataFrame, *args: Any, **kwargs: Any
) -> "GPBoostRegressorPlugin":
self.model.fit(X, *args, **kwargs)
return self

def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame:
return self.model.predict(X, *args, **kwargs)


plugin = GPBoostRegressorPlugin
2 changes: 1 addition & 1 deletion src/hyperimpute/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.7"
__version__ = "0.1.8"
97 changes: 97 additions & 0 deletions tests/prediction/classifiers/test_gpboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# stdlib
from typing import Any

# third party
import numpy as np
import optuna
import pytest
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# hyperimpute absolute
from hyperimpute.plugins.prediction import PredictionPlugin, Predictions
from hyperimpute.plugins.prediction.classifiers.plugin_gpboost import plugin
from hyperimpute.utils.serialization import load_model, save_model
from hyperimpute.utils.tester import evaluate_estimator


def from_api() -> PredictionPlugin:
return Predictions().get("gpboost")


def from_module() -> PredictionPlugin:
return plugin()


def from_pickle() -> PredictionPlugin:
buff = save_model(plugin())
return load_model(buff)


@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
def test_gpboost_plugin_sanity(test_plugin: PredictionPlugin) -> None:
assert test_plugin is not None


@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
def test_gpboost_plugin_name(test_plugin: PredictionPlugin) -> None:
assert test_plugin.name() == "gpboost"


@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
def test_gpboost_plugin_type(test_plugin: PredictionPlugin) -> None:
assert test_plugin.type() == "prediction"
assert test_plugin.subtype() == "classifier"


@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
def test_gpboost_plugin_hyperparams(test_plugin: PredictionPlugin) -> None:
assert len(test_plugin.hyperparameter_space()) == 9


@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
def test_gpboost_plugin_fit_predict(test_plugin: PredictionPlugin) -> None:
X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

y_pred = test_plugin.fit(X_train, y_train).predict(X_test)

assert np.abs(np.subtract(y_pred.values, y_test.values)).mean() < 1


@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()])
def test_gpboost_plugin_score(test_plugin: PredictionPlugin) -> None:
X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

test_plugin.fit(X_train, y_train)

assert test_plugin.score(X_test, y_test) > 0.5


def test_param_search() -> None:
if len(plugin.hyperparameter_space()) == 0:
return

X, y = load_iris(return_X_y=True, as_frame=True)

def evaluate_args(**kwargs: Any) -> float:
kwargs["n_estimators"] = 10

model = plugin(**kwargs)
metrics = evaluate_estimator(model, X, y)

return metrics["clf"]["aucroc"][0]

def objective(trial: optuna.Trial) -> float:
args = plugin.sample_hyperparameters(trial)
return evaluate_args(**args)

study = optuna.create_study(
load_if_exists=True,
directions=["maximize"],
study_name=f"test_param_search_{plugin.name()}",
)
study.optimize(objective, n_trials=10, timeout=60)

assert len(study.trials) == 10
Loading

0 comments on commit feae8fc

Please sign in to comment.