-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* gpboost clf * add GPBoost
- Loading branch information
Showing
6 changed files
with
419 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,7 @@ install_requires = | |
pydantic | ||
jupyter | ||
notebook | ||
gpboost | ||
importlib-metadata; python_version<"3.8" | ||
|
||
|
||
|
121 changes: 121 additions & 0 deletions
121
src/hyperimpute/plugins/prediction/classifiers/plugin_gpboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# stdlib | ||
import multiprocessing | ||
from typing import Any, List, Optional | ||
|
||
# third party | ||
from gpboost import GPBoostClassifier | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import LabelEncoder | ||
|
||
# hyperimpute absolute | ||
import hyperimpute.plugins.core.params as params | ||
import hyperimpute.plugins.prediction.classifiers.base as base | ||
|
||
|
||
class GPBoostPlugin(base.ClassifierPlugin): | ||
"""Classification plugin based on the GPBoost classifier. | ||
Args: | ||
n_estimators: int | ||
The maximum number of estimators at which boosting is terminated. | ||
max_depth: int | ||
Maximum depth of a tree. | ||
reg_lambda: float | ||
L2 regularization term on weights (xgb’s lambda). | ||
reg_alpha: float | ||
L1 regularization term on weights (xgb’s alpha). | ||
colsample_bytree: float | ||
Subsample ratio of columns when constructing each tree. | ||
subsample: float | ||
Subsample ratio of the training instance. | ||
learning_rate: float | ||
Boosting learning rate | ||
boosting_type: str | ||
Specify which booster to use: gbtree, gblinear or dart. | ||
min_child_weight: int | ||
Minimum sum of instance weight(hessian) needed in a child. | ||
random_state: float | ||
Random number seed. | ||
Example: | ||
>>> from hyperimpute.plugins.prediction import Predictions | ||
>>> plugin = Predictions(category="classifiers").get("gpboost") | ||
>>> from sklearn.datasets import load_iris | ||
>>> X, y = load_iris(return_X_y=True) | ||
>>> plugin.fit_predict(X, y) | ||
""" | ||
|
||
boosting_type = ["gbdt", "goss", "dart"] | ||
|
||
def __init__( | ||
self, | ||
boosting_type: int = 0, | ||
max_depth: Optional[int] = 3, | ||
n_estimators: int = 100, | ||
reg_lambda: float = 0, | ||
reg_alpha: float = 0, | ||
colsample_bytree: float = 1.0, | ||
subsample: float = 1.0, | ||
learning_rate: float = 1e-3, | ||
min_child_weight: int = 0.001, | ||
n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)), | ||
random_state: int = 0, | ||
hyperparam_search_iterations: Optional[int] = None, | ||
**kwargs: Any | ||
) -> None: | ||
super().__init__(**kwargs) | ||
if hyperparam_search_iterations: | ||
n_estimators = int(hyperparam_search_iterations) | ||
|
||
self.model = GPBoostClassifier( | ||
boosting_type=GPBoostPlugin.boosting_type[boosting_type], | ||
n_estimators=n_estimators, | ||
max_depth=max_depth, | ||
reg_lambda=reg_lambda, | ||
reg_alpha=reg_alpha, | ||
colsample_bytree=colsample_bytree, | ||
subsample=subsample, | ||
learning_rate=learning_rate, | ||
min_child_weight=min_child_weight, | ||
random_state=random_state, | ||
n_jobs=n_jobs, | ||
**kwargs, | ||
) | ||
|
||
@staticmethod | ||
def name() -> str: | ||
return "gpboost" | ||
|
||
@staticmethod | ||
def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]: | ||
return [ | ||
params.Float("reg_lambda", 1e-3, 10.0), | ||
params.Float("reg_alpha", 1e-3, 10.0), | ||
params.Float("colsample_bytree", 0.1, 0.9), | ||
params.Float("subsample", 0.1, 0.9), | ||
params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]), | ||
params.Integer("max_depth", 2, 5), | ||
params.Integer("n_estimators", 10, 300), | ||
params.Integer("min_child_weight", 0, 300), | ||
params.Integer("boosting_type", 0, len(GPBoostPlugin.boosting_type) - 1), | ||
] | ||
|
||
def _fit(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> "GPBoostPlugin": | ||
y = np.asarray(args[0]) | ||
self.encoder = LabelEncoder() | ||
y = self.encoder.fit_transform(y) | ||
self.model.fit(X, y, **kwargs) | ||
return self | ||
|
||
def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame: | ||
return self.encoder.inverse_transform(self.model.predict(X, *args, **kwargs)) | ||
|
||
def _predict_proba( | ||
self, X: pd.DataFrame, *args: Any, **kwargs: Any | ||
) -> pd.DataFrame: | ||
return self.model.predict_proba(X, *args, **kwargs) | ||
|
||
|
||
plugin = GPBoostPlugin |
115 changes: 115 additions & 0 deletions
115
src/hyperimpute/plugins/prediction/regression/plugin_gpboost_regressor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# stdlib | ||
import multiprocessing | ||
from typing import Any, List, Optional | ||
|
||
# third party | ||
from gpboost import GPBoostRegressor | ||
import pandas as pd | ||
|
||
# hyperimpute absolute | ||
import hyperimpute.plugins.core.params as params | ||
import hyperimpute.plugins.prediction.regression.base as base | ||
|
||
|
||
class GPBoostRegressorPlugin(base.RegressionPlugin): | ||
"""Classification plugin based on the GPBoost classifier. | ||
Args: | ||
n_estimators: int | ||
The maximum number of estimators at which boosting is terminated. | ||
max_depth: int | ||
Maximum depth of a tree. | ||
reg_lambda: float | ||
L2 regularization term on weights (xgb’s lambda). | ||
reg_alpha: float | ||
L1 regularization term on weights (xgb’s alpha). | ||
colsample_bytree: float | ||
Subsample ratio of columns when constructing each tree. | ||
subsample: float | ||
Subsample ratio of the training instance. | ||
learning_rate: float | ||
Boosting learning rate | ||
boosting_type: str | ||
Specify which booster to use: gbtree, gblinear or dart. | ||
min_child_weight: int | ||
Minimum sum of instance weight(hessian) needed in a child. | ||
random_state: float | ||
Random number seed. | ||
Example: | ||
>>> from hyperimpute.plugins.prediction import Predictions | ||
>>> plugin = Predictions(category="regression").get("gpboost_regressor") | ||
>>> from sklearn.datasets import load_iris | ||
>>> X, y = load_iris(return_X_y=True) | ||
>>> plugin.fit_predict(X, y) | ||
""" | ||
|
||
boosting_type = ["gbdt", "goss", "dart"] | ||
|
||
def __init__( | ||
self, | ||
boosting_type: int = 0, | ||
max_depth: Optional[int] = -1, | ||
n_estimators: int = 100, | ||
reg_lambda: float = 0, | ||
reg_alpha: float = 0, | ||
colsample_bytree: float = 1.0, | ||
subsample: float = 1.0, | ||
learning_rate: float = 1e-3, | ||
min_child_weight: int = 0.001, | ||
n_jobs: int = max(1, int(multiprocessing.cpu_count() / 2)), | ||
random_state: int = 0, | ||
hyperparam_search_iterations: Optional[int] = None, | ||
**kwargs: Any | ||
) -> None: | ||
super().__init__(**kwargs) | ||
if hyperparam_search_iterations: | ||
n_estimators = int(hyperparam_search_iterations) | ||
|
||
self.model = GPBoostRegressor( | ||
boosting_type=GPBoostRegressorPlugin.boosting_type[boosting_type], | ||
n_estimators=n_estimators, | ||
max_depth=max_depth, | ||
reg_lambda=reg_lambda, | ||
reg_alpha=reg_alpha, | ||
colsample_bytree=colsample_bytree, | ||
subsample=subsample, | ||
learning_rate=learning_rate, | ||
min_child_weight=min_child_weight, | ||
random_state=random_state, | ||
n_jobs=n_jobs, | ||
**kwargs, | ||
) | ||
|
||
@staticmethod | ||
def name() -> str: | ||
return "gpboost_regressor" | ||
|
||
@staticmethod | ||
def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]: | ||
return [ | ||
params.Float("reg_lambda", 1e-3, 10.0), | ||
params.Float("reg_alpha", 1e-3, 10.0), | ||
params.Float("colsample_bytree", 0.1, 0.9), | ||
params.Float("subsample", 0.1, 0.9), | ||
params.Categorical("learning_rate", [1e-4, 1e-3, 1e-2]), | ||
params.Integer("max_depth", 2, 5), | ||
params.Integer("n_estimators", 10, 300), | ||
params.Integer("min_child_weight", 0, 300), | ||
params.Integer( | ||
"boosting_type", 0, len(GPBoostRegressorPlugin.boosting_type) - 1 | ||
), | ||
] | ||
|
||
def _fit( | ||
self, X: pd.DataFrame, *args: Any, **kwargs: Any | ||
) -> "GPBoostRegressorPlugin": | ||
self.model.fit(X, *args, **kwargs) | ||
return self | ||
|
||
def _predict(self, X: pd.DataFrame, *args: Any, **kwargs: Any) -> pd.DataFrame: | ||
return self.model.predict(X, *args, **kwargs) | ||
|
||
|
||
plugin = GPBoostRegressorPlugin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.1.7" | ||
__version__ = "0.1.8" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# stdlib | ||
from typing import Any | ||
|
||
# third party | ||
import numpy as np | ||
import optuna | ||
import pytest | ||
from sklearn.datasets import load_iris | ||
from sklearn.model_selection import train_test_split | ||
|
||
# hyperimpute absolute | ||
from hyperimpute.plugins.prediction import PredictionPlugin, Predictions | ||
from hyperimpute.plugins.prediction.classifiers.plugin_gpboost import plugin | ||
from hyperimpute.utils.serialization import load_model, save_model | ||
from hyperimpute.utils.tester import evaluate_estimator | ||
|
||
|
||
def from_api() -> PredictionPlugin: | ||
return Predictions().get("gpboost") | ||
|
||
|
||
def from_module() -> PredictionPlugin: | ||
return plugin() | ||
|
||
|
||
def from_pickle() -> PredictionPlugin: | ||
buff = save_model(plugin()) | ||
return load_model(buff) | ||
|
||
|
||
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) | ||
def test_gpboost_plugin_sanity(test_plugin: PredictionPlugin) -> None: | ||
assert test_plugin is not None | ||
|
||
|
||
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) | ||
def test_gpboost_plugin_name(test_plugin: PredictionPlugin) -> None: | ||
assert test_plugin.name() == "gpboost" | ||
|
||
|
||
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) | ||
def test_gpboost_plugin_type(test_plugin: PredictionPlugin) -> None: | ||
assert test_plugin.type() == "prediction" | ||
assert test_plugin.subtype() == "classifier" | ||
|
||
|
||
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) | ||
def test_gpboost_plugin_hyperparams(test_plugin: PredictionPlugin) -> None: | ||
assert len(test_plugin.hyperparameter_space()) == 9 | ||
|
||
|
||
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) | ||
def test_gpboost_plugin_fit_predict(test_plugin: PredictionPlugin) -> None: | ||
X, y = load_iris(return_X_y=True, as_frame=True) | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | ||
|
||
y_pred = test_plugin.fit(X_train, y_train).predict(X_test) | ||
|
||
assert np.abs(np.subtract(y_pred.values, y_test.values)).mean() < 1 | ||
|
||
|
||
@pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_pickle()]) | ||
def test_gpboost_plugin_score(test_plugin: PredictionPlugin) -> None: | ||
X, y = load_iris(return_X_y=True, as_frame=True) | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | ||
|
||
test_plugin.fit(X_train, y_train) | ||
|
||
assert test_plugin.score(X_test, y_test) > 0.5 | ||
|
||
|
||
def test_param_search() -> None: | ||
if len(plugin.hyperparameter_space()) == 0: | ||
return | ||
|
||
X, y = load_iris(return_X_y=True, as_frame=True) | ||
|
||
def evaluate_args(**kwargs: Any) -> float: | ||
kwargs["n_estimators"] = 10 | ||
|
||
model = plugin(**kwargs) | ||
metrics = evaluate_estimator(model, X, y) | ||
|
||
return metrics["clf"]["aucroc"][0] | ||
|
||
def objective(trial: optuna.Trial) -> float: | ||
args = plugin.sample_hyperparameters(trial) | ||
return evaluate_args(**args) | ||
|
||
study = optuna.create_study( | ||
load_if_exists=True, | ||
directions=["maximize"], | ||
study_name=f"test_param_search_{plugin.name()}", | ||
) | ||
study.optimize(objective, n_trials=10, timeout=60) | ||
|
||
assert len(study.trials) == 10 |
Oops, something went wrong.