Hyperparameter sampling fixes (#37)

* samplign bugfixing * cleanup
vanderschaarlab · Feb 10, 2023 · c62e72f · c62e72f
1 parent 825635d
commit c62e72f
Show file tree

Hide file tree

Showing 14 changed files with 326 additions and 3 deletions.
diff --git a/src/hyperimpute/plugins/imputers/plugin_EM.py b/src/hyperimpute/plugins/imputers/plugin_EM.py
@@ -226,6 +226,8 @@ class EMPlugin(base.ImputerPlugin):
     def __init__(
         self,
         random_state: int = 0,
+        maxit: int = 500,
+        convergence_threshold: float = 1e-08,
     ) -> None:
         super().__init__(random_state=random_state)
 

diff --git a/src/hyperimpute/plugins/imputers/plugin_missforest.py b/src/hyperimpute/plugins/imputers/plugin_missforest.py
@@ -72,7 +72,6 @@ def hyperparameter_space(*args: Any, **kwargs: Any) -> List[params.Params]:
         return [
             params.Integer("n_estimators", 10, 50, 10),
             params.Integer("max_iter", 100, 300, 100),
-            params.Integer("max_depth", 1, 3),
         ]
 
     @decorators.benchmark

diff --git a/src/hyperimpute/version.py b/src/hyperimpute/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.15"
+__version__ = "0.1.16"
 
 MAJOR_VERSION = ".".join(__version__.split(".")[:-1])
 MINOR_VERSION = __version__.split(".")[-1]
diff --git a/tests/imputers/test_em.py b/tests/imputers/test_em.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -90,3 +92,30 @@ def test_compare_methods_perf(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_em < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_gain.py b/tests/imputers/test_gain.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -90,3 +92,30 @@ def test_compare_methods_perf(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_gain < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_hyperimpute.py b/tests/imputers/test_hyperimpute.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -184,3 +186,30 @@ def test_imputation_order(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_mf < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_ice.py b/tests/imputers/test_ice.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -93,3 +95,30 @@ def test_compare_methods_perf(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_ice < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_mice.py b/tests/imputers/test_mice.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -100,3 +102,30 @@ def test_compare_methods_perf(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_ice < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_miracle.py b/tests/imputers/test_miracle.py
@@ -1,7 +1,12 @@
+# stdlib
+from typing import Any
+
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -50,3 +55,30 @@ def test_mean_plugin_fit_transform(test_plugin: ImputerPlugin) -> None:
     )
 
     assert res.isnull().values.any() == False  # noqa
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_missforest.py b/tests/imputers/test_missforest.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -44,7 +46,7 @@ def test_missforest_plugin_type(test_plugin: ImputerPlugin) -> None:
 
 @pytest.mark.parametrize("test_plugin", [from_api(), from_module(), from_serde()])
 def test_missforest_plugin_hyperparams(test_plugin: ImputerPlugin) -> None:
-    assert len(test_plugin.hyperparameter_space()) == 3
+    assert len(test_plugin.hyperparameter_space()) == 2
     assert test_plugin.hyperparameter_space()[0].name == "n_estimators"
 
 
@@ -91,3 +93,30 @@ def test_compare_methods_perf(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_mf < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0
diff --git a/tests/imputers/test_miwae.py b/tests/imputers/test_miwae.py
@@ -3,8 +3,10 @@
 
 # third party
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
+from sklearn.datasets import load_iris
 
 # hyperimpute absolute
 from hyperimpute.plugins.imputers import ImputerPlugin, Imputers
@@ -90,3 +92,30 @@ def test_compare_methods_perf(
     rmse_other = RMSE(x_other.to_numpy(), x, mask)
 
     assert rmse_ot < rmse_other
+
+
+def test_param_search() -> None:
+    if len(plugin.hyperparameter_space()) == 0:
+        return
+
+    X, _ = load_iris(return_X_y=True)
+    orig_val = X[0, 0]
+    X[0, 0] = np.nan
+
+    def evaluate_args(**kwargs: Any) -> float:
+        X_imp = plugin(**kwargs).fit_transform(X.copy()).values
+
+        return np.abs(orig_val - X_imp[0, 0])
+
+    def objective(trial: optuna.Trial) -> float:
+        args = plugin.sample_hyperparameters(trial)
+        return evaluate_args(**args)
+
+    study = optuna.create_study(
+        load_if_exists=True,
+        directions=["minimize"],
+        study_name=f"test_param_search_{plugin.name()}",
+    )
+    study.optimize(objective, n_trials=10, timeout=60)
+
+    assert len(study.trials) > 0