From 53f0805af29495d0c5b2709e33ef96f28210b9a9 Mon Sep 17 00:00:00 2001
From: JunchiXi <liaojunchi777@163.com>
Date: Thu, 18 Jan 2024 16:09:49 +0800
Subject: [PATCH] feat:add RidgeRegression algorithm

---
 geochemistrypi/data_mining/constants.py       |   1 +
 .../func/algo_regression/_ridge_regression.py |  39 ++
 .../data_mining/model/regression.py           | 366 +++++++++++++++++-
 geochemistrypi/data_mining/process/regress.py |  11 +
 4 files changed, 416 insertions(+), 1 deletion(-)
 create mode 100644 geochemistrypi/data_mining/model/func/algo_regression/_ridge_regression.py

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
index 6ccc7a23..1dc79390 100644
--- a/geochemistrypi/data_mining/constants.py
+++ b/geochemistrypi/data_mining/constants.py
@@ -45,6 +45,7 @@
     "Elastic Net",
     "SGD Regression",
     "BayesianRidge Regression",
+    "Ridge Regression",
     # "Bagging Regression",
     # "Decision Tree",
     # Histogram-based Gradient Boosting,
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_ridge_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_ridge_regression.py
new file mode 100644
index 00000000..2eb4b9b3
--- /dev/null
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_ridge_regression.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+from typing import Dict
+
+from rich import print
+
+from ....constants import SECTION
+from ....data.data_readiness import bool_input, float_input, num_input
+
+
+def ridge_regression_manual_hyper_parameters() -> Dict:
+    """Manually set hyperparameters.
+
+    Returns
+    -------
+    hyper_parameters : dict
+    """
+    print("Alpha: This hyperparameter represents the coefficient of the norm, which controls the degree of contraction constraint.")
+    print("Please indicate the coefficient of alpha. A good starting range could be between 0.001 and 2, such as 1.")
+    alpha = float_input(0.01, SECTION[2], "@Alpha: ")
+    print("Fit Intercept: This hyperparameter represents whether the model is evaluated with constant terms.")
+    print("Please indicate whether there is a parameter entry. It is generally recommended to leave it set to True.")
+    fit_intercept = bool_input(SECTION[2])
+    print("Max Iter: This hyperparameter represents the maximum number of iterations for the solver to converge.")
+    print("Please indicate the maximum number of iterations. A good starting range could be between 1000 and 10000, such as 1000.")
+    max_iter = num_input(SECTION[2], "@Max Iter: ")
+    print("Tolerance: This hyperparameter represents the tolerance of the optimization method.")
+    print("Please indicate the tolerance. A good starting range could be between 0.0001 and 0.001, such as 0.0001.")
+    tol = float_input(0.0001, SECTION[2], "@Tolerance: ")
+    # print("Selection: This hyperparameter represents the method of selecting the regularization coefficient.")
+    # print("Please indicate the method of selecting the regularization coefficient. It is generally recommended to leave it set to 'cyclic'.")
+    # selections = ["cyclic", "random"]
+    # selection = str_input(selections, SECTION[2])
+    hyper_parameters = {
+        "alpha": alpha,
+        "fit_intercept": fit_intercept,
+        "max_iter": max_iter,
+        "tol": tol,
+    }
+    return hyper_parameters
diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
index 2c7d91f5..592e4937 100644
--- a/geochemistrypi/data_mining/model/regression.py
+++ b/geochemistrypi/data_mining/model/regression.py
@@ -11,7 +11,7 @@
 from multipledispatch import dispatch
 from rich import print
 from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
-from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LinearRegression, SGDRegressor
+from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LinearRegression, Ridge, SGDRegressor
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.neural_network import MLPRegressor
 from sklearn.preprocessing import PolynomialFeatures
@@ -33,6 +33,7 @@
 from .func.algo_regression._multi_layer_perceptron import multi_layer_perceptron_manual_hyper_parameters
 from .func.algo_regression._polynomial_regression import polynomial_regression_manual_hyper_parameters
 from .func.algo_regression._rf import random_forest_manual_hyper_parameters
+from .func.algo_regression._ridge_regression import ridge_regression_manual_hyper_parameters
 from .func.algo_regression._sgd_regression import sgd_regression_manual_hyper_parameters
 from .func.algo_regression._svr import svr_manual_hyper_parameters
 from .func.algo_regression._xgboost import xgboost_manual_hyper_parameters
@@ -3939,3 +3940,366 @@ def special_components(self, **kwargs) -> None:
     @dispatch(bool)
     def special_components(self, is_automl: bool, **kwargs) -> None:
         pass
+
+
+class RidgeRegression(LinearWorkflowMixin, RegressionWorkflowBase):
+    """The automation workflow of using Lasso to make insightful products."""
+
+    name = "Ridge Regression"
+    special_function = ["Ridge Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"]
+
+    def __init__(
+        self,
+        alpha: float = 1.0,
+        *,
+        fit_intercept: bool = True,
+        copy_X: bool = True,
+        max_iter: int = 1000,
+        tol: float = 1e-4,
+        solver: str = "auto",
+        positive: bool = False,
+        random_state: Optional[int] = None,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        alpha : {float, ndarray of shape (n_targets,)}, default=1.0
+            Constant that multiplies the L2 term, controlling regularization
+            strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
+
+            When `alpha = 0`, the objective is equivalent to ordinary least
+            squares, solved by the :class:`LinearRegression` object. For numerical
+            reasons, using `alpha = 0` with the `Ridge` object is not advised.
+            Instead, you should use the :class:`LinearRegression` object.
+
+            If an array is passed, penalties are assumed to be specific to the
+            targets. Hence they must correspond in number.
+
+        fit_intercept : bool, default=True
+            Whether to fit the intercept for this model. If set
+            to false, no intercept will be used in calculations
+            (i.e. ``X`` and ``y`` are expected to be centered).
+
+        copy_X : bool, default=True
+            If True, X will be copied; else, it may be overwritten.
+
+        max_iter : int, default=None
+            Maximum number of iterations for conjugate gradient solver.
+            For 'sparse_cg' and 'lsqr' solvers, the default value is determined
+            by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
+            For 'lbfgs' solver, the default value is 15000.
+
+        tol : float, default=1e-4
+            The precision of the solution (`coef_`) is determined by `tol` which
+            specifies a different convergence criterion for each solver:
+
+            - 'svd': `tol` has no impact.
+
+            - 'cholesky': `tol` has no impact.
+
+            - 'sparse_cg': norm of residuals smaller than `tol`.
+
+            - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
+              which control the norm of the residual vector in terms of the norms of
+              matrix and coefficients.
+
+            - 'sag' and 'saga': relative change of coef smaller than `tol`.
+
+            - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|
+              smaller than `tol`.
+
+            .. versionchanged:: 1.2
+               Default value changed from 1e-3 to 1e-4 for consistency with other linear
+               models.
+
+        solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
+                'sag', 'saga', 'lbfgs'}, default='auto'
+            Solver to use in the computational routines:
+
+            - 'auto' chooses the solver automatically based on the type of data.
+
+            - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
+              coefficients. It is the most stable solver, in particular more stable
+              for singular matrices than 'cholesky' at the cost of being slower.
+
+            - 'cholesky' uses the standard scipy.linalg.solve function to
+              obtain a closed-form solution.
+
+            - 'sparse_cg' uses the conjugate gradient solver as found in
+              scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
+              more appropriate than 'cholesky' for large-scale data
+              (possibility to set `tol` and `max_iter`).
+
+            - 'lsqr' uses the dedicated regularized least-squares routine
+              scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
+              procedure.
+
+            - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
+              its improved, unbiased version named SAGA. Both methods also use an
+              iterative procedure, and are often faster than other solvers when
+              both n_samples and n_features are large. Note that 'sag' and
+              'saga' fast convergence is only guaranteed on features with
+              approximately the same scale. You can preprocess the data with a
+              scaler from sklearn.preprocessing.
+
+            - 'lbfgs' uses L-BFGS-B algorithm implemented in
+              `scipy.optimize.minimize`. It can be used only when `positive`
+              is True.
+
+            All solvers except 'svd' support both dense and sparse data. However, only
+            'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
+            `fit_intercept` is True.
+
+            .. versionadded:: 0.17
+               Stochastic Average Gradient descent solver.
+            .. versionadded:: 0.19
+               SAGA solver.
+
+        positive : bool, default=False
+            When set to ``True``, forces the coefficients to be positive.
+            Only 'lbfgs' solver is supported in this case.
+
+        random_state : int, RandomState instance, default=None
+            Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
+            See :term:`Glossary <random_state>` for details.
+
+            .. versionadded:: 0.17
+               `random_state` to support Stochastic Average Gradient.
+
+        References
+        ----------
+        Scikit-learn API: sklearn.linear_model.Lasso
+        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
+
+        """
+        super().__init__()
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.max_iter = max_iter
+        self.tol = tol
+        self.solver = solver
+        self.positive = positive
+        self.random_state = random_state
+
+        self.model = Ridge(
+            alpha=self.alpha,
+            fit_intercept=self.fit_intercept,
+            copy_X=self.copy_X,
+            max_iter=self.max_iter,
+            tol=self.tol,
+            solver=self.solver,
+            positive=self.positive,
+            random_state=self.random_state,
+        )
+
+        self.naming = RidgeRegression.name
+        self.customized = True
+        self.customized_name = "Ridge"
+
+    @property
+    def settings(self) -> Dict:
+        """The configuration of Lasso to implement AutoML by FLAML framework."""
+        configuration = {
+            "time_budget": 10,  # total running time in seconds
+            "metric": "r2",
+            "estimator_list": [self.customized_name],  # list of ML learners
+            "task": "regression",  # task type
+            # "log_file_name": f'{self.naming} - automl.log',  # flaml log file
+            # "log_training_metric": True,  # whether to log training metric
+        }
+        return configuration
+
+    @property
+    def customization(self) -> object:
+        """The customized Lasso of FLAML framework."""
+        from flaml import tune
+        from flaml.data import REGRESSION
+        from flaml.model import SKLearnEstimator
+        from sklearn.linear_model import Ridge
+
+        class MyRidgeRegression(SKLearnEstimator):
+            def __init__(self, task="regression", n_jobs=None, **config):
+                super().__init__(task, **config)
+                if task in REGRESSION:
+                    self.estimator_class = Ridge
+
+            @classmethod
+            def search_space(cls, data_size, task):
+                space = {
+                    "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1},
+                    "fit_intercept": {"domain": tune.choice([True, False])},
+                    "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000},
+                    "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4},
+                    # "selection": {"domain": tune.choice(["cyclic", "random"])},
+                    # "random_state": {'domain': tune.choice(["", ""])}
+                }
+                return space
+
+        return MyRidgeRegression
+
+    @classmethod
+    def manual_hyper_parameters(cls) -> Dict:
+        """Manual hyper-parameters specification."""
+        print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-")
+        hyper_parameters = ridge_regression_manual_hyper_parameters()
+        clear_output()
+        return hyper_parameters
+
+    @dispatch()
+    def special_components(self, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by Scikit-learn framework."""
+        GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
+        self._show_formula(
+            coef=[self.model.coef_],
+            intercept=self.model.intercept_,
+            features_name=RidgeRegression.X_train.columns,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
+            mlflow_path="root",
+        )
+        columns_num = RidgeRegression.X.shape[1]
+        if columns_num > 2:
+            # choose one of dimensions to draw
+            two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1)
+            self._plot_2d_scatter_diagram(
+                feature_data=two_dimen_data,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            # choose two of dimensions to draw
+            three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 2)
+            self._plot_3d_scatter_diagram(
+                feature_data=three_dimen_data,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+        elif columns_num == 2:
+            # choose one of dimensions to draw
+            two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1)
+            self._plot_2d_scatter_diagram(
+                feature_data=two_dimen_data,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            # no need to choose
+            self._plot_3d_scatter_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            self._plot_3d_surface_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                y_test_predict=RidgeRegression.y_test_predict,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+        elif columns_num == 1:
+            # no need to choose
+            self._plot_2d_scatter_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            self._plot_2d_line_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                y_test_predict=RidgeRegression.y_test_predict,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+        else:
+            pass
+
+    @dispatch(bool)
+    def special_components(self, is_automl: bool, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by FLAML framework."""
+        GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
+        self._show_formula(
+            coef=[self.auto_model.coef_],
+            intercept=self.auto_model.intercept_,
+            features_name=RidgeRegression.X_train.columns,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
+            mlflow_path="root",
+        )
+        columns_num = RidgeRegression.X.shape[1]
+        if columns_num > 2:
+            # choose one of dimensions to draw
+            two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1)
+            self._plot_2d_scatter_diagram(
+                feature_data=two_dimen_data,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            # choose two of dimensions to draw
+            three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 2)
+            self._plot_3d_scatter_diagram(
+                feature_data=three_dimen_data,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+        elif columns_num == 2:
+            # choose one of dimensions to draw
+            two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1)
+            self._plot_2d_scatter_diagram(
+                feature_data=two_dimen_data,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            # no need to choose
+            self._plot_3d_scatter_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            self._plot_3d_surface_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                y_test_predict=RidgeRegression.y_test_predict,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+        elif columns_num == 1:
+            # no need to choose
+            self._plot_2d_scatter_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+            self._plot_2d_line_diagram(
+                feature_data=RidgeRegression.X_test,
+                target_data=RidgeRegression.y_test,
+                y_test_predict=RidgeRegression.y_test_predict,
+                algorithm_name=self.naming,
+                local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+                mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+            )
+        else:
+            pass
diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py
index d6b9c07d..36d2a7fa 100644
--- a/geochemistrypi/data_mining/process/regress.py
+++ b/geochemistrypi/data_mining/process/regress.py
@@ -20,6 +20,7 @@
     PolynomialRegression,
     RandomForestRegression,
     RegressionWorkflowBase,
+    RidgeRegression,
     SGDRegression,
     SVMRegression,
     XGBoostRegression,
@@ -197,6 +198,14 @@ def activate(
                 copy_X=hyper_parameters["copy_X"],
                 verbose=hyper_parameters["verbose"],
             )
+        elif self.model_name == "Ridge Regression":
+            hyper_parameters = RidgeRegression.manual_hyper_parameters()
+            self.reg_workflow = RidgeRegression(
+                alpha=hyper_parameters["alpha"],
+                fit_intercept=hyper_parameters["fit_intercept"],
+                max_iter=hyper_parameters["max_iter"],
+                tol=hyper_parameters["tol"],
+            )
 
         self.reg_workflow.show_info()
 
@@ -270,6 +279,8 @@ def activate(
             self.reg_workflow = SGDRegression()
         elif self.model_name == "BayesianRidge Regression":
             self.reg_workflow = BayesianRidgeRegression()
+        elif self.model_name == "Ridge Regression":
+            self.reg_workflow = RidgeRegression()
 
         self.reg_workflow.show_info()