diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py index a629e47..312638c 100644 --- a/geochemistrypi/data_mining/model/_base.py +++ b/geochemistrypi/data_mining/model/_base.py @@ -1,400 +1,405 @@ -# -*- coding: utf-8 -*- -import json -import os -from abc import ABCMeta, abstractmethod -from typing import Dict, List, Optional, Tuple, Union - -import mlflow -import numpy as np -import pandas as pd -from multipledispatch import dispatch -from rich import print - -from ..constants import SECTION -from ..data.data_readiness import limit_num_input, num2option, num_input, show_data_columns -from ..utils.base import save_data, save_fig, save_model, save_text -from .func._common_supervised import plot_decision_tree, plot_feature_importance, plot_permutation_importance, show_formula -from .func.algo_regression._linear_regression import plot_2d_line_diagram, plot_2d_scatter_diagram, plot_3d_scatter_diagram, plot_3d_surface_diagram - - -class WorkflowBase(metaclass=ABCMeta): - """Base class for all workflow classes in geochemistry π.""" - - # Default for child class. They need to be overwritten in child classes. - name = None - common_function = [] - special_function = [] - X, y = None, None - X_train, X_test, y_train, y_test = None, None, None, None - y_test_predict = None - - @classmethod - def show_info(cls) -> None: - """Display what application functions the algorithm will provide.""" - print("*-*" * 2, cls.name, "is running ...", "*-*" * 2) - print("Expected Functionality:") - function = cls.common_function + cls.special_function - for i in range(len(function)): - print("+ ", function[i]) - - def __init__(self) -> None: - # Default for child class. They need to be overwritten in child classes. - self.model = None - self.naming = None - self.automl = None - self.ray_best_model = None - self.random_state = 42 - - @property - def image_config(self): - return { - # Picture layout - "width": 16, # number of subgraph rows - "height": 9, # number of subgraph columns - "dpi": 360, # resolution - # Main content - "cmap": "coolwarm_r", # color setting - "cmap2": "Wistia", - "marker_angle": "^", # point type - "marker_circle": "o", # point type - "edgecolor": "w", # point edge color - "markersize1": 18, # point size - "markersize2": 6, - "alpha1": 0.4, # point transparency - "alpha2": 0.95, - "linestyle": "-", - # bar - "bar_color": "blue", - "bar_align": "center", - "bar_x": range(len(self.X.columns)), # the sequence of horizontal coordinates of the bar - "bar_height": None, # the height(s) of the bars - "bar_label": self.X.columns, # The label on the X-axis - "bar_width": 0.3, # the width(s) of the bars - "bottom": 0, # the y coordinate(s) of the bars bases - # Convert the font of the axes - "labelsize": 5, # the font size of the axis label - "xrotation": 0, # x axis label rotation Angle - "xha": "center", # x axis 'ha' - "rot": 90, # y axis label rotation Angle - "yha": "center", # y axis 'ha' - "axislabelfont": "Times New Roman", # axis label font - # Picture title adjustment - "title_label": self.naming, # picture name - "title_size": 15, # title font size - "title_color": "k", - "title_location": "center", - "title_font": "Times New Roman", - "title_pad": 2, - # Tree parameter - "max_depth": None, # The maximum depth of the representation - "feature_names": None, # Names of each of the features - "class_names": ["class" + str(i) for i in range(1, 1000)], # Names of each of the target classes in ascending numerical order - "label": "all", # Whether to show informative labels for impurity, etc - "filled": True, # color filling - "impurity": True, # When set to True, show the impurity at each node - "node_ids": None, # When set to True, show the ID number on each node - "proportion": False, # When set to True, change the display of ‘values’ and/or ‘samples’ to be proportions and percentages respectively - "rounded": True, # When set to True, draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman - "precision": 3, # Number of digits of precision for floating point in the values of impurity, threshold and value attributes of each node - "ax": None, # axes to plot to. - "fontsize": None, # size of text font - } - - @abstractmethod - def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: - """Placeholder for fit. child classes should implement this method! - - Parameters - ---------- - X : pd.DataFrame (n_samples, n_features) - Training data, where `n_samples` is the number of samples and n_features` is the number of features. - - y : pd.DataFrame, default=None (n_samples,) or (n_samples, n_targets) - Target values。 - """ - return None - - def predict(self, X: pd.DataFrame) -> pd.DataFrame: - """The interface for the child classes.""" - return pd.DataFrame() - - def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """The interface for the child classes.""" - return pd.DataFrame() - - @abstractmethod - def manual_hyper_parameters(cls) -> Dict: - """Placeholder for manual_hyper_parameters. child classes should implement this method! - - Parameters - ---------- - kwargs : dict - The hyper parameters of the model. - """ - return dict() - - @staticmethod - def score(y_true: Union[pd.DataFrame, np.ndarray], y_predict: Union[pd.DataFrame, np.ndarray]) -> Union[int, float]: - """The interface for the child classes.""" - return float() - - @staticmethod - def np2pd(array: np.ndarray, columns_name: Union[List[str], pd.Index]) -> pd.DataFrame: - """The type of the data set is transformed from numpy.ndarray to pandas.DataFrame. - - Parameters - ---------- - array : np.ndarray (n_samples, n_features) - the data set. - - columns_name : list[str] or pd.Index - the name of the columns of the data set. - - Returns - ------- - pd.DataFrame (n_samples, n_features) - the data set. - """ - return pd.DataFrame(array, columns=columns_name) - - @staticmethod - def choose_dimension_data(data: pd.DataFrame, dimensions: int) -> Tuple[List[int], pd.DataFrame]: - """Choose a subgroup data from the whole data set to draw 2d or 3d graph. - - Parameters - ---------- - data : pd.DataFrame (n_samples, n_features) - the whole data. - dimensions : int - how much dimensions data to keep. - - Returns - ------- - selected_axis_index : list[int] - the index of the that dimension, which is shown as the index of the column of the data set. - - selected_axis_data : pd.DataFrame (n_samples, n_features) - the selected data from the whole data set. - """ - print(f"-----* {dimensions} Dimensions Data Selection *-----") - print(f"The software is going to draw related {dimensions}d graphs.") - print(f"Currently, the data dimension is beyond {dimensions} dimensions.") - print(f"Please choose {dimensions} dimensions of the data below.") - data = pd.DataFrame(data) - selected_axis_index = [] - selected_axis_name = [] - for i in range(1, dimensions + 1): - num2option(data.columns) - print(f"Choose dimension - {i} data:") - index_axis = limit_num_input(data.columns, SECTION[3], num_input) - selected_axis_index.append(index_axis - 1) - selected_axis_name.append(data.columns[index_axis - 1]) - selected_axis_data = data.loc[:, selected_axis_name] - print("The Selected Data Dimension:") - show_data_columns(selected_axis_name) - return selected_axis_index, selected_axis_data - - @staticmethod - def data_upload( - X: Optional[pd.DataFrame] = None, - y: Optional[pd.DataFrame] = None, - X_train: Optional[pd.DataFrame] = None, - X_test: Optional[pd.DataFrame] = None, - y_train: Optional[pd.DataFrame] = None, - y_test: Optional[pd.DataFrame] = None, - name_train: Optional[pd.Series] = None, - name_test: Optional[pd.Series] = None, - name_all: Optional[pd.Series] = None, - y_train_predict: Optional[pd.DataFrame] = None, - y_test_predict: Optional[pd.DataFrame] = None, - ) -> None: - """This method loads the required data into the base class's attributes.""" - if X is not None: - WorkflowBase.X = X - if y is not None: - WorkflowBase.y = y - if X_train is not None: - WorkflowBase.X_train = X_train - if X_test is not None: - WorkflowBase.X_test = X_test - if y_train is not None: - WorkflowBase.y_train = y_train - if y_test is not None: - WorkflowBase.y_test = y_test - if name_train is not None: - WorkflowBase.name_train = name_train - if name_test is not None: - WorkflowBase.name_test = name_test - if name_all is not None: - WorkflowBase.name_all = name_all - if y_test_predict is not None: - WorkflowBase.y_test_predict = y_test_predict - if y_train_predict is not None: - WorkflowBase.y_train_predict = y_train_predict - - @staticmethod - def data_save(df: pd.DataFrame, name: str, df_name: str, local_path: str, mlflow_path: str, slogan: str) -> None: - """This method saves the data into the local path and the mlflow path. - - Parameters - ---------- - df : pd.DataFrame - The data to be saved. - - name: str - The name. - df_name : str - The name of the data. - - local_path : str - The local path to save the data. - - mlflow_path : str - The mlflow path to save the data. - - slogan : str - The title of the output section. - """ - print(f"-----* {slogan} *-----") - print(df) - save_data(df, name, df_name, local_path, mlflow_path) - - @staticmethod - def save_hyper_parameters(hyper_parameters_dict: Dict, model_name: str, local_path: str) -> None: - """This method saves the hyper parameters into the local path. - - Parameters - ---------- - hyper_parameters_dict : dict - The hyper parameters of the model. - - model_name : str - The name of the model. - - local_path : str - The local path to save the hyper parameters. - """ - hyper_parameters_str = json.dumps(hyper_parameters_dict, indent=4) - save_text(hyper_parameters_str, f"Hyper Parameters - {model_name}", local_path) - mlflow.log_params(hyper_parameters_dict) - - @dispatch() - def model_save(self) -> None: - """Persist the model for future use after training the model with Scikit-learn framework.""" - print("-----* Model Persistence *-----") - GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH") - save_model(self.model, self.naming, self.X_train.iloc[[0]], GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH) - - @dispatch(bool) - def model_save(self, is_automl: bool) -> None: - """Persist the model for future use after training the model with FLAML framework.""" - print("-----* Model Persistence *-----") - GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH") - save_model(self.auto_model, self.naming, self.X_train.iloc[[0]], GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH) - - @staticmethod - def _plot_permutation_importance( - X_test: pd.DataFrame, - y_test: pd.DataFrame, - name_column: str, - trained_model: object, - image_config: dict, - algorithm_name: str, - local_path: str, - mlflow_path: str, - ) -> None: - """Permutation importance plot.""" - print("-----* Permutation Importance Diagram *-----") - importances_mean, importances_std, importances = plot_permutation_importance(X_test, y_test, trained_model, image_config) - save_fig(f"Permutation Importance - {algorithm_name}", local_path, mlflow_path) - save_data(X_test, name_column, "Permutation Importance - X Test", local_path, mlflow_path) - save_data(y_test, name_column, "Permutation Importance - Y Test", local_path, mlflow_path) - data_dict = {"importances_mean": importances_mean.tolist(), "importances_std": importances_std.tolist(), "importances": importances.tolist()} - data_str = json.dumps(data_dict, indent=4) - save_text(data_str, f"Permutation Importance - {algorithm_name}", local_path, mlflow_path) - - -class TreeWorkflowMixin: - """Mixin class for tree models.""" - - @staticmethod - def _plot_feature_importance(X_train: pd.DataFrame, name_column: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Draw the feature importance bar diagram.""" - print("-----* Feature Importance Diagram *-----") - columns_name = X_train.columns - feature_importances = trained_model.feature_importances_ - data = plot_feature_importance(columns_name, feature_importances, image_config) - save_fig(f"Feature Importance - {algorithm_name}", local_path, mlflow_path) - save_data(data, name_column, f"Feature Importance - {algorithm_name}", local_path, mlflow_path, True) - - @staticmethod - def _plot_tree(trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Drawing decision tree diagrams.""" - print("-----* Single Tree Diagram *-----") - plot_decision_tree(trained_model, image_config) - save_fig(f"Tree Diagram - {algorithm_name}", local_path, mlflow_path) - - -class LinearWorkflowMixin: - """Mixin class for linear models.""" - - @staticmethod - def _show_formula( - coef: np.ndarray, intercept: np.ndarray, features_name: np.ndarray, algorithm_name: str, regression_classification: str, y_train: pd.DataFrame, local_path: str, mlflow_path: str - ) -> None: - """Show the formula.""" - print(f"-----* {algorithm_name} Formula *-----") - formula = show_formula(coef, intercept, features_name, regression_classification, y_train) - formula_str = json.dumps(formula, indent=4) - save_text(formula_str, f"{algorithm_name} Formula", local_path, mlflow_path) - - @staticmethod - def _plot_2d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Plot the 2D graph of the linear regression model.""" - print("-----* 2D Scatter Diagram *-----") - plot_2d_scatter_diagram(feature_data, target_data) - save_fig(f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([feature_data, target_data], axis=1) - save_data(data, data_name, f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) - - @staticmethod - def _plot_2d_line_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Plot the 2D graph of the linear regression model.""" - print("-----* 2D Line Diagram *-----") - plot_2d_line_diagram(feature_data, target_data, y_test_predict) - save_fig(f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([feature_data, target_data, y_test_predict], axis=1) - save_data(data, data_name, f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path) - - @staticmethod - def _plot_3d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Plot the 3D graph of the linear regression model.""" - print("-----* 3D Scatter Diagram *-----") - plot_3d_scatter_diagram(feature_data, target_data) - save_fig(f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([feature_data, target_data], axis=1) - save_data(data, data_name, f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) - - @staticmethod - def _plot_3d_surface_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Plot the 3D graph of the linear regression model.""" - print("-----* 3D Surface Diagram *-----") - plot_3d_surface_diagram(feature_data, target_data, y_test_predict) - save_fig(f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([feature_data, target_data, y_test_predict], axis=1) - save_data(data, data_name, f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path) - - -class ClusteringMetricsMixin: - """Mixin class for clustering metrics.""" - - @staticmethod - def _get_num_clusters(labels: pd.Series, func_name: str, algorithm_name: str, store_path: str) -> None: - """Get and log the number of clusters. It is only used in those algorithms which don't allow to set the number of cluster in advance.""" - print(f"-----* {func_name} *-----") - num_clusters = len(np.unique(labels.to_numpy())) - print(f"{func_name}: {num_clusters}") - num_clusters_dict = {f"{func_name}": num_clusters} - mlflow.log_metrics(num_clusters_dict) - num_clusters_str = json.dumps(num_clusters_dict, indent=4) - save_text(num_clusters_str, f"{func_name} - {algorithm_name}", store_path) +# -*- coding: utf-8 -*- +import json +import os +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Optional, Tuple, Union + +import mlflow +import numpy as np +import pandas as pd +from multipledispatch import dispatch +from rich import print + +from ..constants import SECTION +from ..data.data_readiness import limit_num_input, num2option, num_input, show_data_columns +from ..utils.base import save_data, save_fig, save_model, save_text +from .func._common_supervised import plot_decision_tree, plot_feature_importance, plot_permutation_importance, show_formula +from .func.algo_regression._linear_regression import plot_2d_line_diagram, plot_2d_scatter_diagram, plot_3d_scatter_diagram, plot_3d_surface_diagram + + +class WorkflowBase(metaclass=ABCMeta): + """Base class for all workflow classes in geochemistry π.""" + + # Default for child class. They need to be overwritten in child classes. + name = None + common_function = [] + special_function = [] + X, y = None, None + X_train, X_test, y_train, y_test = None, None, None, None + y_test_predict = None + + @classmethod + def show_info(cls) -> None: + """Display what application functions the algorithm will provide.""" + print("*-*" * 2, cls.name, "is running ...", "*-*" * 2) + print("Expected Functionality:") + function = cls.common_function + cls.special_function + for i in range(len(function)): + print("+ ", function[i]) + + def __init__(self) -> None: + # Default for child class. They need to be overwritten in child classes. + self.model = None + self.naming = None + self.automl = None + self.ray_best_model = None + self.random_state = 42 + + @property + def image_config(self): + return { + # Picture layout + "width": 16, # number of subgraph rows + "height": 9, # number of subgraph columns + "dpi": 360, # resolution + # Main content + "cmap": "coolwarm_r", # color setting + "cmap2": "Wistia", + "marker_angle": "^", # point type + "marker_circle": "o", # point type + "edgecolor": "w", # point edge color + "markersize1": 18, # point size + "markersize2": 6, + "alpha1": 0.4, # point transparency + "alpha2": 0.95, + "linestyle": "-", + # bar + "bar_color": "blue", + "bar_align": "center", + "bar_x": range(len(self.X.columns)), # the sequence of horizontal coordinates of the bar + "bar_height": None, # the height(s) of the bars + "bar_label": self.X.columns, # The label on the X-axis + "bar_width": 0.3, # the width(s) of the bars + "bottom": 0, # the y coordinate(s) of the bars bases + # Convert the font of the axes + "labelsize": 5, # the font size of the axis label + "xrotation": 0, # x axis label rotation Angle + "xha": "center", # x axis 'ha' + "rot": 90, # y axis label rotation Angle + "yha": "center", # y axis 'ha' + "axislabelfont": "Times New Roman", # axis label font + # Picture title adjustment + "title_label": self.naming, # picture name + "title_size": 15, # title font size + "title_color": "k", + "title_location": "center", + "title_font": "Times New Roman", + "title_pad": 2, + # Tree parameter + "max_depth": None, # The maximum depth of the representation + "feature_names": None, # Names of each of the features + "class_names": ["class" + str(i) for i in range(1, 1000)], # Names of each of the target classes in ascending numerical order + "label": "all", # Whether to show informative labels for impurity, etc + "filled": True, # color filling + "impurity": True, # When set to True, show the impurity at each node + "node_ids": None, # When set to True, show the ID number on each node + "proportion": False, # When set to True, change the display of ‘values’ and/or ‘samples’ to be proportions and percentages respectively + "rounded": True, # When set to True, draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman + "precision": 3, # Number of digits of precision for floating point in the values of impurity, threshold and value attributes of each node + "ax": None, # axes to plot to. + "fontsize": None, # size of text font + } + + @abstractmethod + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: + """Placeholder for fit. child classes should implement this method! + + Parameters + ---------- + X : pd.DataFrame (n_samples, n_features) + Training data, where `n_samples` is the number of samples and n_features` is the number of features. + + y : pd.DataFrame, default=None (n_samples,) or (n_samples, n_targets) + Target values。 + """ + return None + + def predict(self, X: pd.DataFrame) -> pd.DataFrame: + """The interface for the child classes.""" + return pd.DataFrame() + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """The interface for the child classes.""" + return pd.DataFrame() + + @abstractmethod + def manual_hyper_parameters(cls) -> Dict: + """Placeholder for manual_hyper_parameters. child classes should implement this method! + + Parameters + ---------- + kwargs : dict + The hyper parameters of the model. + """ + return dict() + + @staticmethod + def score(y_true: Union[pd.DataFrame, np.ndarray], y_predict: Union[pd.DataFrame, np.ndarray]) -> Union[int, float]: + """The interface for the child classes.""" + return float() + + @staticmethod + def np2pd(array: np.ndarray, columns_name: Union[List[str], pd.Index]) -> pd.DataFrame: + """The type of the data set is transformed from numpy.ndarray to pandas.DataFrame. + + Parameters + ---------- + array : np.ndarray (n_samples, n_features) + the data set. + + columns_name : list[str] or pd.Index + the name of the columns of the data set. + + Returns + ------- + pd.DataFrame (n_samples, n_features) + the data set. + """ + return pd.DataFrame(array, columns=columns_name) + + @staticmethod + def choose_dimension_data(data: pd.DataFrame, dimensions: int) -> Tuple[List[int], pd.DataFrame]: + """Choose a subgroup data from the whole data set to draw 2d or 3d graph. + + Parameters + ---------- + data : pd.DataFrame (n_samples, n_features) + the whole data. + dimensions : int + how much dimensions data to keep. + + Returns + ------- + selected_axis_index : list[int] + the index of the that dimension, which is shown as the index of the column of the data set. + + selected_axis_data : pd.DataFrame (n_samples, n_features) + the selected data from the whole data set. + """ + print(f"-----* {dimensions} Dimensions Data Selection *-----") + print(f"The software is going to draw related {dimensions}d graphs.") + print(f"Currently, the data dimension is beyond {dimensions} dimensions.") + print(f"Please choose {dimensions} dimensions of the data below.") + data = pd.DataFrame(data) + selected_axis_index = [] + selected_axis_name = [] + for i in range(1, dimensions + 1): + num2option(data.columns) + print(f"Choose dimension - {i} data:") + index_axis = limit_num_input(data.columns, SECTION[3], num_input) + selected_axis_index.append(index_axis - 1) + selected_axis_name.append(data.columns[index_axis - 1]) + selected_axis_data = data.loc[:, selected_axis_name] + print("The Selected Data Dimension:") + show_data_columns(selected_axis_name) + return selected_axis_index, selected_axis_data + + @staticmethod + def data_upload( + X: Optional[pd.DataFrame] = None, + y: Optional[pd.DataFrame] = None, + X_train: Optional[pd.DataFrame] = None, + X_test: Optional[pd.DataFrame] = None, + y_train: Optional[pd.DataFrame] = None, + y_test: Optional[pd.DataFrame] = None, + name_train: Optional[pd.Series] = None, + name_test: Optional[pd.Series] = None, + name_all: Optional[pd.Series] = None, + y_train_predict: Optional[pd.DataFrame] = None, + y_test_predict: Optional[pd.DataFrame] = None, + ) -> None: + """This method loads the required data into the base class's attributes.""" + if X is not None: + WorkflowBase.X = X + if y is not None: + WorkflowBase.y = y + if X_train is not None: + WorkflowBase.X_train = X_train + if X_test is not None: + WorkflowBase.X_test = X_test + if y_train is not None: + WorkflowBase.y_train = y_train + if y_test is not None: + WorkflowBase.y_test = y_test + if name_train is not None: + WorkflowBase.name_train = name_train + if name_test is not None: + WorkflowBase.name_test = name_test + if name_all is not None: + WorkflowBase.name_all = name_all + if y_test_predict is not None: + WorkflowBase.y_test_predict = y_test_predict + if y_train_predict is not None: + WorkflowBase.y_train_predict = y_train_predict + + @staticmethod + def data_save(df: pd.DataFrame, name: str, df_name: str, local_path: str, mlflow_path: str, slogan: str) -> None: + """This method saves the data into the local path and the mlflow path. + + Parameters + ---------- + df : pd.DataFrame + The data to be saved. + + name: str + The name. + df_name : str + The name of the data. + + local_path : str + The local path to save the data. + + mlflow_path : str + The mlflow path to save the data. + + slogan : str + The title of the output section. + """ + print(f"-----* {slogan} *-----") + print(df) + save_data(df, name, df_name, local_path, mlflow_path) + + @staticmethod + def save_hyper_parameters(hyper_parameters_dict: Dict, model_name: str, local_path: str) -> None: + """This method saves the hyper parameters into the local path. + + Parameters + ---------- + hyper_parameters_dict : dict + The hyper parameters of the model. + + model_name : str + The name of the model. + + local_path : str + The local path to save the hyper parameters. + """ + hyper_parameters_str = json.dumps(hyper_parameters_dict, indent=4) + save_text(hyper_parameters_str, f"Hyper Parameters - {model_name}", local_path) + mlflow.log_params(hyper_parameters_dict) + + @dispatch() + def model_save(self) -> None: + """Persist the model for future use after training the model with Scikit-learn framework.""" + print("-----* Model Persistence *-----") + GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH") + save_model(self.model, self.naming, self.X_train.iloc[[0]], GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH) + + @dispatch(bool) + def model_save(self, is_automl: bool) -> None: + """Persist the model for future use after training the model with FLAML framework.""" + print("-----* Model Persistence *-----") + GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH") + save_model(self.auto_model, self.naming, self.X_train.iloc[[0]], GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH) + + @staticmethod + def _plot_permutation_importance( + X_test: pd.DataFrame, + y_test: pd.DataFrame, + name_column: str, + trained_model: object, + image_config: dict, + algorithm_name: str, + grah_name: str, + local_path: str, + mlflow_path: str, + ) -> None: + """Permutation importance plot.""" + print(f"-----* {grah_name} *-----") # Permutation Importance + importances_mean, importances_std, importances = plot_permutation_importance(X_test, y_test, trained_model, image_config) + save_fig(f"{grah_name} - {algorithm_name}", local_path, mlflow_path) + save_data(X_test, name_column, f"{grah_name} - X Test", local_path, mlflow_path) + save_data(y_test, name_column, f"{grah_name} - Y Test", local_path, mlflow_path) + data_dict = {"importances_mean": importances_mean.tolist(), "importances_std": importances_std.tolist(), "importances": importances.tolist()} + data_str = json.dumps(data_dict, indent=4) + save_text(data_str, f"{grah_name} - {algorithm_name}", local_path, mlflow_path) + + +class TreeWorkflowMixin: + """Mixin class for tree models.""" + + @staticmethod + def _plot_feature_importance(X_train: pd.DataFrame, name_column: str, trained_model: object, image_config: dict, algorithm_name: str, func_name: str, local_path: str, mlflow_path: str) -> None: + """Draw the feature importance bar diagram.""" + print(f"-----* {func_name} *-----") # Feature Importance Diagram + columns_name = X_train.columns + feature_importances = trained_model.feature_importances_ + data = plot_feature_importance(columns_name, feature_importances, image_config) + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"{func_name} - {algorithm_name}", local_path, mlflow_path, True) + + @staticmethod + def _plot_tree(trained_model: object, image_config: dict, algorithm_name: str, func_name: str, local_path: str, mlflow_path: str) -> None: + """Drawing decision tree diagrams.""" + print(f"-----* {func_name} *-----") # Single Tree Diagram + plot_decision_tree(trained_model, image_config) + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + + +class LinearWorkflowMixin: + """Mixin class for linear models.""" + + @staticmethod + def _show_formula( + coef: np.ndarray, intercept: np.ndarray, features_name: np.ndarray, algorithm_name: str, regression_classification: str, y_train: pd.DataFrame, local_path: str, mlflow_path: str + ) -> None: + """Show the formula.""" + print(f"-----* {algorithm_name} Formula *-----") + formula = show_formula(coef, intercept, features_name, regression_classification, y_train) + formula_str = json.dumps(formula, indent=4) + save_text(formula_str, f"{algorithm_name} Formula", local_path, mlflow_path) + + @staticmethod + def _plot_2d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, func_name: str, local_path: str, mlflow_path: str) -> None: + """Plot the 2D graph of the linear regression model.""" + print(f"-----* {func_name} *-----") # 2D Scatter Diagram + plot_2d_scatter_diagram(feature_data, target_data) + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + data = pd.concat([feature_data, target_data], axis=1) + save_data(data, data_name, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _plot_2d_line_diagram( + feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, func_name: str, local_path: str, mlflow_path: str + ) -> None: + """Plot the 2D graph of the linear regression model.""" + print(f"-----* {func_name} *-----") # 2D Line Diagram + plot_2d_line_diagram(feature_data, target_data, y_test_predict) + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + data = pd.concat([feature_data, target_data, y_test_predict], axis=1) + save_data(data, data_name, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _plot_3d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, func_name: str, local_path: str, mlflow_path: str) -> None: + """Plot the 3D graph of the linear regression model.""" + print(f"-----* {func_name} *-----") # 3D Scatter Diagram + plot_3d_scatter_diagram(feature_data, target_data) + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + data = pd.concat([feature_data, target_data], axis=1) + save_data(data, data_name, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _plot_3d_surface_diagram( + feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, func_name: str, local_path: str, mlflow_path: str + ) -> None: + """Plot the 3D graph of the linear regression model.""" + print(f"-----* {func_name} *-----") # 3D Surface Diagram + plot_3d_surface_diagram(feature_data, target_data, y_test_predict) + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + data = pd.concat([feature_data, target_data, y_test_predict], axis=1) + save_data(data, data_name, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + + +class ClusteringMetricsMixin: + """Mixin class for clustering metrics.""" + + @staticmethod + def _get_num_clusters(labels: pd.Series, func_name: str, algorithm_name: str, store_path: str) -> None: + """Get and log the number of clusters. It is only used in those algorithms which don't allow to set the number of cluster in advance.""" + print(f"-----* {func_name} *-----") + num_clusters = len(np.unique(labels.to_numpy())) + print(f"{func_name}: {num_clusters}") + num_clusters_dict = {f"{func_name}": num_clusters} + mlflow.log_metrics(num_clusters_dict) + num_clusters_str = json.dumps(num_clusters_dict, indent=4) + save_text(num_clusters_str, f"{func_name} - {algorithm_name}", store_path) diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_enum.py b/geochemistrypi/data_mining/model/func/algo_regression/_enum.py index e26ccea..b62b6dc 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_enum.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_enum.py @@ -2,11 +2,43 @@ class RegressionCommonFunction(Enum): - PREDICTED_VS_ACTUAL_DIAGRAM = "Predicted vs. Actual Diagram" - RESIDUALS_DIAGRAM = "Residuals Diagram" MODEL_SCORE = "Model Score" CROSS_VALIDATION = "Cross Validation" + MODEL_PREDICTION = "Model Prediction" + MODEL_PERSISTENCE = "Model Persistence" + PREDICTED_VS_ACTUAL_DIAGRAM = "Predicted vs. Actual Diagram" + RESIDUALS_DIAGRAM = "Residuals Diagram" + PERMUTATION_IMPORTANC_DIAGRAM = "Permutation Importance Diagram" + + +class RegressionSpecialFunction(Enum): + FEATURE_IMPORTANCE_DIAGRAM = "Feature Importance Diagram" + SINGLE_TREE_DIAGRAM = "Single Tree Diagram" + TWO_DIMENSIONAL_SCATTER_DIAGRAM = "2D Scatter Diagram" + THREE_DIMENSIONAL_SCATTER_DIAGRAM = "3D Scatter Diagram" + TWO_DIMENSIONAL_LINE_DIAGRAM = "2D Line Diagram" + THREE_DIMENSIONAL_SURFACE_DIAGRAM = "3D Surface Diagram" class MLPSpecialFunction(Enum): LOSS_CURVE_DIAGRAM = "Loss Curve Diagram" + + +class ClassicalLinearSpecialFunction(Enum): + LINEAR_REGRESSION_FORMULA = "Linear Regression Formula" + + +class LassoSpecialFunction(Enum): + LASSO_REGRESSION_FORMULA = "Lasso Regression Formula" + + +class ElasticNetSpecialFunction(Enum): + ELASTIC_NET_FORMULA = "Elastic Net Formula" + + +class SGDSpecialFunction(Enum): + SGD_REGRESSION_FORMULA = "SGD Regression Formula" + + +class RidgeSpecialFunction(Enum): + RIDGE_REGRESSION_FORMULA = "Ridge Regression Formula" diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index 2c47784..e9c8433 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -1,4415 +1,4570 @@ -# -*- coding: utf-8 -*- -import json -import os -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union - -import mlflow -import numpy as np -import pandas as pd -import xgboost -from flaml import AutoML -from multipledispatch import dispatch -from rich import print -from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor -from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LinearRegression, Ridge, SGDRegressor -from sklearn.neighbors import KNeighborsRegressor -from sklearn.neural_network import MLPRegressor -from sklearn.preprocessing import PolynomialFeatures -from sklearn.svm import SVR -from sklearn.tree import DecisionTreeRegressor - -from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, RAY_FLAML -from ..utils.base import clear_output, save_data, save_fig, save_text -from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase -from .func.algo_regression._bayesianridge_regression import bayesian_ridge_manual_hyper_parameters -from .func.algo_regression._common import cross_validation, plot_predicted_vs_actual, plot_residuals, score -from .func.algo_regression._decision_tree import decision_tree_manual_hyper_parameters -from .func.algo_regression._elastic_net import elastic_net_manual_hyper_parameters -from .func.algo_regression._enum import MLPSpecialFunction, RegressionCommonFunction -from .func.algo_regression._extra_tree import extra_trees_manual_hyper_parameters -from .func.algo_regression._gradient_boosting import gradient_boosting_manual_hyper_parameters -from .func.algo_regression._knn import knn_manual_hyper_parameters -from .func.algo_regression._lasso_regression import lasso_regression_manual_hyper_parameters -from .func.algo_regression._linear_regression import linear_regression_manual_hyper_parameters -from .func.algo_regression._multi_layer_perceptron import multi_layer_perceptron_manual_hyper_parameters -from .func.algo_regression._polynomial_regression import polynomial_regression_manual_hyper_parameters -from .func.algo_regression._rf import random_forest_manual_hyper_parameters -from .func.algo_regression._ridge_regression import ridge_regression_manual_hyper_parameters -from .func.algo_regression._sgd_regression import sgd_regression_manual_hyper_parameters -from .func.algo_regression._svr import svr_manual_hyper_parameters -from .func.algo_regression._xgboost import xgboost_manual_hyper_parameters - - -class RegressionWorkflowBase(WorkflowBase): - """The base workflow class of regression algorithms.""" - - common_function = ["Model Score", "Cross Validation", "Model Prediction", "Model Persistence", "Predicted vs. Actual Diagram", "Residuals Diagram", "Permutation Importance Diagram"] - - def __init__(self) -> None: - super().__init__() - # These two attributes are used for the customized models of FLAML framework - self.customized = False - self.customized_name = None - self.mode = "Regression" - - @dispatch(object, object) - def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: - """Fit the model by Scikit-learn framework.""" - self.model.fit(X, y) - - @dispatch(object, object, bool) - def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None, is_automl: bool = False) -> None: - """Fit the model by FLAML framework and RAY framework.""" - # print(f"-*-*- {self.naming} - AutoML -*-*-.") - if self.naming not in RAY_FLAML: - self.automl = AutoML() - if self.customized: # When the model is not built-in in FLAML framwork, use FLAML customization. - self.automl.add_learner(learner_name=self.customized_name, learner_class=self.customization) - if y.shape[1] == 1: # FLAML's data format validation mechanism - y = y.squeeze() # Convert a single dataFrame column into a series - self.automl.fit(X_train=X, y_train=y, **self.settings) - else: - # When the model is not built-in in FLAML framework, use RAY + FLAML customization. - self.ray_tune( - RegressionWorkflowBase.X_train, - RegressionWorkflowBase.X_test, - RegressionWorkflowBase.y_train, - RegressionWorkflowBase.y_test, - ) - self.ray_best_model.fit(X, y) - - @dispatch(object) - def predict(self, X: pd.DataFrame) -> np.ndarray: - """Perform classification on samples in X by Scikit-learn framework.""" - y_predict = self.model.predict(X) - return y_predict - - @dispatch(object, bool) - def predict(self, X: pd.DataFrame, is_automl: bool = False) -> np.ndarray: - """Perform classification on samples in X by FLAML framework and RAY framework.""" - if self.naming not in RAY_FLAML: - y_predict = self.automl.predict(X) - return y_predict - else: - y_predict = self.ray_best_model.predict(X) - return y_predict - - @property - def auto_model(self) -> object: - """Get AutoML trained model by FLAML framework and RAY framework.""" - if self.naming not in RAY_FLAML: - return self.automl.model.estimator - else: - return self.ray_best_model - - @property - def settings(self) -> Dict: - """The configuration to implement AutoML by FLAML framework.""" - return dict() - - @property - def customization(self) -> object: - """The customized model of FLAML framework.""" - return object - - def ray_tune(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame) -> object: - """The customized model of FLAML framework and RAY framework.""" - return object - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - return dict() - - @staticmethod - def _plot_predicted_vs_actual(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str, grah_name: str) -> None: - """Plot the predicted vs. actual diagram.""" - print(f"-----* {grah_name} *-----") - plot_predicted_vs_actual(y_test_predict, y_test, algorithm_name) - save_fig(f"{grah_name} - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([y_test, y_test_predict], axis=1) - save_data(data, name_column, f"{grah_name} - {algorithm_name}", local_path, mlflow_path) - - @staticmethod - def _plot_residuals(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str, grah_name: str) -> None: - """Plot the residuals diagram.""" - print(f"-----* {grah_name} *-----") - residuals = plot_residuals(y_test_predict, y_test, algorithm_name) - save_fig(f"{grah_name} - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([y_test, residuals], axis=1) - save_data(data, name_column, f"{grah_name} - {algorithm_name}", local_path, mlflow_path) - - @staticmethod - def _score(y_true: pd.DataFrame, y_predict: pd.DataFrame, algorithm_name: str, store_path: str, grah_name: str) -> None: - """Calculate the score of the model.""" - print(f"-----* {grah_name} *-----") - scores = score(y_true, y_predict) - scores_str = json.dumps(scores, indent=4) - save_text(scores_str, f"{grah_name} - {algorithm_name}", store_path) - mlflow.log_metrics(scores) - - @staticmethod - def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, cv_num: int, algorithm_name: str, store_path: str, grah_name: str) -> None: - """Cross validation.""" - print(f"-----* {grah_name} *-----") - print(f"K-Folds: {cv_num}") - scores = cross_validation(trained_model, X_train, y_train, cv_num=cv_num) - scores_str = json.dumps(scores, indent=4) - save_text(scores_str, f"{grah_name} - {algorithm_name}", store_path) - - @dispatch() - def common_components(self) -> None: - """Invoke all common application functions for regression algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._score( - y_true=RegressionWorkflowBase.y_test, - y_predict=RegressionWorkflowBase.y_test_predict, - algorithm_name=self.naming, - store_path=GEOPI_OUTPUT_METRICS_PATH, - grah_name=RegressionCommonFunction.MODEL_SCORE.value, - ) - self._cross_validation( - trained_model=self.model, - X_train=RegressionWorkflowBase.X_train, - y_train=RegressionWorkflowBase.y_train, - cv_num=10, - algorithm_name=self.naming, - store_path=GEOPI_OUTPUT_METRICS_PATH, - grah_name=RegressionCommonFunction.CROSS_VALIDATION.value, - ) - self._plot_predicted_vs_actual( - y_test_predict=RegressionWorkflowBase.y_test_predict, - y_test=RegressionWorkflowBase.y_test, - name_column=RegressionWorkflowBase.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - grah_name=RegressionCommonFunction.PREDICTED_VS_ACTUAL_DIAGRAM.value, - ) - self._plot_residuals( - y_test_predict=RegressionWorkflowBase.y_test_predict, - y_test=RegressionWorkflowBase.y_test, - name_column=RegressionWorkflowBase.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - grah_name=RegressionCommonFunction.RESIDUALS_DIAGRAM.value, - ) - self._plot_permutation_importance( - X_test=RegressionWorkflowBase.X_test, - y_test=RegressionWorkflowBase.y_test, - name_column=RegressionWorkflowBase.name_test, - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - @dispatch(bool) - def common_components(self, is_automl: bool = False) -> None: - """Invoke all common application functions for regression algorithms by FLAML framework.""" - GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._score( - y_true=RegressionWorkflowBase.y_test, - y_predict=RegressionWorkflowBase.y_test_predict, - algorithm_name=self.naming, - store_path=GEOPI_OUTPUT_METRICS_PATH, - grah_name=RegressionCommonFunction.MODEL_SCORE.value, - ) - self._cross_validation( - trained_model=self.auto_model, - X_train=RegressionWorkflowBase.X_train, - y_train=RegressionWorkflowBase.y_train, - cv_num=10, - algorithm_name=self.naming, - store_path=GEOPI_OUTPUT_METRICS_PATH, - grah_name=RegressionCommonFunction.CROSS_VALIDATION.value, - ) - self._plot_predicted_vs_actual( - y_test_predict=RegressionWorkflowBase.y_test_predict, - y_test=RegressionWorkflowBase.y_test, - name_column=RegressionWorkflowBase.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - grah_name=RegressionCommonFunction.PREDICTED_VS_ACTUAL_DIAGRAM.value, - ) - self._plot_residuals( - y_test_predict=RegressionWorkflowBase.y_test_predict, - y_test=RegressionWorkflowBase.y_test, - name_column=RegressionWorkflowBase.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - grah_name=RegressionCommonFunction.RESIDUALS_DIAGRAM.value, - ) - self._plot_permutation_importance( - X_test=RegressionWorkflowBase.X_test, - y_test=RegressionWorkflowBase.y_test, - name_column=RegressionWorkflowBase.name_test, - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - -class PolynomialRegression(LinearWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Polynomial Regression algorithm to make insightful products.""" - - name = "Polynomial Regression" - special_function = ["Polynomial Regression Formula"] - - def __init__( - self, - degree: int = 2, - interaction_only: bool = False, - include_bias: bool = False, - order: str = "C", - fit_intercept: bool = True, - normalize: bool = False, - copy_X: bool = True, - n_jobs: Optional[int] = None, - ) -> None: - - super().__init__() - self.degree = degree - self.include_bias = include_bias - self.interaction_only = interaction_only - self.order = order - self.fit_intercept = fit_intercept - self.normalize = normalize - self.copy_X = copy_X - self.n_jobs = n_jobs - self.model = LinearRegression(fit_intercept=self.fit_intercept, copy_X=self.copy_X, n_jobs=self.n_jobs) - - self._features_name = None - self.naming = PolynomialRegression.name - - def poly(self, X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Polynomial features.""" - poly_features = PolynomialFeatures(degree=self.degree, include_bias=self.include_bias, interaction_only=self.interaction_only, order=self.order) - X_train_poly = poly_features.fit_transform(X_train) - X_test_poly = poly_features.fit_transform(X_test) - poly_config = {type(poly_features).__name__: poly_features.get_params()} - try: - # scikit-learn >= 1.0 - self._features_name = poly_features.get_feature_names_out() - except AttributeError: - self._features_name = poly_features.get_feature_names() - X_train_poly = pd.DataFrame(X_train_poly, columns=self._features_name) - X_test_poly = pd.DataFrame(X_test_poly, columns=self._features_name) - return poly_config, X_train_poly, X_test_poly - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = polynomial_regression_manual_hyper_parameters() - clear_output() - return hyper_parameters - - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=self.model.coef_, - intercept=self.model.intercept_, - features_name=self._features_name, - regression_classification="Regression", - y_train=PolynomialRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - - -class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using XGBoost algorithm to make insightful products.""" - - name = "XGBoost" - special_function = ["Feature Importance Diagram"] - - # In fact, it's used for type hint in the original xgboost package. - # Hence, we have to copy it here again. Just ignore it - _SklObjective = Optional[Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]] - - def __init__( - self, - max_depth: Optional[int] = 6, - learning_rate: Optional[float] = 0.3, - n_estimators: int = 100, - verbosity: Optional[int] = 1, - objective: _SklObjective = None, - booster: Optional[str] = None, - tree_method: Optional[str] = "auto", - n_jobs: Optional[int] = None, - gamma: Optional[float] = 0, - min_child_weight: Optional[float] = None, - max_delta_step: Optional[float] = 0, - subsample: Optional[float] = 1, - colsample_bytree: Optional[float] = 1, - colsample_bylevel: Optional[float] = 1, - colsample_bynode: Optional[float] = 1, - reg_alpha: Optional[float] = 0, - reg_lambda: Optional[float] = 1, - scale_pos_weight: Optional[float] = 1, - base_score: Optional[float] = None, - random_state: Optional[Union[np.random.RandomState, int]] = None, - missing: float = np.nan, - num_parallel_tree: Optional[int] = 1, - monotone_constraints: Optional[Union[Dict[str, int], str]] = None, - interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None, - importance_type: Optional[str] = "gain", - gpu_id: Optional[int] = None, - validate_parameters: Optional[bool] = None, - predictor: Optional[str] = None, - # enable_categorical: bool = False, - eval_metric: Optional[Union[str, List[str], Callable]] = None, - early_stopping_rounds: Optional[int] = None, - **kwargs: Any, - ) -> None: - """ - Parameters - ---------- - max_depth [default=6] - Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. - 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. - exact tree method requires non-zero value. - range: [0,∞] - - learning_rate [default=0.3] - Step size shrinkage used in update to prevents overfitting. - After each boosting step, we can directly get the weights of new features, - and eta shrinks the feature weights to make the boosting process more conservative. - range: [0,1] - - n_estimators : int - Number of gradient boosted trees. Equivalent to number of boosting rounds. - - objective : {SklObjective} - Specify the learning task and the corresponding learning objective or - a custom objective function to be used (see note below). - - verbosity [default=1] - Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 (debug). - Sometimes XGBoost tries to change configurations based on heuristics, - which is displayed as warning message. - If there’s unexpected behaviour, please try to increase value of verbosity. - - booster [default= gbtree ] - Which booster to use. Can be gbtree, gblinear or dart; - gbtree and dart use tree based models while gblinear uses linear functions. - - tree_method string [default= auto] - The tree construction algorithm used in XGBoost. See description in the reference paper and Tree Methods. - XGBoost supports approx, hist and gpu_hist for distributed training. Experimental support for external memory is available for approx and gpu_hist. - Choices: auto, exact, approx, hist, gpu_hist, this is a combination of commonly used updaters. For other updaters like refresh, set the parameter updater directly. - auto: Use heuristic to choose the fastest method. - For small dataset, exact greedy (exact) will be used. - For larger dataset, approximate algorithm (approx) will be chosen. It’s recommended to try hist and gpu_hist for higher performance with large dataset. - (gpu_hist)has support for external memory. - Because old behavior is always use exact greedy in single machine, user will get a message when approximate algorithm is chosen to notify this choice. - exact: Exact greedy algorithm. Enumerates all split candidates. - approx: Approximate greedy algorithm using quantile sketch and gradient histogram. - hist: Faster histogram optimized approximate greedy algorithm. - gpu_hist: GPU implementation of hist algorithm. - - n_jobs : Optional[int] - Number of parallel threads used to run xgboost. When used with other - Scikit-Learn algorithms like grid search, you may choose which algorithm to - parallelize and balance the threads. Creating thread contention will - significantly slow down both algorithms. - - gamma [default=0, alias: min_split_loss] - Minimum loss reduction required to make a further partition on a leaf node of the tree. - The larger gamma is, the more conservative the algorithm will be. - range: [0,∞] - - min_child_weight [default=1] - Minimum sum of instance weight (hessian) needed in a child. - If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, - then the building process will give up further partitioning. In linear regression task, - this simply corresponds to minimum number of instances needed to be in each node. - The larger min_child_weight is, the more conservative the algorithm will be. - range: [0,∞] - - max_delta_step [default=0] - Maximum delta step we allow each leaf output to be. - If the value is set to 0, it means there is no constraint. - If it is set to a positive value, it can help making the update step more conservative. - Usually this parameter is not needed, but it might help in logistic regression - when class is extremely imbalanced. Set it to value of 1-10 might help control the update. - range: [0,∞] - - subsample [default=1] - Subsample ratio of the training instances. - Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. - and this will prevent overfitting. - Subsampling will occur once in every boosting iteration. - range: (0,1] - - colsample_bytree [default=1] - colsample_bytree is the subsample ratio of columns when constructing each tree. - Subsampling occurs once for every tree constructed. - - colsample_bylevel [default=1] - colsample_bylevel is the subsample ratio of columns for each level. - Subsampling occurs once for every new depth level reached in a tree. - Columns are subsampled from the set of columns chosen for the current tree. - - colsample_bynode [default=1] - colsample_bynode is the subsample ratio of columns for each node (split). - Subsampling occurs once every time a new split is evaluated. - Columns are subsampled from the set of columns chosen for the current level. - - reg_alpha [default=0] - L1 regularization term on weights. - Increasing this value will make model more conservative. - - reg_lambda [default=1, alias: reg_lambda] - L2 regularization term on weights. - Increasing this value will make model more conservative. - - scale_pos_weight [default=1] - Control the balance of positive and negative weights, useful for unbalanced classes. - - predictor, [default= auto] - The type of predictor algorithm to use. - Provides the same results but allows the use of GPU or CPU. - auto: Configure predictor based on heuristics. - cpu_predictor: Multicore CPU prediction algorithm. - gpu_predictor: Prediction using GPU. Used when tree_method is gpu_hist. - When predictor is set to default value auto, the gpu_hist tree method is able to provide GPU based prediction - without copying training data to GPU memory. If gpu_predictor is explicitly specified, - then all data is copied into GPU, only recommended for performing prediction tasks. - - base_score : Optional[float] - The initial prediction score of all instances, global bias. - - random_state : Optional[Union[numpy.random.RandomState, int]] - Random number seed. - .. note:: - - Using gblinear booster with shotgun updater is nondeterministic as - it uses Hogwild algorithm. - - missing : float, default np.nan - Value in the data which needs to be present as a missing value. - - num_parallel_tree: Optional[int] - Used for boosting random forest. - - monotone_constraints : Optional[Union[Dict[str, int], str]] - Constraint of variable monotonicity. See :doc:`tutorial ` - for more information. - - interaction_constraints : Optional[Union[str, List[Tuple[str]]]] - Constraints for interaction representing permitted interactions. The - constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2, - 3, 4]]``, where each inner list is a group of indices of features that are - allowed to interact with each other. See :doc:`tutorial - ` for more information - - importance_type: Optional[str] - The feature importance type for the feature_importances\\_ property: - - * For tree model, it's either "gain", "weight", "cover", "total_gain" or - "total_cover". - * For linear model, only "weight" is defined and it's the normalized coefficients - without bias. - - gpu_id : Optional[int] - Device ordinal. - - validate_parameters : Optional[bool] - Give warnings for unknown parameter. - - eval_metric : Optional[Union[str, List[str], Callable]] - - early_stopping_rounds : Optional[int] - - References - ---------- - [1] XGBoost Python API Reference - Scikit-Learn API - https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn - - [2] XGBoost API for the scikit-learn wrapper: - https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py - """ - - super().__init__() - self.n_estimators = n_estimators - self.objective = objective - self.max_depth = max_depth - self.learning_rate = learning_rate - self.verbosity = verbosity - self.booster = booster - self.tree_method = tree_method - self.gamma = gamma - self.min_child_weight = min_child_weight - self.max_delta_step = max_delta_step - self.subsample = subsample - self.colsample_bytree = colsample_bytree - self.colsample_bylevel = colsample_bylevel - self.colsample_bynode = colsample_bynode - self.reg_alpha = reg_alpha - self.reg_lambda = reg_lambda - self.scale_pos_weight = scale_pos_weight - self.base_score = base_score - self.missing = missing - self.num_parallel_tree = num_parallel_tree - self.n_jobs = n_jobs - self.monotone_constraints = monotone_constraints - self.interaction_constraints = interaction_constraints - self.importance_type = importance_type - self.gpu_id = gpu_id - self.validate_parameters = validate_parameters - self.predictor = predictor - # self.enable_categorical = enable_categorical - self.eval_metric = eval_metric - self.early_stopping_rounds = early_stopping_rounds - if kwargs: - self.kwargs = kwargs - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = xgboost.XGBRegressor( - n_estimators=self.n_estimators, - objective=self.objective, - max_depth=self.max_depth, - learning_rate=self.learning_rate, - verbosity=self.verbosity, - booster=self.booster, - tree_method=self.tree_method, - gamma=self.gamma, - min_child_weight=self.min_child_weight, - max_delta_step=self.max_delta_step, - subsample=self.subsample, - colsample_bytree=self.colsample_bytree, - colsample_bylevel=self.colsample_bylevel, - colsample_bynode=self.colsample_bynode, - reg_alpha=self.reg_alpha, - reg_lambda=self.reg_lambda, - scale_pos_weight=self.scale_pos_weight, - base_score=self.base_score, - missing=self.missing, - num_parallel_tree=self.num_parallel_tree, - random_state=self.random_state, - n_jobs=self.n_jobs, - monotone_constraints=self.monotone_constraints, - interaction_constraints=self.interaction_constraints, - importance_type=self.importance_type, - gpu_id=self.gpu_id, - validate_parameters=self.validate_parameters, - predictor=self.predictor, - # enable_categorical=self.enable_categorical, - eval_metric=self.eval_metric, - early_stopping_rounds=self.early_stopping_rounds, - ) - - self.naming = XGBoostRegression.name - - @property - def settings(self) -> Dict: - """The configuration to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": ["xgboost"], # list of ML learners - "task": "regression", # task type - # "log_file_name": f"{self.naming} - automl.log", # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = xgboost_manual_hyper_parameters() - clear_output() - return hyper_parameters - - # @staticmethod - # def _histograms_feature_weights(X: pd.DataFrame, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - # """Histograms of feature weights plot.""" - # histograms_feature_weights(X, trained_model, image_config) - # save_fig(f"Feature Importance Score - {algorithm_name}", local_path, mlflow_path) - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=XGBoostRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # self._histograms_feature_weights( - # X=XGBoostRegression.X, - # trained_model=self.model, - # image_config=self.image_config, - # algorithm_name=self.naming, - # local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - # ) - - @dispatch(bool) - def special_components(self, is_automl: bool = False, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=XGBoostRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # self._histograms_feature_weights( - # X=XGBoostRegression.X, - # trained_model=self.auto_model, - # image_config=self.image_config, - # algorithm_name=self.naming, - # local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - # ) - - -class DecisionTreeRegression(TreeWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Decision Tree algorithm to make insightful products.""" - - name = "Decision Tree" - special_function = ["Feature Importance Diagram", "Single Tree Diagram"] - - def __init__( - self, - criterion: str = "squared_error", - splitter: str = "best", - max_depth: Optional[int] = None, - min_samples_split: Union[int, float] = 2, - min_samples_leaf: Union[int, float] = 1, - min_weight_fraction_leaf: float = 0.0, - max_features: Union[int, float, str, None] = None, - random_state: Optional[int] = None, - max_leaf_nodes: Optional[int] = None, - min_impurity_decrease: float = 0.0, - ccp_alpha: float = 0.0, - ) -> None: - """ - Parameters - ---------- - criterion : {"squared_error", "friedman_mse", "absolute_error", \ - "poisson"}, default="squared_error" - The function to measure the quality of a split. Supported criteria - are "squared_error" for the mean squared error, which is equal to - variance reduction as feature selection criterion and minimizes the L2 - loss using the mean of each terminal node, "friedman_mse", which uses - mean squared error with Friedman's improvement score for potential - splits, "absolute_error" for the mean absolute error, which minimizes - the L1 loss using the median of each terminal node, and "poisson" which - uses reduction in Poisson deviance to find splits. - - .. versionadded:: 0.18 - Mean Absolute Error (MAE) criterion. - - .. versionadded:: 0.24 - Poisson deviance criterion. - - .. deprecated:: 1.0 - Criterion "mse" was deprecated in v1.0 and will be removed in - version 1.2. Use `criterion="squared_error"` which is equivalent. - - .. deprecated:: 1.0 - Criterion "mae" was deprecated in v1.0 and will be removed in - version 1.2. Use `criterion="absolute_error"` which is equivalent. - - splitter : {"best", "random"}, default="best" - The strategy used to choose the split at each node. Supported - strategies are "best" to choose the best split and "random" to choose - the best random split. - - max_depth : int, default=None - The maximum depth of the tree. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node: - - If int, then consider `min_samples_split` as the minimum number. - - If float, then `min_samples_split` is a fraction and - `ceil(min_samples_split * n_samples)` are the minimum - number of samples for each split. - .. versionchanged:: 0.18 - Added float values for fractions. - - min_samples_leaf : int or float, default=1 - The minimum number of samples required to be at a leaf node. - A split point at any depth will only be considered if it leaves at - least ``min_samples_leaf`` training samples in each of the left and - right branches. This may have the effect of smoothing the model, - especially in regression. - - If int, then consider `min_samples_leaf` as the minimum number. - - If float, then `min_samples_leaf` is a fraction and - `ceil(min_samples_leaf * n_samples)` are the minimum - number of samples for each node. - .. versionchanged:: 0.18 - Added float values for fractions. - - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. - - max_features : int, float or {"auto", "sqrt", "log2"}, default=None - The number of features to consider when looking for the best split: - - If int, then consider `max_features` features at each split. - - If float, then `max_features` is a fraction and - `max(1, int(max_features * n_features_in_))` features are considered at each - split. - - If "auto", then `max_features=n_features`. - - If "sqrt", then `max_features=sqrt(n_features)`. - - If "log2", then `max_features=log2(n_features)`. - - If None, then `max_features=n_features`. - .. deprecated:: 1.1 - The `"auto"` option was deprecated in 1.1 and will be removed - in 1.3. - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires to - effectively inspect more than ``max_features`` features. - - random_state : int, RandomState instance or None, default=None - Controls the randomness of the estimator. The features are always - randomly permuted at each split, even if ``splitter`` is set to - ``"best"``. When ``max_features < n_features``, the algorithm will - select ``max_features`` at random at each split before finding the best - split among them. But the best found split may vary across different - runs, even if ``max_features=n_features``. That is the case, if the - improvement of the criterion is identical for several splits and one - split has to be selected at random. To obtain a deterministic behaviour - during fitting, ``random_state`` has to be fixed to an integer. - See :term:`Glossary ` for details. - - max_leaf_nodes : int, default=None - Grow a tree with ``max_leaf_nodes`` in best-first fashion. - Best nodes are defined as relative reduction in impurity. - If None then unlimited number of leaf nodes. - - min_impurity_decrease : float, default=0.0 - A node will be split if this split induces a decrease of the impurity - greater than or equal to this value. - The weighted impurity decrease equation is the following:: - N_t / N * (impurity - N_t_R / N_t * right_impurity - - N_t_L / N_t * left_impurity) - where ``N`` is the total number of samples, ``N_t`` is the number of - samples at the current node, ``N_t_L`` is the number of samples in the - left child, and ``N_t_R`` is the number of samples in the right child. - ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, - if ``sample_weight`` is passed. - .. versionadded:: 0.19 - - ccp_alpha : non-negative float, default=0.0 - Complexity parameter used for Minimal Cost-Complexity Pruning. The - subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. See - :ref:`minimal_cost_complexity_pruning` for details. - .. versionadded:: 0.22 - - References - ---------- - Scikit-learn API: sklearn.tree.DecisionTreeClassifier - https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier - """ - - super().__init__() - self.criterion = (criterion,) - self.splitter = (splitter,) - self.max_depth = (max_depth,) - self.min_samples_split = (min_samples_split,) - self.min_samples_leaf = (min_samples_leaf,) - self.min_weight_fraction_leaf = (min_weight_fraction_leaf,) - self.max_features = (max_features,) - self.max_leaf_nodes = (max_leaf_nodes,) - self.min_impurity_decrease = (min_impurity_decrease,) - self.ccp_alpha = (ccp_alpha,) - - if random_state: - self.random_state = (random_state,) - else: - self.random_state = (self.random_state,) - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = DecisionTreeRegressor( - criterion=self.criterion[0], - splitter=self.splitter[0], - max_depth=self.max_depth[0], - min_samples_split=self.min_samples_split[0], - min_samples_leaf=self.min_samples_leaf[0], - min_weight_fraction_leaf=self.min_weight_fraction_leaf[0], - max_features=self.max_features[0], - random_state=self.random_state[0], - max_leaf_nodes=self.max_leaf_nodes[0], - min_impurity_decrease=self.min_impurity_decrease[0], - ccp_alpha=self.ccp_alpha[0], - ) - self.naming = DecisionTreeRegression.name - self.customized = True - self.customized_name = "Decision Tree" - - @property - def settings(self) -> Dict: - """The configuration of Decision Tree to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized Decision Tree of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.tree import DecisionTreeRegressor - - class MyDTRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = DecisionTreeRegressor - - @classmethod - def search_space(cls, data_size, task): - space = { - "criterion": {"domain": tune.choice(["squared_error", "friedman_mse", "absolute_error", "poisson"])}, - "max_depth": {"domain": tune.randint(lower=2, upper=20), "init_value": 1, "low_cost_init_value": 1}, - "min_samples_split": { - "domain": tune.randint(lower=2, upper=10), - "init_value": 2, - "low_cost_init_value": 2, - }, - "min_samples_leaf": {"domain": tune.randint(lower=1, upper=10), "init_value": 1, "low_cost_init_value": 1}, - "max_features": {"domain": tune.randint(lower=1, upper=10), "init_value": 1, "low_cost_init_value": 1}, - } - return space - - return MyDTRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = decision_tree_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs): - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=DecisionTreeRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - @dispatch(bool) - def special_components(self, is_automl: bool = False, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=DecisionTreeRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - -class ExtraTreesRegression(TreeWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Extra-Trees algorithm to make insightful products.""" - - name = "Extra-Trees" - special_function = ["Feature Importance Diagram", "Single Tree Diagram"] - - def __init__( - self, - n_estimators: int = 100, - criterion: str = "mse", - max_depth: Optional[int] = None, - min_samples_split: Union[int, float] = 2, - min_samples_leaf: Union[int, float] = 1, - min_weight_fraction_leaf: float = 0.0, - max_features: Union[int, float, str] = "auto", - max_leaf_nodes: int = None, - min_impurity_decrease: float = 0.0, - # min_impurity_split=None, - bootstrap: bool = False, - oob_score: bool = False, - n_jobs: Optional[int] = None, - random_state: Optional[int] = None, - verbose: int = 0, - warm_start: bool = False, - ccp_alpha: float = 0.0, - max_samples: Union[int, float] = None, - ) -> None: - """ - Parameters - ---------- - n_estimators : int, default=100 - The number of trees in the forest. - - .. versionchanged:: 0.22 - The default value of ``n_estimators`` changed from 10 to 100 - in 0.22. - - criterion : {"squared_error", "absolute_error"}, default="squared_error" - The function to measure the quality of a split. Supported criteria - are "squared_error" for the mean squared error, which is equal to - variance reduction as feature selection criterion, and "absolute_error" - for the mean absolute error. - - .. versionadded:: 0.18 - Mean Absolute Error (MAE) criterion. - - .. deprecated:: 1.0 - Criterion "mse" was deprecated in v1.0 and will be removed in - version 1.2. Use `criterion="squared_error"` which is equivalent. - - .. deprecated:: 1.0 - Criterion "mae" was deprecated in v1.0 and will be removed in - version 1.2. Use `criterion="absolute_error"` which is equivalent. - - max_depth : int, default=None - The maximum depth of the tree. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node: - - - If int, then consider `min_samples_split` as the minimum number. - - If float, then `min_samples_split` is a fraction and - `ceil(min_samples_split * n_samples)` are the minimum - number of samples for each split. - - .. versionchanged:: 0.18 - Added float values for fractions. - - min_samples_leaf : int or float, default=1 - The minimum number of samples required to be at a leaf node. - A split point at any depth will only be considered if it leaves at - least ``min_samples_leaf`` training samples in each of the left and - right branches. This may have the effect of smoothing the model, - especially in regression. - - - If int, then consider `min_samples_leaf` as the minimum number. - - If float, then `min_samples_leaf` is a fraction and - `ceil(min_samples_leaf * n_samples)` are the minimum - number of samples for each node. - - .. versionchanged:: 0.18 - Added float values for fractions. - - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. - - max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" - The number of features to consider when looking for the best split: - - - If int, then consider `max_features` features at each split. - - If float, then `max_features` is a fraction and - `round(max_features * n_features)` features are considered at each - split. - - If "auto", then `max_features=n_features`. - - If "sqrt", then `max_features=sqrt(n_features)`. - - If "log2", then `max_features=log2(n_features)`. - - If None, then `max_features=n_features`. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires to - effectively inspect more than ``max_features`` features. - - max_leaf_nodes : int, default=None - Grow trees with ``max_leaf_nodes`` in best-first fashion. - Best nodes are defined as relative reduction in impurity. - If None then unlimited number of leaf nodes. - - min_impurity_decrease : float, default=0.0 - A node will be split if this split induces a decrease of the impurity - greater than or equal to this value. - - The weighted impurity decrease equation is the following:: - - N_t / N * (impurity - N_t_R / N_t * right_impurity - - N_t_L / N_t * left_impurity) - - where ``N`` is the total number of samples, ``N_t`` is the number of - samples at the current node, ``N_t_L`` is the number of samples in the - left child, and ``N_t_R`` is the number of samples in the right child. - - ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, - if ``sample_weight`` is passed. - - .. versionadded:: 0.19 - - bootstrap : bool, default=False - Whether bootstrap samples are used when building trees. If False, the - whole dataset is used to build each tree. - - oob_score : bool, default=False - Whether to use out-of-bag samples to estimate the generalization score. - Only available if bootstrap=True. - - n_jobs : int, default=None - The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, - :meth:`decision_path` and :meth:`apply` are all parallelized over the - trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` - context. ``-1`` means using all processors. See :term:`Glossary - ` for more details. - - random_state : int, RandomState instance or None, default=None - Controls 3 sources of randomness: - - - the bootstrapping of the samples used when building trees - (if ``bootstrap=True``) - - the sampling of the features to consider when looking for the best - split at each node (if ``max_features < n_features``) - - the draw of the splits for each of the `max_features` - - See :term:`Glossary ` for details. - - verbose : int, default=0 - Controls the verbosity when fitting and predicting. - - warm_start : bool, default=False - When set to ``True``, reuse the solution of the previous call to fit - and add more estimators to the ensemble, otherwise, just fit a whole - new forest. See :term:`the Glossary `. - - ccp_alpha : non-negative float, default=0.0 - Complexity parameter used for Minimal Cost-Complexity Pruning. The - subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. See - :ref:`minimal_cost_complexity_pruning` for details. - - .. versionadded:: 0.22 - - max_samples : int or float, default=None - If bootstrap is True, the number of samples to draw from X - to train each base estimator. - - - If None (default), then draw `X.shape[0]` samples. - - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0.0, 1.0]`. - - .. versionadded:: 0.22 - - References - ---------- - Scikit-learn API: sklearn.ensemble.ExtraTreesRegressor - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html?highlight=extratreesregressor#sklearn.ensemble.ExtraTreesRegressor - """ - - super().__init__() - self.n_estimators = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf - self.max_features = max_features - self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease - # self.min_impurity_split = min_impurity_split - self.bootstrap = bootstrap - self.oob_score = oob_score - self.n_jobs = n_jobs - self.verbose = verbose - self.warm_start = warm_start - self.ccp_alpha = ccp_alpha - self.max_samples = max_samples - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = ExtraTreesRegressor( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - max_features=self.max_features, - max_leaf_nodes=self.max_leaf_nodes, - min_impurity_decrease=self.min_impurity_decrease, - # min_impurity_split=self.min_impurity_split, - bootstrap=self.bootstrap, - oob_score=self.oob_score, - n_jobs=self.n_jobs, - random_state=self.random_state, - verbose=self.verbose, - warm_start=self.warm_start, - ccp_alpha=self.ccp_alpha, - max_samples=self.max_samples, - ) - - self.naming = ExtraTreesRegression.name - - @property - def settings(self) -> Dict: - """The configuration to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": ["extra_tree"], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = extra_trees_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=ExtraTreesRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.model.estimators_[0], - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - @dispatch(bool) - def special_components(self, is_automl: bool = False, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=ExtraTreesRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.auto_model.estimators_[0], - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - -class RandomForestRegression(TreeWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Random Forest algorithm to make insightful products.""" - - name = "Random Forest" - special_function = ["Feature Importance Diagram", "Single Tree Diagram"] - - def __init__( - self, - n_estimators: int = 100, - criterion: str = "mse", - max_depth: Optional[int] = None, - min_samples_split: Union[int, float] = 2, - min_samples_leaf: Union[int, float] = 1, - min_weight_fraction_leaf: float = 0.0, - max_features: Union[int, float] = "sqrt", - max_leaf_nodes: int = None, - min_impurity_decrease: float = 0.0, - bootstrap: bool = True, - oob_score: bool = False, - n_jobs: int = None, - random_state: Optional[int] = None, - verbose: int = 0, - warm_start: bool = False, - # class_weight=None, - ccp_alpha: float = 0.0, - max_samples: Union[int, float] = None, - ) -> None: - """ - Parameters - ---------- - n_estimators : int, default=100 - The number of trees in the forest. - - .. versionchanged:: 0.22 - The default value of ``n_estimators`` changed from 10 to 100 - in 0.22. - - criterion : {"squared_error", "absolute_error", "poisson"}, \ - default="squared_error" - The function to measure the quality of a split. Supported criteria - are "squared_error" for the mean squared error, which is equal to - variance reduction as feature selection criterion, "absolute_error" - for the mean absolute error, and "poisson" which uses reduction in - Poisson deviance to find splits. - Training using "absolute_error" is significantly slower - than when using "squared_error". - - .. versionadded:: 0.18 - Mean Absolute Error (MAE) criterion. - - .. versionadded:: 1.0 - Poisson criterion. - - .. deprecated:: 1.0 - Criterion "mse" was deprecated in v1.0 and will be removed in - version 1.2. Use `criterion="squared_error"` which is equivalent. - - .. deprecated:: 1.0 - Criterion "mae" was deprecated in v1.0 and will be removed in - version 1.2. Use `criterion="absolute_error"` which is equivalent. - - max_depth : int, default=None - The maximum depth of the tree. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node: - - - If int, then consider `min_samples_split` as the minimum number. - - If float, then `min_samples_split` is a fraction and - `ceil(min_samples_split * n_samples)` are the minimum - number of samples for each split. - - .. versionchanged:: 0.18 - Added float values for fractions. - - min_samples_leaf : int or float, default=1 - The minimum number of samples required to be at a leaf node. - A split point at any depth will only be considered if it leaves at - least ``min_samples_leaf`` training samples in each of the left and - right branches. This may have the effect of smoothing the model, - especially in regression. - - - If int, then consider `min_samples_leaf` as the minimum number. - - If float, then `min_samples_leaf` is a fraction and - `ceil(min_samples_leaf * n_samples)` are the minimum - number of samples for each node. - - .. versionchanged:: 0.18 - Added float values for fractions. - - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. - - max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" - The number of features to consider when looking for the best split: - - - If int, then consider `max_features` features at each split. - - If float, then `max_features` is a fraction and - `round(max_features * n_features)` features are considered at each - split. - - If "auto", then `max_features=n_features`. - - If "sqrt", then `max_features=sqrt(n_features)`. - - If "log2", then `max_features=log2(n_features)`. - - If None, then `max_features=n_features`. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires to - effectively inspect more than ``max_features`` features. - - max_leaf_nodes : int, default=None - Grow trees with ``max_leaf_nodes`` in best-first fashion. - Best nodes are defined as relative reduction in impurity. - If None then unlimited number of leaf nodes. - - min_impurity_decrease : float, default=0.0 - A node will be split if this split induces a decrease of the impurity - greater than or equal to this value. - - The weighted impurity decrease equation is the following:: - - N_t / N * (impurity - N_t_R / N_t * right_impurity - - N_t_L / N_t * left_impurity) - - where ``N`` is the total number of samples, ``N_t`` is the number of - samples at the current node, ``N_t_L`` is the number of samples in the - left child, and ``N_t_R`` is the number of samples in the right child. - - ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, - if ``sample_weight`` is passed. - - .. versionadded:: 0.19 - - bootstrap : bool, default=True - Whether bootstrap samples are used when building trees. If False, the - whole dataset is used to build each tree. - - oob_score : bool, default=False - Whether to use out-of-bag samples to estimate the generalization score. - Only available if bootstrap=True. - - n_jobs : int, default=None - The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, - :meth:`decision_path` and :meth:`apply` are all parallelized over the - trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` - context. ``-1`` means using all processors. See :term:`Glossary - ` for more details. - - random_state : int, RandomState instance or None, default=None - Controls both the randomness of the bootstrapping of the samples used - when building trees (if ``bootstrap=True``) and the sampling of the - features to consider when looking for the best split at each node - (if ``max_features < n_features``). - See :term:`Glossary ` for details. - - verbose : int, default=0 - Controls the verbosity when fitting and predicting. - - warm_start : bool, default=False - When set to ``True``, reuse the solution of the previous call to fit - and add more estimators to the ensemble, otherwise, just fit a whole - new forest. See :term:`the Glossary `. - - ccp_alpha : non-negative float, default=0.0 - Complexity parameter used for Minimal Cost-Complexity Pruning. The - subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. See - :ref:`minimal_cost_complexity_pruning` for details. - - .. versionadded:: 0.22 - - max_samples : int or float, default=None - If bootstrap is True, the number of samples to draw from X - to train each base estimator. - - - If None (default), then draw `X.shape[0]` samples. - - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0.0, 1.0]`. - - .. versionadded:: 0.22 - - References - ---------- - Scikit-learn API: sklearn.ensemble.RandomForestRegressor - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html?highlight=randomforestregressor#sklearn.ensemble.RandomForestRegressor - """ - - super().__init__() - self.n_estimators = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf - self.max_features = max_features - self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease - self.bootstrap = bootstrap - self.oob_score = oob_score - self.n_jobs = n_jobs - self.verbose = verbose - self.warm_start = warm_start - # self.class_weight = class_weight - self.ccp_alpha = ccp_alpha - self.max_samples = max_samples - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = RandomForestRegressor( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - max_features=self.max_features, - max_leaf_nodes=self.max_leaf_nodes, - min_impurity_decrease=self.min_impurity_decrease, - bootstrap=self.bootstrap, - oob_score=self.oob_score, - n_jobs=self.n_jobs, - random_state=self.random_state, - verbose=self.verbose, - warm_start=self.warm_start, - # class_weight=self.class_weight, - ccp_alpha=self.ccp_alpha, - max_samples=self.max_samples, - ) - self.naming = RandomForestRegression.name - - @property - def settings(self) -> Dict: - """The configuration to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": ["rf"], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = random_forest_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=RandomForestRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.model.estimators_[0], - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - @dispatch(bool) - def special_components(self, is_automl: bool = False, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=RandomForestRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.auto_model.estimators_[0], - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - -class SVMRegression(RegressionWorkflowBase): - """The automation workflow of using SVR algorithm to make insightful products.""" - - name = "Support Vector Machine" - special_function = [] - - def __init__( - self, - kernel: str = "rbf", - degree: int = 3, - gamma: Union[str, float] = "scale", - # coef0: float = 0.0, - tol: float = 1e-3, - C: float = 1.0, - epsilon: float = 0.1, - shrinking: bool = True, - cache_size: float = 200, - verbose: bool = False, - max_iter: int = -1, - ) -> None: - """ - Parameters - ---------- - kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ - default='rbf' - Specifies the kernel type to be used in the algorithm. - If none is given, 'rbf' will be used. If a callable is given it is - used to precompute the kernel matrix. - - degree : int, default=3 - Degree of the polynomial kernel function ('poly'). - Ignored by all other kernels. - - gamma : {'scale', 'auto'} or float, default='scale' - Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - - if ``gamma='scale'`` (default) is passed then it uses - 1 / (n_features * X.var()) as value of gamma, - - if 'auto', uses 1 / n_features. - .. versionchanged:: 0.22 - The default value of ``gamma`` changed from 'auto' to 'scale'. - - coef0 : float, default=0.0 - Independent term in kernel function. - It is only significant in 'poly' and 'sigmoid'. - - tol : float, default=1e-3 - Tolerance for stopping criterion. - - C : float, default=1.0 - Regularization parameter. The strength of the regularization is - inversely proportional to C. Must be strictly positive. - The penalty is a squared l2 penalty. - - epsilon : float, default=0.1 - Epsilon in the epsilon-SVR model. It specifies the epsilon-tube - within which no penalty is associated in the training loss function - with points predicted within a distance epsilon from the actual - value. - shrinking : bool, default=True - Whether to use the shrinking heuristic. - See the :ref:`User Guide `. - - cache_size : float, default=200 - Specify the size of the kernel cache (in MB). - - verbose : bool, default=False - Enable verbose output. Note that this setting takes advantage of a - per-process runtime setting in libsvm that, if enabled, may not work - properly in a multithreaded context. - - max_iter : int, default=-1 - Hard limit on iterations within solver, or -1 for no limit. - - References - ---------- - Scikit-learn API: sklearn.svm.SVR - https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html - """ - - super().__init__() - self.kernel = kernel - self.degree = degree - self.gamma = gamma - # self.coef0 = coef0 - self.tol = tol - self.C = C - self.epsilon = epsilon - self.shrinking = shrinking - self.cache_size = cache_size - self.verbose = verbose - self.max_iter = max_iter - - self.model = SVR( - kernel=self.kernel, - degree=self.degree, - gamma=self.gamma, - # coef0=self.coef0, - tol=self.tol, - C=self.C, - epsilon=self.epsilon, - shrinking=self.shrinking, - cache_size=self.cache_size, - verbose=self.verbose, - max_iter=self.max_iter, - ) - - self.naming = SVMRegression.name - self.customized = True - self.customized_name = "SVR" - - @property - def settings(self) -> Dict: - """The configuration of SVR to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized SVR of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.svm import SVR - - class MySVMRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = SVR - - @classmethod - def search_space(cls, data_size, task): - space = { - "C": {"domain": tune.uniform(lower=1, upper=data_size[0]), "init_value": 1, "low_cost_init_value": 1}, - "kernel": {"domain": tune.choice(["poly", "rbf", "sigmoid"])}, - "gamma": {"domain": tune.uniform(lower=1e-5, upper=10), "init_value": 1e-1, "low_cost_init_value": 1e-1}, - "degree": {"domain": tune.quniform(lower=1, upper=5, q=1), "init_value": 3, "low_cost_init_value": 3}, - "coef0": {"domain": tune.uniform(lower=0, upper=1), "init_value": 0, "low_cost_init_value": 0}, - "shrinking": {"domain": tune.choice([True, False])}, - } - return space - - return MySVMRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = svr_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs): - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - pass - - -class MLPRegression(RegressionWorkflowBase): - """The automation workflow of using Multi-layer Perceptron algorithm to make insightful products.""" - - name = "Multi-layer Perceptron" - special_function = [func.value for func in MLPSpecialFunction] # Loss Curve Diagram - - def __init__( - self, - hidden_layer_sizes: tuple = (50, 25, 5), - activation: str = "relu", - solver: str = "adam", - alpha: float = 0.0001, - batch_size: Union[int, str] = "auto", - learning_rate: str = "constant", - learning_rate_init: float = 0.001, - max_iter: int = 200, - shuffle: bool = True, - random_state: Optional[int] = None, - tol: float = 1e-4, - verbose: bool = False, - warm_start: bool = False, - early_stopping: bool = False, - validation_fraction: float = 0.1, - beta_1: float = 0.9, - beta_2: float = 0.999, - epsilon: float = 1e-8, - n_iter_no_change: int = 10, - ): - """ - Parameters - ---------- - hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,) - The ith element represents the number of neurons in the ith - hidden layer. - - activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu' - Activation function for the hidden layer. - - 'identity', no-op activation, useful to implement linear bottleneck, - returns f(x) = x - - 'logistic', the logistic sigmoid function, - returns f(x) = 1 / (1 + exp(-x)). - - 'tanh', the hyperbolic tan function, - returns f(x) = tanh(x). - - 'relu', the rectified linear unit function, - returns f(x) = max(0, x) - - solver : {'lbfgs', 'sgd', 'adam'}, default='adam' - The solver for weight optimization. - - 'lbfgs' is an optimizer in the family of quasi-Newton methods. - - 'sgd' refers to stochastic gradient descent. - - 'adam' refers to a stochastic gradient-based optimizer proposed by - Kingma, Diederik, and Jimmy Ba - Note: The default solver 'adam' works pretty well on relatively - large datasets (with thousands of training samples or more) in terms of - both training time and validation score. - For small datasets, however, 'lbfgs' can converge faster and perform - better. - - alpha : float, default=0.0001 - Strength of the L2 regularization term. The L2 regularization term - is divided by the sample size when added to the loss. - - batch_size : int, default='auto' - Size of minibatches for stochastic optimizers. - If the solver is 'lbfgs', the classifier will not use minibatch. - When set to "auto", `batch_size=min(200, n_samples)`. - - learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant' - Learning rate schedule for weight updates. - - 'constant' is a constant learning rate given by - 'learning_rate_init'. - - 'invscaling' gradually decreases the learning rate ``learning_rate_`` - at each time step 't' using an inverse scaling exponent of 'power_t'. - effective_learning_rate = learning_rate_init / pow(t, power_t) - - 'adaptive' keeps the learning rate constant to - 'learning_rate_init' as long as training loss keeps decreasing. - Each time two consecutive epochs fail to decrease training loss by at - least tol, or fail to increase validation score by at least tol if - 'early_stopping' is on, the current learning rate is divided by 5. - Only used when solver='sgd'. - - learning_rate_init : float, default=0.001 - The initial learning rate used. It controls the step-size - in updating the weights. Only used when solver='sgd' or 'adam'. - - power_t : float, default=0.5 - The exponent for inverse scaling learning rate. - It is used in updating effective learning rate when the learning_rate - is set to 'invscaling'. Only used when solver='sgd'. - - max_iter : int, default=200 - Maximum number of iterations. The solver iterates until convergence - (determined by 'tol') or this number of iterations. For stochastic - solvers ('sgd', 'adam'), note that this determines the number of epochs - (how many times each data point will be used), not the number of - gradient steps. - - shuffle : bool, default=True - Whether to shuffle samples in each iteration. Only used when - solver='sgd' or 'adam'. - - random_state : int, RandomState instance, default=None - Determines random number generation for weights and bias - initialization, train-test split if early stopping is used, and batch - sampling when solver='sgd' or 'adam'. - Pass an int for reproducible results across multiple function calls. - See :term:`Glossary `. - - tol : float, default=1e-4 - Tolerance for thfe optimization. When the loss or score is not improving - by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, - unless ``learning_rate`` is set to 'adaptive', convergence is - considered to be reached and training stops. - - verbose : bool, default=False - Whether to print progress messages to stdout. - - warm_start : bool, default=False - When set to True, reuse the solution of the previous - call to fit as initialization, otherwise, just erase the - previous solution. See :term:`the Glossary `. - - momentum : float, default=0.9 - Momentum for gradient descent update. Should be between 0 and 1. Only - used when solver='sgd'. - - nesterovs_momentum : bool, default=True - Whether to use Nesterov's momentum. Only used when solver='sgd' and - momentum > 0. - - early_stopping : bool, default=False - Whether to use early stopping to terminate training when validation - score is not improving. If set to true, it will automatically set - aside 10% of training data as validation and terminate training when - validation score is not improving by at least ``tol`` for - ``n_iter_no_change`` consecutive epochs. - Only effective when solver='sgd' or 'adam'. - - validation_fraction : float, default=0.1 - The proportion of training data to set aside as validation set for - early stopping. Must be between 0 and 1. - Only used if early_stopping is True. - - beta_1 : float, default=0.9 - Exponential decay rate for estimates of first moment vector in adam, - should be in [0, 1). Only used when solver='adam'. - - beta_2 : float, default=0.999 - Exponential decay rate for estimates of second moment vector in adam, - should be in [0, 1). Only used when solver='adam'. - - epsilon : float, default=1e-8 - Value for numerical stability in adam. Only used when solver='adam'. - - n_iter_no_change : int, default=10 - Maximum number of epochs to not meet ``tol`` improvement. - Only effective when solver='sgd' or 'adam'. - - .. versionadded:: 0.20 - - max_fun : int, default=15000 - Only used when solver='lbfgs'. Maximum number of function calls. - The solver iterates until convergence (determined by 'tol'), number - of iterations reaches max_iter, or this number of function calls. - Note that number of function calls will be greater than or equal to - the number of iterations for the MLPRegressor. - - .. versionadded:: 0.22 - - References - ---------- - Scikit-learn API: sklearn.neural_network.MLPRegressor - https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html - """ - super().__init__() - self.hidden_layer_sizes = hidden_layer_sizes - self.activation = activation - self.solver = solver - self.alpha = alpha - self.batch_size = batch_size - self.learning_rate = learning_rate - self.learning_rate_init = learning_rate_init - self.max_iter = max_iter - self.shuffle = shuffle - self.tol = tol - self.verbose = verbose - self.warm_start = warm_start - self.early_stopping = early_stopping - self.validation_fraction = validation_fraction - self.beta_1 = beta_1 - self.beta_2 = beta_2 - self.epsilon = epsilon - self.n_iter_no_change = n_iter_no_change - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = MLPRegressor( - hidden_layer_sizes=self.hidden_layer_sizes, - activation=self.activation, - solver=self.solver, - alpha=self.alpha, - batch_size=self.batch_size, - learning_rate=self.learning_rate, - learning_rate_init=self.learning_rate_init, - max_iter=self.max_iter, - shuffle=self.shuffle, - random_state=self.random_state, - tol=self.tol, - verbose=self.verbose, - warm_start=self.warm_start, - early_stopping=self.early_stopping, - validation_fraction=self.validation_fraction, - beta_1=self.beta_1, - beta_2=self.beta_2, - epsilon=self.epsilon, - n_iter_no_change=self.n_iter_no_change, - ) - - self.naming = MLPRegression.name - - def ray_tune(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame) -> None: - """The customized MLP of the combinations of Ray, FLAML and Scikit-learn framework.""" - - from ray import tune - from ray.air import session - from ray.tune.search import ConcurrencyLimiter - from ray.tune.search.flaml import BlendSearch - from sklearn.metrics import mean_squared_error - - def customized_model(l1: int, l2: int, l3: int, batch: int) -> object: - """The customized model by Scikit-learn framework.""" - return MLPRegressor(hidden_layer_sizes=(l1, l2, l3), batch_size=batch) - - def evaluate(l1: int, l2: int, l3: int, batch: int) -> float: - """The evaluation function by simulating a long-running ML experiment - to get the model's performance at every epoch.""" - regr = customized_model(l1, l2, l3, batch) - regr.fit(X_train, y_train) - y_pred = regr.predict(X_test) - mse = mean_squared_error(y_test, y_pred) - rmse = np.sqrt(mse) # Use RMSE score - return rmse - - def objective(config: Dict) -> None: - """Objective function takes a Tune config, evaluates the score of your experiment in a training loop, - and uses session.report to report the score back to Tune.""" - for step in range(config["steps"]): - score = evaluate(config["l1"], config["l2"], config["l3"], config["batch"]) - session.report({"iterations": step, "mean_loss": score}) - - # Search space: The critical assumption is that the optimal hyper-parameters live within this space. - search_config = { - "l1": tune.randint(1, 20), - "l2": tune.randint(1, 30), - "l3": tune.randint(1, 20), - "batch": tune.randint(20, 100), - } - - # Define the time budget in seconds. - time_budget_s = 30 - - # Integrate with FLAML's BlendSearch to implement hyper-parameters optimization . - algo = BlendSearch(metric="mean_loss", mode="min", space=search_config) - algo.set_search_properties(config={"time_budget_s": time_budget_s}) - algo = ConcurrencyLimiter(algo, max_concurrent=4) - - # Use Ray Tune to run the experiment to "min"imize the “mean_loss” of the "objective" - # by searching "search_config" via "algo", "num_samples" times. - tuner = tune.Tuner( - objective, - tune_config=tune.TuneConfig( - metric="mean_loss", - mode="min", - search_alg=algo, - num_samples=-1, - time_budget_s=time_budget_s, - ), - param_space={"steps": 100}, - ) - results = tuner.fit() - - # The hyper-parameters found to minimize the mean loss of the defined objective and the corresponding model. - best_result = results.get_best_result(metric="mean_loss", mode="min") - self.ray_best_model = customized_model(best_result.config["l1"], best_result.config["l2"], best_result.config["l3"], best_result.config["batch"]) - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = multi_layer_perceptron_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @staticmethod - def _plot_loss_curve(trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str, func_name: str) -> None: - """Plot the learning curve of the trained model.""" - print(f"-----* {func_name} *-----") - data = pd.DataFrame(trained_model.loss_curve_, columns=["Loss"]) - data.plot(title="Loss") - save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"{func_name} - {algorithm_name}", local_path, mlflow_path) - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - if self.model.get_params()["solver"] in ["sgd", "adam"]: - self._plot_loss_curve( - trained_model=self.model, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - func_name=MLPSpecialFunction.LOSS_CURVE_DIAGRAM.value, - ) - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - if self.model.get_params()["solver"] in ["sgd", "adam"]: - self._plot_loss_curve( - trained_model=self.auto_model, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - func_name=MLPSpecialFunction.LOSS_CURVE_DIAGRAM.value, - ) - - -class ClassicalLinearRegression(LinearWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Linear Regression algorithm to make insightful products.""" - - name = "Linear Regression" - special_function = ["Linear Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] - - def __init__( - self, - fit_intercept: bool = True, - copy_X: bool = True, - n_jobs: Optional[int] = None, - positive: bool = False, - ) -> None: - """ - Parameters - ---------- - fit_intercept : bool, default=True - Whether to calculate the intercept for this model. If set - to False, no intercept will be used in calculations - (i.e. data is expected to be centered). - - copy_X : bool, default=True - If True, X will be copied; else, it may be overwritten. - - n_jobs : int, default=None - The number of jobs to use for the computation. This will only provide - speedup in case of sufficiently large problems, that is if firstly - `n_targets > 1` and secondly `X` is sparse or if `positive` is set - to `True`. ``None`` means 1 unless in a - :obj:`joblib.parallel_backend` context. ``-1`` means using all - processors. See :term:`Glossary ` for more details. - - positive : bool, default=False - When set to ``True``, forces the coefficients to be positive. This - option is only supported for dense arrays. - .. versionadded:: 0.24 - - References - ---------- - Scikit-learn API: sklearn.linear_model.LinearRegression - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linearregression - """ - super().__init__() - self.fit_intercept = fit_intercept - self.copy_X = copy_X - self.n_jobs = n_jobs - self.positive = positive - - self.model = LinearRegression( - fit_intercept=self.fit_intercept, - copy_X=self.copy_X, - n_jobs=self.n_jobs, - positive=self.positive, - ) - - self.naming = ClassicalLinearRegression.name - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = linear_regression_manual_hyper_parameters() - clear_output() - return hyper_parameters - - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._show_formula( - coef=self.model.coef_, - intercept=self.model.intercept_, - features_name=ClassicalLinearRegression.X_train.columns, - regression_classification="Regression", - y_train=ClassicalLinearRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = ClassicalLinearRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ClassicalLinearRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(ClassicalLinearRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ClassicalLinearRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=ClassicalLinearRegression.X_test, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=ClassicalLinearRegression.X_test, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - y_test_predict=ClassicalLinearRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=ClassicalLinearRegression.X_test, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=ClassicalLinearRegression.X_test, - target_data=ClassicalLinearRegression.y_test, - data_name=ClassicalLinearRegression.name_test, - y_test_predict=ClassicalLinearRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - -class KNNRegression(RegressionWorkflowBase): - """The automation workflow of using KNN algorithm to make insightful products.""" - - name = "K-Nearest Neighbors" - special_function = [] - - def __init__( - self, - n_neighbors: int = 5, - *, - weights: str = "uniform", - algorithm: str = "auto", - leaf_size: int = 30, - p: int = 2, - metric: str = "minkowski", - metric_params: Optional[Dict] = None, - n_jobs: Optional[int] = None, - ) -> None: - """ - Parameters - ---------- - n_neighbors : int, default=5 - Number of neighbors to use by default for :meth:`kneighbors` queries. - - weights : {'uniform', 'distance'}, callable or None, default='uniform' - Weight function used in prediction. Possible values: - - - 'uniform' : uniform weights. All points in each neighborhood - are weighted equally. - - 'distance' : weight points by the inverse of their distance. - in this case, closer neighbors of a query point will have a - greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an - array of distances, and returns an array of the same shape - containing the weights. - - Uniform weights are used by default. - - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' - Algorithm used to compute the nearest neighbors: - - - 'ball_tree' will use :class:`BallTree` - - 'kd_tree' will use :class:`KDTree` - - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. - - Note: fitting on sparse input will override the setting of - this parameter, using brute force. - - leaf_size : int, default=30 - Leaf size passed to BallTree or KDTree. This can affect the - speed of the construction and query, as well as the memory - required to store the tree. The optimal value depends on the - nature of the problem. - - p : int, default=2 - Power parameter for the Minkowski metric. When p = 1, this is - equivalent to using manhattan_distance (l1), and euclidean_distance - (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - - metric : str or callable, default='minkowski' - Metric to use for distance computation. Default is "minkowski", which - results in the standard Euclidean distance when p = 2. See the - documentation of `scipy.spatial.distance - `_ and - the metrics listed in - :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric - values. - - If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`sparse graph`, in which - case only "nonzero" elements may be considered neighbors. - - If metric is a callable function, it takes two arrays representing 1D - vectors as inputs and must return one value indicating the distance - between those vectors. This works for Scipy's metrics, but is less - efficient than passing the metric name as a string. - - metric_params : dict, default=None - Additional keyword arguments for the metric function. - - n_jobs : int, default=None - The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - Doesn't affect :meth:`fit` method. - - References - ---------- - Scikit-learn API: sklearn.neighbors.KNeighborsRegressor - https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html - """ - super().__init__() - self.n_neighbors = n_neighbors - self.weights = weights - self.algorithm = algorithm - self.leaf_size = leaf_size - self.p = p - self.metric = metric - self.metric_params = metric_params - self.n_jobs = n_jobs - - self.model = KNeighborsRegressor( - n_neighbors=self.n_neighbors, - weights=self.weights, - algorithm=self.algorithm, - leaf_size=self.leaf_size, - p=self.p, - metric=self.metric, - metric_params=self.metric_params, - n_jobs=self.n_jobs, - ) - - self.naming = KNNRegression.name - - @property - def settings(self) -> Dict: - """The configuration to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": ["kneighbor"], # list of ML learners - "task": "regression", # task type - # "log_file_name": f"{self.naming} - automl.log", # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = knn_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - pass - - -class GradientBoostingRegression(TreeWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Gradient Boosting algorithm to make insightful products.""" - - name = "Gradient Boosting" - special_function = ["Feature Importance Diagram", "Single Tree Diagram"] - - def __init__( - self, - *, - loss: str = "squared_error", - learning_rate: float = 0.1, - n_estimators: int = 100, - subsample: float = 1.0, - criterion: str = "friedman_mse", - min_samples_split: Union[int, float] = 2, - min_samples_leaf: Union[int, float] = 1, - min_weight_fraction_leaf: float = 0.0, - max_depth: float = 3, - min_impurity_decrease: float = 0.0, - init: Optional[object] = None, - random_state: Optional[int] = None, - max_features: Union[str, int, float] = None, - alpha: float = 0.9, - verbose: int = 0, - max_leaf_nodes: Optional[int] = None, - warm_start: bool = False, - validation_fraction: float = 0.1, - n_iter_no_change: Optional[int] = None, - tol: float = 1e-4, - ccp_alpha: float = 0.0, - ) -> None: - """ - Parameters - ---------- - loss : {'squared_error', 'absolute_error', 'huber', 'quantile'}, \ - default='squared_error' - Loss function to be optimized. 'squared_error' refers to the squared - error for regression. 'absolute_error' refers to the absolute error of - regression and is a robust loss function. 'huber' is a - combination of the two. 'quantile' allows quantile regression (use - `alpha` to specify the quantile). - - learning_rate : float, default=0.1 - Learning rate shrinks the contribution of each tree by `learning_rate`. - There is a trade-off between learning_rate and n_estimators. - Values must be in the range `[0.0, inf)`. - - n_estimators : int, default=100 - The number of boosting stages to perform. Gradient boosting - is fairly robust to over-fitting so a large number usually - results in better performance. - Values must be in the range `[1, inf)`. - - subsample : float, default=1.0 - The fraction of samples to be used for fitting the individual base - learners. If smaller than 1.0 this results in Stochastic Gradient - Boosting. `subsample` interacts with the parameter `n_estimators`. - Choosing `subsample < 1.0` leads to a reduction of variance - and an increase in bias. - Values must be in the range `(0.0, 1.0]`. - - criterion : {'friedman_mse', 'squared_error'}, default='friedman_mse' - The function to measure the quality of a split. Supported criteria are - "friedman_mse" for the mean squared error with improvement score by - Friedman, "squared_error" for mean squared error. The default value of - "friedman_mse" is generally the best as it can provide a better - approximation in some cases. - - .. versionadded:: 0.18 - - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node: - - - If int, values must be in the range `[2, inf)`. - - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split` - will be `ceil(min_samples_split * n_samples)`. - - .. versionchanged:: 0.18 - Added float values for fractions. - - min_samples_leaf : int or float, default=1 - The minimum number of samples required to be at a leaf node. - A split point at any depth will only be considered if it leaves at - least ``min_samples_leaf`` training samples in each of the left and - right branches. This may have the effect of smoothing the model, - especially in regression. - - - If int, values must be in the range `[1, inf)`. - - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf` - will be `ceil(min_samples_leaf * n_samples)`. - - .. versionchanged:: 0.18 - Added float values for fractions. - - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. - Values must be in the range `[0.0, 0.5]`. - - max_depth : int or None, default=3 - Maximum depth of the individual regression estimators. The maximum - depth limits the number of nodes in the tree. Tune this parameter - for best performance; the best value depends on the interaction - of the input variables. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - If int, values must be in the range `[1, inf)`. - - min_impurity_decrease : float, default=0.0 - A node will be split if this split induces a decrease of the impurity - greater than or equal to this value. - Values must be in the range `[0.0, inf)`. - - The weighted impurity decrease equation is the following:: - - N_t / N * (impurity - N_t_R / N_t * right_impurity - - N_t_L / N_t * left_impurity) - - where ``N`` is the total number of samples, ``N_t`` is the number of - samples at the current node, ``N_t_L`` is the number of samples in the - left child, and ``N_t_R`` is the number of samples in the right child. - - ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, - if ``sample_weight`` is passed. - - .. versionadded:: 0.19 - - init : estimator or 'zero', default=None - An estimator object that is used to compute the initial predictions. - ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the - initial raw predictions are set to zero. By default a - ``DummyEstimator`` is used, predicting either the average target value - (for loss='squared_error'), or a quantile for the other losses. - - random_state : int, RandomState instance or None, default=None - Controls the random seed given to each Tree estimator at each - boosting iteration. - In addition, it controls the random permutation of the features at - each split (see Notes for more details). - It also controls the random splitting of the training data to obtain a - validation set if `n_iter_no_change` is not None. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - max_features : {'sqrt', 'log2'}, int or float, default=None - The number of features to consider when looking for the best split: - - - If int, values must be in the range `[1, inf)`. - - If float, values must be in the range `(0.0, 1.0]` and the features - considered at each split will be `max(1, int(max_features * n_features_in_))`. - - If "sqrt", then `max_features=sqrt(n_features)`. - - If "log2", then `max_features=log2(n_features)`. - - If None, then `max_features=n_features`. - - Choosing `max_features < n_features` leads to a reduction of variance - and an increase in bias. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires to - effectively inspect more than ``max_features`` features. - - alpha : float, default=0.9 - The alpha-quantile of the huber loss function and the quantile - loss function. Only if ``loss='huber'`` or ``loss='quantile'``. - Values must be in the range `(0.0, 1.0)`. - - verbose : int, default=0 - Enable verbose output. If 1 then it prints progress and performance - once in a while (the more trees the lower the frequency). If greater - than 1 then it prints progress and performance for every tree. - Values must be in the range `[0, inf)`. - - max_leaf_nodes : int, default=None - Grow trees with ``max_leaf_nodes`` in best-first fashion. - Best nodes are defined as relative reduction in impurity. - Values must be in the range `[2, inf)`. - If None, then unlimited number of leaf nodes. - - warm_start : bool, default=False - When set to ``True``, reuse the solution of the previous call to fit - and add more estimators to the ensemble, otherwise, just erase the - previous solution. See :term:`the Glossary `. - - validation_fraction : float, default=0.1 - The proportion of training data to set aside as validation set for - early stopping. Values must be in the range `(0.0, 1.0)`. - Only used if ``n_iter_no_change`` is set to an integer. - - .. versionadded:: 0.20 - - n_iter_no_change : int, default=None - ``n_iter_no_change`` is used to decide if early stopping will be used - to terminate training when validation score is not improving. By - default it is set to None to disable early stopping. If set to a - number, it will set aside ``validation_fraction`` size of the training - data as validation and terminate training when validation score is not - improving in all of the previous ``n_iter_no_change`` numbers of - iterations. - Values must be in the range `[1, inf)`. - - .. versionadded:: 0.20 - - tol : float, default=1e-4 - Tolerance for the early stopping. When the loss is not improving - by at least tol for ``n_iter_no_change`` iterations (if set to a - number), the training stops. - Values must be in the range `[0.0, inf)`. - - .. versionadded:: 0.20 - - ccp_alpha : non-negative float, default=0.0 - Complexity parameter used for Minimal Cost-Complexity Pruning. The - subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. - Values must be in the range `[0.0, inf)`. - See :ref:`minimal_cost_complexity_pruning` for details. - - .. versionadded:: 0.22 - - References - ---------- - Scikit-learn API: sklearn.ensemble.GradientBoostingRegressor - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html - """ - super().__init__() - self.loss = loss - self.learning_rate = learning_rate - self.n_estimators = n_estimators - self.subsample = subsample - self.criterion = criterion - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf - self.max_depth = max_depth - self.min_impurity_decrease = min_impurity_decrease - self.init = init - self.max_features = max_features - self.alpha = alpha - self.verbose = verbose - self.max_leaf_nodes = max_leaf_nodes - self.warm_start = warm_start - self.validation_fraction = validation_fraction - self.n_iter_no_change = n_iter_no_change - self.tol = tol - self.ccp_alpha = ccp_alpha - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = GradientBoostingRegressor( - loss=self.loss, - learning_rate=self.learning_rate, - n_estimators=self.n_estimators, - subsample=self.subsample, - criterion=self.criterion, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - max_depth=self.max_depth, - min_impurity_decrease=self.min_impurity_decrease, - init=self.init, - random_state=self.random_state, - max_features=self.max_features, - alpha=self.alpha, - verbose=self.verbose, - max_leaf_nodes=self.max_leaf_nodes, - warm_start=self.warm_start, - validation_fraction=self.validation_fraction, - n_iter_no_change=self.n_iter_no_change, - tol=self.tol, - ccp_alpha=self.ccp_alpha, - ) - - self.naming = GradientBoostingRegression.name - self.customized = True - self.customized_name = "Gradient Boosting" - - @property - def settings(self) -> Dict: - """The configuration of Gradient Boosting to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized Gradient Boosting of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.ensemble import GradientBoostingRegressor - - class MyGradientBoostingRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = GradientBoostingRegressor - - @classmethod - def search_space(cls, data_size, task): - space = { - "n_estimators": {"domain": tune.lograndint(lower=4, upper=512), "init_value": 100}, - "max_depth": {"domain": tune.randint(lower=1, upper=10), "init_value": 3}, - "learning_rate": {"domain": tune.loguniform(lower=0.001, upper=1.0), "init_value": 0.1}, - "subsample": {"domain": tune.uniform(lower=0.1, upper=1.0), "init_value": 1.0}, - "min_samples_split": {"domain": tune.randint(lower=2, upper=20), "init_value": 2}, - "min_samples_leaf": {"domain": tune.randint(lower=1, upper=20), "init_value": 1}, - "min_impurity_decrease": {"domain": tune.loguniform(lower=1e-10, upper=1e-2), "init_value": 0.0}, - } - return space - - return MyGradientBoostingRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = gradient_boosting_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=GradientBoostingRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.model.estimators_[0][0], - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._plot_feature_importance( - X_train=GradientBoostingRegression.X_train, - name_column=RegressionWorkflowBase.name_train, - trained_model=self.auto_model, - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_tree( - trained_model=self.auto_model.estimators_[0][0], - image_config=self.image_config, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - -class LassoRegression(LinearWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Lasso to make insightful products.""" - - name = "Lasso Regression" - special_function = ["Lasso Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] - - def __init__( - self, - alpha: float = 1.0, - *, - fit_intercept: bool = True, - precompute: bool = False, - copy_X: bool = True, - max_iter: int = 1000, - tol: float = 1e-4, - warm_start: bool = False, - positive: bool = False, - random_state: Optional[int] = None, - selection: str = "cyclic", - ) -> None: - """ - Parameters - ---------- - alpha : float, default=1.0 - Constant that multiplies the L1 term, controlling regularization - strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. - - When `alpha = 0`, the objective is equivalent to ordinary least - squares, solved by the :class:`LinearRegression` object. For numerical - reasons, using `alpha = 0` with the `Lasso` object is not advised. - Instead, you should use the :class:`LinearRegression` object. - - fit_intercept : bool, default=True - Whether to calculate the intercept for this model. If set - to False, no intercept will be used in calculations - (i.e. data is expected to be centered). - - precompute : bool or array-like of shape (n_features, n_features),\ - default=False - Whether to use a precomputed Gram matrix to speed up - calculations. The Gram matrix can also be passed as argument. - For sparse input this option is always ``False`` to preserve sparsity. - - copy_X : bool, default=True - If ``True``, X will be copied; else, it may be overwritten. - - max_iter : int, default=1000 - The maximum number of iterations. - - tol : float, default=1e-4 - The tolerance for the optimization: if the updates are - smaller than ``tol``, the optimization code checks the - dual gap for optimality and continues until it is smaller - than ``tol``, see Notes below. - - warm_start : bool, default=False - When set to True, reuse the solution of the previous call to fit as - initialization, otherwise, just erase the previous solution. - See :term:`the Glossary `. - - positive : bool, default=False - When set to ``True``, forces the coefficients to be positive. - - random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator that selects a random - feature to update. Used when ``selection`` == 'random'. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - selection : {'cyclic', 'random'}, default='cyclic' - If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially by default. This - (setting to 'random') often leads to significantly faster convergence - especially when tol is higher than 1e-4. - - References - ---------- - Scikit-learn API: sklearn.linear_model.Lasso - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html - """ - super().__init__() - self.alpha = alpha - self.fit_intercept = fit_intercept - self.precompute = precompute - self.copy_X = copy_X - self.max_iter = max_iter - self.tol = tol - self.warm_start = warm_start - self.positive = positive - self.selection = selection - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = Lasso( - alpha=self.alpha, - fit_intercept=self.fit_intercept, - precompute=self.precompute, - copy_X=self.copy_X, - max_iter=self.max_iter, - tol=self.tol, - warm_start=self.warm_start, - positive=self.positive, - random_state=self.random_state, - selection=self.selection, - ) - - self.naming = LassoRegression.name - self.customized = True - self.customized_name = "Lasso" - - @property - def settings(self) -> Dict: - """The configuration of Lasso to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized Lasso of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.linear_model import Lasso - - class MyLassoRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = Lasso - - @classmethod - def search_space(cls, data_size, task): - space = { - "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1}, - "fit_intercept": {"domain": tune.choice([True, False])}, - "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000}, - "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4}, - "selection": {"domain": tune.choice(["cyclic", "random"])}, - } - return space - - return MyLassoRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = lasso_regression_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.model.coef_], - intercept=self.model.intercept_, - features_name=LassoRegression.X_train.columns, - regression_classification="Regression", - y_train=LassoRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = LassoRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - y_test_predict=LassoRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - y_test_predict=LassoRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.auto_model.coef_], - intercept=self.auto_model.intercept_, - features_name=LassoRegression.X_train.columns, - regression_classification="Regression", - y_train=LassoRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = LassoRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - y_test_predict=LassoRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=LassoRegression.X_test, - target_data=LassoRegression.y_test, - data_name=LassoRegression.name_test, - y_test_predict=LassoRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - -class ElasticNetRegression(LinearWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Elastic Net algorithm to make insightful products.""" - - name = "Elastic Net" - special_function = ["Elastic Net Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] - - def __init__( - self, - alpha: float = 1.0, - l1_ratio: float = 0.5, - fit_intercept: bool = True, - precompute: bool = False, - max_iter: int = 1000, - copy_X: bool = True, - tol: float = 1e-4, - warm_start: bool = False, - positive: bool = False, - random_state: Optional[int] = None, - selection: str = "cyclic", - ) -> None: - """ - Parameters - ---------- - alpha : float, default=1.0 - Constant that multiplies the penalty terms. Defaults to 1.0. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to an ordinary least square, - solved by the :class:`LinearRegression` object. For numerical - reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. - Given this, you should use the :class:`LinearRegression` object. - - l1_ratio : float, default=0.5 - The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For - ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it - is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a - combination of L1 and L2. - - fit_intercept : bool, default=True - Whether the intercept should be estimated or not. If ``False``, the - data is assumed to be already centered. - - precompute : bool or array-like of shape (n_features, n_features),\ - default=False - Whether to use a precomputed Gram matrix to speed up - calculations. The Gram matrix can also be passed as argument. - For sparse input this option is always ``False`` to preserve sparsity. - - max_iter : int, default=1000 - The maximum number of iterations. - - copy_X : bool, default=True - If ``True``, X will be copied; else, it may be overwritten. - - tol : float, default=1e-4 - The tolerance for the optimization: if the updates are - smaller than ``tol``, the optimization code checks the - dual gap for optimality and continues until it is smaller - than ``tol``, see Notes below. - - warm_start : bool, default=False - When set to ``True``, reuse the solution of the previous call to fit as - initialization, otherwise, just erase the previous solution. - See :term:`the Glossary `. - - positive : bool, default=False - When set to ``True``, forces the coefficients to be positive. - - random_state : int, RandomState instance, default=None - The seed of the pseudo random number generator that selects a random - feature to update. Used when ``selection`` == 'random'. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - selection : {'cyclic', 'random'}, default='cyclic' - If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially by default. This - (setting to 'random') often leads to significantly faster convergence - especially when tol is higher than 1e-4. - - References - ---------- - Scikit-learn API: sklearn.linear_model.ElasticNet - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html - """ - super().__init__() - self.alpha = alpha - self.l1_ratio = l1_ratio - self.fit_intercept = fit_intercept - self.precompute = precompute - self.max_iter = max_iter - self.copy_X = copy_X - self.tol = tol - self.warm_start = warm_start - self.positive = positive - self.selection = selection - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = ElasticNet( - alpha=self.alpha, - l1_ratio=self.l1_ratio, - fit_intercept=self.fit_intercept, - precompute=self.precompute, - max_iter=self.max_iter, - copy_X=self.copy_X, - tol=self.tol, - warm_start=self.warm_start, - positive=self.positive, - random_state=self.random_state, - selection=self.selection, - ) - - self.naming = ElasticNetRegression.name - self.customized = True - self.customized_name = "Elastic Net" - - @property - def settings(self) -> Dict: - """The configuration of Elastic Net to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized Elastic Net of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.linear_model import ElasticNet - - class MyElasticNetRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = ElasticNet - - @classmethod - def search_space(cls, data_size, task): - space = { - "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1}, - "l1_ratio": {"domain": tune.uniform(lower=0.001, upper=1), "init_value": 0.5}, - "fit_intercept": {"domain": tune.choice([True, False])}, - "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000}, - "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4}, - "selection": {"domain": tune.choice(["cyclic", "random"])}, - } - return space - - return MyElasticNetRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = elastic_net_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.model.coef_], - intercept=self.model.intercept_, - features_name=ElasticNetRegression.X_train.columns, - regression_classification="Regression", - y_train=ElasticNetRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = ElasticNetRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - y_test_predict=ElasticNetRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - y_test_predict=ElasticNetRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.auto_model.coef_], - intercept=self.auto_model.intercept_, - features_name=ElasticNetRegression.X_train.columns, - regression_classification="Regression", - y_train=ElasticNetRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = ElasticNetRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - y_test_predict=ElasticNetRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=ElasticNetRegression.X_test, - target_data=ElasticNetRegression.y_test, - data_name=ElasticNetRegression.name_test, - y_test_predict=ElasticNetRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - -class SGDRegression(LinearWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Stochastic Gradient Descent - SGD algorithm to make insightful products.""" - - name = "SGD Regression" - special_function = ["SGD Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] - - def __init__( - self, - loss: str = "squared_error", - penalty: str = "l2", - alpha: float = 0.0001, - l1_ratio: float = 0.15, - fit_intercept: bool = True, - max_iter: int = 1000, - tol: Union[float, None] = 0.001, - shuffle: bool = True, - verbose: int = 0, - epsilon: float = 0.1, - random_state: Optional[int] = None, - learning_rate: str = "invscaling", - eta0: float = 0.01, - power_t: float = 0.25, - early_stopping: bool = False, - validation_fraction: float = 0.1, - n_iter_no_change: int = 5, - warm_start: bool = False, - average: Union[bool, int] = False, - ) -> None: - """ - Parameters - ---------- - loss : str, default='squared_error' - The loss function to be used. The possible values are 'squared_error', - 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' - The 'squared_error' refers to the ordinary least squares fit. - 'huber' modifies 'squared_error' to focus less on getting outliers - correct by switching from squared to linear loss past a distance of - epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is - linear past that; this is the loss function used in SVR. - 'squared_epsilon_insensitive' is the same but becomes squared loss past - a tolerance of epsilon. - More details about the losses formulas can be found in the - :ref:`User Guide `. - - penalty : {'l2', 'l1', 'elasticnet', None}, default='l2' - The penalty (aka regularization term) to be used. Defaults to 'l2' - which is the standard regularizer for linear SVM models. 'l1' and - 'elasticnet' might bring sparsity to the model (feature selection) - not achievable with 'l2'. No penalty is added when set to `None`. - - alpha : float, default=0.0001 - Constant that multiplies the regularization term. The higher the - value, the stronger the regularization. - Also used to compute the learning rate when set to `learning_rate` is - set to 'optimal'. - - l1_ratio : float, default=0.15 - The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. - l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. - Only used if `penalty` is 'elasticnet'. - - fit_intercept : bool, default=True - Whether the intercept should be estimated or not. If False, the - data is assumed to be already centered. - - max_iter : int, default=1000 - The maximum number of passes over the training data (aka epochs). - It only impacts the behavior in the ``fit`` method, and not the - :meth:`partial_fit` method. - .. versionadded:: 0.19 - - tol : float or None, default=1e-3 - The stopping criterion. If it is not None, training will stop - when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive - epochs. - Convergence is checked against the training loss or the - validation loss depending on the `early_stopping` parameter. - .. versionadded:: 0.19 - - shuffle : bool, default=True - Whether or not the training data should be shuffled after each epoch. - - verbose : int, default=0 - The verbosity level. - - epsilon : float, default=0.1 - Epsilon in the epsilon-insensitive loss functions; only if `loss` is - 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. - For 'huber', determines the threshold at which it becomes less - important to get the prediction exactly right. - For epsilon-insensitive, any differences between the current prediction - and the correct label are ignored if they are less than this threshold. - - random_state : int, RandomState instance, default=None - Used for shuffling the data, when ``shuffle`` is set to ``True``. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - learning_rate : str, default='invscaling' - The learning rate schedule: - - 'constant': `eta = eta0` - - 'optimal': `eta = 1.0 / (alpha * (t + t0))` - where t0 is chosen by a heuristic proposed by Leon Bottou. - - 'invscaling': `eta = eta0 / pow(t, power_t)` - - 'adaptive': eta = eta0, as long as the training keeps decreasing. - Each time n_iter_no_change consecutive epochs fail to decrease the - training loss by tol or fail to increase validation score by tol if - early_stopping is True, the current learning rate is divided by 5. - .. versionadded:: 0.20 - Added 'adaptive' option - - eta0 : float, default=0.01 - The initial learning rate for the 'constant', 'invscaling' or - 'adaptive' schedules. The default value is 0.01. - - power_t : float, default=0.25 - The exponent for inverse scaling learning rate. - - early_stopping : bool, default=False - Whether to use early stopping to terminate training when validation - score is not improving. If set to True, it will automatically set aside - a fraction of training data as validation and terminate - training when validation score returned by the `score` method is not - improving by at least `tol` for `n_iter_no_change` consecutive - epochs. - .. versionadded:: 0.20 - Added 'early_stopping' option - - validation_fraction : float, default=0.1 - The proportion of training data to set aside as validation set for - early stopping. Must be between 0 and 1. - Only used if `early_stopping` is True. - .. versionadded:: 0.20 - Added 'validation_fraction' option - - n_iter_no_change : int, default=5 - Number of iterations with no improvement to wait before stopping - fitting. - Convergence is checked against the training loss or the - validation loss depending on the `early_stopping` parameter. - .. versionadded:: 0.20 - Added 'n_iter_no_change' option - - warm_start : bool, default=False - When set to True, reuse the solution of the previous call to fit as - initialization, otherwise, just erase the previous solution. - See :term:`the Glossary `. - Repeatedly calling fit or partial_fit when warm_start is True can - result in a different solution than when calling fit a single time - because of the way the data is shuffled. - If a dynamic learning rate is used, the learning rate is adapted - depending on the number of samples already seen. Calling ``fit`` resets - this counter, while ``partial_fit`` will result in increasing the - existing counter. - - average : bool or int, default=False - When set to True, computes the averaged SGD weights across all - updates and stores the result in the ``coef_`` attribute. If set to - an int greater than 1, averaging will begin once the total number of - samples seen reaches `average`. So ``average=10`` will begin - averaging after seeing 10 samples. - - References - ---------- - Scikit-learn API: sklearn.linear_model.SGDRegressor - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html - """ - super().__init__() - self.loss = loss - self.penalty = penalty - self.alpha = alpha - self.l1_ratio = l1_ratio - self.fit_intercept = fit_intercept - self.max_iter = max_iter - self.tol = tol - self.shuffle = shuffle - self.verbose = verbose - self.epsilon = epsilon - self.learning_rate = learning_rate - self.eta0 = eta0 - self.power_t = power_t - self.early_stopping = early_stopping - self.validation_fraction = validation_fraction - self.n_iter_no_change = n_iter_no_change - self.warm_start = warm_start - self.average = average - - if random_state: - self.random_state = random_state - - # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' - self.model = SGDRegressor( - loss=self.loss, - penalty=self.penalty, - alpha=self.alpha, - l1_ratio=self.l1_ratio, - fit_intercept=self.fit_intercept, - max_iter=self.max_iter, - tol=self.tol, - shuffle=self.shuffle, - verbose=self.verbose, - epsilon=self.epsilon, - random_state=self.random_state, - learning_rate=self.learning_rate, - eta0=self.eta0, - power_t=self.power_t, - early_stopping=self.early_stopping, - validation_fraction=self.validation_fraction, - n_iter_no_change=self.n_iter_no_change, - warm_start=self.warm_start, - average=self.average, - ) - - self.naming = SGDRegression.name - self.customized = True - self.customized_name = "SGD Regression" - - @property - def settings(self) -> Dict: - """The configuration of SGD to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized SGD of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.linear_model import SGDRegressor - - class MySGDRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = SGDRegressor - - @classmethod - def search_space(cls, data_size, task): - space = { - "loss": {"domain": tune.choice(["squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]), "init_value": "squared_error"}, - "penalty": {"domain": tune.choice(["l2", "l1", "elasticnet", None]), "init_value": "l2"}, - "alpha": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.0001}, - "fit_intercept": {"domain": tune.choice([True, False]), "init_value": True}, - "max_iter": {"domain": tune.randint(lower=50, upper=1000), "init_value": 1000}, - "tol": {"domain": tune.loguniform(lower=0.000001, upper=0.001), "init_value": 0.001}, - "shuffle": {"domain": tune.choice([True, False]), "init_value": True}, - "learning_rate": {"domain": tune.choice(["constant", "optimal", "invscaling", "adaptive"]), "init_value": "invscaling"}, - "eta0": {"domain": tune.loguniform(lower=0.0001, upper=0.1), "init_value": 0.01}, - "power_t": {"domain": tune.uniform(lower=0.1, upper=0.9), "init_value": 0.25}, - "l1_ratio": {"domain": tune.uniform(lower=0, upper=1), "init_value": 0.15}, - } - return space - - return MySGDRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = sgd_regression_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.model.coef_], - intercept=self.model.intercept_, - features_name=SGDRegression.X_train.columns, - regression_classification="Regression", - y_train=SGDRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = SGDRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - y_test_predict=SGDRegression.y_test_predict, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - y_test_predict=SGDRegression.y_test_predict, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.auto_model.coef_], - intercept=self.auto_model.intercept_, - features_name=SGDRegression.X_train.columns, - regression_classification="Regression", - y_train=SGDRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = SGDRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - y_test_predict=SGDRegression.y_test_predict, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=SGDRegression.X_test, - target_data=SGDRegression.y_test, - y_test_predict=SGDRegression.y_test_predict, - data_name=SGDRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - -class BayesianRidgeRegression(RegressionWorkflowBase): - """The automation workflow of using Bayesian ridge regression algorithm to make insightful products.""" - - name = "BayesianRidge Regression" - special_function = [] - - def __init__( - self, - tol: float = 0.001, - alpha_1: float = 0.000001, - alpha_2: float = 0.000001, - lambda_1: float = 0.000001, - lambda_2: float = 0.000001, - alpha_init: float = None, - lambda_init: float = None, - compute_score: bool = False, - fit_intercept: bool = True, - copy_X: bool = True, - verbose: bool = False, - ) -> None: - """ - Parameters - ------------ - max_iter : int, default=None - Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion. If , it corresponds to .Nonemax_iter=300 - - Changed in version 1.3. - - tol : float, default=1e-3 - Stop the algorithm if w has converged. - - alpha_1 : float, default=1e-6 - Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter. - - alpha_2 : float, default=1e-6 - Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter. - - lambda_1 : float, default=1e-6 - Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter. - - lambda_2 : float, default=1e-6 - Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter. - - alpha_init : float, default=None - Initial value for alpha (precision of the noise). If not set, alpha_init is 1/Var(y). - - New in version 0.22. - - lambda_init : float, default=None - Initial value for lambda (precision of the weights). If not set, lambda_init is 1. - - New in version 0.22. - - compute_score : bool, default=False - If True, compute the log marginal likelihood at each iteration of the optimization. - - fit_intercept : bool, default=True - Whether to calculate the intercept for this model. The intercept is not treated as a probabilistic parameter and - thus has no associated variance. If set to False, no intercept will be used in calculations (i.e. data is expect - ed to be centered). - - copy_X : bool, default=True - If True, X will be copied; else, it may be overwritten. - - verbose : bool, default=False - Verbose mode when fitting the model. - - n_iter : int - Maximum number of iterations. Should be greater than or equal to 1. - - References - ---------- - Scikit-learn API: sklearn.linear_model.BayesianRidge - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html - """ - super().__init__() - self.tol = tol - self.alpha_1 = alpha_1 - self.alpha_2 = alpha_2 - self.lambda_1 = lambda_1 - self.lambda_2 = lambda_2 - self.alpha_init = alpha_init - self.lambda_init = lambda_init - self.compute_score = compute_score - self.fit_intercept = fit_intercept - self.copy_X = copy_X - self.verbose = verbose - - self.model = BayesianRidge( - tol=self.tol, - alpha_1=self.alpha_1, - alpha_2=self.alpha_2, - lambda_1=self.lambda_1, - lambda_2=self.lambda_2, - alpha_init=self.alpha_init, - lambda_init=self.lambda_init, - compute_score=self.compute_score, - fit_intercept=self.compute_score, - copy_X=self.copy_X, - verbose=self.verbose, - ) - - self.naming = BayesianRidgeRegression.name - self.customized = True - self.customized_name = "BayesianRidge Regression" - - @property - def settings(self) -> Dict: - """The configuration of BayesianRidge to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized BayesianRidge of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.linear_model import BayesianRidge - - class MyBayesianRidgeRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = BayesianRidge - - @classmethod - def search_space(cls, data_size, task): - space = { - "tol": {"domain": tune.loguniform(lower=0.000001, upper=0.001), "init_value": 0.001}, - "alpha_1": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, - "alpha_2": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, - "lambda_1": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, - "lambda_2": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, - "alpha_init": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, - "lambda_init": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, - "compute_score": {"domain": tune.choice([True, False]), "init_value": False}, - "fit_intercept": {"domain": tune.choice([True, False]), "init_value": True}, - "copy_X": {"domain": tune.choice([True, False]), "init_value": True}, - "verbose": {"domain": tune.choice([True, False]), "init_value": False}, - } - return space - - return MyBayesianRidgeRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = bayesian_ridge_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - pass - - -class RidgeRegression(LinearWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Lasso to make insightful products.""" - - name = "Ridge Regression" - special_function = ["Ridge Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] - - def __init__( - self, - alpha: float = 1.0, - *, - fit_intercept: bool = True, - copy_X: bool = True, - max_iter: int = 1000, - tol: float = 1e-4, - solver: str = "auto", - positive: bool = False, - random_state: Optional[int] = None, - ) -> None: - """ - Parameters - ---------- - alpha : {float, ndarray of shape (n_targets,)}, default=1.0 - Constant that multiplies the L2 term, controlling regularization - strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. - - When `alpha = 0`, the objective is equivalent to ordinary least - squares, solved by the :class:`LinearRegression` object. For numerical - reasons, using `alpha = 0` with the `Ridge` object is not advised. - Instead, you should use the :class:`LinearRegression` object. - - If an array is passed, penalties are assumed to be specific to the - targets. Hence they must correspond in number. - - fit_intercept : bool, default=True - Whether to fit the intercept for this model. If set - to false, no intercept will be used in calculations - (i.e. ``X`` and ``y`` are expected to be centered). - - copy_X : bool, default=True - If True, X will be copied; else, it may be overwritten. - - max_iter : int, default=None - Maximum number of iterations for conjugate gradient solver. - For 'sparse_cg' and 'lsqr' solvers, the default value is determined - by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. - For 'lbfgs' solver, the default value is 15000. - - tol : float, default=1e-4 - The precision of the solution (`coef_`) is determined by `tol` which - specifies a different convergence criterion for each solver: - - - 'svd': `tol` has no impact. - - - 'cholesky': `tol` has no impact. - - - 'sparse_cg': norm of residuals smaller than `tol`. - - - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr, - which control the norm of the residual vector in terms of the norms of - matrix and coefficients. - - - 'sag' and 'saga': relative change of coef smaller than `tol`. - - - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals| - smaller than `tol`. - - .. versionchanged:: 1.2 - Default value changed from 1e-3 to 1e-4 for consistency with other linear - models. - - solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \ - 'sag', 'saga', 'lbfgs'}, default='auto' - Solver to use in the computational routines: - - - 'auto' chooses the solver automatically based on the type of data. - - - 'svd' uses a Singular Value Decomposition of X to compute the Ridge - coefficients. It is the most stable solver, in particular more stable - for singular matrices than 'cholesky' at the cost of being slower. - - - 'cholesky' uses the standard scipy.linalg.solve function to - obtain a closed-form solution. - - - 'sparse_cg' uses the conjugate gradient solver as found in - scipy.sparse.linalg.cg. As an iterative algorithm, this solver is - more appropriate than 'cholesky' for large-scale data - (possibility to set `tol` and `max_iter`). - - - 'lsqr' uses the dedicated regularized least-squares routine - scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative - procedure. - - - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses - its improved, unbiased version named SAGA. Both methods also use an - iterative procedure, and are often faster than other solvers when - both n_samples and n_features are large. Note that 'sag' and - 'saga' fast convergence is only guaranteed on features with - approximately the same scale. You can preprocess the data with a - scaler from sklearn.preprocessing. - - - 'lbfgs' uses L-BFGS-B algorithm implemented in - `scipy.optimize.minimize`. It can be used only when `positive` - is True. - - All solvers except 'svd' support both dense and sparse data. However, only - 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when - `fit_intercept` is True. - - .. versionadded:: 0.17 - Stochastic Average Gradient descent solver. - .. versionadded:: 0.19 - SAGA solver. - - positive : bool, default=False - When set to ``True``, forces the coefficients to be positive. - Only 'lbfgs' solver is supported in this case. - - random_state : int, RandomState instance, default=None - Used when ``solver`` == 'sag' or 'saga' to shuffle the data. - See :term:`Glossary ` for details. - - .. versionadded:: 0.17 - `random_state` to support Stochastic Average Gradient. - - References - ---------- - Scikit-learn API: sklearn.linear_model.Lasso - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html - - """ - super().__init__() - self.alpha = alpha - self.fit_intercept = fit_intercept - self.copy_X = copy_X - self.max_iter = max_iter - self.tol = tol - self.solver = solver - self.positive = positive - self.random_state = random_state - - self.model = Ridge( - alpha=self.alpha, - fit_intercept=self.fit_intercept, - copy_X=self.copy_X, - max_iter=self.max_iter, - tol=self.tol, - solver=self.solver, - positive=self.positive, - random_state=self.random_state, - ) - - self.naming = RidgeRegression.name - self.customized = True - self.customized_name = "Ridge" - - @property - def settings(self) -> Dict: - """The configuration of Ridge to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": 10, # total running time in seconds - "metric": "r2", - "estimator_list": [self.customized_name], # list of ML learners - "task": "regression", # task type - # "log_file_name": f'{self.naming} - automl.log', # flaml log file - # "log_training_metric": True, # whether to log training metric - } - return configuration - - @property - def customization(self) -> object: - """The customized Ridge of FLAML framework.""" - from flaml import tune - from flaml.data import REGRESSION - from flaml.model import SKLearnEstimator - from sklearn.linear_model import Ridge - - class MyRidgeRegression(SKLearnEstimator): - def __init__(self, task="regression", n_jobs=None, **config): - super().__init__(task, **config) - if task in REGRESSION: - self.estimator_class = Ridge - - @classmethod - def search_space(cls, data_size, task): - space = { - "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1}, - "fit_intercept": {"domain": tune.choice([True, False])}, - "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000}, - "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4}, - # "selection": {"domain": tune.choice(["cyclic", "random"])}, - # "random_state": {'domain': tune.choice(["", ""])} - } - return space - - return MyRidgeRegression - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = ridge_regression_manual_hyper_parameters() - clear_output() - return hyper_parameters - - @dispatch() - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.model.coef_], - intercept=self.model.intercept_, - features_name=RidgeRegression.X_train.columns, - regression_classification="Regression", - y_train=RidgeRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = RidgeRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - y_test_predict=RidgeRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - y_test_predict=RidgeRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass - - @dispatch(bool) - def special_components(self, is_automl: bool, **kwargs) -> None: - """Invoke all special application functions for this algorithms by FLAML framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - self._show_formula( - coef=[self.auto_model.coef_], - intercept=self.auto_model.intercept_, - features_name=RidgeRegression.X_train.columns, - regression_classification="Regression", - y_train=RidgeRegression.y, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, - mlflow_path="root", - ) - columns_num = RidgeRegression.X.shape[1] - if columns_num > 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # choose two of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 2) - self._plot_3d_scatter_diagram( - feature_data=three_dimen_data, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 2: - # choose one of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) - self._plot_2d_scatter_diagram( - feature_data=two_dimen_data, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - # no need to choose - self._plot_3d_scatter_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_3d_surface_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - y_test_predict=RidgeRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif columns_num == 1: - # no need to choose - self._plot_2d_scatter_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - self._plot_2d_line_diagram( - feature_data=RidgeRegression.X_test, - target_data=RidgeRegression.y_test, - data_name=RidgeRegression.name_test, - y_test_predict=RidgeRegression.y_test_predict, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass +# -*- coding: utf-8 -*- +import json +import os +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union + +import mlflow +import numpy as np +import pandas as pd +import xgboost +from flaml import AutoML +from multipledispatch import dispatch +from rich import print +from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor +from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LinearRegression, Ridge, SGDRegressor +from sklearn.neighbors import KNeighborsRegressor +from sklearn.neural_network import MLPRegressor +from sklearn.preprocessing import PolynomialFeatures +from sklearn.svm import SVR +from sklearn.tree import DecisionTreeRegressor + +from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, RAY_FLAML +from ..utils.base import clear_output, save_data, save_fig, save_text +from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase +from .func.algo_regression._bayesianridge_regression import bayesian_ridge_manual_hyper_parameters +from .func.algo_regression._common import cross_validation, plot_predicted_vs_actual, plot_residuals, score +from .func.algo_regression._decision_tree import decision_tree_manual_hyper_parameters +from .func.algo_regression._elastic_net import elastic_net_manual_hyper_parameters +from .func.algo_regression._enum import ( + ClassicalLinearSpecialFunction, + ElasticNetSpecialFunction, + LassoSpecialFunction, + MLPSpecialFunction, + RegressionCommonFunction, + RegressionSpecialFunction, + RidgeSpecialFunction, + SGDSpecialFunction, +) +from .func.algo_regression._extra_tree import extra_trees_manual_hyper_parameters +from .func.algo_regression._gradient_boosting import gradient_boosting_manual_hyper_parameters +from .func.algo_regression._knn import knn_manual_hyper_parameters +from .func.algo_regression._lasso_regression import lasso_regression_manual_hyper_parameters +from .func.algo_regression._linear_regression import linear_regression_manual_hyper_parameters +from .func.algo_regression._multi_layer_perceptron import multi_layer_perceptron_manual_hyper_parameters +from .func.algo_regression._polynomial_regression import polynomial_regression_manual_hyper_parameters +from .func.algo_regression._rf import random_forest_manual_hyper_parameters +from .func.algo_regression._ridge_regression import ridge_regression_manual_hyper_parameters +from .func.algo_regression._sgd_regression import sgd_regression_manual_hyper_parameters +from .func.algo_regression._svr import svr_manual_hyper_parameters +from .func.algo_regression._xgboost import xgboost_manual_hyper_parameters + + +class RegressionWorkflowBase(WorkflowBase): + """The base workflow class of regression algorithms.""" + + common_function = [grah.value for grah in RegressionCommonFunction] + # ["Model Score", "Cross Validation", "Model Prediction", "Model Persistence", "Predicted vs. Actual Diagram", "Residuals Diagram", "Permutation Importance Diagram"] + + def __init__(self) -> None: + super().__init__() + # These two attributes are used for the customized models of FLAML framework + self.customized = False + self.customized_name = None + self.mode = "Regression" + + @dispatch(object, object) + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: + """Fit the model by Scikit-learn framework.""" + self.model.fit(X, y) + + @dispatch(object, object, bool) + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None, is_automl: bool = False) -> None: + """Fit the model by FLAML framework and RAY framework.""" + # print(f"-*-*- {self.naming} - AutoML -*-*-.") + if self.naming not in RAY_FLAML: + self.automl = AutoML() + if self.customized: # When the model is not built-in in FLAML framwork, use FLAML customization. + self.automl.add_learner(learner_name=self.customized_name, learner_class=self.customization) + if y.shape[1] == 1: # FLAML's data format validation mechanism + y = y.squeeze() # Convert a single dataFrame column into a series + self.automl.fit(X_train=X, y_train=y, **self.settings) + else: + # When the model is not built-in in FLAML framework, use RAY + FLAML customization. + self.ray_tune( + RegressionWorkflowBase.X_train, + RegressionWorkflowBase.X_test, + RegressionWorkflowBase.y_train, + RegressionWorkflowBase.y_test, + ) + self.ray_best_model.fit(X, y) + + @dispatch(object) + def predict(self, X: pd.DataFrame) -> np.ndarray: + """Perform classification on samples in X by Scikit-learn framework.""" + y_predict = self.model.predict(X) + return y_predict + + @dispatch(object, bool) + def predict(self, X: pd.DataFrame, is_automl: bool = False) -> np.ndarray: + """Perform classification on samples in X by FLAML framework and RAY framework.""" + if self.naming not in RAY_FLAML: + y_predict = self.automl.predict(X) + return y_predict + else: + y_predict = self.ray_best_model.predict(X) + return y_predict + + @property + def auto_model(self) -> object: + """Get AutoML trained model by FLAML framework and RAY framework.""" + if self.naming not in RAY_FLAML: + return self.automl.model.estimator + else: + return self.ray_best_model + + @property + def settings(self) -> Dict: + """The configuration to implement AutoML by FLAML framework.""" + return dict() + + @property + def customization(self) -> object: + """The customized model of FLAML framework.""" + return object + + def ray_tune(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame) -> object: + """The customized model of FLAML framework and RAY framework.""" + return object + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + return dict() + + @staticmethod + def _plot_predicted_vs_actual(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str, grah_name: str) -> None: + """Plot the predicted vs. actual diagram.""" + print(f"-----* {grah_name} *-----") + plot_predicted_vs_actual(y_test_predict, y_test, algorithm_name) + save_fig(f"{grah_name} - {algorithm_name}", local_path, mlflow_path) + data = pd.concat([y_test, y_test_predict], axis=1) + save_data(data, name_column, f"{grah_name} - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _plot_residuals(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str, grah_name: str) -> None: + """Plot the residuals diagram.""" + print(f"-----* {grah_name} *-----") + residuals = plot_residuals(y_test_predict, y_test, algorithm_name) + save_fig(f"{grah_name} - {algorithm_name}", local_path, mlflow_path) + data = pd.concat([y_test, residuals], axis=1) + save_data(data, name_column, f"{grah_name} - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _score(y_true: pd.DataFrame, y_predict: pd.DataFrame, algorithm_name: str, store_path: str, grah_name: str) -> None: + """Calculate the score of the model.""" + print(f"-----* {grah_name} *-----") + scores = score(y_true, y_predict) + scores_str = json.dumps(scores, indent=4) + save_text(scores_str, f"{grah_name} - {algorithm_name}", store_path) + mlflow.log_metrics(scores) + + @staticmethod + def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, cv_num: int, algorithm_name: str, store_path: str, grah_name: str) -> None: + """Cross validation.""" + print(f"-----* {grah_name} *-----") + print(f"K-Folds: {cv_num}") + scores = cross_validation(trained_model, X_train, y_train, cv_num=cv_num) + scores_str = json.dumps(scores, indent=4) + save_text(scores_str, f"{grah_name} - {algorithm_name}", store_path) + + @dispatch() + def common_components(self) -> None: + """Invoke all common application functions for regression algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._score( + y_true=RegressionWorkflowBase.y_test, + y_predict=RegressionWorkflowBase.y_test_predict, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, + grah_name=RegressionCommonFunction.MODEL_SCORE.value, + ) + self._cross_validation( + trained_model=self.model, + X_train=RegressionWorkflowBase.X_train, + y_train=RegressionWorkflowBase.y_train, + cv_num=10, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, + grah_name=RegressionCommonFunction.CROSS_VALIDATION.value, + ) + self._plot_predicted_vs_actual( + y_test_predict=RegressionWorkflowBase.y_test_predict, + y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + grah_name=RegressionCommonFunction.PREDICTED_VS_ACTUAL_DIAGRAM.value, + ) + self._plot_residuals( + y_test_predict=RegressionWorkflowBase.y_test_predict, + y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + grah_name=RegressionCommonFunction.RESIDUALS_DIAGRAM.value, + ) + self._plot_permutation_importance( + X_test=RegressionWorkflowBase.X_test, + y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + grah_name=RegressionCommonFunction.PERMUTATION_IMPORTANC_DIAGRAM.value, + ) + + @dispatch(bool) + def common_components(self, is_automl: bool = False) -> None: + """Invoke all common application functions for regression algorithms by FLAML framework.""" + GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._score( + y_true=RegressionWorkflowBase.y_test, + y_predict=RegressionWorkflowBase.y_test_predict, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, + grah_name=RegressionCommonFunction.MODEL_SCORE.value, + ) + self._cross_validation( + trained_model=self.auto_model, + X_train=RegressionWorkflowBase.X_train, + y_train=RegressionWorkflowBase.y_train, + cv_num=10, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, + grah_name=RegressionCommonFunction.CROSS_VALIDATION.value, + ) + self._plot_predicted_vs_actual( + y_test_predict=RegressionWorkflowBase.y_test_predict, + y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + grah_name=RegressionCommonFunction.PREDICTED_VS_ACTUAL_DIAGRAM.value, + ) + self._plot_residuals( + y_test_predict=RegressionWorkflowBase.y_test_predict, + y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + grah_name=RegressionCommonFunction.RESIDUALS_DIAGRAM.value, + ) + self._plot_permutation_importance( + X_test=RegressionWorkflowBase.X_test, + y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + grah_name=RegressionCommonFunction.PERMUTATION_IMPORTANC_DIAGRAM.value, + ) + + +class PolynomialRegression(LinearWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Polynomial Regression algorithm to make insightful products.""" + + name = "Polynomial Regression" + special_function = [] + + def __init__( + self, + degree: int = 2, + interaction_only: bool = False, + include_bias: bool = False, + order: str = "C", + fit_intercept: bool = True, + normalize: bool = False, + copy_X: bool = True, + n_jobs: Optional[int] = None, + ) -> None: + + super().__init__() + self.degree = degree + self.include_bias = include_bias + self.interaction_only = interaction_only + self.order = order + self.fit_intercept = fit_intercept + self.normalize = normalize + self.copy_X = copy_X + self.n_jobs = n_jobs + self.model = LinearRegression(fit_intercept=self.fit_intercept, copy_X=self.copy_X, n_jobs=self.n_jobs) + + self._features_name = None + self.naming = PolynomialRegression.name + + def poly(self, X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Polynomial features.""" + poly_features = PolynomialFeatures(degree=self.degree, include_bias=self.include_bias, interaction_only=self.interaction_only, order=self.order) + X_train_poly = poly_features.fit_transform(X_train) + X_test_poly = poly_features.fit_transform(X_test) + poly_config = {type(poly_features).__name__: poly_features.get_params()} + try: + # scikit-learn >= 1.0 + self._features_name = poly_features.get_feature_names_out() + except AttributeError: + self._features_name = poly_features.get_feature_names() + X_train_poly = pd.DataFrame(X_train_poly, columns=self._features_name) + X_test_poly = pd.DataFrame(X_test_poly, columns=self._features_name) + return poly_config, X_train_poly, X_test_poly + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = polynomial_regression_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=self.model.coef_, + intercept=self.model.intercept_, + features_name=self._features_name, + regression_classification="Regression", + y_train=PolynomialRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + + +class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using XGBoost algorithm to make insightful products.""" + + name = "XGBoost" + special_function = [RegressionSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value] # Feature Importance Diagram + + # In fact, it's used for type hint in the original xgboost package. + # Hence, we have to copy it here again. Just ignore it + _SklObjective = Optional[Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]] + + def __init__( + self, + max_depth: Optional[int] = 6, + learning_rate: Optional[float] = 0.3, + n_estimators: int = 100, + verbosity: Optional[int] = 1, + objective: _SklObjective = None, + booster: Optional[str] = None, + tree_method: Optional[str] = "auto", + n_jobs: Optional[int] = None, + gamma: Optional[float] = 0, + min_child_weight: Optional[float] = None, + max_delta_step: Optional[float] = 0, + subsample: Optional[float] = 1, + colsample_bytree: Optional[float] = 1, + colsample_bylevel: Optional[float] = 1, + colsample_bynode: Optional[float] = 1, + reg_alpha: Optional[float] = 0, + reg_lambda: Optional[float] = 1, + scale_pos_weight: Optional[float] = 1, + base_score: Optional[float] = None, + random_state: Optional[Union[np.random.RandomState, int]] = None, + missing: float = np.nan, + num_parallel_tree: Optional[int] = 1, + monotone_constraints: Optional[Union[Dict[str, int], str]] = None, + interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None, + importance_type: Optional[str] = "gain", + gpu_id: Optional[int] = None, + validate_parameters: Optional[bool] = None, + predictor: Optional[str] = None, + # enable_categorical: bool = False, + eval_metric: Optional[Union[str, List[str], Callable]] = None, + early_stopping_rounds: Optional[int] = None, + **kwargs: Any, + ) -> None: + """ + Parameters + ---------- + max_depth [default=6] + Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. + 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. + exact tree method requires non-zero value. + range: [0,∞] + + learning_rate [default=0.3] + Step size shrinkage used in update to prevents overfitting. + After each boosting step, we can directly get the weights of new features, + and eta shrinks the feature weights to make the boosting process more conservative. + range: [0,1] + + n_estimators : int + Number of gradient boosted trees. Equivalent to number of boosting rounds. + + objective : {SklObjective} + Specify the learning task and the corresponding learning objective or + a custom objective function to be used (see note below). + + verbosity [default=1] + Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 (debug). + Sometimes XGBoost tries to change configurations based on heuristics, + which is displayed as warning message. + If there’s unexpected behaviour, please try to increase value of verbosity. + + booster [default= gbtree ] + Which booster to use. Can be gbtree, gblinear or dart; + gbtree and dart use tree based models while gblinear uses linear functions. + + tree_method string [default= auto] + The tree construction algorithm used in XGBoost. See description in the reference paper and Tree Methods. + XGBoost supports approx, hist and gpu_hist for distributed training. Experimental support for external memory is available for approx and gpu_hist. + Choices: auto, exact, approx, hist, gpu_hist, this is a combination of commonly used updaters. For other updaters like refresh, set the parameter updater directly. + auto: Use heuristic to choose the fastest method. + For small dataset, exact greedy (exact) will be used. + For larger dataset, approximate algorithm (approx) will be chosen. It’s recommended to try hist and gpu_hist for higher performance with large dataset. + (gpu_hist)has support for external memory. + Because old behavior is always use exact greedy in single machine, user will get a message when approximate algorithm is chosen to notify this choice. + exact: Exact greedy algorithm. Enumerates all split candidates. + approx: Approximate greedy algorithm using quantile sketch and gradient histogram. + hist: Faster histogram optimized approximate greedy algorithm. + gpu_hist: GPU implementation of hist algorithm. + + n_jobs : Optional[int] + Number of parallel threads used to run xgboost. When used with other + Scikit-Learn algorithms like grid search, you may choose which algorithm to + parallelize and balance the threads. Creating thread contention will + significantly slow down both algorithms. + + gamma [default=0, alias: min_split_loss] + Minimum loss reduction required to make a further partition on a leaf node of the tree. + The larger gamma is, the more conservative the algorithm will be. + range: [0,∞] + + min_child_weight [default=1] + Minimum sum of instance weight (hessian) needed in a child. + If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, + then the building process will give up further partitioning. In linear regression task, + this simply corresponds to minimum number of instances needed to be in each node. + The larger min_child_weight is, the more conservative the algorithm will be. + range: [0,∞] + + max_delta_step [default=0] + Maximum delta step we allow each leaf output to be. + If the value is set to 0, it means there is no constraint. + If it is set to a positive value, it can help making the update step more conservative. + Usually this parameter is not needed, but it might help in logistic regression + when class is extremely imbalanced. Set it to value of 1-10 might help control the update. + range: [0,∞] + + subsample [default=1] + Subsample ratio of the training instances. + Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. + and this will prevent overfitting. + Subsampling will occur once in every boosting iteration. + range: (0,1] + + colsample_bytree [default=1] + colsample_bytree is the subsample ratio of columns when constructing each tree. + Subsampling occurs once for every tree constructed. + + colsample_bylevel [default=1] + colsample_bylevel is the subsample ratio of columns for each level. + Subsampling occurs once for every new depth level reached in a tree. + Columns are subsampled from the set of columns chosen for the current tree. + + colsample_bynode [default=1] + colsample_bynode is the subsample ratio of columns for each node (split). + Subsampling occurs once every time a new split is evaluated. + Columns are subsampled from the set of columns chosen for the current level. + + reg_alpha [default=0] + L1 regularization term on weights. + Increasing this value will make model more conservative. + + reg_lambda [default=1, alias: reg_lambda] + L2 regularization term on weights. + Increasing this value will make model more conservative. + + scale_pos_weight [default=1] + Control the balance of positive and negative weights, useful for unbalanced classes. + + predictor, [default= auto] + The type of predictor algorithm to use. + Provides the same results but allows the use of GPU or CPU. + auto: Configure predictor based on heuristics. + cpu_predictor: Multicore CPU prediction algorithm. + gpu_predictor: Prediction using GPU. Used when tree_method is gpu_hist. + When predictor is set to default value auto, the gpu_hist tree method is able to provide GPU based prediction + without copying training data to GPU memory. If gpu_predictor is explicitly specified, + then all data is copied into GPU, only recommended for performing prediction tasks. + + base_score : Optional[float] + The initial prediction score of all instances, global bias. + + random_state : Optional[Union[numpy.random.RandomState, int]] + Random number seed. + .. note:: + + Using gblinear booster with shotgun updater is nondeterministic as + it uses Hogwild algorithm. + + missing : float, default np.nan + Value in the data which needs to be present as a missing value. + + num_parallel_tree: Optional[int] + Used for boosting random forest. + + monotone_constraints : Optional[Union[Dict[str, int], str]] + Constraint of variable monotonicity. See :doc:`tutorial ` + for more information. + + interaction_constraints : Optional[Union[str, List[Tuple[str]]]] + Constraints for interaction representing permitted interactions. The + constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2, + 3, 4]]``, where each inner list is a group of indices of features that are + allowed to interact with each other. See :doc:`tutorial + ` for more information + + importance_type: Optional[str] + The feature importance type for the feature_importances\\_ property: + + * For tree model, it's either "gain", "weight", "cover", "total_gain" or + "total_cover". + * For linear model, only "weight" is defined and it's the normalized coefficients + without bias. + + gpu_id : Optional[int] + Device ordinal. + + validate_parameters : Optional[bool] + Give warnings for unknown parameter. + + eval_metric : Optional[Union[str, List[str], Callable]] + + early_stopping_rounds : Optional[int] + + References + ---------- + [1] XGBoost Python API Reference - Scikit-Learn API + https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn + + [2] XGBoost API for the scikit-learn wrapper: + https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py + """ + + super().__init__() + self.n_estimators = n_estimators + self.objective = objective + self.max_depth = max_depth + self.learning_rate = learning_rate + self.verbosity = verbosity + self.booster = booster + self.tree_method = tree_method + self.gamma = gamma + self.min_child_weight = min_child_weight + self.max_delta_step = max_delta_step + self.subsample = subsample + self.colsample_bytree = colsample_bytree + self.colsample_bylevel = colsample_bylevel + self.colsample_bynode = colsample_bynode + self.reg_alpha = reg_alpha + self.reg_lambda = reg_lambda + self.scale_pos_weight = scale_pos_weight + self.base_score = base_score + self.missing = missing + self.num_parallel_tree = num_parallel_tree + self.n_jobs = n_jobs + self.monotone_constraints = monotone_constraints + self.interaction_constraints = interaction_constraints + self.importance_type = importance_type + self.gpu_id = gpu_id + self.validate_parameters = validate_parameters + self.predictor = predictor + # self.enable_categorical = enable_categorical + self.eval_metric = eval_metric + self.early_stopping_rounds = early_stopping_rounds + if kwargs: + self.kwargs = kwargs + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = xgboost.XGBRegressor( + n_estimators=self.n_estimators, + objective=self.objective, + max_depth=self.max_depth, + learning_rate=self.learning_rate, + verbosity=self.verbosity, + booster=self.booster, + tree_method=self.tree_method, + gamma=self.gamma, + min_child_weight=self.min_child_weight, + max_delta_step=self.max_delta_step, + subsample=self.subsample, + colsample_bytree=self.colsample_bytree, + colsample_bylevel=self.colsample_bylevel, + colsample_bynode=self.colsample_bynode, + reg_alpha=self.reg_alpha, + reg_lambda=self.reg_lambda, + scale_pos_weight=self.scale_pos_weight, + base_score=self.base_score, + missing=self.missing, + num_parallel_tree=self.num_parallel_tree, + random_state=self.random_state, + n_jobs=self.n_jobs, + monotone_constraints=self.monotone_constraints, + interaction_constraints=self.interaction_constraints, + importance_type=self.importance_type, + gpu_id=self.gpu_id, + validate_parameters=self.validate_parameters, + predictor=self.predictor, + # enable_categorical=self.enable_categorical, + eval_metric=self.eval_metric, + early_stopping_rounds=self.early_stopping_rounds, + ) + + self.naming = (XGBoostRegression.name,) + self.func = (XGBoostRegression.special_function,) + + @property + def settings(self) -> Dict: + """The configuration to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": ["xgboost"], # list of ML learners + "task": "regression", # task type + # "log_file_name": f"{self.naming} - automl.log", # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = xgboost_manual_hyper_parameters() + clear_output() + return hyper_parameters + + # @staticmethod + # def _histograms_feature_weights(X: pd.DataFrame, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + # """Histograms of feature weights plot.""" + # histograms_feature_weights(X, trained_model, image_config) + # save_fig(f"Feature Importance Score - {algorithm_name}", local_path, mlflow_path) + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=XGBoostRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + # self._histograms_feature_weights( + # X=XGBoostRegression.X, + # trained_model=self.model, + # image_config=self.image_config, + # algorithm_name=self.naming, + # local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + # ) + + @dispatch(bool) + def special_components(self, is_automl: bool = False, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=XGBoostRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + # self._histograms_feature_weights( + # X=XGBoostRegression.X, + # trained_model=self.auto_model, + # image_config=self.image_config, + # algorithm_name=self.naming, + # local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + # ) + + +class DecisionTreeRegression(TreeWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Decision Tree algorithm to make insightful products.""" + + name = "Decision Tree" + special_function = [ + RegressionSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value, + RegressionSpecialFunction.SINGLE_TREE_DIAGRAM.value, + ] + # ["Feature Importance Diagram", "Single Tree Diagram"] + + def __init__( + self, + criterion: str = "squared_error", + splitter: str = "best", + max_depth: Optional[int] = None, + min_samples_split: Union[int, float] = 2, + min_samples_leaf: Union[int, float] = 1, + min_weight_fraction_leaf: float = 0.0, + max_features: Union[int, float, str, None] = None, + random_state: Optional[int] = None, + max_leaf_nodes: Optional[int] = None, + min_impurity_decrease: float = 0.0, + ccp_alpha: float = 0.0, + ) -> None: + """ + Parameters + ---------- + criterion : {"squared_error", "friedman_mse", "absolute_error", \ + "poisson"}, default="squared_error" + The function to measure the quality of a split. Supported criteria + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion and minimizes the L2 + loss using the mean of each terminal node, "friedman_mse", which uses + mean squared error with Friedman's improvement score for potential + splits, "absolute_error" for the mean absolute error, which minimizes + the L1 loss using the median of each terminal node, and "poisson" which + uses reduction in Poisson deviance to find splits. + + .. versionadded:: 0.18 + Mean Absolute Error (MAE) criterion. + + .. versionadded:: 0.24 + Poisson deviance criterion. + + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + + splitter : {"best", "random"}, default="best" + The strategy used to choose the split at each node. Supported + strategies are "best" to choose the best split and "random" to choose + the best random split. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `max(1, int(max_features * n_features_in_))` features are considered at each + split. + - If "auto", then `max_features=n_features`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + .. deprecated:: 1.1 + The `"auto"` option was deprecated in 1.1 and will be removed + in 1.3. + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + The weighted impurity decrease equation is the following:: + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + .. versionadded:: 0.19 + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. + .. versionadded:: 0.22 + + References + ---------- + Scikit-learn API: sklearn.tree.DecisionTreeClassifier + https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier + """ + + super().__init__() + self.criterion = (criterion,) + self.splitter = (splitter,) + self.max_depth = (max_depth,) + self.min_samples_split = (min_samples_split,) + self.min_samples_leaf = (min_samples_leaf,) + self.min_weight_fraction_leaf = (min_weight_fraction_leaf,) + self.max_features = (max_features,) + self.max_leaf_nodes = (max_leaf_nodes,) + self.min_impurity_decrease = (min_impurity_decrease,) + self.ccp_alpha = (ccp_alpha,) + + if random_state: + self.random_state = (random_state,) + else: + self.random_state = (self.random_state,) + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = DecisionTreeRegressor( + criterion=self.criterion[0], + splitter=self.splitter[0], + max_depth=self.max_depth[0], + min_samples_split=self.min_samples_split[0], + min_samples_leaf=self.min_samples_leaf[0], + min_weight_fraction_leaf=self.min_weight_fraction_leaf[0], + max_features=self.max_features[0], + random_state=self.random_state[0], + max_leaf_nodes=self.max_leaf_nodes[0], + min_impurity_decrease=self.min_impurity_decrease[0], + ccp_alpha=self.ccp_alpha[0], + ) + self.naming = DecisionTreeRegression.name + self.func = DecisionTreeRegression.special_function + self.customized = True + self.customized_name = "Decision Tree" + + @property + def settings(self) -> Dict: + """The configuration of Decision Tree to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized Decision Tree of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.tree import DecisionTreeRegressor + + class MyDTRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = DecisionTreeRegressor + + @classmethod + def search_space(cls, data_size, task): + space = { + "criterion": {"domain": tune.choice(["squared_error", "friedman_mse", "absolute_error", "poisson"])}, + "max_depth": {"domain": tune.randint(lower=2, upper=20), "init_value": 1, "low_cost_init_value": 1}, + "min_samples_split": { + "domain": tune.randint(lower=2, upper=10), + "init_value": 2, + "low_cost_init_value": 2, + }, + "min_samples_leaf": {"domain": tune.randint(lower=1, upper=10), "init_value": 1, "low_cost_init_value": 1}, + "max_features": {"domain": tune.randint(lower=1, upper=10), "init_value": 1, "low_cost_init_value": 1}, + } + return space + + return MyDTRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = decision_tree_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs): + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=DecisionTreeRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + @dispatch(bool) + def special_components(self, is_automl: bool = False, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=DecisionTreeRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + +class ExtraTreesRegression(TreeWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Extra-Trees algorithm to make insightful products.""" + + name = "Extra-Trees" + special_function = [ + RegressionSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value, + RegressionSpecialFunction.SINGLE_TREE_DIAGRAM.value, + ] + # ["Feature Importance Diagram", "Single Tree Diagram"] + + def __init__( + self, + n_estimators: int = 100, + criterion: str = "mse", + max_depth: Optional[int] = None, + min_samples_split: Union[int, float] = 2, + min_samples_leaf: Union[int, float] = 1, + min_weight_fraction_leaf: float = 0.0, + max_features: Union[int, float, str] = "auto", + max_leaf_nodes: int = None, + min_impurity_decrease: float = 0.0, + # min_impurity_split=None, + bootstrap: bool = False, + oob_score: bool = False, + n_jobs: Optional[int] = None, + random_state: Optional[int] = None, + verbose: int = 0, + warm_start: bool = False, + ccp_alpha: float = 0.0, + max_samples: Union[int, float] = None, + ) -> None: + """ + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + criterion : {"squared_error", "absolute_error"}, default="squared_error" + The function to measure the quality of a split. Supported criteria + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion, and "absolute_error" + for the mean absolute error. + + .. versionadded:: 0.18 + Mean Absolute Error (MAE) criterion. + + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=n_features`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + bootstrap : bool, default=False + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate the generalization score. + Only available if bootstrap=True. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls 3 sources of randomness: + + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`the Glossary `. + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. + + .. versionadded:: 0.22 + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + .. versionadded:: 0.22 + + References + ---------- + Scikit-learn API: sklearn.ensemble.ExtraTreesRegressor + https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html?highlight=extratreesregressor#sklearn.ensemble.ExtraTreesRegressor + """ + + super().__init__() + self.n_estimators = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + # self.min_impurity_split = min_impurity_split + self.bootstrap = bootstrap + self.oob_score = oob_score + self.n_jobs = n_jobs + self.verbose = verbose + self.warm_start = warm_start + self.ccp_alpha = ccp_alpha + self.max_samples = max_samples + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = ExtraTreesRegressor( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + # min_impurity_split=self.min_impurity_split, + bootstrap=self.bootstrap, + oob_score=self.oob_score, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose, + warm_start=self.warm_start, + ccp_alpha=self.ccp_alpha, + max_samples=self.max_samples, + ) + + self.naming = ExtraTreesRegression.name + self.func = ExtraTreesRegression.special_function + + @property + def settings(self) -> Dict: + """The configuration to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": ["extra_tree"], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = extra_trees_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=ExtraTreesRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.model.estimators_[0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + @dispatch(bool) + def special_components(self, is_automl: bool = False, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=ExtraTreesRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.auto_model.estimators_[0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + +class RandomForestRegression(TreeWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Random Forest algorithm to make insightful products.""" + + name = "Random Forest" + special_function = [ + RegressionSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value, + RegressionSpecialFunction.SINGLE_TREE_DIAGRAM.value, + ] + # ["Feature Importance Diagram", "Single Tree Diagram"] + + def __init__( + self, + n_estimators: int = 100, + criterion: str = "mse", + max_depth: Optional[int] = None, + min_samples_split: Union[int, float] = 2, + min_samples_leaf: Union[int, float] = 1, + min_weight_fraction_leaf: float = 0.0, + max_features: Union[int, float] = "sqrt", + max_leaf_nodes: int = None, + min_impurity_decrease: float = 0.0, + bootstrap: bool = True, + oob_score: bool = False, + n_jobs: int = None, + random_state: Optional[int] = None, + verbose: int = 0, + warm_start: bool = False, + # class_weight=None, + ccp_alpha: float = 0.0, + max_samples: Union[int, float] = None, + ) -> None: + """ + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + criterion : {"squared_error", "absolute_error", "poisson"}, \ + default="squared_error" + The function to measure the quality of a split. Supported criteria + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion, "absolute_error" + for the mean absolute error, and "poisson" which uses reduction in + Poisson deviance to find splits. + Training using "absolute_error" is significantly slower + than when using "squared_error". + + .. versionadded:: 0.18 + Mean Absolute Error (MAE) criterion. + + .. versionadded:: 1.0 + Poisson criterion. + + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=n_features`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + bootstrap : bool, default=True + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate the generalization score. + Only available if bootstrap=True. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`the Glossary `. + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. + + .. versionadded:: 0.22 + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + .. versionadded:: 0.22 + + References + ---------- + Scikit-learn API: sklearn.ensemble.RandomForestRegressor + https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html?highlight=randomforestregressor#sklearn.ensemble.RandomForestRegressor + """ + + super().__init__() + self.n_estimators = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.bootstrap = bootstrap + self.oob_score = oob_score + self.n_jobs = n_jobs + self.verbose = verbose + self.warm_start = warm_start + # self.class_weight = class_weight + self.ccp_alpha = ccp_alpha + self.max_samples = max_samples + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = RandomForestRegressor( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=self.max_features, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + bootstrap=self.bootstrap, + oob_score=self.oob_score, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose, + warm_start=self.warm_start, + # class_weight=self.class_weight, + ccp_alpha=self.ccp_alpha, + max_samples=self.max_samples, + ) + self.naming = RandomForestRegression.name + self.func = RandomForestRegression.special_function + + @property + def settings(self) -> Dict: + """The configuration to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": ["rf"], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = random_forest_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=RandomForestRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.model.estimators_[0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + @dispatch(bool) + def special_components(self, is_automl: bool = False, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=RandomForestRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.auto_model.estimators_[0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + +class SVMRegression(RegressionWorkflowBase): + """The automation workflow of using SVR algorithm to make insightful products.""" + + name = "Support Vector Machine" + special_function = [] + + def __init__( + self, + kernel: str = "rbf", + degree: int = 3, + gamma: Union[str, float] = "scale", + # coef0: float = 0.0, + tol: float = 1e-3, + C: float = 1.0, + epsilon: float = 0.1, + shrinking: bool = True, + cache_size: float = 200, + verbose: bool = False, + max_iter: int = -1, + ) -> None: + """ + Parameters + ---------- + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features. + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + The penalty is a squared l2 penalty. + + epsilon : float, default=0.1 + Epsilon in the epsilon-SVR model. It specifies the epsilon-tube + within which no penalty is associated in the training loss function + with points predicted within a distance epsilon from the actual + value. + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + References + ---------- + Scikit-learn API: sklearn.svm.SVR + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html + """ + + super().__init__() + self.kernel = kernel + self.degree = degree + self.gamma = gamma + # self.coef0 = coef0 + self.tol = tol + self.C = C + self.epsilon = epsilon + self.shrinking = shrinking + self.cache_size = cache_size + self.verbose = verbose + self.max_iter = max_iter + + self.model = SVR( + kernel=self.kernel, + degree=self.degree, + gamma=self.gamma, + # coef0=self.coef0, + tol=self.tol, + C=self.C, + epsilon=self.epsilon, + shrinking=self.shrinking, + cache_size=self.cache_size, + verbose=self.verbose, + max_iter=self.max_iter, + ) + + self.naming = SVMRegression.name + self.customized = True + self.customized_name = "SVR" + + @property + def settings(self) -> Dict: + """The configuration of SVR to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized SVR of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.svm import SVR + + class MySVMRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = SVR + + @classmethod + def search_space(cls, data_size, task): + space = { + "C": {"domain": tune.uniform(lower=1, upper=data_size[0]), "init_value": 1, "low_cost_init_value": 1}, + "kernel": {"domain": tune.choice(["poly", "rbf", "sigmoid"])}, + "gamma": {"domain": tune.uniform(lower=1e-5, upper=10), "init_value": 1e-1, "low_cost_init_value": 1e-1}, + "degree": {"domain": tune.quniform(lower=1, upper=5, q=1), "init_value": 3, "low_cost_init_value": 3}, + "coef0": {"domain": tune.uniform(lower=0, upper=1), "init_value": 0, "low_cost_init_value": 0}, + "shrinking": {"domain": tune.choice([True, False])}, + } + return space + + return MySVMRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = svr_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs): + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + pass + + +class MLPRegression(RegressionWorkflowBase): + """The automation workflow of using Multi-layer Perceptron algorithm to make insightful products.""" + + name = "Multi-layer Perceptron" + special_function = [MLPSpecialFunction.LOSS_CURVE_DIAGRAM.value] # "Loss Curve Diagram" + + def __init__( + self, + hidden_layer_sizes: tuple = (50, 25, 5), + activation: str = "relu", + solver: str = "adam", + alpha: float = 0.0001, + batch_size: Union[int, str] = "auto", + learning_rate: str = "constant", + learning_rate_init: float = 0.001, + max_iter: int = 200, + shuffle: bool = True, + random_state: Optional[int] = None, + tol: float = 1e-4, + verbose: bool = False, + warm_start: bool = False, + early_stopping: bool = False, + validation_fraction: float = 0.1, + beta_1: float = 0.9, + beta_2: float = 0.999, + epsilon: float = 1e-8, + n_iter_no_change: int = 10, + ): + """ + Parameters + ---------- + hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,) + The ith element represents the number of neurons in the ith + hidden layer. + + activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu' + Activation function for the hidden layer. + - 'identity', no-op activation, useful to implement linear bottleneck, + returns f(x) = x + - 'logistic', the logistic sigmoid function, + returns f(x) = 1 / (1 + exp(-x)). + - 'tanh', the hyperbolic tan function, + returns f(x) = tanh(x). + - 'relu', the rectified linear unit function, + returns f(x) = max(0, x) + + solver : {'lbfgs', 'sgd', 'adam'}, default='adam' + The solver for weight optimization. + - 'lbfgs' is an optimizer in the family of quasi-Newton methods. + - 'sgd' refers to stochastic gradient descent. + - 'adam' refers to a stochastic gradient-based optimizer proposed by + Kingma, Diederik, and Jimmy Ba + Note: The default solver 'adam' works pretty well on relatively + large datasets (with thousands of training samples or more) in terms of + both training time and validation score. + For small datasets, however, 'lbfgs' can converge faster and perform + better. + + alpha : float, default=0.0001 + Strength of the L2 regularization term. The L2 regularization term + is divided by the sample size when added to the loss. + + batch_size : int, default='auto' + Size of minibatches for stochastic optimizers. + If the solver is 'lbfgs', the classifier will not use minibatch. + When set to "auto", `batch_size=min(200, n_samples)`. + + learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant' + Learning rate schedule for weight updates. + - 'constant' is a constant learning rate given by + 'learning_rate_init'. + - 'invscaling' gradually decreases the learning rate ``learning_rate_`` + at each time step 't' using an inverse scaling exponent of 'power_t'. + effective_learning_rate = learning_rate_init / pow(t, power_t) + - 'adaptive' keeps the learning rate constant to + 'learning_rate_init' as long as training loss keeps decreasing. + Each time two consecutive epochs fail to decrease training loss by at + least tol, or fail to increase validation score by at least tol if + 'early_stopping' is on, the current learning rate is divided by 5. + Only used when solver='sgd'. + + learning_rate_init : float, default=0.001 + The initial learning rate used. It controls the step-size + in updating the weights. Only used when solver='sgd' or 'adam'. + + power_t : float, default=0.5 + The exponent for inverse scaling learning rate. + It is used in updating effective learning rate when the learning_rate + is set to 'invscaling'. Only used when solver='sgd'. + + max_iter : int, default=200 + Maximum number of iterations. The solver iterates until convergence + (determined by 'tol') or this number of iterations. For stochastic + solvers ('sgd', 'adam'), note that this determines the number of epochs + (how many times each data point will be used), not the number of + gradient steps. + + shuffle : bool, default=True + Whether to shuffle samples in each iteration. Only used when + solver='sgd' or 'adam'. + + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + tol : float, default=1e-4 + Tolerance for thfe optimization. When the loss or score is not improving + by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, + unless ``learning_rate`` is set to 'adaptive', convergence is + considered to be reached and training stops. + + verbose : bool, default=False + Whether to print progress messages to stdout. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous + call to fit as initialization, otherwise, just erase the + previous solution. See :term:`the Glossary `. + + momentum : float, default=0.9 + Momentum for gradient descent update. Should be between 0 and 1. Only + used when solver='sgd'. + + nesterovs_momentum : bool, default=True + Whether to use Nesterov's momentum. Only used when solver='sgd' and + momentum > 0. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to true, it will automatically set + aside 10% of training data as validation and terminate training when + validation score is not improving by at least ``tol`` for + ``n_iter_no_change`` consecutive epochs. + Only effective when solver='sgd' or 'adam'. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if early_stopping is True. + + beta_1 : float, default=0.9 + Exponential decay rate for estimates of first moment vector in adam, + should be in [0, 1). Only used when solver='adam'. + + beta_2 : float, default=0.999 + Exponential decay rate for estimates of second moment vector in adam, + should be in [0, 1). Only used when solver='adam'. + + epsilon : float, default=1e-8 + Value for numerical stability in adam. Only used when solver='adam'. + + n_iter_no_change : int, default=10 + Maximum number of epochs to not meet ``tol`` improvement. + Only effective when solver='sgd' or 'adam'. + + .. versionadded:: 0.20 + + max_fun : int, default=15000 + Only used when solver='lbfgs'. Maximum number of function calls. + The solver iterates until convergence (determined by 'tol'), number + of iterations reaches max_iter, or this number of function calls. + Note that number of function calls will be greater than or equal to + the number of iterations for the MLPRegressor. + + .. versionadded:: 0.22 + + References + ---------- + Scikit-learn API: sklearn.neural_network.MLPRegressor + https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html + """ + super().__init__() + self.hidden_layer_sizes = hidden_layer_sizes + self.activation = activation + self.solver = solver + self.alpha = alpha + self.batch_size = batch_size + self.learning_rate = learning_rate + self.learning_rate_init = learning_rate_init + self.max_iter = max_iter + self.shuffle = shuffle + self.tol = tol + self.verbose = verbose + self.warm_start = warm_start + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.n_iter_no_change = n_iter_no_change + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = MLPRegressor( + hidden_layer_sizes=self.hidden_layer_sizes, + activation=self.activation, + solver=self.solver, + alpha=self.alpha, + batch_size=self.batch_size, + learning_rate=self.learning_rate, + learning_rate_init=self.learning_rate_init, + max_iter=self.max_iter, + shuffle=self.shuffle, + random_state=self.random_state, + tol=self.tol, + verbose=self.verbose, + warm_start=self.warm_start, + early_stopping=self.early_stopping, + validation_fraction=self.validation_fraction, + beta_1=self.beta_1, + beta_2=self.beta_2, + epsilon=self.epsilon, + n_iter_no_change=self.n_iter_no_change, + ) + + self.naming = MLPRegression.name + self.func = MLPRegression.special_function + + def ray_tune(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame) -> None: + """The customized MLP of the combinations of Ray, FLAML and Scikit-learn framework.""" + + from ray import tune + from ray.air import session + from ray.tune.search import ConcurrencyLimiter + from ray.tune.search.flaml import BlendSearch + from sklearn.metrics import mean_squared_error + + def customized_model(l1: int, l2: int, l3: int, batch: int) -> object: + """The customized model by Scikit-learn framework.""" + return MLPRegressor(hidden_layer_sizes=(l1, l2, l3), batch_size=batch) + + def evaluate(l1: int, l2: int, l3: int, batch: int) -> float: + """The evaluation function by simulating a long-running ML experiment + to get the model's performance at every epoch.""" + regr = customized_model(l1, l2, l3, batch) + regr.fit(X_train, y_train) + y_pred = regr.predict(X_test) + mse = mean_squared_error(y_test, y_pred) + rmse = np.sqrt(mse) # Use RMSE score + return rmse + + def objective(config: Dict) -> None: + """Objective function takes a Tune config, evaluates the score of your experiment in a training loop, + and uses session.report to report the score back to Tune.""" + for step in range(config["steps"]): + score = evaluate(config["l1"], config["l2"], config["l3"], config["batch"]) + session.report({"iterations": step, "mean_loss": score}) + + # Search space: The critical assumption is that the optimal hyper-parameters live within this space. + search_config = { + "l1": tune.randint(1, 20), + "l2": tune.randint(1, 30), + "l3": tune.randint(1, 20), + "batch": tune.randint(20, 100), + } + + # Define the time budget in seconds. + time_budget_s = 30 + + # Integrate with FLAML's BlendSearch to implement hyper-parameters optimization . + algo = BlendSearch(metric="mean_loss", mode="min", space=search_config) + algo.set_search_properties(config={"time_budget_s": time_budget_s}) + algo = ConcurrencyLimiter(algo, max_concurrent=4) + + # Use Ray Tune to run the experiment to "min"imize the “mean_loss” of the "objective" + # by searching "search_config" via "algo", "num_samples" times. + tuner = tune.Tuner( + objective, + tune_config=tune.TuneConfig( + metric="mean_loss", + mode="min", + search_alg=algo, + num_samples=-1, + time_budget_s=time_budget_s, + ), + param_space={"steps": 100}, + ) + results = tuner.fit() + + # The hyper-parameters found to minimize the mean loss of the defined objective and the corresponding model. + best_result = results.get_best_result(metric="mean_loss", mode="min") + self.ray_best_model = customized_model(best_result.config["l1"], best_result.config["l2"], best_result.config["l3"], best_result.config["batch"]) + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = multi_layer_perceptron_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @staticmethod + def _plot_loss_curve(trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str, func_name: str) -> None: + """Plot the learning curve of the trained model.""" + print(f"-----* {func_name} *-----") + data = pd.DataFrame(trained_model.loss_curve_, columns=["Loss"]) + data.plot(title="Loss") + save_fig(f"{func_name} - {algorithm_name}", local_path, mlflow_path) + save_data(data, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + if self.model.get_params()["solver"] in ["sgd", "adam"]: + self._plot_loss_curve( + trained_model=self.model, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + if self.model.get_params()["solver"] in ["sgd", "adam"]: + self._plot_loss_curve( + trained_model=self.auto_model, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + + +class ClassicalLinearRegression(LinearWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Linear Regression algorithm to make insightful products.""" + + name = "Linear Regression" + special_function = [ + ClassicalLinearSpecialFunction.LINEAR_REGRESSION_FORMULA.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_LINE_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SURFACE_DIAGRAM.value, + ] + # [Linear Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] + + def __init__( + self, + fit_intercept: bool = True, + copy_X: bool = True, + n_jobs: Optional[int] = None, + positive: bool = False, + ) -> None: + """ + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + n_jobs : int, default=None + The number of jobs to use for the computation. This will only provide + speedup in case of sufficiently large problems, that is if firstly + `n_targets > 1` and secondly `X` is sparse or if `positive` is set + to `True`. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. This + option is only supported for dense arrays. + .. versionadded:: 0.24 + + References + ---------- + Scikit-learn API: sklearn.linear_model.LinearRegression + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linearregression + """ + super().__init__() + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.n_jobs = n_jobs + self.positive = positive + + self.model = LinearRegression( + fit_intercept=self.fit_intercept, + copy_X=self.copy_X, + n_jobs=self.n_jobs, + positive=self.positive, + ) + + self.naming = ClassicalLinearRegression.name + self.func = ClassicalLinearRegression.special_function + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = linear_regression_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._show_formula( + coef=self.model.coef_, + intercept=self.model.intercept_, + features_name=ClassicalLinearRegression.X_train.columns, + regression_classification="Regression", + y_train=ClassicalLinearRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = ClassicalLinearRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ClassicalLinearRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(ClassicalLinearRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ClassicalLinearRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=ClassicalLinearRegression.X_test, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=ClassicalLinearRegression.X_test, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + y_test_predict=ClassicalLinearRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=ClassicalLinearRegression.X_test, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=ClassicalLinearRegression.X_test, + target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, + y_test_predict=ClassicalLinearRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + +class KNNRegression(RegressionWorkflowBase): + """The automation workflow of using KNN algorithm to make insightful products.""" + + name = "K-Nearest Neighbors" + special_function = [] + + def __init__( + self, + n_neighbors: int = 5, + *, + weights: str = "uniform", + algorithm: str = "auto", + leaf_size: int = 30, + p: int = 2, + metric: str = "minkowski", + metric_params: Optional[Dict] = None, + n_jobs: Optional[int] = None, + ) -> None: + """ + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : int, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + Doesn't affect :meth:`fit` method. + + References + ---------- + Scikit-learn API: sklearn.neighbors.KNeighborsRegressor + https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html + """ + super().__init__() + self.n_neighbors = n_neighbors + self.weights = weights + self.algorithm = algorithm + self.leaf_size = leaf_size + self.p = p + self.metric = metric + self.metric_params = metric_params + self.n_jobs = n_jobs + + self.model = KNeighborsRegressor( + n_neighbors=self.n_neighbors, + weights=self.weights, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + p=self.p, + metric=self.metric, + metric_params=self.metric_params, + n_jobs=self.n_jobs, + ) + + self.naming = KNNRegression.name + + @property + def settings(self) -> Dict: + """The configuration to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": ["kneighbor"], # list of ML learners + "task": "regression", # task type + # "log_file_name": f"{self.naming} - automl.log", # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = knn_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + pass + + +class GradientBoostingRegression(TreeWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Gradient Boosting algorithm to make insightful products.""" + + name = "Gradient Boosting" + special_function = [ + RegressionSpecialFunction.FEATURE_IMPORTANCE_DIAGRAM.value, + RegressionSpecialFunction.SINGLE_TREE_DIAGRAM.value, + ] + # ["Feature Importance Diagram", "Single Tree Diagram"] + + def __init__( + self, + *, + loss: str = "squared_error", + learning_rate: float = 0.1, + n_estimators: int = 100, + subsample: float = 1.0, + criterion: str = "friedman_mse", + min_samples_split: Union[int, float] = 2, + min_samples_leaf: Union[int, float] = 1, + min_weight_fraction_leaf: float = 0.0, + max_depth: float = 3, + min_impurity_decrease: float = 0.0, + init: Optional[object] = None, + random_state: Optional[int] = None, + max_features: Union[str, int, float] = None, + alpha: float = 0.9, + verbose: int = 0, + max_leaf_nodes: Optional[int] = None, + warm_start: bool = False, + validation_fraction: float = 0.1, + n_iter_no_change: Optional[int] = None, + tol: float = 1e-4, + ccp_alpha: float = 0.0, + ) -> None: + """ + Parameters + ---------- + loss : {'squared_error', 'absolute_error', 'huber', 'quantile'}, \ + default='squared_error' + Loss function to be optimized. 'squared_error' refers to the squared + error for regression. 'absolute_error' refers to the absolute error of + regression and is a robust loss function. 'huber' is a + combination of the two. 'quantile' allows quantile regression (use + `alpha` to specify the quantile). + + learning_rate : float, default=0.1 + Learning rate shrinks the contribution of each tree by `learning_rate`. + There is a trade-off between learning_rate and n_estimators. + Values must be in the range `[0.0, inf)`. + + n_estimators : int, default=100 + The number of boosting stages to perform. Gradient boosting + is fairly robust to over-fitting so a large number usually + results in better performance. + Values must be in the range `[1, inf)`. + + subsample : float, default=1.0 + The fraction of samples to be used for fitting the individual base + learners. If smaller than 1.0 this results in Stochastic Gradient + Boosting. `subsample` interacts with the parameter `n_estimators`. + Choosing `subsample < 1.0` leads to a reduction of variance + and an increase in bias. + Values must be in the range `(0.0, 1.0]`. + + criterion : {'friedman_mse', 'squared_error'}, default='friedman_mse' + The function to measure the quality of a split. Supported criteria are + "friedman_mse" for the mean squared error with improvement score by + Friedman, "squared_error" for mean squared error. The default value of + "friedman_mse" is generally the best as it can provide a better + approximation in some cases. + + .. versionadded:: 0.18 + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, values must be in the range `[2, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split` + will be `ceil(min_samples_split * n_samples)`. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf` + will be `ceil(min_samples_leaf * n_samples)`. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + Values must be in the range `[0.0, 0.5]`. + + max_depth : int or None, default=3 + Maximum depth of the individual regression estimators. The maximum + depth limits the number of nodes in the tree. Tune this parameter + for best performance; the best value depends on the interaction + of the input variables. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + If int, values must be in the range `[1, inf)`. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + Values must be in the range `[0.0, inf)`. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + init : estimator or 'zero', default=None + An estimator object that is used to compute the initial predictions. + ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the + initial raw predictions are set to zero. By default a + ``DummyEstimator`` is used, predicting either the average target value + (for loss='squared_error'), or a quantile for the other losses. + + random_state : int, RandomState instance or None, default=None + Controls the random seed given to each Tree estimator at each + boosting iteration. + In addition, it controls the random permutation of the features at + each split (see Notes for more details). + It also controls the random splitting of the training data to obtain a + validation set if `n_iter_no_change` is not None. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_features : {'sqrt', 'log2'}, int or float, default=None + The number of features to consider when looking for the best split: + + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and the features + considered at each split will be `max(1, int(max_features * n_features_in_))`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Choosing `max_features < n_features` leads to a reduction of variance + and an increase in bias. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + alpha : float, default=0.9 + The alpha-quantile of the huber loss function and the quantile + loss function. Only if ``loss='huber'`` or ``loss='quantile'``. + Values must be in the range `(0.0, 1.0)`. + + verbose : int, default=0 + Enable verbose output. If 1 then it prints progress and performance + once in a while (the more trees the lower the frequency). If greater + than 1 then it prints progress and performance for every tree. + Values must be in the range `[0, inf)`. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + Values must be in the range `[2, inf)`. + If None, then unlimited number of leaf nodes. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just erase the + previous solution. See :term:`the Glossary `. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Values must be in the range `(0.0, 1.0)`. + Only used if ``n_iter_no_change`` is set to an integer. + + .. versionadded:: 0.20 + + n_iter_no_change : int, default=None + ``n_iter_no_change`` is used to decide if early stopping will be used + to terminate training when validation score is not improving. By + default it is set to None to disable early stopping. If set to a + number, it will set aside ``validation_fraction`` size of the training + data as validation and terminate training when validation score is not + improving in all of the previous ``n_iter_no_change`` numbers of + iterations. + Values must be in the range `[1, inf)`. + + .. versionadded:: 0.20 + + tol : float, default=1e-4 + Tolerance for the early stopping. When the loss is not improving + by at least tol for ``n_iter_no_change`` iterations (if set to a + number), the training stops. + Values must be in the range `[0.0, inf)`. + + .. versionadded:: 0.20 + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. + Values must be in the range `[0.0, inf)`. + See :ref:`minimal_cost_complexity_pruning` for details. + + .. versionadded:: 0.22 + + References + ---------- + Scikit-learn API: sklearn.ensemble.GradientBoostingRegressor + https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html + """ + super().__init__() + self.loss = loss + self.learning_rate = learning_rate + self.n_estimators = n_estimators + self.subsample = subsample + self.criterion = criterion + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_depth = max_depth + self.min_impurity_decrease = min_impurity_decrease + self.init = init + self.max_features = max_features + self.alpha = alpha + self.verbose = verbose + self.max_leaf_nodes = max_leaf_nodes + self.warm_start = warm_start + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.tol = tol + self.ccp_alpha = ccp_alpha + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = GradientBoostingRegressor( + loss=self.loss, + learning_rate=self.learning_rate, + n_estimators=self.n_estimators, + subsample=self.subsample, + criterion=self.criterion, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_depth=self.max_depth, + min_impurity_decrease=self.min_impurity_decrease, + init=self.init, + random_state=self.random_state, + max_features=self.max_features, + alpha=self.alpha, + verbose=self.verbose, + max_leaf_nodes=self.max_leaf_nodes, + warm_start=self.warm_start, + validation_fraction=self.validation_fraction, + n_iter_no_change=self.n_iter_no_change, + tol=self.tol, + ccp_alpha=self.ccp_alpha, + ) + + self.naming = GradientBoostingRegression.name + self.func = GradientBoostingRegression.special_function + self.customized = True + self.customized_name = "Gradient Boosting" + + @property + def settings(self) -> Dict: + """The configuration of Gradient Boosting to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized Gradient Boosting of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.ensemble import GradientBoostingRegressor + + class MyGradientBoostingRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = GradientBoostingRegressor + + @classmethod + def search_space(cls, data_size, task): + space = { + "n_estimators": {"domain": tune.lograndint(lower=4, upper=512), "init_value": 100}, + "max_depth": {"domain": tune.randint(lower=1, upper=10), "init_value": 3}, + "learning_rate": {"domain": tune.loguniform(lower=0.001, upper=1.0), "init_value": 0.1}, + "subsample": {"domain": tune.uniform(lower=0.1, upper=1.0), "init_value": 1.0}, + "min_samples_split": {"domain": tune.randint(lower=2, upper=20), "init_value": 2}, + "min_samples_leaf": {"domain": tune.randint(lower=1, upper=20), "init_value": 1}, + "min_impurity_decrease": {"domain": tune.loguniform(lower=1e-10, upper=1e-2), "init_value": 0.0}, + } + return space + + return MyGradientBoostingRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = gradient_boosting_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=GradientBoostingRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.model.estimators_[0][0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=GradientBoostingRegression.X_train, + name_column=RegressionWorkflowBase.name_train, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[0], + ) + self._plot_tree( + trained_model=self.auto_model.estimators_[0][0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + + +class LassoRegression(LinearWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Lasso to make insightful products.""" + + name = "Lasso Regression" + special_function = [ + LassoSpecialFunction.LASSO_REGRESSION_FORMULA.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_LINE_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SURFACE_DIAGRAM.value, + ] + # ["Lasso Regression Formula","2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] + + def __init__( + self, + alpha: float = 1.0, + *, + fit_intercept: bool = True, + precompute: bool = False, + copy_X: bool = True, + max_iter: int = 1000, + tol: float = 1e-4, + warm_start: bool = False, + positive: bool = False, + random_state: Optional[int] = None, + selection: str = "cyclic", + ) -> None: + """ + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the L1 term, controlling regularization + strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. + + When `alpha = 0`, the objective is equivalent to ordinary least + squares, solved by the :class:`LinearRegression` object. For numerical + reasons, using `alpha = 0` with the `Lasso` object is not advised. + Instead, you should use the :class:`LinearRegression` object. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + precompute : bool or array-like of shape (n_features, n_features),\ + default=False + Whether to use a precomputed Gram matrix to speed up + calculations. The Gram matrix can also be passed as argument. + For sparse input this option is always ``False`` to preserve sparsity. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``, see Notes below. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + References + ---------- + Scikit-learn API: sklearn.linear_model.Lasso + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html + """ + super().__init__() + self.alpha = alpha + self.fit_intercept = fit_intercept + self.precompute = precompute + self.copy_X = copy_X + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start + self.positive = positive + self.selection = selection + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = Lasso( + alpha=self.alpha, + fit_intercept=self.fit_intercept, + precompute=self.precompute, + copy_X=self.copy_X, + max_iter=self.max_iter, + tol=self.tol, + warm_start=self.warm_start, + positive=self.positive, + random_state=self.random_state, + selection=self.selection, + ) + + self.naming = LassoRegression.name + self.func = LassoRegression.special_function + self.customized = True + self.customized_name = "Lasso" + + @property + def settings(self) -> Dict: + """The configuration of Lasso to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized Lasso of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.linear_model import Lasso + + class MyLassoRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = Lasso + + @classmethod + def search_space(cls, data_size, task): + space = { + "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1}, + "fit_intercept": {"domain": tune.choice([True, False])}, + "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000}, + "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4}, + "selection": {"domain": tune.choice(["cyclic", "random"])}, + } + return space + + return MyLassoRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = lasso_regression_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.model.coef_], + intercept=self.model.intercept_, + features_name=LassoRegression.X_train.columns, + regression_classification="Regression", + y_train=LassoRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = LassoRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + y_test_predict=LassoRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + y_test_predict=LassoRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.auto_model.coef_], + intercept=self.auto_model.intercept_, + features_name=LassoRegression.X_train.columns, + regression_classification="Regression", + y_train=LassoRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = LassoRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(LassoRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + y_test_predict=LassoRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=LassoRegression.X_test, + target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, + y_test_predict=LassoRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + +class ElasticNetRegression(LinearWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Elastic Net algorithm to make insightful products.""" + + name = "Elastic Net" + special_function = [ + ElasticNetSpecialFunction.ELASTIC_NET_FORMULA.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_LINE_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SURFACE_DIAGRAM.value, + ] + # ["Elastic Net Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] + + def __init__( + self, + alpha: float = 1.0, + l1_ratio: float = 0.5, + fit_intercept: bool = True, + precompute: bool = False, + max_iter: int = 1000, + copy_X: bool = True, + tol: float = 1e-4, + warm_start: bool = False, + positive: bool = False, + random_state: Optional[int] = None, + selection: str = "cyclic", + ) -> None: + """ + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the penalty terms. Defaults to 1.0. + See the notes for the exact mathematical meaning of this + parameter. ``alpha = 0`` is equivalent to an ordinary least square, + solved by the :class:`LinearRegression` object. For numerical + reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. + Given this, you should use the :class:`LinearRegression` object. + + l1_ratio : float, default=0.5 + The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If ``False``, the + data is assumed to be already centered. + + precompute : bool or array-like of shape (n_features, n_features),\ + default=False + Whether to use a precomputed Gram matrix to speed up + calculations. The Gram matrix can also be passed as argument. + For sparse input this option is always ``False`` to preserve sparsity. + + max_iter : int, default=1000 + The maximum number of iterations. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``, see Notes below. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + References + ---------- + Scikit-learn API: sklearn.linear_model.ElasticNet + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html + """ + super().__init__() + self.alpha = alpha + self.l1_ratio = l1_ratio + self.fit_intercept = fit_intercept + self.precompute = precompute + self.max_iter = max_iter + self.copy_X = copy_X + self.tol = tol + self.warm_start = warm_start + self.positive = positive + self.selection = selection + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = ElasticNet( + alpha=self.alpha, + l1_ratio=self.l1_ratio, + fit_intercept=self.fit_intercept, + precompute=self.precompute, + max_iter=self.max_iter, + copy_X=self.copy_X, + tol=self.tol, + warm_start=self.warm_start, + positive=self.positive, + random_state=self.random_state, + selection=self.selection, + ) + + self.naming = ElasticNetRegression.name + self.func = ElasticNetRegression.special_function + self.customized = True + self.customized_name = "Elastic Net" + + @property + def settings(self) -> Dict: + """The configuration of Elastic Net to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized Elastic Net of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.linear_model import ElasticNet + + class MyElasticNetRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = ElasticNet + + @classmethod + def search_space(cls, data_size, task): + space = { + "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1}, + "l1_ratio": {"domain": tune.uniform(lower=0.001, upper=1), "init_value": 0.5}, + "fit_intercept": {"domain": tune.choice([True, False])}, + "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000}, + "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4}, + "selection": {"domain": tune.choice(["cyclic", "random"])}, + } + return space + + return MyElasticNetRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = elastic_net_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.model.coef_], + intercept=self.model.intercept_, + features_name=ElasticNetRegression.X_train.columns, + regression_classification="Regression", + y_train=ElasticNetRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = ElasticNetRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + y_test_predict=ElasticNetRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + y_test_predict=ElasticNetRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.auto_model.coef_], + intercept=self.auto_model.intercept_, + features_name=ElasticNetRegression.X_train.columns, + regression_classification="Regression", + y_train=ElasticNetRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = ElasticNetRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(ElasticNetRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + y_test_predict=ElasticNetRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=ElasticNetRegression.X_test, + target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, + y_test_predict=ElasticNetRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + +class SGDRegression(LinearWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Stochastic Gradient Descent - SGD algorithm to make insightful products.""" + + name = "SGD Regression" + special_function = [ + SGDSpecialFunction.SGD_REGRESSION_FORMULA.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_LINE_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SURFACE_DIAGRAM.value, + ] + # ["SGD Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] + + def __init__( + self, + loss: str = "squared_error", + penalty: str = "l2", + alpha: float = 0.0001, + l1_ratio: float = 0.15, + fit_intercept: bool = True, + max_iter: int = 1000, + tol: Union[float, None] = 0.001, + shuffle: bool = True, + verbose: int = 0, + epsilon: float = 0.1, + random_state: Optional[int] = None, + learning_rate: str = "invscaling", + eta0: float = 0.01, + power_t: float = 0.25, + early_stopping: bool = False, + validation_fraction: float = 0.1, + n_iter_no_change: int = 5, + warm_start: bool = False, + average: Union[bool, int] = False, + ) -> None: + """ + Parameters + ---------- + loss : str, default='squared_error' + The loss function to be used. The possible values are 'squared_error', + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' + The 'squared_error' refers to the ordinary least squares fit. + 'huber' modifies 'squared_error' to focus less on getting outliers + correct by switching from squared to linear loss past a distance of + epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is + linear past that; this is the loss function used in SVR. + 'squared_epsilon_insensitive' is the same but becomes squared loss past + a tolerance of epsilon. + More details about the losses formulas can be found in the + :ref:`User Guide `. + + penalty : {'l2', 'l1', 'elasticnet', None}, default='l2' + The penalty (aka regularization term) to be used. Defaults to 'l2' + which is the standard regularizer for linear SVM models. 'l1' and + 'elasticnet' might bring sparsity to the model (feature selection) + not achievable with 'l2'. No penalty is added when set to `None`. + + alpha : float, default=0.0001 + Constant that multiplies the regularization term. The higher the + value, the stronger the regularization. + Also used to compute the learning rate when set to `learning_rate` is + set to 'optimal'. + + l1_ratio : float, default=0.15 + The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. + l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. + Only used if `penalty` is 'elasticnet'. + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`partial_fit` method. + .. versionadded:: 0.19 + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, training will stop + when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive + epochs. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + .. versionadded:: 0.19 + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + + verbose : int, default=0 + The verbosity level. + + epsilon : float, default=0.1 + Epsilon in the epsilon-insensitive loss functions; only if `loss` is + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. + For 'huber', determines the threshold at which it becomes less + important to get the prediction exactly right. + For epsilon-insensitive, any differences between the current prediction + and the correct label are ignored if they are less than this threshold. + + random_state : int, RandomState instance, default=None + Used for shuffling the data, when ``shuffle`` is set to ``True``. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + learning_rate : str, default='invscaling' + The learning rate schedule: + - 'constant': `eta = eta0` + - 'optimal': `eta = 1.0 / (alpha * (t + t0))` + where t0 is chosen by a heuristic proposed by Leon Bottou. + - 'invscaling': `eta = eta0 / pow(t, power_t)` + - 'adaptive': eta = eta0, as long as the training keeps decreasing. + Each time n_iter_no_change consecutive epochs fail to decrease the + training loss by tol or fail to increase validation score by tol if + early_stopping is True, the current learning rate is divided by 5. + .. versionadded:: 0.20 + Added 'adaptive' option + + eta0 : float, default=0.01 + The initial learning rate for the 'constant', 'invscaling' or + 'adaptive' schedules. The default value is 0.01. + + power_t : float, default=0.25 + The exponent for inverse scaling learning rate. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to True, it will automatically set aside + a fraction of training data as validation and terminate + training when validation score returned by the `score` method is not + improving by at least `tol` for `n_iter_no_change` consecutive + epochs. + .. versionadded:: 0.20 + Added 'early_stopping' option + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if `early_stopping` is True. + .. versionadded:: 0.20 + Added 'validation_fraction' option + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before stopping + fitting. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + .. versionadded:: 0.20 + Added 'n_iter_no_change' option + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + If a dynamic learning rate is used, the learning rate is adapted + depending on the number of samples already seen. Calling ``fit`` resets + this counter, while ``partial_fit`` will result in increasing the + existing counter. + + average : bool or int, default=False + When set to True, computes the averaged SGD weights across all + updates and stores the result in the ``coef_`` attribute. If set to + an int greater than 1, averaging will begin once the total number of + samples seen reaches `average`. So ``average=10`` will begin + averaging after seeing 10 samples. + + References + ---------- + Scikit-learn API: sklearn.linear_model.SGDRegressor + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html + """ + super().__init__() + self.loss = loss + self.penalty = penalty + self.alpha = alpha + self.l1_ratio = l1_ratio + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.tol = tol + self.shuffle = shuffle + self.verbose = verbose + self.epsilon = epsilon + self.learning_rate = learning_rate + self.eta0 = eta0 + self.power_t = power_t + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.warm_start = warm_start + self.average = average + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + self.model = SGDRegressor( + loss=self.loss, + penalty=self.penalty, + alpha=self.alpha, + l1_ratio=self.l1_ratio, + fit_intercept=self.fit_intercept, + max_iter=self.max_iter, + tol=self.tol, + shuffle=self.shuffle, + verbose=self.verbose, + epsilon=self.epsilon, + random_state=self.random_state, + learning_rate=self.learning_rate, + eta0=self.eta0, + power_t=self.power_t, + early_stopping=self.early_stopping, + validation_fraction=self.validation_fraction, + n_iter_no_change=self.n_iter_no_change, + warm_start=self.warm_start, + average=self.average, + ) + + self.naming = SGDRegression.name + self.func = SGDRegression.special_function + self.customized = True + self.customized_name = "SGD Regression" + + @property + def settings(self) -> Dict: + """The configuration of SGD to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized SGD of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.linear_model import SGDRegressor + + class MySGDRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = SGDRegressor + + @classmethod + def search_space(cls, data_size, task): + space = { + "loss": {"domain": tune.choice(["squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]), "init_value": "squared_error"}, + "penalty": {"domain": tune.choice(["l2", "l1", "elasticnet", None]), "init_value": "l2"}, + "alpha": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.0001}, + "fit_intercept": {"domain": tune.choice([True, False]), "init_value": True}, + "max_iter": {"domain": tune.randint(lower=50, upper=1000), "init_value": 1000}, + "tol": {"domain": tune.loguniform(lower=0.000001, upper=0.001), "init_value": 0.001}, + "shuffle": {"domain": tune.choice([True, False]), "init_value": True}, + "learning_rate": {"domain": tune.choice(["constant", "optimal", "invscaling", "adaptive"]), "init_value": "invscaling"}, + "eta0": {"domain": tune.loguniform(lower=0.0001, upper=0.1), "init_value": 0.01}, + "power_t": {"domain": tune.uniform(lower=0.1, upper=0.9), "init_value": 0.25}, + "l1_ratio": {"domain": tune.uniform(lower=0, upper=1), "init_value": 0.15}, + } + return space + + return MySGDRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = sgd_regression_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.model.coef_], + intercept=self.model.intercept_, + features_name=SGDRegression.X_train.columns, + regression_classification="Regression", + y_train=SGDRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = SGDRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.auto_model.coef_], + intercept=self.auto_model.intercept_, + features_name=SGDRegression.X_train.columns, + regression_classification="Regression", + y_train=SGDRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = SGDRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(SGDRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=SGDRegression.X_test, + target_data=SGDRegression.y_test, + y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + +class BayesianRidgeRegression(RegressionWorkflowBase): + """The automation workflow of using Bayesian ridge regression algorithm to make insightful products.""" + + name = "BayesianRidge Regression" + special_function = [] + + def __init__( + self, + tol: float = 0.001, + alpha_1: float = 0.000001, + alpha_2: float = 0.000001, + lambda_1: float = 0.000001, + lambda_2: float = 0.000001, + alpha_init: float = None, + lambda_init: float = None, + compute_score: bool = False, + fit_intercept: bool = True, + copy_X: bool = True, + verbose: bool = False, + ) -> None: + """ + Parameters + ------------ + max_iter : int, default=None + Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion. If , it corresponds to .Nonemax_iter=300 + + Changed in version 1.3. + + tol : float, default=1e-3 + Stop the algorithm if w has converged. + + alpha_1 : float, default=1e-6 + Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter. + + alpha_2 : float, default=1e-6 + Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter. + + lambda_1 : float, default=1e-6 + Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter. + + lambda_2 : float, default=1e-6 + Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter. + + alpha_init : float, default=None + Initial value for alpha (precision of the noise). If not set, alpha_init is 1/Var(y). + + New in version 0.22. + + lambda_init : float, default=None + Initial value for lambda (precision of the weights). If not set, lambda_init is 1. + + New in version 0.22. + + compute_score : bool, default=False + If True, compute the log marginal likelihood at each iteration of the optimization. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. The intercept is not treated as a probabilistic parameter and + thus has no associated variance. If set to False, no intercept will be used in calculations (i.e. data is expect + ed to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + verbose : bool, default=False + Verbose mode when fitting the model. + + n_iter : int + Maximum number of iterations. Should be greater than or equal to 1. + + References + ---------- + Scikit-learn API: sklearn.linear_model.BayesianRidge + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html + """ + super().__init__() + self.tol = tol + self.alpha_1 = alpha_1 + self.alpha_2 = alpha_2 + self.lambda_1 = lambda_1 + self.lambda_2 = lambda_2 + self.alpha_init = alpha_init + self.lambda_init = lambda_init + self.compute_score = compute_score + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.verbose = verbose + + self.model = BayesianRidge( + tol=self.tol, + alpha_1=self.alpha_1, + alpha_2=self.alpha_2, + lambda_1=self.lambda_1, + lambda_2=self.lambda_2, + alpha_init=self.alpha_init, + lambda_init=self.lambda_init, + compute_score=self.compute_score, + fit_intercept=self.compute_score, + copy_X=self.copy_X, + verbose=self.verbose, + ) + + self.naming = BayesianRidgeRegression.name + self.customized = True + self.customized_name = "BayesianRidge Regression" + + @property + def settings(self) -> Dict: + """The configuration of BayesianRidge to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized BayesianRidge of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.linear_model import BayesianRidge + + class MyBayesianRidgeRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = BayesianRidge + + @classmethod + def search_space(cls, data_size, task): + space = { + "tol": {"domain": tune.loguniform(lower=0.000001, upper=0.001), "init_value": 0.001}, + "alpha_1": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, + "alpha_2": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, + "lambda_1": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, + "lambda_2": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, + "alpha_init": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, + "lambda_init": {"domain": tune.loguniform(lower=0.0001, upper=1), "init_value": 0.000001}, + "compute_score": {"domain": tune.choice([True, False]), "init_value": False}, + "fit_intercept": {"domain": tune.choice([True, False]), "init_value": True}, + "copy_X": {"domain": tune.choice([True, False]), "init_value": True}, + "verbose": {"domain": tune.choice([True, False]), "init_value": False}, + } + return space + + return MyBayesianRidgeRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = bayesian_ridge_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + pass + + +class RidgeRegression(LinearWorkflowMixin, RegressionWorkflowBase): + """The automation workflow of using Lasso to make insightful products.""" + + name = "Ridge Regression" + special_function = [ + RidgeSpecialFunction.RIDGE_REGRESSION_FORMULA.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SCATTER_DIAGRAM.value, + RegressionSpecialFunction.TWO_DIMENSIONAL_LINE_DIAGRAM.value, + RegressionSpecialFunction.THREE_DIMENSIONAL_SURFACE_DIAGRAM.value, + ] + # ["Ridge Regression Formula", "2D Scatter Diagram", "3D Scatter Diagram", "2D Line Diagram", "3D Surface Diagram"] + + def __init__( + self, + alpha: float = 1.0, + *, + fit_intercept: bool = True, + copy_X: bool = True, + max_iter: int = 1000, + tol: float = 1e-4, + solver: str = "auto", + positive: bool = False, + random_state: Optional[int] = None, + ) -> None: + """ + Parameters + ---------- + alpha : {float, ndarray of shape (n_targets,)}, default=1.0 + Constant that multiplies the L2 term, controlling regularization + strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. + + When `alpha = 0`, the objective is equivalent to ordinary least + squares, solved by the :class:`LinearRegression` object. For numerical + reasons, using `alpha = 0` with the `Ridge` object is not advised. + Instead, you should use the :class:`LinearRegression` object. + + If an array is passed, penalties are assumed to be specific to the + targets. Hence they must correspond in number. + + fit_intercept : bool, default=True + Whether to fit the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. ``X`` and ``y`` are expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + max_iter : int, default=None + Maximum number of iterations for conjugate gradient solver. + For 'sparse_cg' and 'lsqr' solvers, the default value is determined + by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. + For 'lbfgs' solver, the default value is 15000. + + tol : float, default=1e-4 + The precision of the solution (`coef_`) is determined by `tol` which + specifies a different convergence criterion for each solver: + + - 'svd': `tol` has no impact. + + - 'cholesky': `tol` has no impact. + + - 'sparse_cg': norm of residuals smaller than `tol`. + + - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr, + which control the norm of the residual vector in terms of the norms of + matrix and coefficients. + + - 'sag' and 'saga': relative change of coef smaller than `tol`. + + - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals| + smaller than `tol`. + + .. versionchanged:: 1.2 + Default value changed from 1e-3 to 1e-4 for consistency with other linear + models. + + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \ + 'sag', 'saga', 'lbfgs'}, default='auto' + Solver to use in the computational routines: + + - 'auto' chooses the solver automatically based on the type of data. + + - 'svd' uses a Singular Value Decomposition of X to compute the Ridge + coefficients. It is the most stable solver, in particular more stable + for singular matrices than 'cholesky' at the cost of being slower. + + - 'cholesky' uses the standard scipy.linalg.solve function to + obtain a closed-form solution. + + - 'sparse_cg' uses the conjugate gradient solver as found in + scipy.sparse.linalg.cg. As an iterative algorithm, this solver is + more appropriate than 'cholesky' for large-scale data + (possibility to set `tol` and `max_iter`). + + - 'lsqr' uses the dedicated regularized least-squares routine + scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative + procedure. + + - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses + its improved, unbiased version named SAGA. Both methods also use an + iterative procedure, and are often faster than other solvers when + both n_samples and n_features are large. Note that 'sag' and + 'saga' fast convergence is only guaranteed on features with + approximately the same scale. You can preprocess the data with a + scaler from sklearn.preprocessing. + + - 'lbfgs' uses L-BFGS-B algorithm implemented in + `scipy.optimize.minimize`. It can be used only when `positive` + is True. + + All solvers except 'svd' support both dense and sparse data. However, only + 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when + `fit_intercept` is True. + + .. versionadded:: 0.17 + Stochastic Average Gradient descent solver. + .. versionadded:: 0.19 + SAGA solver. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + Only 'lbfgs' solver is supported in this case. + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. + + .. versionadded:: 0.17 + `random_state` to support Stochastic Average Gradient. + + References + ---------- + Scikit-learn API: sklearn.linear_model.Lasso + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html + + """ + super().__init__() + self.alpha = alpha + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.max_iter = max_iter + self.tol = tol + self.solver = solver + self.positive = positive + self.random_state = random_state + + self.model = Ridge( + alpha=self.alpha, + fit_intercept=self.fit_intercept, + copy_X=self.copy_X, + max_iter=self.max_iter, + tol=self.tol, + solver=self.solver, + positive=self.positive, + random_state=self.random_state, + ) + + self.naming = RidgeRegression.name + self.func = RidgeRegression.special_function + self.customized = True + self.customized_name = "Ridge" + + @property + def settings(self) -> Dict: + """The configuration of Ridge to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "r2", + "estimator_list": [self.customized_name], # list of ML learners + "task": "regression", # task type + # "log_file_name": f'{self.naming} - automl.log', # flaml log file + # "log_training_metric": True, # whether to log training metric + } + return configuration + + @property + def customization(self) -> object: + """The customized Ridge of FLAML framework.""" + from flaml import tune + from flaml.data import REGRESSION + from flaml.model import SKLearnEstimator + from sklearn.linear_model import Ridge + + class MyRidgeRegression(SKLearnEstimator): + def __init__(self, task="regression", n_jobs=None, **config): + super().__init__(task, **config) + if task in REGRESSION: + self.estimator_class = Ridge + + @classmethod + def search_space(cls, data_size, task): + space = { + "alpha": {"domain": tune.uniform(lower=0.001, upper=10), "init_value": 1}, + "fit_intercept": {"domain": tune.choice([True, False])}, + "max_iter": {"domain": tune.randint(lower=500, upper=2000), "init_value": 1000}, + "tol": {"domain": tune.uniform(lower=1e-5, upper=1e-3), "init_value": 1e-4}, + # "selection": {"domain": tune.choice(["cyclic", "random"])}, + # "random_state": {'domain': tune.choice(["", ""])} + } + return space + + return MyRidgeRegression + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = ridge_regression_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.model.coef_], + intercept=self.model.intercept_, + features_name=RidgeRegression.X_train.columns, + regression_classification="Regression", + y_train=RidgeRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = RidgeRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + y_test_predict=RidgeRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + y_test_predict=RidgeRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + self._show_formula( + coef=[self.auto_model.coef_], + intercept=self.auto_model.intercept_, + features_name=RidgeRegression.X_train.columns, + regression_classification="Regression", + y_train=RidgeRegression.y, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_PATH, + mlflow_path="root", + ) + columns_num = RidgeRegression.X.shape[1] + if columns_num > 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # choose two of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 2) + self._plot_3d_scatter_diagram( + feature_data=three_dimen_data, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + elif columns_num == 2: + # choose one of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(RidgeRegression.X_test, 1) + self._plot_2d_scatter_diagram( + feature_data=two_dimen_data, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + # no need to choose + self._plot_3d_scatter_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[2], + ) + self._plot_3d_surface_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + y_test_predict=RidgeRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[4], + ) + elif columns_num == 1: + # no need to choose + self._plot_2d_scatter_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[1], + ) + self._plot_2d_line_diagram( + feature_data=RidgeRegression.X_test, + target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, + y_test_predict=RidgeRegression.y_test_predict, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + func_name=self.func[3], + ) + else: + pass