diff --git a/README.md b/README.md index 51892e14..aba250bd 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ On Jupyter Notebook / Google Colab: ``` **Note**: There are four built-in data sets corresponding to four kinds of model pattern. -### Case 2: Run with your own data set +### Case 2: Run with your own data set without model inference On command line: ``` diff --git a/geochemistrypi/cli.py b/geochemistrypi/cli.py index 17d2e6dc..20bb44c7 100644 --- a/geochemistrypi/cli.py +++ b/geochemistrypi/cli.py @@ -40,7 +40,7 @@ def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="S def data_mining( data: str = typer.Option("", help="The path of the training data without model inference."), training: str = typer.Option("", help="The path of the training data."), - inference: str = typer.Option("", help="The path of the inference data."), + application: str = typer.Option("", help="The path of the inference data."), mlflow: bool = typer.Option(False, help="Start the mlflow server."), web: bool = False, ) -> None: @@ -81,11 +81,11 @@ def start_mlflow(): if data: cli_pipeline(data) # If the training data and inference data are provided, start the CLI pipeline with continuous training and inference - elif training and inference: - cli_pipeline(training, inference) + elif training and application: + cli_pipeline(training, application) # If no data is provided, use built-in data to start the CLI pipeline with continuous training and inference else: - cli_pipeline(training, inference) + cli_pipeline(training, application) @app.command() diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index f7b7e336..1859ce73 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -45,7 +45,7 @@ from .utils.mlflow_utils import retrieve_previous_experiment_id -def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = None) -> None: +def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None: """The command line interface software for Geochemistry π. The business logic of this CLI software can be found in the figures in the README.md file. It provides three MLOps core functionalities: @@ -58,11 +58,15 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N training_data_path : str The path of the training data. - inference_data_path : str, optional - The path of the inference data, by default None + application_data_path : str, optional + The path of the application data, by default None """ - # TODO: If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True. + # Local test: Uncomment the following line to utilize built-in datasets to test the pipeline. Don't forget to modify the path value to be consistent with your own location. + training_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" + application_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" + + # Local test: If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True. show_warning(False) os.makedirs(OUTPUT_PATH, exist_ok=True) @@ -85,22 +89,22 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N print("[bold red]No Training Data File Provided![/bold red]") print("[bold green]Built-in Data Loading.[/bold green]") - # <-- User Inference Data Loading --> - with console.status("[bold green]Inference Data Loading...[/bold green]", spinner="dots"): + # <-- User Application Data Loading --> + with console.status("[bold green]Application Data Loading...[/bold green]", spinner="dots"): sleep(0.75) is_built_in_inference_data = False - if training_data_path and inference_data_path: + if training_data_path and application_data_path: # If the user provides file name, then load the inference data from the file. - inference_data = read_data(file_path=inference_data_path, is_own_data=1) - print("[bold green]Successfully Loading Own Inference Data![bold green]") - elif training_data_path and (not inference_data_path): + inference_data = read_data(file_path=application_data_path, is_own_data=1) + print("[bold green]Successfully Loading Own Application Data![bold green]") + elif training_data_path and (not application_data_path): # If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference. inference_data = None - print("[bold red]No Inference Data File Provided![/bold red]") - elif (not training_data_path) and (not inference_data_path): + print("[bold red]No Application Data File Provided![/bold red]") + elif (not training_data_path) and (not application_data_path): is_built_in_inference_data = True - print("[bold red]No Inference Data File Provided![/bold red]") - print("[bold green]Built-in Inference Data Loading.[/bold green]") + print("[bold red]No Application Data File Provided![/bold red]") + print("[bold green]Built-in Application Data Loading.[/bold green]") # <-- Dependency Checking --> with console.status("[bold green]Denpendency Checking...[/bold green]", spinner="dots"): @@ -194,23 +198,23 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N show_data_columns(data.columns) clear_output() - # <--- Built-in Inference Data Loading ---> - logger.debug("Built-in Inference Data Loading") + # <--- Built-in Application Data Loading ---> + logger.debug("Built-in Application Data Loading") # If the user doesn't provide training data path and inference data path, then use the built-in inference data. if is_built_in_inference_data: - print("-*-*- Built-in Inference Data Option-*-*-") + print("-*-*- Built-in Application Data Option-*-*-") num2option(TEST_DATA_OPTION) built_in_inference_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input) if built_in_inference_data_num == 1: - inference_data_path = "InferenceData_Regression.xlsx" + application_data_path = "InferenceData_Regression.xlsx" elif built_in_inference_data_num == 2: - inference_data_path = "InferenceData_Classification.xlsx" + application_data_path = "InferenceData_Classification.xlsx" elif built_in_inference_data_num == 3: - inference_data_path = "InferenceData_Clustering.xlsx" + application_data_path = "InferenceData_Clustering.xlsx" elif built_in_inference_data_num == 4: - inference_data_path = "InferenceData_Decomposition.xlsx" - inference_data = read_data(file_path=inference_data_path) - print(f"Successfully loading the built-in inference data set '{inference_data_path}'.") + application_data_path = "InferenceData_Decomposition.xlsx" + inference_data = read_data(file_path=application_data_path) + print(f"Successfully loading the built-in inference data set '{application_data_path}'.") show_data_columns(inference_data.columns) clear_output() @@ -260,14 +264,14 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # 2. Don't drop the rows with missing values, before implementing the model inference, the inference data set should be imputed as well. # Because dropping the rows with missing values use pandas.DataFrame.dropna() method, while imputing the missing values use sklearn.impute.SimpleImputer() method. drop_rows_with_missing_value_flag = False - clear_output() + # clear_output() if missing_value_flag: + clear_output() # Ask the user whether to use imputation techniques to deal with the missing values. - print("-*-*- Missing Values Process-*-*-") + print("-*-*- Missing Values Process -*-*-") print("Do you want to deal with the missing values?") num2option(OPTION) is_process_missing_value = limit_num_input(OPTION, SECTION[1], num_input) - clear_output() if is_process_missing_value == 1: process_missing_value_flag = True # If the user wants to deal with the missing values, then ask the user which strategy to use. @@ -279,6 +283,8 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N if missing_value_strategy_num == 1: # Drop the rows with missing values data_selected_dropped = data_selected.dropna() + # Reset the index of the data set after dropping the rows with missing values. + data_selected_dropped = data_selected_dropped.reset_index(drop=True) print("Successfully drop the rows with missing values.") print("The Selected Data Set After Dropping:") print(data_selected_dropped) @@ -295,10 +301,12 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # Don't deal with the missing values, which means neither drop the rows with missing values nor use imputation techniques. imputed_flag = False save_data(data_selected, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + clear_output() else: # If the selected data set doesn't have missing values, then don't deal with the missing values. imputed_flag = False save_data(data_selected, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + clear_output() data_selected = data_selected_dropped if drop_rows_with_missing_value_flag else data_selected # If the selected data set contains missing values and the user wants to deal with the missing values and choose not to drop the rows with missing values, # then use imputation techniques to deal with the missing values. @@ -368,10 +376,10 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <--- Data Segmentation ---> # divide X and y data set when it is supervised learning - logger.debug("Data Split") + logger.debug("Data Divsion") if mode_num == 1 or mode_num == 2: # Supervised learning - print("-*-*- Data Split - X Set and Y Set -*-*-") + print("-*-*- Data Segmentation - X Set and Y Set -*-*-") print("Divide the processing data set into X (feature value) and Y (target value) respectively.") # create X data set print("Selected sub data set to create X data set:") @@ -386,6 +394,22 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N save_data(X, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() + # Create Y data set + print("-*-*- Data Segmentation - X Set and Y Set-*-*-") + print("Selected sub data set to create Y data set:") + show_data_columns(data_selected_imputed_fe.columns) + print("The selected Y data set:") + print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.") + print("Notice: For classification model training, please choose the label column which has distinctive integers.") + y = create_sub_data_set(data_selected_imputed_fe) + print("Successfully create Y data set.") + print("The Selected Data Set:") + print(y) + print("Basic Statistical Information: ") + basic_statistic(y) + save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + clear_output() + # <--- Feature Scaling ---> print("-*-*- Feature Scaling on X Set -*-*-") num2option(OPTION) @@ -406,24 +430,8 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N feature_scaling_config = {} clear_output() - # Create Y data set - print("-*-*- Data Split - X Set and Y Set-*-*-") - print("Selected sub data set to create Y data set:") - show_data_columns(data_selected_imputed_fe.columns) - print("The selected Y data set:") - print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.") - print("Notice: For classification model training, please choose the label column which has distinctive integers.") - y = create_sub_data_set(data_selected_imputed_fe) - print("Successfully create Y data set.") - print("The Selected Data Set:") - print(y) - print("Basic Statistical Information: ") - basic_statistic(y) - save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - clear_output() - # <--- Feature Selection ---> - print("-*-*- Feature Selection -*-*-") + print("-*-*- Feature Selection on X set -*-*-") num2option(OPTION) is_feature_selection = limit_num_input(OPTION, SECTION[1], num_input) if is_feature_selection == 1: @@ -535,7 +543,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N is_inference = False # If the model is supervised learning, then allow the user to use model inference. if mode_num == 1 or mode_num == 2: - print("-*-*- Feature Engineering on Inference Data -*-*-") + print("-*-*- Feature Engineering on Application Data -*-*-") is_inference = True selected_columns = X_train.columns if inference_data is not None: @@ -545,15 +553,15 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N new_feature_builder = FeatureConstructor(inference_data) inference_data_fe = new_feature_builder.batch_build(feature_engineering_config) inference_data_fe_selected = inference_data_fe[selected_columns] - save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe, "Application Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe_selected, "Application Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: print("You have not applied feature engineering to the training data.") print("Hence, no feature engineering operation will be applied to the inference data.") inference_data_fe_selected = inference_data[selected_columns] - save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(inference_data_fe_selected, "Inference Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe_selected, "Application Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: # If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference. print("You did not enter inference data.") @@ -596,7 +604,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N if drop_rows_with_missing_value_flag: inference_data_fe_selected_dropped = inference_data_fe_selected.dropna() model_inference(inference_data_fe_selected_dropped, is_inference, run, transformer_config, transform_pipeline) - save_data(inference_data_fe_selected_dropped, "Inference Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe_selected_dropped, "Application Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) clear_output() @@ -633,7 +641,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N if drop_rows_with_missing_value_flag: inference_data_fe_selected_dropped = inference_data_fe_selected.dropna() model_inference(inference_data_fe_selected_dropped, is_inference, run, transformer_config, transform_pipeline) - save_data(inference_data_fe_selected_dropped, "Inference Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe_selected_dropped, "Application Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) clear_output() diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 2b8d1fc6..8686c72f 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -100,6 +100,6 @@ CUSTOMIZE_LABEL_STRATEGY = ["Automatic Coding", "Custom Numeric Labels", "Custom Non-numeric Labels"] -FEATURE_SELECTION_STRATEGY = ["GenericUnivariateSelect", "SelectKBest"] +FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"] CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"] diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py index 0ebd6bce..db78c434 100644 --- a/geochemistrypi/data_mining/data/inference.py +++ b/geochemistrypi/data_mining/data/inference.py @@ -129,7 +129,7 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec """ # If is_inference is True, then run the model inference. if is_inference is True: - print("Use the trained model to make predictions on the inference data.") + print("Use the trained model to make predictions on the application data.") # If transformer_config is not {}, then transform the inference data with the transform pipeline. if transformer_config: inference_data_transformed = transform_pipeline.transform(inference_data) @@ -139,4 +139,4 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec inference_data_predicted_np = loaded_model.predict(inference_data_transformed) inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"]) GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") - save_data(inference_data_predicted, "Inference Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_predicted, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) diff --git a/geochemistrypi/data_mining/data/preprocessing.py b/geochemistrypi/data_mining/data/preprocessing.py index 4587c7c8..95a6a7a3 100644 --- a/geochemistrypi/data_mining/data/preprocessing.py +++ b/geochemistrypi/data_mining/data/preprocessing.py @@ -73,7 +73,7 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i X_selected : pd.DataFrame The feature dataset after selecting. """ - print("--Original Features-") + print("-- Original Features --") show_data_columns(X.columns) features_num = len(X.columns) @@ -85,9 +85,9 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i elif feature_selection_task == 2: score_func = f_classif - if method[method_idx] == "GenericUnivariateSelect": + if method[method_idx] == "Generic Univariate Select": selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num) - elif method[method_idx] == "SelectKBest": + elif method[method_idx] == "Select K Best": selector = SelectKBest(score_func=score_func, k=features_retain_num) try: diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 910f34d3..c2e3f280 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -533,6 +533,10 @@ def __init__( self.decision_function_shape = decision_function_shape self.break_ties = break_ties + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = SVC( C=self.C, kernel=self.kernel, @@ -784,6 +788,10 @@ def __init__( self.class_weight = class_weight self.ccp_alpha = ccp_alpha + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = DecisionTreeClassifier( criterion=self.criterion, splitter=self.splitter, @@ -916,7 +924,7 @@ def __init__( bootstrap: bool = True, oob_score: bool = False, n_jobs: Optional[int] = -1, - random_state: Optional[int] = 42, + random_state: Optional[int] = None, verbose: int = 0, warm_start: bool = False, class_weight: Union[str, dict, list[dict], None] = None, @@ -1118,13 +1126,16 @@ def __init__( self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs - self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.max_samples = max_samples + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = RandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, @@ -1449,7 +1460,6 @@ def __init__( self.base_score = base_score self.missing = missing self.num_parallel_tree = num_parallel_tree - self.random_state = random_state self.n_jobs = n_jobs self.monotone_constraints = monotone_constraints self.interaction_constraints = interaction_constraints @@ -1460,9 +1470,14 @@ def __init__( self.enable_categorical = enable_categorical self.eval_metric = eval_metric self.early_stopping_rounds = early_stopping_rounds + if kwargs: self.kwargs = kwargs + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = xgboost.XGBClassifier( n_estimators=self.n_estimators, objective=self.objective, @@ -1750,17 +1765,19 @@ def __init__( self.fit_intercept = fit_intercept self.intercept_scaling = intercept_scaling self.class_weight = class_weight - self.random_state = random_state self.solver = solver self.max_iter = max_iter self.multi_class = multi_class self.n_jobs = n_jobs - self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight self.l1_ratio = l1_ratio + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = LogisticRegression( penalty=self.penalty, dual=self.dual, @@ -2046,7 +2063,6 @@ def __init__( self.power_t = (power_t,) self.max_iter = (max_iter,) self.shuffle = (shuffle,) - self.random_state = (random_state,) self.tol = (tol,) self.verbose = (verbose,) self.warm_start = (warm_start,) @@ -2060,6 +2076,12 @@ def __init__( self.n_iter_no_change = (n_iter_no_change,) self.max_fun = (max_fun,) + if random_state: + self.random_state = (random_state,) + else: + self.random_state = (self.random_state,) + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = MLPClassifier( hidden_layer_sizes=self.hidden_layer_sizes[0], activation=self.activation[0], @@ -2394,13 +2416,16 @@ def __init__( self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs - self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.max_samples = max_samples + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, @@ -2715,7 +2740,6 @@ def __init__( self.init = (init,) self.subsample = (subsample,) self.max_features = (max_features,) - self.random_state = (random_state,) self.verbose = (verbose,) self.max_leaf_nodes = (max_leaf_nodes,) self.min_impurity_decrease = (min_impurity_decrease,) @@ -2725,6 +2749,12 @@ def __init__( self.tol = (tol,) self.ccp_alpha = (ccp_alpha,) + if random_state: + self.random_state = (random_state,) + else: + self.random_state = (self.random_state,) + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = GradientBoostingClassifier( loss=self.loss[0], learning_rate=self.learning_rate[0], @@ -3218,7 +3248,6 @@ def __init__( self.verbose = verbose self.epsilon = epsilon self.n_jobs = n_jobs - self.random_state = random_state self.learning_rate = learning_rate self.eta0 = eta0 self.power_t = power_t @@ -3229,6 +3258,10 @@ def __init__( self.warm_start = warm_start self.average = average + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = SGDClassifier( loss=self.loss, penalty=self.penalty, diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index 8aeb3cd0..99098c6b 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -64,10 +64,10 @@ def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_ mlflow.log_metrics(scores) @staticmethod - def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the two-dimensional diagram of the clustering result.""" print("-----* Cluster Two-Dimensional Diagram *-----") - scatter2d(data, labels, algorithm_name) + scatter2d(data, labels, cluster_centers_, algorithm_name) save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) @@ -118,6 +118,7 @@ def common_components(self) -> None: self._scatter2d( data=two_dimen_data, labels=self.clustering_result["clustering result"], + cluster_centers_=self.get_cluster_centers(), algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -138,6 +139,7 @@ def common_components(self) -> None: self._scatter2d( data=two_dimen_data, labels=self.clustering_result["clustering result"], + cluster_centers_=self.get_cluster_centers(), algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -155,6 +157,7 @@ def common_components(self) -> None: self._scatter2d( data=self.X, labels=self.clustering_result["clustering result"], + cluster_centers_=self.get_cluster_centers(), algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -277,10 +280,13 @@ def __init__( self.tol = tol self.n_init = n_init self.verbose = verbose - self.random_state = random_state self.copy_x = copy_x self.algorithm = algorithm + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = KMeans( n_clusters=self.n_clusters, init=self.init, @@ -438,16 +444,20 @@ def __init__( self.verbose = verbose self.preference = preference self.affinity = affinity - self.random_state = random_state + + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = AffinityPropagation( damping=self.damping, max_iter=self.max_iter, convergence_iter=self.convergence_iter, copy=self.copy, - preference=None, - affinity="euclidean", - verbose=False, - random_state=None, + preference=self.preference, + affinity=self.affinity, + verbose=self.verbose, + random_state=self.random_state, ) self.naming = AffinityPropagationClustering.name diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py index d9490456..ab954230 100644 --- a/geochemistrypi/data_mining/model/decomposition.py +++ b/geochemistrypi/data_mining/model/decomposition.py @@ -189,8 +189,11 @@ def __init__( self.iterated_power = iterated_power # self.n_oversamples = n_oversamples # self.power_iteration_normalizer = power_iteration_normalizer - self.random_state = random_state + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = PCA( n_components=self.n_components, copy=self.copy, @@ -488,12 +491,15 @@ def __init__( self.metric_params = metric_params self.init = init self.verbose = verbose - self.random_state = random_state self.method = method self.angle = angle self.n_jobs = n_jobs self.square_distances = square_distances + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = TSNE( n_components=self.n_components, perplexity=self.perplexity, @@ -618,10 +624,13 @@ def __init__( self.verbose = verbose self.eps = eps self.n_jobs = n_jobs - self.random_state = random_state self.dissimilarity = dissimilarity # self.normalized_stress = normalized_stress + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = MDS( n_components=self.n_components, metric=self.metric, diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py b/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py index e4a0eb2b..75ae43f3 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py @@ -34,7 +34,7 @@ def extra_trees_manual_hyper_parameters() -> Dict: "Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement to create a new dataset" " of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree." ) - print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.") + print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.") bootstrap = bool_input(SECTION[2]) max_samples = None if bootstrap: @@ -45,7 +45,7 @@ def extra_trees_manual_hyper_parameters() -> Dict: "oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data" " to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. " ) - print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.") + print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.") oob_score = bool_input(SECTION[2]) hyper_parameters = { "n_estimators": n_estimators, diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py b/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py index 432ddbf7..5ff1f9d5 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py @@ -17,7 +17,7 @@ def logistic_regression_manual_hyper_parameters() -> Dict: hyper_parameters : dict """ print("Penalty: This hyperparameter specifies the norm used in the penalization.") - print("Please specify the norm used in the penalization. It is generally recommended to leave it set to l2.") + print("Please specify the norm used in the penalization. It is generally recommended to leave it as 'l2'.") penalties = ["l1", "l2", "elasticnet", "None"] penalty = str_input(penalties, SECTION[2]) if penalty == "None": @@ -28,12 +28,12 @@ def logistic_regression_manual_hyper_parameters() -> Dict: l1_ratio = None if penalty == "l1": print("Solver: This hyperparameter specifies the algorithm to use in the optimization problem.") - print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it set to liblinear.") + print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it as 'liblinear'.") solvers = ["liblinear", "saga"] solver = str_input(solvers, SECTION[2]) elif penalty == "l2" or penalty == "none": print("Solver: This hyperparameter specifies the algorithm to use in the optimization problem.") - print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it set to lbfgs.") + print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it as 'lbfgs'.") solvers = ["newton-cg", "lbfgs", "sag", "saga"] solver = str_input(solvers, SECTION[2]) elif penalty == "elasticnet": @@ -48,7 +48,7 @@ def logistic_regression_manual_hyper_parameters() -> Dict: "Class Weight: This hyperparameter specifies the weights associated with classes. It can be set to 'balanced'" " to automatically adjust the weights inversely proportional to the class frequencies in the input data." ) - print("Please specify the weights associated with classes. It is generally recommended to leave it set to None.") + print("Please specify the weights associated with classes. It is generally recommended to leave it as None.") class_weights = ["None", "balanced"] class_weight = str_input(class_weights, SECTION[2]) if class_weight == "None": diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py b/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py index e7769eb5..8d805288 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py @@ -18,18 +18,18 @@ def multi_layer_perceptron_manual_hyper_parameters() -> Dict: print("Please specify the size of hidden layer and the number of neurons in the each hidden layer.") hidden_layer = tuple_input((50, 25, 5), SECTION[2], "@Hidden Layer Sizes: ") print("Activation: Activation function for the hidden layer.") - print("Please specify the activation function for the hidden layer. It is generally recommended to leave it set to ReLU.") + print("Please specify the activation function for the hidden layer. It is generally recommended to leave it as 'ReLU'.") activations = ["identity", "logistic", "tanh", "relu"] activation = str_input(activations, SECTION[2]) print("Solver: The solver for weight optimization.") - print("Please specify the solver for weight optimization. It is generally recommended to leave it set to Adam.") + print("Please specify the solver for weight optimization. It is generally recommended to leave it as 'Adam'.") solvers = ["lbfgs", "sgd", "adam"] solver = str_input(solvers, SECTION[2]) print("Alpha: L2 penalty (regularization term) parameter.") print("Please specify the L2 penalty (regularization term) parameter. A good starting range could be between 0.0001 and 10, such as 0.0001.") alpha = float_input(0.0001, SECTION[2], "@Alpha: ") print("Learning Rate: It controls the step-size in updating the weights.") - print("Please specify the learning rate. It is generally recommended to leave it set to Adaptive.") + print("Please specify the learning rate. It is generally recommended to leave it as 'Adaptive'.") learning_rates = ["constant", "invscaling", "adaptive"] learning_rate = str_input(learning_rates, SECTION[2]) print("Max Iterations: Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations.") diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_rf.py b/geochemistrypi/data_mining/model/func/algo_classification/_rf.py index 6d73e305..6de6a97d 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_rf.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_rf.py @@ -35,7 +35,7 @@ def random_forest_manual_hyper_parameters() -> Dict: "Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement" " to create a new dataset of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree." ) - print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.") + print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.") bootstrap = bool_input(SECTION[2]) max_samples = None if bootstrap: @@ -46,7 +46,7 @@ def random_forest_manual_hyper_parameters() -> Dict: "oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data" " to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. " ) - print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.") + print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.") oob_score = bool_input(SECTION[2]) hyper_parameters = { "n_estimators": n_estimators, diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_svc.py b/geochemistrypi/data_mining/model/func/algo_classification/_svc.py index 45cb16cf..a58634a4 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_svc.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_svc.py @@ -15,7 +15,7 @@ def svc_manual_hyper_parameters() -> Dict: hyper_parameters : dict """ print("Kernel: This hyperparameter specifies the kernel function to be used for mapping the input data to a higher-dimensional feature space.") - print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it set to Radial basis function (RBF) kernel.") + print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it as 'Radial Basis Function (RBF) Kernel'.") kernels = ["linear", "poly", "rbf", "sigmoid"] kernel = str_input(kernels, SECTION[2]) degree = None @@ -41,7 +41,7 @@ def svc_manual_hyper_parameters() -> Dict: C = float_input(1, SECTION[2], "@C: ") print("Shrinking: This hyperparameter specifies whether to use the shrinking heuristic.") print("The shrinking heuristic is a technique that speeds up the training process by only considering the support vectors in the decision function.") - print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it set to True.") + print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it as True.") shrinking = bool_input(SECTION[2]) hyper_parameters = {"kernel": kernel, "C": C, "shrinking": shrinking} if not degree: diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_common.py b/geochemistrypi/data_mining/model/func/algo_clustering/_common.py index 855d6a3f..75848442 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_common.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_common.py @@ -38,22 +38,25 @@ def score(data: pd.DataFrame, labels: pd.DataFrame) -> Dict: return scores -def scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> None: +def scatter2d(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, algorithm_name: str) -> None: """ Draw the result-2D diagram for analysis. Parameters ---------- data : pd.DataFrame (n_samples, n_components) - The true values. + The features of the data. labels : pd.DataFrame (n_samples,) Labels of each point. + cluster_centers_: np.ndarray (n_samples,) + Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_. + algorithm_name : str the name of the algorithm """ - markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] + # markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] colors = [ "#1f77b4", "#ff7f0e", @@ -77,17 +80,29 @@ def scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> "#bcbd22", ] - marker_cycle = cycle(markers) + # marker_cycle = cycle(markers) color_cycle = cycle(colors) fig = plt.figure() fig.set_size_inches(18, 10) plt.subplot(111) + # Plot the data for i, label in enumerate(set(labels)): cluster_data = data[labels == label] color = next(color_cycle) - marker = next(marker_cycle) + # marker = next(marker_cycle) + marker = "." plt.scatter(cluster_data.iloc[:, 0], cluster_data.iloc[:, 1], c=color, marker=marker) + + # Plot the cluster centers + if not isinstance(cluster_centers_, str): + # Draw white circles at cluster centers + plt.scatter(cluster_centers_[:, 0], cluster_centers_[:, 1], c="white", marker="o", alpha=1, s=200, edgecolor="k") + + # Label the cluster centers + for i, c in enumerate(cluster_centers_): + plt.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") + plt.xlabel(f"{data.columns[0]}") plt.ylabel(f"{data.columns[1]}") plt.title(f"Cluster Data Bi-plot - {algorithm_name}") @@ -100,7 +115,7 @@ def scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> Parameters ---------- data : pd.DataFrame (n_samples, n_components) - The true values. + The features of the data. labels : pd.DataFrame (n_samples,) Labels of each point. @@ -113,15 +128,16 @@ def scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> fig = plt.figure(figsize=(12, 6), facecolor="w") plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9) + # Plot the data without cluster results ax = fig.add_subplot(121, projection="3d") - ax.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], alpha=0.3, c="#FF0000", s=6) + ax.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], alpha=0.3, c="#FF0000", marker=".") ax.set_xlabel(namelist[0]) ax.set_ylabel(namelist[1]) ax.set_zlabel(namelist[2]) plt.grid(True) ax2 = fig.add_subplot(122, projection="3d") - markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] + # markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] colors = [ "#1f77b4", "#ff7f0e", @@ -144,13 +160,15 @@ def scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> "#7f7f7f", "#bcbd22", ] - marker_cycle = cycle(markers) + # marker_cycle = cycle(markers) color_cycle = cycle(colors) + # Plot the data with cluster results for i, label in enumerate(set(labels)): cluster_data = data[labels == label] color = next(color_cycle) - marker = next(marker_cycle) + # marker = next(marker_cycle) + marker = "." ax2.scatter(cluster_data.iloc[:, 0], cluster_data.iloc[:, 1], cluster_data.iloc[:, 2], c=color, marker=marker, s=6, cmap=plt.cm.Paired, edgecolors="none") ax2.set_xlabel(namelist[0]) @@ -271,12 +289,10 @@ def plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, cluster_ce ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k") if not isinstance(cluster_centers_, str): - # Labeling the clusters - centers = cluster_centers_ # Draw white circles at cluster centers ax2.scatter( - centers[:, 0], - centers[:, 1], + cluster_centers_[:, 0], + cluster_centers_[:, 1], marker="o", c="white", alpha=1, @@ -284,7 +300,8 @@ def plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, cluster_ce edgecolor="k", ) - for i, c in enumerate(centers): + # Label the cluster centers + for i, c in enumerate(cluster_centers_): ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") ax2.set_title("The visualization of the clustered data.") diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py index 003334bf..9918713b 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py @@ -16,17 +16,51 @@ def dbscan_manual_hyper_parameters() -> Dict: print("Eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.") print("Please specify the maximum distance. A good starting range could be between 0.1 and 1.0, such as 0.5.") eps = float_input(0.5, SECTION[2], "Eps: ") + print("Min Samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.") print("Please specify the number of samples. A good starting range could be between 5 and 20, such as 5.") min_samples = num_input(SECTION[2], "Min Samples: ") - print("Metric: The metric to use when calculating distance between instances in a feature array.") - print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.") - metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"] - metric = str_input(metrics, SECTION[2]) + print("Algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.") print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.") algorithms = ["auto", "ball_tree", "kd_tree", "brute"] algorithm = str_input(algorithms, SECTION[2]) + + # Reference: + # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html + # https://scikit-learn.org/stable/modules/neighbors.html + print("Metric: The metric to use when calculating distance between instances in a feature array.") + print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.") + if algorithm == "kd_tree": + metrics = ["euclidean", "l2", "minkowski", "p", "manhattan", "cityblock", "l1", "chebyshev", "infinity"] + elif algorithm == "ball_tree": + metrics = [ + "euclidean", + "l2", + "minkowski", + "p", + "manhattan", + "cityblock", + "l1", + "chebyshev", + "infinity", + "seuclidean", + "mahalanobis", + "hamming", + "canberra", + "braycurtis", + "jaccard", + "dice", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", + "haversine", + ] + else: + metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"] + metric = str_input(metrics, SECTION[2]) + print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.") print("Please specify the leaf size. A good starting range could be between 10 and 30, such as 30.") leaf_size = num_input(SECTION[2], "Leaf Size: ") diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py index 9a1abb9c..30ec1918 100644 --- a/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py +++ b/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py @@ -17,7 +17,7 @@ def mds_manual_hyper_parameters() -> Dict: print("Please specify the number of components to retain. A good starting range could be between 2 and 10, such as 4.") n_components = num_input(SECTION[2], "N Components: ") print("Metric: This parameter specifies the metric to be used when calculating distance between instances in a feature array.") - print("Please specify whether the metric is used when measuring the pairwise distances between data points in the input space. It is generally recommended to leave it set to True.") + print("Please specify whether the metric is used when measuring the pairwise distances between data points in the input space. It is generally recommended to leave it as True.") metric = bool_input(SECTION[2]) print("N Init: This parameter specifies the number of times the SMACOF algorithm will be run with different initializations.") print("Please specify the number of times. A good starting range could be between 1 and 10, such as 4.") diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py b/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py index d440b274..4476f892 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py @@ -35,7 +35,7 @@ def extra_trees_manual_hyper_parameters() -> Dict: "Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement" " to create a new dataset of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree." ) - print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.") + print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.") bootstrap = bool_input(SECTION[2]) max_samples = None if bootstrap: @@ -46,7 +46,7 @@ def extra_trees_manual_hyper_parameters() -> Dict: "oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data" " to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. " ) - print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.") + print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.") oob_score = bool_input(SECTION[2]) hyper_parameters = { "n_estimators": n_estimators, diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py index a8a1d293..9b450f19 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py @@ -18,7 +18,7 @@ def lasso_regression_manual_hyper_parameters() -> Dict: print("Please indicate the coefficient of alpha. A good starting range could be between 0.001 and 2, such as 1.") alpha = float_input(0.01, SECTION[2], "@Alpha: ") print("Fit Intercept: This hyperparameter represents whether the model is evaluated with constant terms.") - print("Please indicate whether there is a parameter entry. It is generally recommended to leave it set to True.") + print("Please indicate whether there is a parameter entry. It is generally recommended to leave it as True.") fit_intercept = bool_input(SECTION[2]) print("Max Iter: This hyperparameter represents the maximum number of iterations for the solver to converge.") print("Please indicate the maximum number of iterations. A good starting range could be between 1000 and 10000, such as 1000.") @@ -27,7 +27,7 @@ def lasso_regression_manual_hyper_parameters() -> Dict: print("Please indicate the tolerance. A good starting range could be between 0.0001 and 0.001, such as 0.0001.") tol = float_input(0.0001, SECTION[2], "@Tolerance: ") print("Selection: This hyperparameter represents the method of selecting the regularization coefficient.") - print("Please indicate the method of selecting the regularization coefficient. It is generally recommended to leave it set to 'cyclic'.") + print("Please indicate the method of selecting the regularization coefficient. It is generally recommended to leave it as 'cyclic'.") selections = ["cyclic", "random"] selection = str_input(selections, SECTION[2]) hyper_parameters = { diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py index ae806d03..2cd39529 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py @@ -19,7 +19,7 @@ def linear_regression_manual_hyper_parameters() -> Dict: hyper_parameters : dict """ print("Fit Intercept: This hyperparameter specifies whether to calculate the intercept (also called the bias term) for this model.") - print("Please specify whether to calculate the intercept for this model. It is generally recommended to leave it set to True.") + print("Please specify whether to calculate the intercept for this model. It is generally recommended to leave it as True.") fit_intercept = bool_input(SECTION[2]) hyper_parameters = {"fit_intercept": fit_intercept} return hyper_parameters diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py b/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py index e7769eb5..8d805288 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py @@ -18,18 +18,18 @@ def multi_layer_perceptron_manual_hyper_parameters() -> Dict: print("Please specify the size of hidden layer and the number of neurons in the each hidden layer.") hidden_layer = tuple_input((50, 25, 5), SECTION[2], "@Hidden Layer Sizes: ") print("Activation: Activation function for the hidden layer.") - print("Please specify the activation function for the hidden layer. It is generally recommended to leave it set to ReLU.") + print("Please specify the activation function for the hidden layer. It is generally recommended to leave it as 'ReLU'.") activations = ["identity", "logistic", "tanh", "relu"] activation = str_input(activations, SECTION[2]) print("Solver: The solver for weight optimization.") - print("Please specify the solver for weight optimization. It is generally recommended to leave it set to Adam.") + print("Please specify the solver for weight optimization. It is generally recommended to leave it as 'Adam'.") solvers = ["lbfgs", "sgd", "adam"] solver = str_input(solvers, SECTION[2]) print("Alpha: L2 penalty (regularization term) parameter.") print("Please specify the L2 penalty (regularization term) parameter. A good starting range could be between 0.0001 and 10, such as 0.0001.") alpha = float_input(0.0001, SECTION[2], "@Alpha: ") print("Learning Rate: It controls the step-size in updating the weights.") - print("Please specify the learning rate. It is generally recommended to leave it set to Adaptive.") + print("Please specify the learning rate. It is generally recommended to leave it as 'Adaptive'.") learning_rates = ["constant", "invscaling", "adaptive"] learning_rate = str_input(learning_rates, SECTION[2]) print("Max Iterations: Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations.") diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py index 63818221..55a0f13c 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py @@ -18,10 +18,10 @@ def polynomial_regression_manual_hyper_parameters() -> Dict: print("Please specify the degree of the polynomial features. A good starting range could be between 1 and 5, such as 2.") degree = num_input(SECTION[2], "@Degree: ") print("Interaction Only: This hyperparameter specifies whether to only include interaction features.") - print("Please specify whether to only include interaction features. It is generally recommended to leave it set to False.") + print("Please specify whether to only include interaction features. It is generally recommended to leave it as False.") interaction_only = bool_input(SECTION[2]) print("Include Bias: This hyperparameter specifies whether to include a bias (also called the intercept) term in the model.") - print("Please specify whether to include a bias term in the model. It is generally recommended to leave it set to True.") + print("Please specify whether to include a bias term in the model. It is generally recommended to leave it as True.") include_bias = bool_input(SECTION[2]) hyper_parameters = {"degree": degree, "interaction_only": interaction_only, "include_bias": include_bias} return hyper_parameters diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_rf.py b/geochemistrypi/data_mining/model/func/algo_regression/_rf.py index 8cf08715..03bd7039 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_rf.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_rf.py @@ -35,7 +35,7 @@ def random_forest_manual_hyper_parameters() -> Dict: "Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement" " to create a new dataset of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree." ) - print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.") + print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.") bootstrap = bool_input(SECTION[2]) max_samples = None if bootstrap: @@ -46,7 +46,7 @@ def random_forest_manual_hyper_parameters() -> Dict: "oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data" " to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. " ) - print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.") + print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.") oob_score = bool_input(SECTION[2]) hyper_parameters = { "n_estimators": n_estimators, diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_svr.py b/geochemistrypi/data_mining/model/func/algo_regression/_svr.py index 13bed965..0f120f5d 100644 --- a/geochemistrypi/data_mining/model/func/algo_regression/_svr.py +++ b/geochemistrypi/data_mining/model/func/algo_regression/_svr.py @@ -15,7 +15,7 @@ def svr_manual_hyper_parameters() -> Dict: hyper_parameters : dict """ print("Kernel: This hyperparameter specifies the kernel function to be used for mapping the input data to a higher-dimensional feature space.") - print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it set to Radial basis function (RBF) kernel.") + print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it as 'Radial Basis Function (RBF) Kernel'.") kernels = ["linear", "poly", "rbf", "sigmoid"] kernel = str_input(kernels, SECTION[2]) degree = None @@ -41,7 +41,7 @@ def svr_manual_hyper_parameters() -> Dict: C = float_input(1, SECTION[2], "@C: ") print("Shrinking: This hyperparameter specifies whether to use the shrinking heuristic.") print("The shrinking heuristic is a technique that speeds up the training process by only considering the support vectors in the decision function.") - print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it set to True.") + print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it as True.") shrinking = bool_input(SECTION[2]) hyper_parameters = {"kernel": kernel, "C": C, "shrinking": shrinking} if not degree: diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index b17aacae..2c7d91f5 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -545,7 +545,6 @@ def __init__( self.base_score = base_score self.missing = missing self.num_parallel_tree = num_parallel_tree - self.random_state = random_state self.n_jobs = n_jobs self.monotone_constraints = monotone_constraints self.interaction_constraints = interaction_constraints @@ -559,6 +558,10 @@ def __init__( if kwargs: self.kwargs = kwargs + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = xgboost.XGBRegressor( n_estimators=self.n_estimators, objective=self.objective, @@ -818,11 +821,16 @@ def __init__( self.min_samples_leaf = (min_samples_leaf,) self.min_weight_fraction_leaf = (min_weight_fraction_leaf,) self.max_features = (max_features,) - self.random_state = (random_state,) self.max_leaf_nodes = (max_leaf_nodes,) self.min_impurity_decrease = (min_impurity_decrease,) - self.ccp_alpha = ccp_alpha + self.ccp_alpha = (ccp_alpha,) + + if random_state: + self.random_state = (random_state,) + else: + self.random_state = (self.random_state,) + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = DecisionTreeRegressor( criterion=self.criterion[0], splitter=self.splitter[0], @@ -834,7 +842,7 @@ def __init__( random_state=self.random_state[0], max_leaf_nodes=self.max_leaf_nodes[0], min_impurity_decrease=self.min_impurity_decrease[0], - ccp_alpha=self.ccp_alpha, + ccp_alpha=self.ccp_alpha[0], ) self.naming = DecisionTreeRegression.name self.customized = True @@ -1135,12 +1143,15 @@ def __init__( self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs - self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.ccp_alpha = ccp_alpha self.max_samples = max_samples + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, @@ -1246,7 +1257,7 @@ def __init__( bootstrap: bool = True, oob_score: bool = False, n_jobs: int = None, - random_state: int = None, + random_state: Optional[int] = None, verbose: int = 0, warm_start: bool = False, # class_weight=None, @@ -1430,13 +1441,16 @@ def __init__( self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs - self.random_state = random_state self.verbose = verbose self.warm_start = warm_start # self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.max_samples = max_samples + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = RandomForestRegressor( n_estimators=self.n_estimators, criterion=self.criterion, @@ -1875,7 +1889,6 @@ def __init__( self.learning_rate_init = learning_rate_init self.max_iter = max_iter self.shuffle = shuffle - self.random_state = random_state self.tol = tol self.verbose = verbose self.warm_start = warm_start @@ -1886,6 +1899,10 @@ def __init__( self.epsilon = epsilon self.n_iter_no_change = n_iter_no_change + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = MLPRegressor( hidden_layer_sizes=self.hidden_layer_sizes, activation=self.activation, @@ -2547,7 +2564,6 @@ def __init__( self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease self.init = init - self.random_state = random_state self.max_features = max_features self.alpha = alpha self.verbose = verbose @@ -2558,6 +2574,10 @@ def __init__( self.tol = tol self.ccp_alpha = ccp_alpha + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = GradientBoostingRegressor( loss=self.loss, learning_rate=self.learning_rate, @@ -2766,9 +2786,12 @@ def __init__( self.tol = tol self.warm_start = warm_start self.positive = positive - self.random_state = random_state self.selection = selection + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = Lasso( alpha=self.alpha, fit_intercept=self.fit_intercept, @@ -3087,9 +3110,12 @@ def __init__( self.tol = tol self.warm_start = warm_start self.positive = positive - self.random_state = random_state self.selection = selection + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = ElasticNet( alpha=self.alpha, l1_ratio=self.l1_ratio, @@ -3494,7 +3520,6 @@ def __init__( self.shuffle = shuffle self.verbose = verbose self.epsilon = epsilon - self.random_state = random_state self.learning_rate = learning_rate self.eta0 = eta0 self.power_t = power_t @@ -3504,6 +3529,10 @@ def __init__( self.warm_start = warm_start self.average = average + if random_state: + self.random_state = random_state + + # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' self.model = SGDRegressor( loss=self.loss, penalty=self.penalty, diff --git a/geochemistrypi/start_cli_pipeline.py b/geochemistrypi/start_cli_pipeline.py index 3f6fbac5..b3834d90 100644 --- a/geochemistrypi/start_cli_pipeline.py +++ b/geochemistrypi/start_cli_pipeline.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- from data_mining.cli_pipeline import cli_pipeline -# Used for internal testing +# Used for internal testing, run in debug mode in IDE to inspect the pipeline cli_pipeline("", "")