ZJUEarthData · SanyHe · Jan 13, 2024 · Jan 13, 2024
diff --git a/README.md b/README.md
@@ -92,7 +92,7 @@ On Jupyter Notebook / Google Colab:
 ```
 **Note**: There are four built-in data sets corresponding to four kinds of model pattern.
 
-### Case 2: Run with your own data set
+### Case 2: Run with your own data set without model inference
 
 On command line:
 ```

diff --git a/geochemistrypi/cli.py b/geochemistrypi/cli.py
@@ -40,7 +40,7 @@ def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="S
 def data_mining(
     data: str = typer.Option("", help="The path of the training data without model inference."),
     training: str = typer.Option("", help="The path of the training data."),
-    inference: str = typer.Option("", help="The path of the inference data."),
+    application: str = typer.Option("", help="The path of the inference data."),
     mlflow: bool = typer.Option(False, help="Start the mlflow server."),
     web: bool = False,
 ) -> None:
@@ -81,11 +81,11 @@ def start_mlflow():
             if data:
                 cli_pipeline(data)
             # If the training data and inference data are provided, start the CLI pipeline with continuous training and inference
-            elif training and inference:
-                cli_pipeline(training, inference)
+            elif training and application:
+                cli_pipeline(training, application)
             # If no data is provided, use built-in data to start the CLI pipeline with continuous training and inference
             else:
-                cli_pipeline(training, inference)
+                cli_pipeline(training, application)
 
 
 @app.command()

diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -100,6 +100,6 @@
 
 CUSTOMIZE_LABEL_STRATEGY = ["Automatic Coding", "Custom Numeric Labels", "Custom Non-numeric Labels"]
 
-FEATURE_SELECTION_STRATEGY = ["GenericUnivariateSelect", "SelectKBest"]
+FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]
 
 CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]
diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py
@@ -129,7 +129,7 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
     """
     # If is_inference is True, then run the model inference.
     if is_inference is True:
-        print("Use the trained model to make predictions on the inference data.")
+        print("Use the trained model to make predictions on the application data.")
         # If transformer_config is not {}, then transform the inference data with the transform pipeline.
         if transformer_config:
             inference_data_transformed = transform_pipeline.transform(inference_data)
@@ -139,4 +139,4 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
         inference_data_predicted_np = loaded_model.predict(inference_data_transformed)
         inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"])
         GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
-        save_data(inference_data_predicted, "Inference Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        save_data(inference_data_predicted, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
diff --git a/geochemistrypi/data_mining/data/preprocessing.py b/geochemistrypi/data_mining/data/preprocessing.py
@@ -73,7 +73,7 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i
     X_selected : pd.DataFrame
         The feature dataset after selecting.
     """
-    print("--Original Features-")
+    print("-- Original Features --")
     show_data_columns(X.columns)
 
     features_num = len(X.columns)
@@ -85,9 +85,9 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i
     elif feature_selection_task == 2:
         score_func = f_classif
 
-    if method[method_idx] == "GenericUnivariateSelect":
+    if method[method_idx] == "Generic Univariate Select":
         selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num)
-    elif method[method_idx] == "SelectKBest":
+    elif method[method_idx] == "Select K Best":
         selector = SelectKBest(score_func=score_func, k=features_retain_num)
 
     try:

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
@@ -533,6 +533,10 @@ def __init__(
         self.decision_function_shape = decision_function_shape
         self.break_ties = break_ties
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = SVC(
             C=self.C,
             kernel=self.kernel,
@@ -784,6 +788,10 @@ def __init__(
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = DecisionTreeClassifier(
             criterion=self.criterion,
             splitter=self.splitter,
@@ -916,7 +924,7 @@ def __init__(
         bootstrap: bool = True,
         oob_score: bool = False,
         n_jobs: Optional[int] = -1,
-        random_state: Optional[int] = 42,
+        random_state: Optional[int] = None,
         verbose: int = 0,
         warm_start: bool = False,
         class_weight: Union[str, dict, list[dict], None] = None,
@@ -1118,13 +1126,16 @@ def __init__(
         self.bootstrap = bootstrap
         self.oob_score = oob_score
         self.n_jobs = n_jobs
-        self.random_state = random_state
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
         self.max_samples = max_samples
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = RandomForestClassifier(
             n_estimators=self.n_estimators,
             criterion=self.criterion,
@@ -1449,7 +1460,6 @@ def __init__(
         self.base_score = base_score
         self.missing = missing
         self.num_parallel_tree = num_parallel_tree
-        self.random_state = random_state
         self.n_jobs = n_jobs
         self.monotone_constraints = monotone_constraints
         self.interaction_constraints = interaction_constraints
@@ -1460,9 +1470,14 @@ def __init__(
         self.enable_categorical = enable_categorical
         self.eval_metric = eval_metric
         self.early_stopping_rounds = early_stopping_rounds
+
         if kwargs:
             self.kwargs = kwargs
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = xgboost.XGBClassifier(
             n_estimators=self.n_estimators,
             objective=self.objective,
@@ -1750,17 +1765,19 @@ def __init__(
         self.fit_intercept = fit_intercept
         self.intercept_scaling = intercept_scaling
         self.class_weight = class_weight
-        self.random_state = random_state
         self.solver = solver
         self.max_iter = max_iter
         self.multi_class = multi_class
         self.n_jobs = n_jobs
-        self.random_state = random_state
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.l1_ratio = l1_ratio
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = LogisticRegression(
             penalty=self.penalty,
             dual=self.dual,
@@ -2046,7 +2063,6 @@ def __init__(
         self.power_t = (power_t,)
         self.max_iter = (max_iter,)
         self.shuffle = (shuffle,)
-        self.random_state = (random_state,)
         self.tol = (tol,)
         self.verbose = (verbose,)
         self.warm_start = (warm_start,)
@@ -2060,6 +2076,12 @@ def __init__(
         self.n_iter_no_change = (n_iter_no_change,)
         self.max_fun = (max_fun,)
 
+        if random_state:
+            self.random_state = (random_state,)
+        else:
+            self.random_state = (self.random_state,)
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = MLPClassifier(
             hidden_layer_sizes=self.hidden_layer_sizes[0],
             activation=self.activation[0],
@@ -2394,13 +2416,16 @@ def __init__(
         self.bootstrap = bootstrap
         self.oob_score = oob_score
         self.n_jobs = n_jobs
-        self.random_state = random_state
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
         self.max_samples = max_samples
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = ExtraTreesClassifier(
             n_estimators=self.n_estimators,
             criterion=self.criterion,
@@ -2715,7 +2740,6 @@ def __init__(
         self.init = (init,)
         self.subsample = (subsample,)
         self.max_features = (max_features,)
-        self.random_state = (random_state,)
         self.verbose = (verbose,)
         self.max_leaf_nodes = (max_leaf_nodes,)
         self.min_impurity_decrease = (min_impurity_decrease,)
@@ -2725,6 +2749,12 @@ def __init__(
         self.tol = (tol,)
         self.ccp_alpha = (ccp_alpha,)
 
+        if random_state:
+            self.random_state = (random_state,)
+        else:
+            self.random_state = (self.random_state,)
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = GradientBoostingClassifier(
             loss=self.loss[0],
             learning_rate=self.learning_rate[0],
@@ -3218,7 +3248,6 @@ def __init__(
         self.verbose = verbose
         self.epsilon = epsilon
         self.n_jobs = n_jobs
-        self.random_state = random_state
         self.learning_rate = learning_rate
         self.eta0 = eta0
         self.power_t = power_t
@@ -3229,6 +3258,10 @@ def __init__(
         self.warm_start = warm_start
         self.average = average
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = SGDClassifier(
             loss=self.loss,
             penalty=self.penalty,

diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
@@ -64,10 +64,10 @@ def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_
         mlflow.log_metrics(scores)
 
     @staticmethod
-    def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Plot the two-dimensional diagram of the clustering result."""
         print("-----* Cluster Two-Dimensional Diagram *-----")
-        scatter2d(data, labels, algorithm_name)
+        scatter2d(data, labels, cluster_centers_, algorithm_name)
         save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
         data_with_labels = pd.concat([data, labels], axis=1)
         save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
@@ -118,6 +118,7 @@ def common_components(self) -> None:
             self._scatter2d(
                 data=two_dimen_data,
                 labels=self.clustering_result["clustering result"],
+                cluster_centers_=self.get_cluster_centers(),
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -138,6 +139,7 @@ def common_components(self) -> None:
             self._scatter2d(
                 data=two_dimen_data,
                 labels=self.clustering_result["clustering result"],
+                cluster_centers_=self.get_cluster_centers(),
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -155,6 +157,7 @@ def common_components(self) -> None:
             self._scatter2d(
                 data=self.X,
                 labels=self.clustering_result["clustering result"],
+                cluster_centers_=self.get_cluster_centers(),
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -277,10 +280,13 @@ def __init__(
         self.tol = tol
         self.n_init = n_init
         self.verbose = verbose
-        self.random_state = random_state
         self.copy_x = copy_x
         self.algorithm = algorithm
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = KMeans(
             n_clusters=self.n_clusters,
             init=self.init,
@@ -438,16 +444,20 @@ def __init__(
         self.verbose = verbose
         self.preference = preference
         self.affinity = affinity
-        self.random_state = random_state
+
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = AffinityPropagation(
             damping=self.damping,
             max_iter=self.max_iter,
             convergence_iter=self.convergence_iter,
             copy=self.copy,
-            preference=None,
-            affinity="euclidean",
-            verbose=False,
-            random_state=None,
+            preference=self.preference,
+            affinity=self.affinity,
+            verbose=self.verbose,
+            random_state=self.random_state,
         )
         self.naming = AffinityPropagationClustering.name
 

diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py
@@ -189,8 +189,11 @@ def __init__(
         self.iterated_power = iterated_power
         # self.n_oversamples = n_oversamples
         # self.power_iteration_normalizer = power_iteration_normalizer
-        self.random_state = random_state
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = PCA(
             n_components=self.n_components,
             copy=self.copy,
@@ -488,12 +491,15 @@ def __init__(
         self.metric_params = metric_params
         self.init = init
         self.verbose = verbose
-        self.random_state = random_state
         self.method = method
         self.angle = angle
         self.n_jobs = n_jobs
         self.square_distances = square_distances
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = TSNE(
             n_components=self.n_components,
             perplexity=self.perplexity,
@@ -618,10 +624,13 @@ def __init__(
         self.verbose = verbose
         self.eps = eps
         self.n_jobs = n_jobs
-        self.random_state = random_state
         self.dissimilarity = dissimilarity
         # self.normalized_stress = normalized_stress
 
+        if random_state:
+            self.random_state = random_state
+
+        # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
         self.model = MDS(
             n_components=self.n_components,
             metric=self.metric,

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py b/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py
@@ -34,7 +34,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
         "Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement to create a new dataset"
         "  of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree."
     )
-    print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.")
+    print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.")
     bootstrap = bool_input(SECTION[2])
     max_samples = None
     if bootstrap:
@@ -45,7 +45,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
         "oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data"
         " to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. "
     )
-    print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.")
+    print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.")
     oob_score = bool_input(SECTION[2])
     hyper_parameters = {
         "n_estimators": n_estimators,