Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: optimize model inference, model hyperparameter and clustering drawing #297

Merged
merged 1 commit into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ On Jupyter Notebook / Google Colab:
```
**Note**: There are four built-in data sets corresponding to four kinds of model pattern.

### Case 2: Run with your own data set
### Case 2: Run with your own data set without model inference

On command line:
```
Expand Down
8 changes: 4 additions & 4 deletions geochemistrypi/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="S
def data_mining(
data: str = typer.Option("", help="The path of the training data without model inference."),
training: str = typer.Option("", help="The path of the training data."),
inference: str = typer.Option("", help="The path of the inference data."),
application: str = typer.Option("", help="The path of the inference data."),
mlflow: bool = typer.Option(False, help="Start the mlflow server."),
web: bool = False,
) -> None:
Expand Down Expand Up @@ -81,11 +81,11 @@ def start_mlflow():
if data:
cli_pipeline(data)
# If the training data and inference data are provided, start the CLI pipeline with continuous training and inference
elif training and inference:
cli_pipeline(training, inference)
elif training and application:
cli_pipeline(training, application)
# If no data is provided, use built-in data to start the CLI pipeline with continuous training and inference
else:
cli_pipeline(training, inference)
cli_pipeline(training, application)


@app.command()
Expand Down
114 changes: 61 additions & 53 deletions geochemistrypi/data_mining/cli_pipeline.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,6 @@

CUSTOMIZE_LABEL_STRATEGY = ["Automatic Coding", "Custom Numeric Labels", "Custom Non-numeric Labels"]

FEATURE_SELECTION_STRATEGY = ["GenericUnivariateSelect", "SelectKBest"]
FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]

CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
"""
# If is_inference is True, then run the model inference.
if is_inference is True:
print("Use the trained model to make predictions on the inference data.")
print("Use the trained model to make predictions on the application data.")
# If transformer_config is not {}, then transform the inference data with the transform pipeline.
if transformer_config:
inference_data_transformed = transform_pipeline.transform(inference_data)
Expand All @@ -139,4 +139,4 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
inference_data_predicted_np = loaded_model.predict(inference_data_transformed)
inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"])
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(inference_data_predicted, "Inference Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_predicted, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
6 changes: 3 additions & 3 deletions geochemistrypi/data_mining/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i
X_selected : pd.DataFrame
The feature dataset after selecting.
"""
print("--Original Features-")
print("-- Original Features --")
show_data_columns(X.columns)

features_num = len(X.columns)
Expand All @@ -85,9 +85,9 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i
elif feature_selection_task == 2:
score_func = f_classif

if method[method_idx] == "GenericUnivariateSelect":
if method[method_idx] == "Generic Univariate Select":
selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num)
elif method[method_idx] == "SelectKBest":
elif method[method_idx] == "Select K Best":
selector = SelectKBest(score_func=score_func, k=features_retain_num)

try:
Expand Down
51 changes: 42 additions & 9 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,10 @@ def __init__(
self.decision_function_shape = decision_function_shape
self.break_ties = break_ties

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = SVC(
C=self.C,
kernel=self.kernel,
Expand Down Expand Up @@ -784,6 +788,10 @@ def __init__(
self.class_weight = class_weight
self.ccp_alpha = ccp_alpha

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = DecisionTreeClassifier(
criterion=self.criterion,
splitter=self.splitter,
Expand Down Expand Up @@ -916,7 +924,7 @@ def __init__(
bootstrap: bool = True,
oob_score: bool = False,
n_jobs: Optional[int] = -1,
random_state: Optional[int] = 42,
random_state: Optional[int] = None,
verbose: int = 0,
warm_start: bool = False,
class_weight: Union[str, dict, list[dict], None] = None,
Expand Down Expand Up @@ -1118,13 +1126,16 @@ def __init__(
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = RandomForestClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
Expand Down Expand Up @@ -1449,7 +1460,6 @@ def __init__(
self.base_score = base_score
self.missing = missing
self.num_parallel_tree = num_parallel_tree
self.random_state = random_state
self.n_jobs = n_jobs
self.monotone_constraints = monotone_constraints
self.interaction_constraints = interaction_constraints
Expand All @@ -1460,9 +1470,14 @@ def __init__(
self.enable_categorical = enable_categorical
self.eval_metric = eval_metric
self.early_stopping_rounds = early_stopping_rounds

if kwargs:
self.kwargs = kwargs

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = xgboost.XGBClassifier(
n_estimators=self.n_estimators,
objective=self.objective,
Expand Down Expand Up @@ -1750,17 +1765,19 @@ def __init__(
self.fit_intercept = fit_intercept
self.intercept_scaling = intercept_scaling
self.class_weight = class_weight
self.random_state = random_state
self.solver = solver
self.max_iter = max_iter
self.multi_class = multi_class
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.l1_ratio = l1_ratio

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = LogisticRegression(
penalty=self.penalty,
dual=self.dual,
Expand Down Expand Up @@ -2046,7 +2063,6 @@ def __init__(
self.power_t = (power_t,)
self.max_iter = (max_iter,)
self.shuffle = (shuffle,)
self.random_state = (random_state,)
self.tol = (tol,)
self.verbose = (verbose,)
self.warm_start = (warm_start,)
Expand All @@ -2060,6 +2076,12 @@ def __init__(
self.n_iter_no_change = (n_iter_no_change,)
self.max_fun = (max_fun,)

if random_state:
self.random_state = (random_state,)
else:
self.random_state = (self.random_state,)

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = MLPClassifier(
hidden_layer_sizes=self.hidden_layer_sizes[0],
activation=self.activation[0],
Expand Down Expand Up @@ -2394,13 +2416,16 @@ def __init__(
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = ExtraTreesClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
Expand Down Expand Up @@ -2715,7 +2740,6 @@ def __init__(
self.init = (init,)
self.subsample = (subsample,)
self.max_features = (max_features,)
self.random_state = (random_state,)
self.verbose = (verbose,)
self.max_leaf_nodes = (max_leaf_nodes,)
self.min_impurity_decrease = (min_impurity_decrease,)
Expand All @@ -2725,6 +2749,12 @@ def __init__(
self.tol = (tol,)
self.ccp_alpha = (ccp_alpha,)

if random_state:
self.random_state = (random_state,)
else:
self.random_state = (self.random_state,)

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = GradientBoostingClassifier(
loss=self.loss[0],
learning_rate=self.learning_rate[0],
Expand Down Expand Up @@ -3218,7 +3248,6 @@ def __init__(
self.verbose = verbose
self.epsilon = epsilon
self.n_jobs = n_jobs
self.random_state = random_state
self.learning_rate = learning_rate
self.eta0 = eta0
self.power_t = power_t
Expand All @@ -3229,6 +3258,10 @@ def __init__(
self.warm_start = warm_start
self.average = average

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = SGDClassifier(
loss=self.loss,
penalty=self.penalty,
Expand Down
26 changes: 18 additions & 8 deletions geochemistrypi/data_mining/model/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_
mlflow.log_metrics(scores)

@staticmethod
def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the two-dimensional diagram of the clustering result."""
print("-----* Cluster Two-Dimensional Diagram *-----")
scatter2d(data, labels, algorithm_name)
scatter2d(data, labels, cluster_centers_, algorithm_name)
save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
data_with_labels = pd.concat([data, labels], axis=1)
save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
Expand Down Expand Up @@ -118,6 +118,7 @@ def common_components(self) -> None:
self._scatter2d(
data=two_dimen_data,
labels=self.clustering_result["clustering result"],
cluster_centers_=self.get_cluster_centers(),
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -138,6 +139,7 @@ def common_components(self) -> None:
self._scatter2d(
data=two_dimen_data,
labels=self.clustering_result["clustering result"],
cluster_centers_=self.get_cluster_centers(),
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -155,6 +157,7 @@ def common_components(self) -> None:
self._scatter2d(
data=self.X,
labels=self.clustering_result["clustering result"],
cluster_centers_=self.get_cluster_centers(),
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -277,10 +280,13 @@ def __init__(
self.tol = tol
self.n_init = n_init
self.verbose = verbose
self.random_state = random_state
self.copy_x = copy_x
self.algorithm = algorithm

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = KMeans(
n_clusters=self.n_clusters,
init=self.init,
Expand Down Expand Up @@ -438,16 +444,20 @@ def __init__(
self.verbose = verbose
self.preference = preference
self.affinity = affinity
self.random_state = random_state

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = AffinityPropagation(
damping=self.damping,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
copy=self.copy,
preference=None,
affinity="euclidean",
verbose=False,
random_state=None,
preference=self.preference,
affinity=self.affinity,
verbose=self.verbose,
random_state=self.random_state,
)
self.naming = AffinityPropagationClustering.name

Expand Down
15 changes: 12 additions & 3 deletions geochemistrypi/data_mining/model/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,11 @@ def __init__(
self.iterated_power = iterated_power
# self.n_oversamples = n_oversamples
# self.power_iteration_normalizer = power_iteration_normalizer
self.random_state = random_state

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = PCA(
n_components=self.n_components,
copy=self.copy,
Expand Down Expand Up @@ -488,12 +491,15 @@ def __init__(
self.metric_params = metric_params
self.init = init
self.verbose = verbose
self.random_state = random_state
self.method = method
self.angle = angle
self.n_jobs = n_jobs
self.square_distances = square_distances

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = TSNE(
n_components=self.n_components,
perplexity=self.perplexity,
Expand Down Expand Up @@ -618,10 +624,13 @@ def __init__(
self.verbose = verbose
self.eps = eps
self.n_jobs = n_jobs
self.random_state = random_state
self.dissimilarity = dissimilarity
# self.normalized_stress = normalized_stress

if random_state:
self.random_state = random_state

# If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = MDS(
n_components=self.n_components,
metric=self.metric,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
"Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement to create a new dataset"
" of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree."
)
print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.")
print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.")
bootstrap = bool_input(SECTION[2])
max_samples = None
if bootstrap:
Expand All @@ -45,7 +45,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
"oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data"
" to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. "
)
print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.")
print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.")
oob_score = bool_input(SECTION[2])
hyper_parameters = {
"n_estimators": n_estimators,
Expand Down
Loading