Skip to content

Commit

Permalink
Merge pull request #287 from ZJUEarthData/web
Browse files Browse the repository at this point in the history
build: v0.4.0
  • Loading branch information
SanyHe authored Dec 15, 2023
2 parents 48f0e9e + 2d7a697 commit 559869b
Show file tree
Hide file tree
Showing 13 changed files with 98 additions and 39 deletions.
2 changes: 1 addition & 1 deletion geochemistrypi/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.0"
__version__ = "0.4.0"
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
del data_selected
clear_output()
else:
# if the selected data set doesn't need imputation, which means there are no missing values.
# If the selected data set doesn't need imputation, which means there are no missing values.
imputation_config = {}
data_selected_imputed = data_selected

Expand Down
Binary file not shown.
Binary file not shown.
40 changes: 21 additions & 19 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(self) -> None:
# These two attributes are used for the customized models of FLAML framework
self.customized = False
self.customized_name = None
self.mode = "Classification"

@dispatch(object, object)
def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
Expand Down Expand Up @@ -2995,7 +2996,8 @@ class SGDClassification(LinearWorkflowMixin, ClassificationWorkflowBase):
"""The automation workflow of using Stochastic Gradient Descent - SGD algorithm to make insightful products."""

name = "Stochastic Gradient Descent"
special_function = ["SGD Formula"]
# special_function = ["SGD Formula"]
special_function = []

def __init__(
self,
Expand Down Expand Up @@ -3315,25 +3317,25 @@ def manual_hyper_parameters(cls) -> Dict:
@dispatch()
def special_components(self, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
self._show_formula(
coef=[self.model.coef_],
intercept=self.model.intercept_,
features_name=SGDClassification.X_train.columns,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
mlflow_path="root",
)
# GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
# self._show_formula(
# coef=[self.model.coef_],
# intercept=self.model.intercept_,
# features_name=SGDClassification.X_train.columns,
# algorithm_name=self.naming,
# local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
# mlflow_path="root",
# )

@dispatch(bool)
def special_components(self, is_automl: bool = False, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by FLAML framework."""
GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
self._show_formula(
coef=self.auto_model.coef_,
intercept=self.auto_model.intercept_,
features_name=SGDClassification.X.columns,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
mlflow_path="root",
)
# GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
# self._show_formula(
# coef=self.auto_model.coef_,
# intercept=self.auto_model.intercept_,
# features_name=SGDClassification.X.columns,
# algorithm_name=self.naming,
# local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
# mlflow_path="root",
# )
61 changes: 52 additions & 9 deletions geochemistrypi/data_mining/model/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ._base import WorkflowBase
from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score
from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, scatter2d, scatter3d
from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d


class ClusteringWorkflowBase(WorkflowBase):
Expand All @@ -25,6 +25,7 @@ class ClusteringWorkflowBase(WorkflowBase):
def __init__(self):
super().__init__()
self.clustering_result = None
self.mode = "Clustering"

def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
"""Fit the model according to the given training data."""
Expand Down Expand Up @@ -93,14 +94,14 @@ def common_components(self) -> None:
algorithm_name=self.naming,
store_path=GEOPI_OUTPUT_METRICS_PATH,
)
self._plot_results(
data=self.X,
labels=self.clustering_result["clustering result"],
cluster_centers_=self.get_cluster_centers(),
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
# self._plot_results(
# data=self.X,
# labels=self.clustering_result["clustering result"],
# cluster_centers_=self.get_cluster_centers(),
# algorithm_name=self.naming,
# local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
# mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
# )
self._plot_silhouette_diagram(
data=self.X,
labels=self.clustering_result["clustering result"],
Expand Down Expand Up @@ -226,6 +227,15 @@ def __init__(

self.naming = KMeansClustering.name

def _get_inertia_scores(self, algorithm_name: str, store_path: str) -> None:
"""Get the scores of the clustering result."""
print("-----* KMeans Inertia Scores *-----")
print("Inertia Score: ", self.model.inertia_)
inertia_scores = {"Inertia Score": self.model.inertia_}
mlflow.log_metrics(inertia_scores)
inertia_scores_str = json.dumps(inertia_scores, indent=4)
save_text(inertia_scores_str, f"KMeans Inertia Scores - {algorithm_name}", store_path)

@classmethod
def manual_hyper_parameters(cls) -> Dict:
"""Manual hyper-parameters specification."""
Expand All @@ -234,6 +244,25 @@ def manual_hyper_parameters(cls) -> Dict:
clear_output()
return hyper_parameters

@staticmethod
def _plot_silhouette_diagram_kmeans(
data: pd.DataFrame,
cluster_labels: pd.DataFrame,
cluster_centers_: np.ndarray,
n_clusters: int,
algorithm_name: str,
local_path: str,
mlflow_path: str,
) -> None:
"""Plot the silhouette diagram of the clustering result."""
print("-----* KMeans's Silhouette Diagram *-----")
plot_silhouette_diagram_kmeans(data, cluster_labels, cluster_centers_, n_clusters, algorithm_name)
save_fig(f"KMeans's Silhouette Diagram - {algorithm_name}", local_path, mlflow_path)
data_with_labels = pd.concat([data, cluster_labels], axis=1)
save_data(data_with_labels, "KMeans's Silhouette Diagram - Data With Labels", local_path, mlflow_path)
cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns)
save_data(cluster_center_data, "KMeans's Silhouette Diagram - Cluster Centers", local_path, mlflow_path)

@staticmethod
def _scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the two-dimensional diagram of the clustering result."""
Expand All @@ -254,7 +283,21 @@ def _scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name:

def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH")
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._get_inertia_scores(
algorithm_name=self.naming,
store_path=GEOPI_OUTPUT_METRICS_PATH,
)
self._plot_silhouette_diagram_kmeans(
data=self.X,
cluster_labels=self.clustering_result["clustering result"],
cluster_centers_=self.get_cluster_centers(),
n_clusters=self.n_clusters,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)

# Draw graphs when the number of principal components > 3
if self.X.shape[1] >= 3:
Expand Down
1 change: 1 addition & 0 deletions geochemistrypi/data_mining/model/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __init__(self) -> None:

# the extra attributes that decomposition algorithm needs
self.X_reduced = None
self.mode = "Decomposition"

def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
"""Fit the model."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import mlflow
import numpy as np
import pandas as pd
from data_mining.constants import CALCULATION_METHOD_OPTION, SECTION
from data_mining.data.data_readiness import limit_num_input, num2option, num_input
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
Expand All @@ -16,6 +14,9 @@
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

from ....constants import CALCULATION_METHOD_OPTION, SECTION
from ....data.data_readiness import limit_num_input, num2option, num_input


def score(y_true: pd.DataFrame, y_predict: pd.DataFrame) -> tuple[str, Dict]:
"""Calculate the scores of the classification model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,14 @@ def sgd_classificaiton_manual_hyper_parameters() -> Dict:
early_stopping = bool_input(SECTION[2])

print("Validation Fraction: The proportion of training data to set aside as validation set for early stopping.")
print("A good starting value could be between 0.000001 and 1, such as 0.1. The default is 0.1.")
validation_fraction = float_input(0.1, SECTION[2], "@Validation Fraction: ")

print("It must be in range (0, 1). A good starting value could be between 0.000001 and 1, such as 0.1. The default is 0.1.")
is_valid = False
while not is_valid:
validation_fraction = float_input(0.1, SECTION[2], "@Validation Fraction: ")
if 0 < validation_fraction < 1:
is_valid = True
else:
print("The validation fraction must be in range (0, 1).")
print("Number of Iterations With No Improvement: Number of iterations with no improvement to wait before stopping fitting.")
print("A good starting value could be between 1 and maximum number of iterations, such as 5. The default is 5.")
n_iter_no_change = num_input(SECTION[2], "@Iterations With No Improvement: ")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
return hyper_parameters


def plot_silhouette_diagram(data: pd.DataFrame, cluster_labels: pd.DataFrame, cluster_centers_: np.ndarray, n_clusters: int, algorithm_name: str) -> None:
def plot_silhouette_diagram_kmeans(data: pd.DataFrame, cluster_labels: pd.DataFrame, cluster_centers_: np.ndarray, n_clusters: int, algorithm_name: str) -> None:
"""
Draw the silhouette diagram for analysis.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,14 @@ def tsne_manual_hyper_parameters() -> Dict:
print("Please specify the learning rate. A good starting range could be between 10 and 1000, such as 200.")
learning_rate = float_input(200, SECTION[2], "Learning Rate: ")
print("Number of Iterations: This parameter controls how many iterations the optimization will run for.")
print("Please specify the number of iterations. A good starting range could be between 250 and 1000, such as 500.")
n_iter = num_input(SECTION[2], "Number of Iterations: ")
print("Please specify the number of iterations. A good starting range could be between 250 and 1000, such as 500. The minimum is 250.")
is_valid = False
while not is_valid:
n_iter = num_input(SECTION[2], "Number of Iterations: ")
if n_iter >= 250:
is_valid = True
else:
print("Please enter a number greater than or equal to 250.")
print("Early Exaggeration: This parameter controls how tight natural clusters in the original space are in the embedded space and how much space will be between them.")
print("Please specify the early exaggeration. A good starting range could be between 5 and 50, such as 12.")
early_exaggeration = float_input(12, SECTION[2], "Early Exaggeration: ")
Expand Down
1 change: 1 addition & 0 deletions geochemistrypi/data_mining/model/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self) -> None:
# These two attributes are used for the customized models of FLAML framework
self.customized = False
self.customized_name = None
self.mode = "Regression"

@dispatch(object, object)
def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "geochemistrypi"
version = "0.4.0.dev2"
version = "0.4.0"
authors = [
{ name="Can He", email="[email protected]" },
]
Expand Down

0 comments on commit 559869b

Please sign in to comment.