From 6d463c3b0c98d9ee36a9834d875f70ee504e9fe7 Mon Sep 17 00:00:00 2001 From: YmY Date: Fri, 1 Mar 2024 11:45:29 +0800 Subject: [PATCH] feat: extract decomposition public functions --- .../data_mining/model/decomposition.py | 49 +++++++ .../model/func/algo_decomposition/_common.py | 123 ++++++++++++++++++ .../data_mining/process/decompose.py | 3 + 3 files changed, 175 insertions(+) diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py index ab954230..f204f81b 100644 --- a/geochemistrypi/data_mining/model/decomposition.py +++ b/geochemistrypi/data_mining/model/decomposition.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + import os from typing import Dict, Optional, Union @@ -11,6 +12,7 @@ from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig from ._base import WorkflowBase +from .func.algo_decomposition._common import plot_2d_scatter_diagram, plot_contour, plot_heatmap from .func.algo_decomposition._mds import mds_manual_hyper_parameters from .func.algo_decomposition._pca import biplot, pca_manual_hyper_parameters, triplot from .func.algo_decomposition._tsne import tsne_manual_hyper_parameters @@ -63,6 +65,53 @@ def _reduced_data2pd(self, reduced_data: np.ndarray, components_num: int) -> Non self.X_reduced = pd.DataFrame(reduced_data) self.X_reduced.columns = pa_name + @staticmethod + def _plot_2d_scatter_diagram(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + """Plot the two-dimensional diagram of the decomposition result.""" + print("-----* Decomposition Two-Dimensional Diagram *-----") + plot_2d_scatter_diagram(data, algorithm_name) + save_fig(f"Decomposition Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, f"Decomposition Two-Dimensional Data - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _plot_heatmap(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + """Plot a heatmap for the decomposition result.""" + print("-----* Decomposition Heatmap *-----") + plot_heatmap(data, algorithm_name) + save_fig(f"Decomposition Heatmap - {algorithm_name}", local_path, mlflow_path) + save_data(data, f"Decomposition Heatmap Data - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _plot_contour(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + """Plot a contour plot for dimensionality reduction results.""" + print("-----* Dimensionality Reduction Contour Plot *-----") + plot_contour(data, algorithm_name) + save_fig(f"Dimensionality Reduction Contour Plot - {algorithm_name}", local_path, mlflow_path) + save_data(data, f"Dimensionality Reduction Contour Plot Data - {algorithm_name}", local_path, mlflow_path) + + def common_components(self) -> None: + """Invoke all common application functions for decomposition algorithms by Scikit-learn framework.""" + + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_2d_scatter_diagram( + data=self.X, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_heatmap( + data=self.X, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_contour( + data=self.X, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + class PCADecomposition(DecompositionWorkflowBase): """The automation workflow of using PCA algorithm to make insightful products.""" diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_common.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_common.py index 40a96afc..519cb1d0 100644 --- a/geochemistrypi/data_mining/model/func/algo_decomposition/_common.py +++ b/geochemistrypi/data_mining/model/func/algo_decomposition/_common.py @@ -1 +1,124 @@ # -*- coding: utf-8 -*- +from itertools import cycle + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from scipy.stats import kde + + +def plot_2d_scatter_diagram(data: pd.DataFrame, algorithm_name: str) -> None: + """ + Plot a 2D scatter diagram for dimensionality reduction results. + + Parameters + ---------- + data : pd.DataFrame (n_samples, n_components) + Data after dimensionality reduction. + + pc : pd.DataFrame (n_features, n_components) + principal components. + + algorithm_name : str + The name of the dimensionality reduction algorithm. + + labels : List[str] + The type of tag of the samples in the data set. + """ + markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] + colors = [ + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + "#17becf", + "#33a02c", + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + ] + + marker_cycle = cycle(markers) + color_cycle = cycle(colors) + + fig = plt.figure(figsize=(10, 8)) + ax = fig.add_subplot(111) + + # Plot the data + for i, label in enumerate(data.index): + colors = next(color_cycle) + markers = next(marker_cycle) + ax.scatter(data.iloc[:, 0], data.iloc[:, 1], c=colors, marker=markers, label=label) + + ax.set_xlabel("Component 1") + ax.set_ylabel("Component 2") + ax.set_title(f"{algorithm_name} Dimensionality Reduction Results") + ax.legend(loc="upper right") + + plt.grid(True) + + +def plot_heatmap(data: pd.DataFrame, algorithm_name: str) -> None: + """ + Plot a heatmap for dimensionality reduction results. + + Parameters + ---------- + data : pd.DataFrame (n_samples, n_components) + Data after dimensionality reduction. + + algorithm_name : str + The name of the dimensionality reduction algorithm. + """ + plt.figure(figsize=(10, 8)) + sns.heatmap(data, cmap="viridis") + plt.title(f"{algorithm_name} Dimensionality Reduction Heatmap") + plt.xlabel("Component") + plt.ylabel("Sample") + + +def plot_contour(data: pd.DataFrame, algorithm_name: str) -> None: + """ + Plot a contour plot for dimensionality reduction results. + + Parameters + ---------- + data : pd.DataFrame (n_samples, n_components) + Data after dimensionality reduction. + + algorithm_name : str + The name of the dimensionality reduction algorithm. + """ + # Calculate the density + x = data.iloc[:, 0] + y = data.iloc[:, 1] + xmin, xmax = x.min(), x.max() + ymin, ymax = y.min(), y.max() + + xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] + positions = np.vstack([xx.ravel(), yy.ravel()]) + values = np.vstack([x, y]) + kernel = kde.gaussian_kde(values) + f = np.reshape(kernel(positions).T, xx.shape) + + # Plot the contour + plt.figure(figsize=(10, 8)) + plt.contourf(xx, yy, f, cmap="viridis", alpha=0.5) + plt.colorbar(label="Density") + plt.scatter(x, y, marker="o", color="black", alpha=0.5) + plt.xlabel(f"{data.columns[0]}") + plt.ylabel(f"{data.columns[1]}") + plt.title(f"{algorithm_name} Dimensionality Reduction Contour Plot") + plt.grid(True) diff --git a/geochemistrypi/data_mining/process/decompose.py b/geochemistrypi/data_mining/process/decompose.py index d1e7922b..91cb1a17 100644 --- a/geochemistrypi/data_mining/process/decompose.py +++ b/geochemistrypi/data_mining/process/decompose.py @@ -61,6 +61,9 @@ def activate( # Save the model hyper-parameters self.dcp_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) + # Common components for every decomposition algorithm + self.dcp_workflow.common_components() + # special components of different algorithms self.dcp_workflow.special_components(components_num=hyper_parameters["n_components"], reduced_data=X_reduced)