Skip to content

Commit

Permalink
feat: extract decomposition public functions
Browse files Browse the repository at this point in the history
  • Loading branch information
luzhu-star committed Mar 1, 2024
1 parent f7d0151 commit 6d463c3
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 0 deletions.
49 changes: 49 additions & 0 deletions geochemistrypi/data_mining/model/decomposition.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-

import os
from typing import Dict, Optional, Union

Expand All @@ -11,6 +12,7 @@
from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
from ..utils.base import clear_output, save_data, save_fig
from ._base import WorkflowBase
from .func.algo_decomposition._common import plot_2d_scatter_diagram, plot_contour, plot_heatmap
from .func.algo_decomposition._mds import mds_manual_hyper_parameters
from .func.algo_decomposition._pca import biplot, pca_manual_hyper_parameters, triplot
from .func.algo_decomposition._tsne import tsne_manual_hyper_parameters
Expand Down Expand Up @@ -63,6 +65,53 @@ def _reduced_data2pd(self, reduced_data: np.ndarray, components_num: int) -> Non
self.X_reduced = pd.DataFrame(reduced_data)
self.X_reduced.columns = pa_name

@staticmethod
def _plot_2d_scatter_diagram(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the two-dimensional diagram of the decomposition result."""
print("-----* Decomposition Two-Dimensional Diagram *-----")
plot_2d_scatter_diagram(data, algorithm_name)
save_fig(f"Decomposition Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
save_data(data, f"Decomposition Two-Dimensional Data - {algorithm_name}", local_path, mlflow_path)

@staticmethod
def _plot_heatmap(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot a heatmap for the decomposition result."""
print("-----* Decomposition Heatmap *-----")
plot_heatmap(data, algorithm_name)
save_fig(f"Decomposition Heatmap - {algorithm_name}", local_path, mlflow_path)
save_data(data, f"Decomposition Heatmap Data - {algorithm_name}", local_path, mlflow_path)

@staticmethod
def _plot_contour(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot a contour plot for dimensionality reduction results."""
print("-----* Dimensionality Reduction Contour Plot *-----")
plot_contour(data, algorithm_name)
save_fig(f"Dimensionality Reduction Contour Plot - {algorithm_name}", local_path, mlflow_path)
save_data(data, f"Dimensionality Reduction Contour Plot Data - {algorithm_name}", local_path, mlflow_path)

def common_components(self) -> None:
"""Invoke all common application functions for decomposition algorithms by Scikit-learn framework."""

GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_2d_scatter_diagram(
data=self.X,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
self._plot_heatmap(
data=self.X,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
self._plot_contour(
data=self.X,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)


class PCADecomposition(DecompositionWorkflowBase):
"""The automation workflow of using PCA algorithm to make insightful products."""
Expand Down
123 changes: 123 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_decomposition/_common.py
Original file line number Diff line number Diff line change
@@ -1 +1,124 @@
# -*- coding: utf-8 -*-
from itertools import cycle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import kde


def plot_2d_scatter_diagram(data: pd.DataFrame, algorithm_name: str) -> None:
"""
Plot a 2D scatter diagram for dimensionality reduction results.
Parameters
----------
data : pd.DataFrame (n_samples, n_components)
Data after dimensionality reduction.
pc : pd.DataFrame (n_features, n_components)
principal components.
algorithm_name : str
The name of the dimensionality reduction algorithm.
labels : List[str]
The type of tag of the samples in the data set.
"""
markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"]
colors = [
"#1f77b4",
"#ff7f0e",
"#2ca02c",
"#d62728",
"#9467bd",
"#8c564b",
"#e377c2",
"#7f7f7f",
"#bcbd22",
"#17becf",
"#33a02c",
"#1f77b4",
"#ff7f0e",
"#2ca02c",
"#d62728",
"#9467bd",
"#8c564b",
"#e377c2",
"#7f7f7f",
"#bcbd22",
]

marker_cycle = cycle(markers)
color_cycle = cycle(colors)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)

# Plot the data
for i, label in enumerate(data.index):
colors = next(color_cycle)
markers = next(marker_cycle)
ax.scatter(data.iloc[:, 0], data.iloc[:, 1], c=colors, marker=markers, label=label)

ax.set_xlabel("Component 1")
ax.set_ylabel("Component 2")
ax.set_title(f"{algorithm_name} Dimensionality Reduction Results")
ax.legend(loc="upper right")

plt.grid(True)


def plot_heatmap(data: pd.DataFrame, algorithm_name: str) -> None:
"""
Plot a heatmap for dimensionality reduction results.
Parameters
----------
data : pd.DataFrame (n_samples, n_components)
Data after dimensionality reduction.
algorithm_name : str
The name of the dimensionality reduction algorithm.
"""
plt.figure(figsize=(10, 8))
sns.heatmap(data, cmap="viridis")
plt.title(f"{algorithm_name} Dimensionality Reduction Heatmap")
plt.xlabel("Component")
plt.ylabel("Sample")


def plot_contour(data: pd.DataFrame, algorithm_name: str) -> None:
"""
Plot a contour plot for dimensionality reduction results.
Parameters
----------
data : pd.DataFrame (n_samples, n_components)
Data after dimensionality reduction.
algorithm_name : str
The name of the dimensionality reduction algorithm.
"""
# Calculate the density
x = data.iloc[:, 0]
y = data.iloc[:, 1]
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()

xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
values = np.vstack([x, y])
kernel = kde.gaussian_kde(values)
f = np.reshape(kernel(positions).T, xx.shape)

# Plot the contour
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, f, cmap="viridis", alpha=0.5)
plt.colorbar(label="Density")
plt.scatter(x, y, marker="o", color="black", alpha=0.5)
plt.xlabel(f"{data.columns[0]}")
plt.ylabel(f"{data.columns[1]}")
plt.title(f"{algorithm_name} Dimensionality Reduction Contour Plot")
plt.grid(True)
3 changes: 3 additions & 0 deletions geochemistrypi/data_mining/process/decompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def activate(
# Save the model hyper-parameters
self.dcp_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH"))

# Common components for every decomposition algorithm
self.dcp_workflow.common_components()

# special components of different algorithms
self.dcp_workflow.special_components(components_num=hyper_parameters["n_components"], reduced_data=X_reduced)

Expand Down

0 comments on commit 6d463c3

Please sign in to comment.