From b0cd8e89f2056184b031fcc954b4f95af5817e6b Mon Sep 17 00:00:00 2001 From: jmz <45778832+PotatoXi@users.noreply.github.com> Date: Sat, 13 Jan 2024 21:45:24 +0800 Subject: [PATCH] feat: add agglomerative clustering algorithm again. --- README.md | 72 +++++---- geochemistrypi/data_mining/cli_pipeline.py | 4 +- geochemistrypi/data_mining/constants.py | 2 +- .../data_mining/model/clustering.py | 138 +++++++++++++++++- .../func/algo_clustering/_agglomerative.py | 27 ++++ geochemistrypi/data_mining/process/cluster.py | 8 +- 6 files changed, 214 insertions(+), 37 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py diff --git a/README.md b/README.md index aba250bd..711db838 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,12 @@

--- -**Documentation**: https://geochemistrypi.readthedocs.io -**Source Code**: https://github.com/ZJUEarthData/geochemistrypi -___ +**Documentation**: ``https://geochemistrypi.readthedocs.io`` + +**Source Code**: ``https://github.com/ZJUEarthData/geochemistrypi`` + +--- Geochemistry π is an **open-sourced highly automated machine learning Python framework** dedicating to build up MLOps level 1 software product for data-driven geochemistry discovery on tabular data. @@ -24,19 +26,20 @@ Core capabilities are: + **Model Inference** Key features are: + + **Easy to use:** The automation of data mining process provides the users with simple number options to choose. + **Extensible:** It allows appending new algorithms through Scikit-learn with automatic hyper parameter searching by FLAML and Ray. + **Traceable**: It integrates MLflow to build special storage mechanism to streamline the end-to-end machine learning lifecycle. Latest Update: follow up by clicking `Starred` and `Watch` on our [GitHub repository](https://github.com/ZJUEarthData/geochemistrypi), then get email notifications of the newest features automatically. -The following figure is the simplified overview of Geochemistry π:
+The following figure is the simplified overview of Geochemistry π: `
`

Overview of workflow

-The following figure is the frontend-backend separation architecture of Geochemistry:
+The following figure is the frontend-backend separation architecture of Geochemistry: `
`

Frontend-backend separation architecture of Geochemistry @@ -45,37 +48,45 @@ The following figure is the frontend-backend separation architecture of Geochemi ## Quick Installation One instruction to download on **command line**, such as Terminal on macOS, Power Shell on Windows. + ``` pip install geochemistrypi ``` + One instruction to download on **Jupyter Notebook** or **Google Colab**. + ``` !pip install geochemistrypi ``` + Check the latest version of our software: + ``` geochemistrypi --version ``` -**Note**: For more detail on installation, please refer to our online documentation in **Installation Manual** under the section of **FOR USER**. Over there, we highly recommend to use virtual environment (Conda) to avoid dependency version problems. +**Note**: For more detail on installation, please refer to our online documentation in **Installation Manual** under the section of **FOR USER**. Over there, we highly recommend to use virtual environment (Conda) to avoid dependency version problems. ## Quick Update One instruction to update the software to the latest version on **command line**, such as Terminal on macOS, Power Shell on Windows. + ``` pip install --upgrade geochemistrypi ``` + One instruction to download on **Jupyter Notebook** or **Google Colab**. + ``` !pip install --upgrade geochemistrypi ``` + Check the latest version of our software: + ``` geochemistrypi --version ``` - - ## Example **How to run:** After successfully downloading, run this instruction on **command line / Jupyter Notebook / Google Colab** whatever directory it is. @@ -83,25 +94,33 @@ geochemistrypi --version ### Case 1: Run with built-in data set for testing On command line: + ``` geochemistrypi data-mining ``` + On Jupyter Notebook / Google Colab: + ``` !geochemistrypi data-mining ``` + **Note**: There are four built-in data sets corresponding to four kinds of model pattern. ### Case 2: Run with your own data set without model inference On command line: + ``` geochemistrypi data-mining --data your_own_data_set.xlsx ``` + On Jupyter Notebook / Google Colab: + ``` !geochemistrypi data-mining --data your_own_data_set.xlsx ``` + **Note**: Currently, `.xlsx` and `.csv` files are supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. ### Case 3: Implement model inference on application data @@ -117,11 +136,11 @@ On Jupyter Notebook / Google Colab: ``` !geochemistrypi data-mining --training your_own_training_data.xlsx --inference your_own_inference_data.xlsx ``` + **Note**: Please make sure the column names (data schema) in both training data file and inference data file are the same. Because the operations you perform via our software on the training data will be record automatically and subsequently applied to the inference data in the same order. The training data in our pipeline will be divided into the train set and test set used for training the ML model and evaluating the model's performance. The score includes two types. The first type is the scores from the prediction on the test set while the second type is cv scores from the cross validation on the train set. - ### Case 4: Activate MLflow web interface On command line: @@ -135,6 +154,7 @@ On Jupyter Notebook / Google Colab: ``` !geochemistrypi data-mining --mlflow ``` + **Note**: Once you run our software, there are two folders (`geopi_output` and `geopi_tracking`) generated automatically. Make sure the directory where you execute using the above command should have the genereted file `geopi_tracking`. Copy the URL shown on the console into any browser to open the MLflow web interface. The URL is normally like this http://127.0.0.1:5000. Search MLflow online to see more operations and usages. @@ -146,8 +166,6 @@ For more details: Please refer to: + [Geochemistry π - Download and Run the Beta Version (International - Youtube)](https://www.youtube.com/watch?v=EeVaJ3H7_AU&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=9) + [Geochemistry π - Download and Run the Beta Version (China - Bilibili)](https://www.bilibili.com/video/BV1UM4y1Q7Ju/?spm_id_from=333.999.0.0&vd_source=27944ab3b73a78970c1a52a5dcbb9140) - - ## Roadmap ### First Phase @@ -165,6 +183,7 @@ Its data section provides feature engineering based on **arithmatic operation**. Its models section provides both **supervised learning** and **unsupervised learning** methods from **Scikit-learn** framework, including four types of algorithms, regression, classification, clustering, and dimensional reduction. Integrated with **FLAML** and **Ray** framework, it allows the users to run AutoML easily, fastly and cost-effectively on the built-in supervised learning algorithms in our framework. The following figure is the hierarchical architecture of Geochemistry π: +

Hierarchical Architecture

@@ -173,23 +192,22 @@ The following figure is the hierarchical architecture of Geochemistry π: Currently, we are building three access ways to provide more user-friendly service, including **web portal**, **CLI package** and **API**. It allows the user to perform **continuous training** and **model inference** by automating the ML pipeline and **machine learning lifecycle management** by unique storage mechanism in different access layers. -The following figure is the system architecture diagram:
+The following figure is the system architecture diagram: `
` ![System Architecture Diagram](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/20b5a2a4-f2de-492d-a2df-9282196d8c4f) - -The following figure is the customized automated ML pipeline:
+The following figure is the customized automated ML pipeline: `
`

Customized automated ML pipeline

-The following figure is the design pattern hierarchical architecture:
+The following figure is the design pattern hierarchical architecture: `
` ![Design Pattern](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/aa84ab12-c95e-4282-a60e-64ba2858c437) ![Workflow Object](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/f08885bf-1bec-4045-bf6b-82c5c18d3f8f) -The following figure is the storage mechanism:
+The following figure is the storage mechanism: `
`

Storage Mechanism @@ -197,15 +215,16 @@ The following figure is the storage mechanism:
The whole package is under construction and the documentation is progressively evolving. - - ## Team Info + **Leader:** + + Can He (Sany, National University of Singapore, Singapore) Email: sanyhew1097618435@163.com **Technical Group:** -+ Jianming Zhao (Jamie, Jilin University, Changchun, China) + ++ Jianming Zhao (Jamie, Zhejiang University, China) + Jianhao Sun (Jin, China University of Geosciences, Wuhan, China) + Kaixin Zheng (Hayne, Sun Yat-sen University, China) + Jianing Wang (National University of Singapore, Singapore) @@ -215,6 +234,7 @@ The whole package is under construction and the documentation is progressively e + Chengtu Li(Trenki, Henan Polytechnic University, Beijing, China) **Product Group**: + + Yang Lyu (Daisy, Zhejiang University, China) + Wenyu Zhao (Molly, Zhejiang University, China) + Keran Li (Kirk, Chengdu University of Technology, China) @@ -225,8 +245,6 @@ The whole package is under construction and the documentation is progressively e + Zhenglin Xu (Garry, Jilin University, China) + Junchi Liao(Roceda, University of Electronic Science and Technology of China, China) - - ## Join Us :) **The recruitment of research interns is ongoing !!!** @@ -234,6 +252,7 @@ The whole package is under construction and the documentation is progressively e **Key Point: All things are done online, remote work (\*^▽^\*)** **What can you learn?** + + Learning the full cycle of data mining (Scikit-learn, Ray, Mlflow) on tabular data, including the algorithms in regression,classification, clustering, and decomposition. + Learning to be a qualified Python developer, including any Python programing contents towards data mining, basic software engineering techniques like frontend (React, Typescript, Ant Design scaffold) and backend (SQL & NoSQL database, RESFful API, FastAPI) development, and cooperation tools like Git. @@ -245,6 +264,7 @@ The whole package is under construction and the documentation is progressively e + Bonus depending on your performance. **Current Working Pattern:** + + Online working and cooperation + Three weeks per working cycle -> One online meeting per working cycle + One cycle report (see below) per cycle - 5 mins to finish @@ -259,11 +279,10 @@ Chinese Page: https://person.zju.edu.cn/zhangzhou#0 **Do you want to contribute to this open-source program?** Contact with your CV: sanyhew1097618435@163.com - - ## In-house Materials Materials are in both Chinese and English. Others unshown below are internal materials. + 1. [Guideline Manual – Geochemistry π (International - Google drive)](https://docs.google.com/document/d/1LjwB5Lazk33E5vbtnFPJio_MyjYQxjEu/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true) 2. [Guideline Manual – Geochemistry π (China - Tencent Docs)](https://docs.qq.com/doc/DQ21IZUdVQktqRWpm?&u=6868f96d4a384b309036e04e637e367a) 3. [Learning Steps for Newbies – Geochemistry π (International - Google drive)](https://docs.google.com/document/d/1GQO-SXwEx_8midr362pqfxNZtfUf-nA6/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true) @@ -277,15 +296,14 @@ Materials are in both Chinese and English. Others unshown below are internal mat Technical record videos are on Bilibili and Youtube synchronously while other meeting videos are internal materials. More Videos will be recorded soon. + 1. [ZJU_Earth_Data Introduction (Geochemical Data, Python, Geochemistry π) - Prof. Zhang](https://www.bilibili.com/video/BV1Lf4y1w7EK?spm_id_from=333.999.0.0) 2. [How to Collaborate and Provide Bug Report on Geochemistry π Through GitHub - Can He (Sany)](https://www.youtube.com/watch?v=1DWoEsqsfvQ&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=3) 3. [Geochemistry π - Download and Run the Beta Version](https://www.youtube.com/watch?v=EeVaJ3H7_AU&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=9) 4. [How to Create and Use Virtual Environment on Geochemistry π - Can He (Sany)](https://www.youtube.com/watch?v=4KFi7OXxD-c&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=4) 5. [How to use Github-Desktop in conflict resolution - Qiuhao Zhao (Brad)](https://www.youtube.com/watch?v=KT1g5JpuUVI&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM) -6. [Virtual Environment & Packages On Windows - Jianming Zhao (Jamie)](https://www.youtube.com/watch?v=e4VqSBuNp_o&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=2) -7. [Git Workflow & Coordinating Synchronization - Jianming Zhao (Jamie)](https://www.bilibili.com/video/BV1Sa4y1f74k?spm_id_from=333.999.0.0&vd_source=9adcf2c5fdeffe1d11c89d441ef598ba) - - +6. [Virtual Environment & Packages On Windows - Jianming Zhao (Jamie)](https://www.youtube.com/watch?v=e4VqSBuNp_o&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=2) +7. [Git Workflow & Coordinating Synchronization - Jianming Zhao (Jamie)](https://www.bilibili.com/video/BV1Sa4y1f74k?spm_id_from=333.999.0.0&vd_source=9adcf2c5fdeffe1d11c89d441ef598ba) ## Contributors diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 1859ce73..66b3ad71 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -63,8 +63,8 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = """ # Local test: Uncomment the following line to utilize built-in datasets to test the pipeline. Don't forget to modify the path value to be consistent with your own location. - training_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" - application_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" + # training_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" + # application_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" # Local test: If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True. show_warning(False) diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 8686c72f..6ccc7a23 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -64,7 +64,7 @@ # "Decision Tree", # Histogram-based Gradient Boosting, ] -CLUSTERING_MODELS = ["KMeans", "DBSCAN"] +CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] # The model can deal with missing values diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index 99098c6b..fbbe8afa 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -6,12 +6,14 @@ import mlflow import numpy as np import pandas as pd +from numpy.typing import ArrayLike from rich import print -from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans +from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, KMeans from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig, save_text from ._base import WorkflowBase +from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters from .func.algo_clustering._common import plot_silhouette_diagram, plot_silhouette_value_diagram, scatter2d, scatter3d, score from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters @@ -420,6 +422,135 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" +class Agglomerative(ClusteringWorkflowBase): + """The automation workflow of using Agglomerative Clustering to make insightful products.""" + + name = "Agglomerative" + special_function = [] + + def __init__( + self, + n_clusters: int = 2, + *, + affinity: str = "euclidean", + memory: str = None, + connectivity: ArrayLike = None, + compute_full_tree: str = "auto", + linkage: str = "ward", + distance_threshold: float = None, + compute_distances: bool = False, + ) -> None: + """ + Parameters + ---------- + n_clusters : int or None, default=2 + The number of clusters to find. It must be ``None`` if + ``distance_threshold`` is not ``None``. + + affinity : str or callable, default='euclidean' + Metric used to compute the linkage. Can be "euclidean", "l1", "l2", + "manhattan", "cosine", or "precomputed". + If linkage is "ward", only "euclidean" is accepted. + If "precomputed", a distance matrix (instead of a similarity matrix) + is needed as input for the fit method. + + memory : str or object with the joblib.Memory interface, default=None + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + connectivity : array-like or callable, default=None + Connectivity matrix. Defines for each sample the neighboring + samples following a given structure of the data. + This can be a connectivity matrix itself or a callable that transforms + the data into a connectivity matrix, such as derived from + `kneighbors_graph`. Default is ``None``, i.e, the + hierarchical clustering algorithm is unstructured. + + compute_full_tree : 'auto' or bool, default='auto' + Stop early the construction of the tree at ``n_clusters``. This is + useful to decrease computation time if the number of clusters is not + small compared to the number of samples. This option is useful only + when specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to the maximum between 100 or `0.02 * n_samples`. + Otherwise, "auto" is equivalent to `False`. + + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which + distance to use between sets of observation. The algorithm will merge + the pairs of cluster that minimize this criterion. + + - 'ward' minimizes the variance of the clusters being merged. + - 'average' uses the average of the distances of each observation of + the two sets. + - 'complete' or 'maximum' linkage uses the maximum distances between + all observations of the two sets. + - 'single' uses the minimum of the distances between all observations + of the two sets. + + .. versionadded:: 0.20 + Added the 'single' option + + distance_threshold : float, default=None + The linkage distance threshold above which, clusters will not be + merged. If not ``None``, ``n_clusters`` must be ``None`` and + ``compute_full_tree`` must be ``True``. + + .. versionadded:: 0.21 + + compute_distances : bool, default=False + Computes distances between clusters even if `distance_threshold` is not + used. This can be used to make dendrogram visualization, but introduces + a computational and memory overhead. + + .. versionadded:: 0.24 + + References + ---------- + sklearn.cluster.AgglomerativeClustering + https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html + """ + + super().__init__() + self.n_clusters = n_clusters + self.affinity = affinity + self.distance_threshold = distance_threshold + self.memory = memory + self.connectivity = connectivity + self.compute_full_tree = compute_full_tree + self.linkage = linkage + self.compute_distances = compute_distances + + self.model = AgglomerativeClustering( + n_clusters=self.n_clusters, + affinity=self.affinity, + memory=self.memory, + connectivity=self.connectivity, + compute_full_tree=self.compute_full_tree, + linkage=self.linkage, + distance_threshold=self.distance_threshold, + compute_distances=self.compute_distances, + ) + + self.naming = Agglomerative.name + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = agglomerative_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + pass + + class AffinityPropagationClustering(ClusteringWorkflowBase): name = "AffinityPropagation" @@ -479,11 +610,6 @@ class WardHierarchicalClustering(ClusteringWorkflowBase): pass -class AgglomerativeClustering(ClusteringWorkflowBase): - name = "Agglomerative" - pass - - class OPTICSClustering(ClusteringWorkflowBase): name = "OPTICS" pass diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py new file mode 100644 index 00000000..f847cdb3 --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py @@ -0,0 +1,27 @@ +from typing import Dict + +from rich import print + +from ....constants import SECTION +from ....data.data_readiness import num_input, str_input + + +def agglomerative_manual_hyper_parameters() -> Dict: + """Manually set hyperparameters. + + Returns + ------- + hyper_parameters : dict + """ + print("N Clusters: The number of clusters to form as well as the number of centroids to generate.") + print("Please specify the number of clusters for agglomerative. A good starting range could be between 2 and 10, such as '4'.") + n_clusters = num_input(SECTION[2], "N Clusters: ") + print("linkage: The linkage criterion determines which distance to use between sets of observation. ") + print("Please specify the linkage criterion. It is generally recommended to leave it set to 'ward'.") + linkages = ["ward", "complete", "average", "single"] + linkage = str_input(linkages, SECTION[2]) + hyper_parameters = { + "n_clusters": n_clusters, + "linkage": linkage, + } + return hyper_parameters diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index e7a38ed5..03686770 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -4,7 +4,7 @@ import pandas as pd -from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering +from ..model.clustering import Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering from ._base import ModelSelectionBase @@ -48,6 +48,12 @@ def activate( leaf_size=hyper_parameters["leaf_size"], p=hyper_parameters["p"], ) + elif self.model_name == "Agglomerative": + hyper_parameters = Agglomerative.manual_hyper_parameters() + self.clt_workflow = Agglomerative( + n_clusters=hyper_parameters["n_clusters"], + linkage=hyper_parameters["linkage"], + ) elif self.model_name == "": pass