From 0666ea3c73a96c8c41a079a2ed01852668e29476 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Sat, 29 Jun 2024 17:42:40 +0800 Subject: [PATCH 01/12] docs: update v0.6.0 progress in changelog. --- README.md | 5 +++-- docs/source/Home/CHANGELOG.md | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9ac5b200..227fedbe 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ pip install geochemistrypi Download the latest version to avoid some old version issues, such as dependency downloading. ``` -pip install "geochemistrypi==0.5.0" +pip install "geochemistrypi==0.6.0" ``` One instruction to download on **Jupyter Notebook** or **Google Colab**. @@ -81,7 +81,7 @@ One instruction to download on **Jupyter Notebook** or **Google Colab**. ``` Download the latest version to avoid some old version issues, such as dependency downloading. ``` -!pip install "geochemistrypi==0.5.0" +!pip install "geochemistrypi==0.6.0" ``` Check the downloaded version of our software: @@ -316,6 +316,7 @@ The whole package is under construction and the documentation is progressively e + Zhelan Lin(Lan, Fuzhou University, China) + ShuYi Li (Communication University Of China, Beijing, China) + Junbo Wang (China University Of Geosciences, Beijing, China) ++ Haibin Wang(Watson, University of Sydney, Australia) ## Join Us :) diff --git a/docs/source/Home/CHANGELOG.md b/docs/source/Home/CHANGELOG.md index dece2df9..314ce913 100644 --- a/docs/source/Home/CHANGELOG.md +++ b/docs/source/Home/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] + MLOps core of continuous training in web interface ++ More new algorithms and new processing techniques ## [0.6.0] - 2024-06-02 @@ -25,15 +26,23 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), + New Mode: + Abnormal Detection + Isolation Forest ++ Docs: + + Mind map of all options in README + + Citation info + + Abnormal detection algorithm example ### Changed + Showing formula function for linear models in both regression and classifiction in terms of the number of the target values' type + Built-in inferenc data only for regression and classification ++ Docs: + + Installation manual + + Clustering algorithm example ### Fixed + Invalid YAML file when launching MLflow interface ++ Online docs layout mismatch ## [0.5.0] - 2024-01-14 From fcbedd52d896ee682d26818c736b194b13612309 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Thu, 4 Jul 2024 21:53:39 +0800 Subject: [PATCH 02/12] feat: add precision-recall curve. --- .../data_mining/model/classification.py | 73 ++++++++++++++----- .../model/func/algo_classification/_common.py | 54 +++++++++++++- .../model/func/algo_classification/_enum.py | 14 ++++ geochemistrypi/data_mining/process/cluster.py | 1 + 4 files changed, 119 insertions(+), 23 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_classification/_enum.py diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 2b4e1fd0..b81edd66 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -23,8 +23,19 @@ from ..plot.statistic_plot import basic_statistic from ..utils.base import clear_output, save_data, save_fig, save_text from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase -from .func.algo_classification._common import cross_validation, plot_2d_decision_boundary, plot_confusion_matrix, plot_precision_recall, plot_ROC, resampler, reset_label, score +from .func.algo_classification._common import ( + cross_validation, + plot_2d_decision_boundary, + plot_confusion_matrix, + plot_precision_recall, + plot_precision_recall_threshold, + plot_ROC, + resampler, + reset_label, + score, +) from .func.algo_classification._decision_tree import decision_tree_manual_hyper_parameters +from .func.algo_classification._enum import ClassificationCommonFunction from .func.algo_classification._extra_trees import extra_trees_manual_hyper_parameters from .func.algo_classification._gradient_boosting import gradient_boosting_manual_hyper_parameters from .func.algo_classification._knn import knn_manual_hyper_parameters @@ -39,17 +50,7 @@ class ClassificationWorkflowBase(WorkflowBase): """The base workflow class of classification algorithms.""" - common_function = [ - "Model Score", - "Confusion Matrix", - "Cross Validation", - "Model Prediction", - "Model Persistence", - "Precision Recall Curve", - "ROC Curve", - "Two-dimensional Decision Boundary Diagram", - "Permutation Importance Diagram", - ] + common_function = [func.value for func in ClassificationCommonFunction] def __init__(self) -> None: super().__init__() @@ -163,18 +164,30 @@ def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, t save_data(data, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True) @staticmethod - def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - print("-----* Precision Recall Curve *-----") - y_probs, precisions, recalls, thresholds = plot_precision_recall(X_test, y_test, trained_model, algorithm_name) - save_fig(f"Precision Recall Curve - {algorithm_name}", local_path, mlflow_path) + def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + print(f"-----* {graph_name} *-----") + y_probs, precisions, recalls, thresholds = plot_precision_recall(X_test, y_test, trained_model, graph_name, algorithm_name) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) y_probs = pd.DataFrame(y_probs, columns=["Probabilities"]) precisions = pd.DataFrame(precisions, columns=["Precisions"]) recalls = pd.DataFrame(recalls, columns=["Recalls"]) thresholds = pd.DataFrame(thresholds, columns=["Thresholds"]) - save_data(y_probs, "Precision Recall Curve - Probabilities", local_path, mlflow_path) - save_data(precisions, "Precision Recall Curve - Precisions", local_path, mlflow_path) - save_data(recalls, "Precision Recall Curve - Recalls", local_path, mlflow_path) - save_data(thresholds, "Precision Recall Curve - Thresholds", local_path, mlflow_path) + save_data(precisions, f"{graph_name} - Precisions", local_path, mlflow_path) + save_data(recalls, f"{graph_name} - Recalls", local_path, mlflow_path) + + @staticmethod + def _plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + print(f"-----* {graph_name} *-----") + y_probs, precisions, recalls, thresholds = plot_precision_recall_threshold(X_test, y_test, trained_model, graph_name, algorithm_name) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + y_probs = pd.DataFrame(y_probs, columns=["Probabilities"]) + precisions = pd.DataFrame(precisions, columns=["Precisions"]) + recalls = pd.DataFrame(recalls, columns=["Recalls"]) + thresholds = pd.DataFrame(thresholds, columns=["Thresholds"]) + save_data(y_probs, f"{graph_name} - Probabilities", local_path, mlflow_path) + save_data(precisions, f"{graph_name} - Precisions", local_path, mlflow_path) + save_data(recalls, f"{graph_name} - Recalls", local_path, mlflow_path) + save_data(thresholds, f"{graph_name} - Thresholds", local_path, mlflow_path) @staticmethod def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: @@ -285,6 +298,16 @@ def common_components(self) -> None: X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, trained_model=self.model, + graph_name=ClassificationCommonFunction.PRECISION_RECALL_CURVE.value, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_precision_recall_threshold( + X_test=ClassificationWorkflowBase.X_test, + y_test=ClassificationWorkflowBase.y_test, + trained_model=self.model, + graph_name=ClassificationCommonFunction.PRECISION_RECALL_THRESHOLD_DIAGRAM.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -356,6 +379,16 @@ def common_components(self, is_automl: bool) -> None: X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, trained_model=self.auto_model, + graph_name=ClassificationCommonFunction.PRECISION_RECALL_CURVE.value, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_precision_recall_threshold( + X_test=ClassificationWorkflowBase.X_test, + y_test=ClassificationWorkflowBase.y_test, + trained_model=self.auto_model, + graph_name=ClassificationCommonFunction.PRECISION_RECALL_THRESHOLD_DIAGRAM.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_common.py b/geochemistrypi/data_mining/model/func/algo_classification/_common.py index 94d05535..8d2058cb 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_common.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_common.py @@ -196,8 +196,8 @@ def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.D return scores_result -def plot_precision_recall(X_test, y_test, trained_model: object, algorithm_name: str) -> tuple: - """Plot the precision-recall curve. +def plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple: + """Plot the precision vs. recall diagram. Parameters ---------- @@ -210,6 +210,54 @@ def plot_precision_recall(X_test, y_test, trained_model: object, algorithm_name: trained_model : object The model trained. + graph_name : str + The name of the graph. + + algorithm_name : str + The name of the algorithm. + + Returns + ------- + y_probs : np.ndarray + The probabilities of the model. + + precisions : np.ndarray + The precision of the model. + + recalls : np.ndarray + The recall of the model. + + thresholds : np.ndarray + The thresholds of the model. + """ + # Predict probabilities for the positive class + y_probs = trained_model.predict_proba(X_test)[:, 1] + precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs) + plt.figure() + plt.plot(recalls, precisions, "b-") + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"{graph_name} - {algorithm_name}") + return y_probs, precisions, recalls, thresholds + + +def plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple: + """Plot the precision-recall vs. threshold diagram. + + Parameters + ---------- + X_test : pd.DataFrame (n_samples, n_components) + The testing feature data. + + y_test : pd.DataFrame (n_samples, n_components) + The testing target values. + + trained_model : object + The model trained. + + graph_name : str + The name of the graph. + algorithm_name : str The name of the algorithm. @@ -234,7 +282,7 @@ def plot_precision_recall(X_test, y_test, trained_model: object, algorithm_name: plt.plot(thresholds, precisions[:-1], "b--", label="Precision") plt.plot(thresholds, recalls[:-1], "g-", label="Recall") plt.legend(labels=["Precision", "Recall"], loc="best") - plt.title(f"Precision Recall Curve - {algorithm_name}") + plt.title(f"{graph_name} - {algorithm_name}") return y_probs, precisions, recalls, thresholds diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_enum.py b/geochemistrypi/data_mining/model/func/algo_classification/_enum.py new file mode 100644 index 00000000..2552ea0c --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_classification/_enum.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class ClassificationCommonFunction(Enum): + MODEL_SCORE = "Model Score" + CONFUSION_MATRIX = "Confusion Matrix" + CROSS_VALIDATION = "Cross Validation" + MODEL_PREDICTION = "Model Prediction" + MODEL_PERSISTENCE = "Model Persistence" + PRECISION_RECALL_CURVE = "Precision-Recall Curve" + PRECISION_RECALL_THRESHOLD_DIAGRAM = "Precision-Recall vs. Threshold Diagram" + ROC_CURVE = "ROC Curve" + TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM = "Two-dimensional Decision Boundary Diagram" + PERMUTATION_IMPORTANCE_DIAGRAM = "Permutation Importance Diagram" diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index db87eb41..5ae50b57 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -69,6 +69,7 @@ def activate( # Use Scikit-learn style API to process input data self.clt_workflow.fit(X) + # TODO: Move this into common_components() self.clt_workflow.get_cluster_centers() self.clt_workflow.get_labels() From 65d5ad2884a6ff9ead9b2c6a8c8748f7937014a5 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Fri, 5 Jul 2024 11:17:52 +0800 Subject: [PATCH 03/12] build: v0.6.1 --- README.md | 4 ++-- geochemistrypi/_version.py | 2 +- geochemistrypi/data_mining/utils/base.py | 2 +- pyproject.toml | 18 ++++++++++++++---- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 227fedbe..b236c6d7 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ pip install geochemistrypi Download the latest version to avoid some old version issues, such as dependency downloading. ``` -pip install "geochemistrypi==0.6.0" +pip install "geochemistrypi==0.6.1" ``` One instruction to download on **Jupyter Notebook** or **Google Colab**. @@ -81,7 +81,7 @@ One instruction to download on **Jupyter Notebook** or **Google Colab**. ``` Download the latest version to avoid some old version issues, such as dependency downloading. ``` -!pip install "geochemistrypi==0.6.0" +!pip install "geochemistrypi==0.6.1" ``` Check the downloaded version of our software: diff --git a/geochemistrypi/_version.py b/geochemistrypi/_version.py index 906d362f..43c4ab00 100644 --- a/geochemistrypi/_version.py +++ b/geochemistrypi/_version.py @@ -1 +1 @@ -__version__ = "0.6.0" +__version__ = "0.6.1" diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 31a01124..7f9f0f69 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -140,7 +140,7 @@ def install_package(package_name: str) -> None: """ import subprocess - subprocess.check_call(["python", "-m", "pip", "install", package_name]) + subprocess.check_call(["python", "-m", "pip", "install", "--quiet", package_name]) def clear_output() -> None: diff --git a/pyproject.toml b/pyproject.toml index c896ad0f..95e9cb2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,22 @@ build-backend = "hatchling.build" [project] name = "geochemistrypi" -version = "0.6.0" +version = "0.6.1" authors = [ - { name="Can He", email="sanyhew1097618435@163.com" }, + { name="Can He", email="sanyhew1097618435@163.com"}, + { name="Jianhao Sun", email="sjh20171502@gmail.com"}, + { name="Jianming Zhao", email="zhaojianming@zju.edu.cn"}, + { name="Yang Lyu", email="lyuyang1007@zju.edu.cn"} ] -maintainers = [] -description = "A highly automated machine learning Python framework for data-driven geochemistry discovery" +maintainers = [ + { name="Can He", email="sanyhew1097618435@163.com"}, + { name="Jianhao Sun", email="sjh20171502@gmail.com"}, + { name="Jianming Zhao", email="zhaojianming@zju.edu.cn"}, + { name="Yongkang Shan", email="kk1361207571@163.com"}, + { name="Mengqi Gao", email="2534671415@qq.com"} +] +description = "A highly automated machine learning Python framework dedicating to build up MLOps level 1 software product for data-driven geochemistry discovery" +keywords = ["Geochemistry π", "Automated", "Machine Learning", "MLOps", "Geochemistry Discovery", "Continuous Training", "Machine Learning Lifecycle Management", "Model Inference", "Data Mining"] readme = "README.md" license = { file="LICENSE" } requires-python = "~=3.9" From 6e7e89d58b6310608e0069c710a5d9bdc21253e3 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Fri, 5 Jul 2024 11:33:27 +0800 Subject: [PATCH 04/12] docs: v0.6.1 new features summary. --- docs/source/Home/CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/Home/CHANGELOG.md b/docs/source/Home/CHANGELOG.md index 314ce913..e225674b 100644 --- a/docs/source/Home/CHANGELOG.md +++ b/docs/source/Home/CHANGELOG.md @@ -10,6 +10,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), + MLOps core of continuous training in web interface + More new algorithms and new processing techniques + +## [0.6.1] - 2024-07-05 + +### Added + ++ Precision-recall curve + +### Changed + ++ Silence of dependency downloading when first launching + +### Fixed + ++ Precision-recall vs. threshold diagram + + ## [0.6.0] - 2024-06-02 ### Added @@ -163,6 +179,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), [ unreleased ]: https://github.com/ZJUEarthData/geochemistrypi +[ 0.6.1 ]: https://github.com/ZJUEarthData/geochemistrypi/compare/v0.6.0...v0.6.1 [ 0.6.0 ]: https://github.com/ZJUEarthData/geochemistrypi/compare/v0.5.0...v0.6.0 [ 0.5.0 ]: https://github.com/ZJUEarthData/geochemistrypi/compare/v0.4.0...v0.5.0 [ 0.4.0 ]: https://github.com/ZJUEarthData/geochemistrypi/compare/v0.3.0...v0.4.0 From 67df2f5da3fdfb11235ce2a418697a1e8eb188b6 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Sun, 7 Jul 2024 15:52:31 +0800 Subject: [PATCH 05/12] docs: rewrite the tutorial of framework. --- README.md | 2 + .../Add New Model To Framework.md | 955 +++++++++++------- geochemistrypi/data_mining/model/_base.py | 2 +- .../data_mining/model/clustering.py | 25 +- .../model/func/algo_clustering/_enum.py | 12 + .../data_mining/model/regression.py | 4 +- .../data_mining/process/classify.py | 2 + geochemistrypi/data_mining/process/detect.py | 1 - 8 files changed, 613 insertions(+), 390 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_clustering/_enum.py diff --git a/README.md b/README.md index b236c6d7..a8c64248 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,8 @@ Geochemistry π was selected for featuring as an Editor’s Highlight in EOS Eos Website: https://eos.org/editor-highlights/machine-learning-for-geochemists-who-dont-want-to-code. +![Geochemistry pi news](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/bdd33a31-824a-492e-adcf-e660da4eaf1d) + ## Quick Installation Our software is well tested on **macOS** and **Windows** system with **Python 3.9**. Other systems and Python version are not guranteed. diff --git a/docs/source/For Developer/Add New Model To Framework.md b/docs/source/For Developer/Add New Model To Framework.md index 3b3eb0c2..d2276ea9 100644 --- a/docs/source/For Developer/Add New Model To Framework.md +++ b/docs/source/For Developer/Add New Model To Framework.md @@ -1,276 +1,230 @@ # Add New Model To Framework +## Table of Contents +- [1. Framework - Design Pattern and Hierarchical Pipeline Architecture](#1-design-pattern) +- [2. Understand Machine Learning Algorithm](#2-understand-ml) +- [3. Construct Model Workflow Class](#3-construct-model) + - [3.1 Add Basic Elements](#3-1-add-basic-element) + - [3.1.1 Find File](#3-1-1-find-file) + - [3.1.2 Define Class Attributes and Constructor](#3-1-2-define-class-attributes-and-constructors) + - [3.2 Add Manual Hyperparameter Tuning Functionality](#3-2-add-manual) + - [3.2.1 Define manual_hyper_parameters Method](#3-2-1-define-manaul-method) + - [3.2.2 Create _algorithm.py File](#3-2-2-create-file) + - [3.3 Add Automated Hyperparameter Tuning (AutoML) Functionality](#3-3-add-automl) + - [3.3.1 Add AutoML Code to Model Workflow Class](#3-3-1-add-automl-code) + - [3.4 Add Application Function to Model Workflow Class](#3-4-add-application-function) + - [3.4.1 Add Common Application Functions and common_components Method](#3-4-1-add-common-function) + - [3.4.2 Add Special Application Functions and special_components Method](#3-4-2-add-special-function) + - [3.4.3 Add @dispatch() to Component Method](#3-4-3-add-dispatch) + - [3.5 Storage Mechanism](#3-5-storage-mechanism) +- [4. Instantiate Model Workflow Class](#4-instantiate-model-workflow-class) + - [4.1 Find File](#4-1-find-file) + - [4.2 Import Module](#4-2-import-module) + - [4.3 Define activate Method](#4-3-define-activate-method) + - [4.4 Create Model Workflow Object](#4-4-create-model-workflow-object) + - [4.5 Invoke Other Methods in Scikit-learn API Style](#4-5-invoke-other-methods) + - [4.6 Add model_name to MODE_MODELS or NON_AUTOML_MODELS](#4-6-add-model-name) +- [5. Test Model Workflow Class](#5-test-model) +- [6. Completed Pull Request](#6-completed-pull-request) +- [7. Precautions](#7-precautions) -## Table of Contents +## 1. Framework - Design Pattern and Hierarchical Pipeline Architecture + +Geochemistry π refers to the software design pattern "Abstract Factory", serving as the foundational framework upon which our advanced automated ML capabilities are built. + +![Design Pattern](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/aa84ab12-c95e-4282-a60e-64ba2858c437) +![Workflow Object](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/f08885bf-1bec-4045-bf6b-82c5c18d3f8f) -- [1. Understand the model](#1-understand-the-model) +The framework is a four-layer hierarchical pipeline architecture that promotes the creation of workflow obiects through a set of model selection interfaces. The critical layers of this architecture are, as follows: -- [2. Add Model](#2-add-model) - - [2.1 Add The Model Class](#21-add-the-model-class) - - [2.1.1 Find Add File](#211-find-add-file) - - [2.1.2 Define class properties and constructors, etc.](#212-define-class-properties-and-constructors-etc) - - [2.1.3 Define manual\_hyper\_parameters](#213-define-manual_hyper_parameters) - - [2.1.4 Define special\_components](#214-define-special_components) +1. Layer 1: the realization of ML model-associated functionalities with specific dependencies or libraries. +2. Layer 2: the abstract components of the ML model workflow class include regression, classification, clustering, and decomposition. +3. Layer 3: the scikit-learn API-style model selection interface implements the creation of ML model workflow objects. +4. Layer 4: the customized automated ML pipeline operated at the command line or through a web interface with a complete data-mining process. - - [2.2 Add AutoML](#22-add-automl) - - [2.2.1 Add AutoML code to class](#221-add-automl-code-to-class) +

+ Hierarchical Architecture +

- - [2.3 Get the hyperparameter value through interactive methods](#23-get-the-hyperparameter-value-through-interactive-methods) - - [2.3.1 Find file](#231-find-file) - - [2.3.2 Create the .py file and add content](#232-create-the-py-file-and-add-content) - - [2.3.3 Import in the file that defines the model class](#233-import-in-the-file-that-defines-the-model-class) +This pattern-driven architecture offers developers a standardized and intuitive way to create a ML model workflow class in Layer 2 by using a unified and consistent approach to object creation in Layer 3. Furthermore, it ensures the interchangeability of different model applications, allowing for seamless transitions between methodologies in Layer 1. - - [2.4 Call Model](#24-call-model) - - [2.4.1 Find file](#241-find-file) - - [2.4.2 Import module](#242-import-module) - - [2.4.3 Call model](#243-call-model) +image - - [2.5 Add the algorithm list and set NON\_AUTOML\_MODELS](#25-add-the-algorithm-list-and-set-non_automl_models) - - [2.5.1 Find file](#251-find-file) +The code of each layer lies as shown above. - - [2.6 Add Functionality](#26-add-functionality) +**Notice**: in our framework, a **model workflow class** refers to an **algorithm workflow class** and a **mode** includes multiple model workflow classes. - - [2.6.1 Model Research](#261-model-research) +Now, we will take KMeans algorithm as an example to illustrate the connection between each layer. Don't get too hung up on ths part. Once you finish reading the whole article, you can come back to here again. - - [2.6.2 Add Common_component](#262-add-common_component) +After reading this article, you are recommended to refer to this publication also for more details on the whole scope of our framework: - - [2.6.3 Add Special_component](#263-add-special_component) +https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2023GC011324 -- [3. Test model](#3-test-model) -- [4. Completed Pull Request](#4-completed-pull-request) +## 2. Understand Machine Learning Algorithm +You need to understand the general meaning of the machine learning algorithm you are responsible for. Then you encapsultate it as an algorithm workflow in our framework and put it under the directory `geochemistrypi/data_mining/model`. Then you need to determine which **mode** this algorithm belongs to and the role of each parameter. For example, linear regression algorithm belongs to regression mode in our framework. -- [5. Precautions](#5-precautions) ++ When learning the ML algorithm, you can refer to the relevant knowledge on the [scikit-learn official website](https://scikit-learn.org/stable/index.html). -## 1. Understand the model -You need to understand the general meaning of the model, determine which algorithm the model belongs to and the role of each parameter. -+ You can choose to learn about the relevant knowledge on the [scikit-learn official website](https://scikit-learn.org/stable/index.html). +## 3. Construct Model Workflow Class +**Noted**: You can reference any existing model workflow classes in our framework to implement your own model workflow class. -## 2. Add Model -### 2.1 Add The Model Class -#### 2.1.1 Find Add File -First, you need to define the model class that you need to complete in the corresponding algorithm file. The corresponding algorithm file is in the `model` folder in the `data_mining` folder in the `geochemistrypi` folder. +### 3.1 Add Basic Elements + +#### 3.1.1 Find File +First, you need to construct the algorithm workflow class in the corresponding model file. The corresponding model file locates under the path `geochemistrypi/data_mining/model`. ![image1](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/3c7d4e53-1a99-4e7e-87b6-fdcb94a9e510) -**eg:** If you want to add a model to the regression algorithm, you need to add it in the `regression.py` file. +**E.g.,** If you want to add a model for the regression mode, you need to add it in the `regression.py` file. + +#### 3.1.2 Define Class Attributes and Constructor + +(1) Define the algorithm workflow class and its base class -#### 2.1.2 Define class properties and constructors, etc. -(1) Define the class and the required Base class ``` -class NAME (Base class): +class ModelWorkflowClassName(BaseModelWorkflowClassName): ``` -+ NAME is the name of the algorithm, you can refer to the NAME of other models, the format needs to be consistent. -+ Base class needs to be selected according to the actual algorithm requirements. ++ You can refer to the ModelName of other models, the format (Upper case and with the suffix 'Corresponding Mode') needs to be consistent. E.g., `XGBoostRegression`. ++ Base class needs to be inherited according to the mode the model belongs to. ``` -"""The automation workflow of using "Name" to make insightful products.""" +"""The automation workflow of using "ModelWorkflowClassName" algorithm to make insightful products.""" ``` -+ Class explanation, you can refer to other classes. ++ Class docstring, you can refer to other classes. The template is shown above. -(2) Define the name and the special_function +(2) Define the class attributes `name` ``` -name = "name" +name = "algorithm terminology" ``` -+ Define name, different from NMAE. -+ This name needs to be added to the _`constants.py`_ file and the corresponding algorithm file in the `process` folder. Note that the names are consistent. ++ The class attributes `name` is different from ModelWorkflowClassName. E.g., the name `XGBoost` in `XGBoostRegression` model workflow class. ++ This name needs to be added to the corresponding constant variable in `geochemistrypi/data_mining/constants.py` file and the corresponding mode processing file under the `geochemistrypi/data_mining/process` folder. Note that those name value should be identical. It will be further explained in later section. ++ For example, the name value `XGBoost` should be included in the constant varible `REGRESSION_MODELS` in `geochemistrypi/data_mining/constants.py` file and it will be use in `geochemistrypi/data_mining/process/regress.py`. + +(3) Define the class attrbutes `common_functiion` or `special_function` + +If this model workflow class is a base class, you need to define the class attrbutes `common_functiion`. For example, the class attrbutes `common_functiion` in the base workflow class `RegressionWorkflowBase`. + +The values of `common_functiion` are the description of the functionalities of the models belonging to the same mode. It means the children class (all regession models) can share the same common functionalies as well. + +``` +common_functiion = [] +``` + +If this model workflow class is a specific model workflow class, you need to define the class attrbutes `special_function`. For example, the class attrbutes `special_function` in the model workflow class `XGBoostRegression`. + +The values of `special_function` are the description of the owned functionalities of that specific model. Those special functions cannot be reused by other models. + ``` special_function = [] ``` -+ special_function is added according to the specific situation of the model, you can refer to other similar models. -(3) Define constructor +More detail will be explained in the later section. + +(4) Define the signature of the constructor ``` def __init__( self, - parameter:type=Default parameter value, + parameter: type = Default parameter value, ) -> None: ``` -+ All parameters in the corresponding model function need to be written out. -+ Default parameter value needs to be set according to official documents. ++ The parameters in the constructor is from the algorithm library you depend on. For example, you use **Lasso** algorithm from Sckit-learn library. You can reference its introduction ([Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)) in Scikit-learn website. ++ Default parameter value needs to be set according to scikit-learn official documents also. + +![image2](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/0f02b7bb-bef1-4b56-9c84-6162e86e2093) + ``` - """ +""" Parameters ---------- -parameter:type,default = Dedault +parameter: type,default = Dedault References ---------- Scikit-learn API: sklearn.model.name https://scikit-learn.org/...... ``` -+ Parameters is in the source of the corresponding model on the official website of sklearn - -**eg:** Take the [Lasso algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) as a column. - -![image2](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/0f02b7bb-bef1-4b56-9c84-6162e86e2093) ++ Parameters docstring are in the source code of the corresponding algorithm on the official website of sklearn. ![image3](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/0926a7e5-7243-4f4b-a3bb-bc4393b9633d) -+ References is your model's official website. -(4) The constructor of Base class is called +(5) The constructor of Base class is called ``` super().__init__() ``` -(5) Initializes the instance's state by assigning the parameter values passed to the constructor to the instance's properties. + +(6) Initializes the instance's state by assigning the parameter values passed to the constructor to the instance's attributes. ``` self.parameter=parameter ``` -(6) Create the model and assign + +(7) Instantiate the algorithm class you depend on and assign. For example, `Lasso` from the library `sklearn.linear_model`. ``` -self.model=modelname( +self.model = modelname( parameter=self.parameter ) ``` -**Note:** Don't forget to import Model from scikit-learn +**Note:** Don't forget to import the class from scikit-learn library ![image4](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/38e64144-fa19-4ef2-83d1-709504ba8001) -(7) Define other class properties +(8) Define the instance attribute `naming` ``` -self.properties=... -``` - -#### 2.1.3 Define manual_hyper_parameters -manual_hyper_parameters gets the hyperparameter value by calling the manual hyperparameter function, and returns hyper_parameters. +self.naming = Class.name ``` -hyper_parameters = name_manual_hyper_parameters() -``` -+ This function calls the corresponding function in the `func` folder (needs to be written, see 2.2.2) to get the hyperparameter value. - -+ This function is called in the corresponding file of the `Process` folder (need to be written, see 2.3). -+ Can be written with reference to similar classes - +This one will be use to print the name of the class and to activate the AutoML functionality. E.g, `self.naming = LassoRegression.name`. Further explaination is in section 2.2. -#### 2.1.4 Define special_components -Its purpose is to Invoke all special application functions for this algorithms by Scikit-learn framework. -**Note:** The content of this part needs to be selected according to the actual situation of your own model.Can refer to similar classes. - -``` -GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") -``` -+ This line of code gets the image model output path from the environment variable. -``` -GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") +(9) Define the instance attribute `customized` and `customized_name` ``` -+ This line of code takes the general output artifact path from the environment variable. -**Note:** You need to choose to add the corresponding path according to the usage in the following functions. - -### 2.2 Add AutoML -#### 2.2.1 Add AutoML code to class -(1) Set AutoML related parameters -``` - @property - def settings(self) -> Dict: - """The configuration of your model to implement AutoML by FLAML framework.""" - configuration = { - "time_budget": '...' - "metric": '...', - "estimator_list": '...' - "task": '...' - } - return configuration +self.customized = True +self.customized_name = "Algorithm Name" ``` -+ "time_budget" represents total running time in seconds -+ "metric" represents Running metric -+ "estimator_list" represents list of ML learners -+ "task" represents task type -**Note:** You can keep the parameters consistent, or you can modify them to make the AutoML better. +These will be use to leverage the customization of AutlML functionality. E.g,`self.customized_name = "Lasso"`. Further explaination is in section 2.3. -(2) Add parameters that need to be AutoML -You can add the parameter tuning code according to the following code: -``` - @property - def customization(self) -> object: - """The customized 'Your model' of FLAML framework.""" - from flaml import tune - from flaml.data import 'TPYE' - from flaml.model import SKLearnEstimator - from sklearn.ensemble import 'model_name' - - class 'Model_Name'(SKLearnEstimator): - def __init__(self, task=type, n_jobs=None, **config): - super().__init__(task, **config) - if task in 'TOYE': - self.estimator_class = 'model_name' - - @classmethod - def search_space(cls, data_size, task): - space = { - "'parameters1'": {"domain": tune.uniform(lower='...', upper='...'), "init_value": '...'}, - "'parameters2'": {"domain": tune.choice([True, False])}, - "'parameters3'": {"domain": tune.randint(lower='...', upper='...'), "init_value": '...'}, - } - return space - - return "Model_Name" -``` -**Note1:** The content in ' ' needs to be modified according to your specific code -**Note2:** +(10) Define other instance attributes ``` - space = { - "'parameters1'": {"domain": tune.uniform(lower='...', upper='...'), "init_value": '...'}, - "'parameters2'": {"domain": tune.choice([True, False])}, - "'parameters3'": {"domain": tune.randint(lower='...', upper='...'), "init_value": '...'}, - } +self.attributes=... ``` -+ tune.Uniform represents float -+ tune.choice represents bool -+ tune.randint represents int -+ lower represents the minimum value of the range, upper represents the maximum value of the range, and init_value represents the initial value -**Note:** You need to select parameters based on the actual situation of the model -(3) Define special_components(FLAML) -This part is the same as 2.1.4 as a whole, and can be modified with reference to it, but only the following two points need to be noted: -a.The multi-dispatch function is different -Scikit-learn framework:@dispatch() -FLAML framework:@dispatch(bool) +### 3.2 Add Manual Hyperparameter Tuning Functionality -b.Added 'is_automl: bool' to the def -**eg:** -``` -Scikit-learn framework: -def special_components(self, **kwargs) -> None: +Our framework provides the user to set the algorithm hyperparameter manually or automiacally. In this part, we implement the manual functionality. -FLAML framework: -def special_components(self, is_automl: bool, **kwargs) -> None: -``` -c.self.model has a different name -**eg:** -``` -Scikit-learn framework: -coefficient=self.model.coefficient - -FLAML framework: -coefficient=self.auto_model.coefficient -``` +Sometimes the users want to input the hyperparameter values for model training manually, so you need to establish an interaction way to get the user's input. -**Note:** You can refer to other similar codes to complete your code. +#### 3.2.1 Define manual_hyper_parameters Method -### 2.3 Get the hyperparameter value through interactive methods -Sometimes the user wants to modify the hyperparameter values for model training, so you need to establish an interaction to get the user's modifications. +The manual operation is control by the **manual_hyper_parameters** method. Inside this method, we encapsulate a lower level application function called algorithm_manual_hyper_parameters(). +``` +@classmethod +def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = algorithm_manual_hyper_parameters() + clear_output() + return hyper_parameters +``` -#### 2.3.1 Find file -You need to find the corresponding folder for model. The corresponding algorithm file is in the `func` folder in the model folder in the `data_mining` folder in the `geochemistrypi` folder. ++ The **manual_hyper_parameters** method is called in the corresponding mode operation file under the `geochemistrypi/data_mining/process` folder. ++ This lower level application function locates in the `geochemistrypi/data_mining/model/func/specific_mode` folder which limits the hyperparameters the user can set manually. E.g., If the model workflow class `LassoRegression` belongs to the regression mode, you need to add the `_lasso_regression.py` file under the folder `geochemistrypi/data_mining/model/func/algo_regression`. Here, `_lasso_regression.py` contains all encapsulated application functions specific to lasso algorithm. ![image5](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/9d7b44d0-fd85-4a6a-a2a8-3f531475f3f6) -**eg:** If your model belongs to the regression, you need to add the `corresponding.py` file in the `alog_regression` folder. +#### 3.2.2 Create `_algorithm.py` File +(1) Create a _algorithm.py file -#### 2.3.2 Create the .py file and add content -(1) Create a .py file **Note:** Keep name format consistent. (2) Import module @@ -283,395 +237,644 @@ from ....constants import SECTION ``` from ....data.data_readiness import bool_input, float_input, num_input ``` -+ This needs to choose the appropriate import according to the hyperparameter type of model interaction. ++ You needs to choose the appropriate common utility functions according to the input type of hyperparameter. -(3) Define the function +(3) Define the application function ``` -def name_manual_hyper_parameters() -> Dict: +def algorithm_manual_hyper_parameters() -> Dict: ``` -**Note:** The name needs to be consistent with that in 2.1.3. (4) Interactive format ``` -print("Hyperparameters: Role") -print("Recommended value") -Hyperparameters = type_input(Recommended value, SECTION[2], "@Hyperparameters: ") +print("Hyperparameters: Explaination") +print("A good starting value ...") +Hyperparameters = type_input(Default Value, SECTION[2], "@Hyperparameters: ") ``` -**Note:** The recommended value needs to be the default value of the corresponding package. +**Note:** You can query ChatGPT for the recommended good starting value. The default value can come from that one in the imported library. For example, check the default value of the specific parameter for `Lasso` algorithm in [Scikit-learn Website](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html). (5) Integrate all hyperparameters into a dictionary type and return. ``` hyper_parameters = { - "Hyperparameters1": Hyperparameters1, - "Hyperparameters": Hyperparameters2, + "Hyperparameters1": Hyperparameters1, + "Hyperparameters": Hyperparameters2, } retuen hyper_parameters ``` -#### 2.3.3 Import in the file that defines the model class +#### 3.2.3 Import in The Model Workflow Class File ``` -from .func.algo_regression.Name import name_manual_hyper_parameters +from .func.algo_mode._algorithm.py import algorithm_manual_hyper_parameters ``` -**eg:** ![image6](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/27e74d2c-8539-41e6-bca9-f0dd50d4ed74) -### 2.4 Call Model -#### 2.4.1 Find file -Call the model in the corresponding file in the `process` folder. The corresponding algorithm file is in the `process` folder in the` model` folder in the `data_mining` folder in the `geochemistrypi` folder. +### 3.3 Add Automated Hyperparameter Tuning (AutoML) Functionality -![image7](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/36e8f6ee-ae21-4f86-b000-0a373ea63cca) +#### 3.3.1 Add AutoML Code to Model Workflow Class -**eg:** If your model belongs to the regression,you need to call it in the regress.py file. +Currently, only supervised learning modes (regression and classification) support AutoML. Hence, only the algorithm belonging to these two modes need to implment AutoML functionality. -#### 2.4.2 Import module -You need to add your model in the from ..model.regression import(). -``` -from ..model.regression import( - ... - NAME, -) -``` -**Note:** NAME needs to be the same as the NAME when defining the class in step 2.1.2. -**eg:** +Our framework leverages FLAML + Ray to build the AutoML functionality. For some algorithms, FLAML has encapsulated them. Hence, it is easy to operate with those built-in algorithm. However, some algorithms without encapsulation needs our customization on our own. -![image8](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/36fabb07-10b0-419a-b31d-31c036493b7b) +There are three cases in total: ++ C1: Encapsulated -> FLAML (Good example: `XGBoostRegression` in `regression.py`) ++ C2: Unencapsulated -> FLAML (Good example: `SVMRegression` in `regression.py`) ++ C3: Unencapsulated -> FLAML + RAY (Good example: `MLPRegression` in `regression.py`) -#### 2.4.3 Call model -There are two activate methods defined in the Regression and Classification algorithms, the first method uses the Scikit-learn framework, and the second method uses the FLAML and RAY frameworks. Decomposition and Clustering algorithms only use the Scikit-learn framework. Therefore, in the call, Regression and Classification need to add related codes to implement the call in both methods, and only one time is needed in Clustering and Decomposition. +Here, we only talk about 2 cases, C1 and C2. C3 is a special case and it is only implemented in MLP algorithm. -(1) Call model in the first activate method(Including Classification, Regression,Decomposition,Clustering) -``` -elif self.model_name == "name": - hyper_parameters = NAME.manual_hyper_parameters() - self.dcp_workflow = NAME( - Hyperparameters1=hyper_parameters["Hyperparameters2"], - Hyperparameters1=hyper_parameters["Hyperparameters2"], - ... - ) -``` -+ The name needs to be the same as the name in 2.4 -+ The hyperparameters in NAME() are the hyperparameters obtained interactively in 2.2 -**eg:** -![image9](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/d4d3c208-e7a5-4e5c-a403-1fa6646bf7a7) +Noted: -(2)Call model in the second activate method(Including Classification, Regression) ++ The calling method **fit** is defined in the base class, hence, no need to define it again in the specific model workflow class. You can refrence the **fit** method of `RegressionWorkflowBase` in `regression.py` + +The following two steps is needed to implement AutoML functionality in the model workflow class. But for C1 it only requires the first step while C2 needs two step both. + +(1) Create `settings` method ``` -elif self.model_name == "name": - self.reg_workflow = NAME() +@property +def settings(self) -> Dict: + """The configuration of your model to implement AutoML by FLAML framework.""" + configuration = { + "time_budget": '...' + "metric": '...', + "estimator_list": '...' + "task": '...' + } + return configuration ``` -+ The name needs to be the same as the name in 2.4 -**eg:** -![image10](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/0eae64d1-8e50-4a02-bf08-c9fc543130d0) ++ "time_budget" represents total running time in seconds ++ "metric" represents Running metric ++ "estimator_list" represents list of ML learners ++ "task" represents task type -### 2.5 Add the algorithm list and set NON_AUTOML_MODELS +For C1, the value of "estimator_list" should come from the specified name in [FLAML library](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML). For example, the specified name `xgboost` in the model workflow class `XGBoostRegression`. Also we need to put this specified value inside a list. -#### 2.5.1 Find file -Find the constants file to add the model name,The constants file is in the `data_mining` folder in the `geochemistrypi` folder. +image -![image11](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/84544ad9-44aa-4fb4-b0f1-668f4c3da65f) +For C2, the value of "estimator_list" should be the instance attribute `self.customized_name`. For example, `self.customized_name = "SVR"` in the model workflow class `SVMRegression`. Also we need to put this specified value inside a list. -(1) Add the model name -Add model name to the algorithm list corresponding to the model in the constants file. -**eg:** Add the name of the Lasso regression algorithm. -![image12](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/ec647037-2467-4a86-b7bb-e009a48cb964) - -(2)set NON_AUTOML_MODELS -Because this is a tutorial without automatic parameters, you need to add the model name in the NON_AUTOML_MODELS. -**eg:** -![image13](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/d6b03566-a833-4868-8738-be09d7356c9c) +**Note:** You can keep the other key-value pair consistent with other exited model workflow classes. +(2) Create `customization` method +You can add the parameter tuning code according to the following code: +``` +@property +def customization(self) -> object: + """The customized 'Your model' of FLAML framework.""" + from flaml import tune + from flaml.data import 'TPYE' + from flaml.model import SKLearnEstimator + from 'sklearn' import 'model_name' + + class 'Model_Name'(SKLearnEstimator): + def __init__(self, task=type, n_jobs=None, **config): + super().__init__(task, **config) + if task in 'TYPE': + self.estimator_class = 'model_name' + + @classmethod + def search_space(cls, data_size, task): + space = { + "'parameters1'": {"domain": tune.uniform(lower='...', upper='...'), "init_value": '...'}, + "'parameters2'": {"domain": tune.choice([True, False])}, + "'parameters3'": {"domain": tune.randint(lower='...', upper='...'), "init_value": '...'}, + } + return space + + return "Model_Name" +``` +**Note1:** The content in ' ' needs to be modified according to your specific code. You can reference that one in the model workflow class `SVMRegression`. +**Note2:** +``` +space = { + "'parameters1'": {"domain": tune.uniform(lower='...', upper='...'), "init_value": '...'}, + "'parameters2'": {"domain": tune.choice([True, False])}, + "'parameters3'": {"domain": tune.randint(lower='...', upper='...'), "init_value": '...'}, +} +``` ++ tune.uniform represents float ++ tune.choice represents bool ++ tune.randint represents int ++ lower represents the minimum value of the range, upper represents the maximum value of the range, and init_value represents the initial value +**Note:** You need to select parameters based on the actual situation of the model +### 3.4 Add Application Function to Model Workflow Class -### 2.6 Add Functionality +We treat the insightful outputs (index, scores) or diagrams to help to analyze and understand the algorithm as useful application. For example, XGBoost algorithm can produce feature importance score, hence, drawing feature importance diagram is an **application function** we can add to the model workflow class `XGBoostRegression`. -#### 2.6.1 Model Research +Conduct research on the corresponding model and look for its useful application functions that need to be added. -Conduct research on the corresponding model and confirm the functions that need to be added. ++ You can confirm the functions that need to be added on the official website of the model (such as scikit learn), search engines (such as Google), chatGPT, etc. -\+ You can confirm the functions that need to be added on the official website of the model (such as scikit learn), search engines (such as Google), chatGPT, etc. +In our framework, we define two types of application function: **common application function** and **special application function**. -(1) Common_component is a public function in a class, and all functions in each class can be used, so they need to be added in the parent class,Each of the parent classes can call Common_component. +Common application function can be shared among the model workflow classes which belong to the same mode. It will be placed inside the base model workflow class. For example, `classification_report` is a common application function placed inside the base class `ClassificationWorkflowBase`. Notice that it is encapsulated in the **private** instance method `_classification_report`. -(2) Special_component is unique to the model, so they need to be added in a specific model,Only they can use it. +Likewise, special application function is the special fucntionalities owned by the algorithm itself, hence it is placed inside a specific model workflow class. For example, for KMeans algorithm, we can get the inertia scores from it. Hence, inside the model workflow class `KMeansClustering`, we have a **private** instance method `_get_inertia_scores`. -![Image1](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/3f983a7a-3b0d-4c7b-b7b7-31b317f4d9d0) +Now, the next question is how to invoke these application function in our framework. +In fact, we put the invocation of the application function in the component method. Accordingly, we have two types of components: +(1) `common_components` is a public method in the base class, and all common application functions will be invoked inside. -#### 2.6.2 Add Common_component +(2) `special_components` is unique to the algorithm, so they need to be added in a specific model workflow class. All special aaplication function related to this algorithm will be invoked inside. -Common_component refer to functions that can be used by all internal submodels, so it is necessary to consider the situation of each submodel when adding them. +![Image1](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/3f983a7a-3b0d-4c7b-b7b7-31b317f4d9d0) -***\*1. Add corresponding functionality to the parent class\**** +For more details, you can refer to the brief illustraion of the framework in section 1. -Once you've identified the features you want to add, you can define the corresponding functions in the parent class. +#### 3.4.1 Add Common Application Functions and `common_components` Method -The code format is: +`common_components` will invoke the common application functions used by all its children model workflow class, so it is necessary to consider the situation of each child model workflow class when adding a application function to it. The better way is to put the application function inside a specific child model workflow class firstly if you are not sure it can be classified as a common application function. -(1) Define the function name and add the required parameters. +**1. Add common application function to the base class** -(2) Use annotations to describe function functionsUse annotations to describe function functions. +Once you’ve identified the functionality you want to add, you can define the corresponding functions in the base class. -(3) Referencing specific functions to implement functionality. +The steps to implement are: -(4) Change the format of data acquisition and save data or images. +(1) Define the private function name and add the required parameters. +(2) Use annotations to decorate the function. +(3) Add the docstring to explain the use of this functionality. +(4) Referencing specific libraries (e.g., Scikit-learn) to implement the functionality. +(5) Change the format of data acquisition and save the produced data or images, etc. +**2. Encapsulte the concrete code in Layer 1** +Please refer to our framework's definition of **Layer 1** in section 1. -***\*2. Define Common_component\**** +Some functions may use large code due to their complexity. To ensure the style and readability of the codebase, you need to put the specific function implementation into the corresponding `geochemistrypi/data_mining/model/func/mode/_common` files and call it. -(1) Define the common_components in the parent class, its role is to set where the output is saved. +The steps to implement are: -(2) Set the parameter source for the added function. +(1) Define the public function name, add the required parameters and proper decorator. +(2) Add the docstring to explain the use of this functionality,the significance of each parameter and the related reference. +(3) Implement functionality. +(4) Returns the value used in **Layer 2**. +**3. Define `common_components` Method** +The steps to implement are: -***\*3. Implement function functions\**** +(1) Define the path to store the data and images, etc. +(2) Invoke the common application functions one by one. -Some functions may use large code due to their complexity. To ensure the style and readability of the code, you need to put the specific function implementation into the corresponding `_common` files and call it. +**4. Apeend The Name of Functionality in Class Attribute `common_function`** -It includes: +The steps to implement are: -(1) Explain the significance of each parameter. +(1) Create a class attribute `common_function` list in `ClusteringWorkflowBase` +(2) Create a enum class to include the name of the functionality +(3) Append the value of enum class into `common_function` list -(2) Implement functionality. +**Example** -(3) Returns the required parameters. +The following is the example of adding model evaluation score to the clustering base class. +First, you need to find the base class of clustering. +![Image2](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/b41a5af8-6cf3-4747-8c83-e613a3fee04b) -***\*eg:\**** You want to add model evaluation to your clustering. +image -First, you need to find the parent class to clustering. +**1. Add `_score` function in base class `ClusteringWorkflowBase(WorkflowBase)`** -![Image2](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/b41a5af8-6cf3-4747-8c83-e613a3fee04b) +```python +@staticmethod +def _score(data: pd.DataFrame, labels: pd.DataFrame, func_name: str, algorithm_name: str, store_path: str) -> None: + """Calculate the score of the model.""" + print(f"-----* {func_name} *-----") + scores = score(data, labels) + scores_str = json.dumps(scores, indent=4) + save_text(scores_str, f"{func_name}- {algorithm_name}", store_path) + mlflow.log_metrics(scores) +``` -![Image3](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/e81f3c96-f90d-49c8-b2e9-e8675d41cf90) +**2. Encapsulte the concrete code of `score` in Layer 1** -***\*1. Add the clustering score function in class ClusteringWorkflowBase (WorkflowBase).\**** +You need to add the specific function implementation `score` to the corresponding `geochemistrypi/data_mining/model/func/algo_clustering/_common` file. +![Image5](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/ee6bb43e-f30e-47b6-8d78-13f017994a44) ```python +def score(data: pd.DataFrame, labels: pd.DataFrame) -> Dict: + """Calculate the scores of the clustering model. -@staticmethod -def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_path: str) -> None: + Parameters + ---------- + data : pd.DataFrame (n_samples, n_components) + The true values. - """Calculate the score of the model.""" + labels : pd.DataFrame (n_samples, n_components) + Labels of each point. + + Returns + ------- + scores : dict + The scores of the clustering model. + """ + silhouette = silhouette_score(data, labels) + calinski_harabaz = calinski_harabasz_score(data, labels) + print("silhouette_score: ", silhouette) + print("calinski_harabasz_score:", calinski_harabaz) + scores = { + "silhouette_score": silhouette, + "calinski_harabasz_score": calinski_harabaz, + } + return scores +``` - print("-----* Model Score *-----") +**3. Define `common_components` Method in class `ClusteringWorkflowBase(WorkflowBase)`** - scores = score(data, labels) +```python +def common_components(self) -> None: + """Invoke all common application functions for clustering algorithms.""" + GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._score( + data=self.X, + labels=self.clustering_result["clustering result"], + func_name=ClusteringCommonFunction.MODEL_SCORE.value, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, + ) +``` - scores_str = json.dumps(scores, indent=4) +**4. Apeend The Name of Functionality in Class Attribute `common_function`** - save_text(scores_str, f"Model Score - {algorithm_name}", store_path) +Create a class attribute `common_function` in `ClusteringWorkflowBase`. - mlflow.log_metrics(scores) +``` +class ClusteringWorkflowBase(WorkflowBase): + """The base workflow class of clustering algorithms.""" + common_function = [func.value for func in ClusteringCommonFunction] ``` +The enum class should be put in the corresponding path `geochemistrypi/data-mining/model/func/algo_clustering/_enum.py` -(1) Define the function name and add the required parameters. +image -(2) Use annotations to describe function functionsUse annotations to describe function functions. -(3) Referencing specific functions to implement functionality (Reference 3.2.3). +#### 3.4.2 Add Special Application Functions and `special_components` Method -(4) Change the format of data acquisition and save data or images. +special application function is a feature that is unique to each specific model. The whole process is similar to that of previous sectoin for common functionalities. -***\*Note:\**** Make sure that the code style of the added function is consistent. +The process is as follows: -***\*2. Define common_components below the added function to define the output position and parameter source for the added function.\**** +1. Add special application function with proper decorator to the child model workflow class +2. Encapsulte the concrete code in Layer 1 +3. Define `special_components` method +4. Apeend the name of functionality in class attribute `special_function` -```python +**Example** -def common_components(self) -> None: +Each algorithms has their own characteristics. Hence, they have different special fucntionalities as well. For example, for KMeans algorithm, we can get the inertia scores from it. Hence, inside the model workflow class `KMeansClustering`, we have a **private** instance method `_get_inertia_scores`. - """Invoke all common application functions for clustering algorithms.""" +First, you need to find the child model workflow class for KMeans algorithm. - GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") +![Image2](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/b41a5af8-6cf3-4747-8c83-e613a3fee04b) - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") +image - self._score( +**1. Add `_get_inertia_scores` function in child model workflow class `KMeansClustering(ClusteringWorkflowBase)`** - data=self.X, +```python +@staticmethod +def _get_inertia_scores(func_name: str, algorithm_name: str, trained_model: object, store_path: str) -> None: + """Get the scores of the clustering result.""" + print(f"-----* {func_name} *-----") + print(f"{func_name}: ", trained_model.inertia_) + inertia_scores = {f"{func_name}": trained_model.inertia_} + mlflow.log_metrics(inertia_scores) + inertia_scores_str = json.dumps(inertia_scores, indent=4) + save_text(inertia_scores_str, f"{func_name} - {algorithm_name}", store_path) +``` - labels=self.clustering_result["clustering result"], +**2. Encapsulte the concrete code in Layer 1** - algorithm_name=self.naming, +Getting the inertia score is only one line of code, hence no need to further encapsulate it. - store_path=GEOPI_OUTPUT_METRICS_PATH, +**3. Define `special_components` Method in class `KMeansClustering(ClusteringWorkflowBase)`** +```python +def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") + self._get_inertia_scores( + func_name=KMeansSpecialFunction.INERTIA_SCORE.value, + algorithm_name=self.naming, + trained_model=self.model, + store_path=GEOPI_OUTPUT_METRICS_PATH, ) - ``` -The positional relationship is shown in Figure 4. +![Image7](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/18dec84b-44ae-4883-a5b8-db2c6e0ef5c8) -![Image4](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/5e3eac82-19f8-4ef3-87a6-701ce6f9ac1b) ++ Also, if only part of the models share a functionality, for example, feature importance in tree-based algorithm including XGBoost, Decision Tree, etc. Hence, you can create a Mixin class to include that application function and let the tree-based model workflow class inherit it. Such as `ExtraTreesRegression(TreeWorkflowMixin, RegressionWorkflowBase)` -***\*3. You need to add the specific function implementation to the corresponding `_commom` file.\**** +**4. Apeend The Name of Functionality in Class Attribute `special_function`** -![Image5](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/ee6bb43e-f30e-47b6-8d78-13f017994a44) +Create a class attribute `special_function` list in `KMeansClustering`. -```python +``` +class KMeansClustering(ClusteringWorkflowBase): + """The automation workflow of using KMeans algorithm to make insightful products.""" -def score(data: pd.DataFrame, labels: pd.DataFrame) -> Dict: + name = "KMeans" + special_function = [func.value for func in KMeansSpecialFunction] +``` - """Calculate the scores of the clustering model. +The enum class should be put in the corresponding path `geochemistrypi/data-mining/model/func/algo_clustering/_enum.py` - Parameters +image - ---------- - data : pd.DataFrame (n_samples, n_components) +#### 3.4.3 Add `@dispatch()` to Component Method - The true values. +Howerever, in **regression** mode and **classification** mode, there are two different scenarios (AutoML and manual ML) when defining either `common_components` or `special_components` method. It is needed because we need to differentiate AutoML and manual ML. For example, inside the base model workflow class `RegressionWorkflowBase`, there are two `common_components` methods but with different decorators. Also, in its child model workflow class `ExtraTreesRegression`, there are two `special_components` methods but with different decorators. - labels : pd.DataFrame (n_samples, n_components) +Inside our framework, we leverages the thought of **method overloading** which is not supported by Python natively but we can achieve it through a library **multipledispatch**. The invocation of `common_components` and `special_components` method locates in Layer 3 which will be explained in later section. - Labels of each point. +The differences between AutoML and manual ML are as follows: - Returns +**1. The decorator** - ------- ++ For manual ML: add @dispatch() to decorate the component method ++ For AutoML: add @dispatch(bool) to decorate the component method - scores : dict +**2. The signature of the component method** - The scores of the clustering model. +For `common_compoents method`: +``` +Manual ML: +@dispatch() +def common_components(self) -> None: - """ +AutoML: +@dispatch(bool) +def common_components(self, is_automl: bool = False) -> None: +``` - silhouette = silhouette_score(data, labels) +For `special_compoents method`: +``` +Manual ML: +@dispatch() +def special_components(self, **kwargs) -> None: - calinski_harabaz = calinski_harabasz_score(data, labels) +AutoML: +@dispatch(bool) +def special_components(self, is_automl: bool = False, **kwargs) -> None: +``` - print("silhouette_score: ", silhouette) +**3. The trained model instance variable** - print("calinski_harabasz_score:", calinski_harabaz) +Usually, inside the component method, we will pass the trained model instance variable to the application function. For example, for `common_components` in `RegressionWorkflowBase(WorkflowBase)`, be careful about the value passed to the parameter `trained_model`. - scores = { +``` +Manual ML: +@dispatch() +def common_components(self) -> None: + self._cross_validation( + trained_model=self.model, + X_train=RegressionWorkflowBase.X_train, + y_train=RegressionWorkflowBase.y_train, + cv_num=10, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, + ) + +AutoML: +@dispatch(bool) +def common_components(self, is_automl: bool = False) -> None: + self._cross_validation( + trained_model=self.auto_model, + X_train=RegressionWorkflowBase.X_train, + y_train=RegressionWorkflowBase.y_train, + cv_num=10, + algorithm_name=self.naming, + store_path=GEOPI_OUTPUT_METRICS_PATH, +) +``` - "silhouette_score": silhouette, +**Note:** The content of this part needs to be selected according to the actual situation of your own model. Can refer to similar classes. - "calinski_harabasz_score": calinski_harabaz, +#### 3.5 Storage Mechanism - } +In Geochemistry π, the storage mechanism consists of two components: the **geopi_tracking** folder and the **geopi_output** folder. MLflow uses the geopi_tracking folder as the store for visualized operation in the web interface, which researchers cannot modify directly. The geopi_output folder is a regular folder aligning with MLflow’s storage structure, which researchers can operate. Overall, this unique storage mechanism is purpose-built to track each experiment and its corresponding runs in order to create an organized and coherent record of researchers’ scientific explorations. - return scores +

+ Storage Mechanism +

-``` +In the codebase, we use Python's open() function to store data into the **geopi_output** folder while MLflow's methods to store data into the **geopi_tracking** folder. -(1) Explain the significance of each parameter. +The common MLflow's methods includes: -(2) Implement functionality. ++ mlflow.log_param(): Log a parameter (e.g. model hyperparameter) under the current run. ++ mlflow.log_params(): Log a batch of params for the current run. ++ mlflow.log_metric(): Log a metric under the current run. ++ mlflow.log_metrics(): Log multiple metrics for the current run. ++ mlflow.log_artifact(): Log a local file or directory as an artifact of the currently active run. In our software, we use it to store the images, data and text. -(3) Returns the required parameters. +You can refer the API document of MLflow for more details. +Actually, we have encapsulated a bunch of saving functions in `geochemistrypi/data_mining/utils/base.py`, which can be used to store the data into the **geopi_output** folder and the **geopi_tracking** folder at the same time. It includes the functions `save_fig`, `save_data`, `save_text`, `save_model`. +Usually, when you want to use the saving functions, you only need to pass it the storage path and data to store. -#### 2.6.3 Add Special_component +For example, in the case of adding a common application function into base clustering model workflow class. -Special_components is a feature that is unique to each specific model. +![Image4](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/5e3eac82-19f8-4ef3-87a6-701ce6f9ac1b) -The process of adding a Special_components is similar to that of a Common_component. +``` +GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") +``` ++ This line of code gets the metrics output path from the environment variable. +``` +GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") +``` ++ This line of code gets the image model output path from the environment variable. +``` +GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") +``` ++ This line of code takes the general output artifact path from the environment variable. +**Note:** You need to choose to add the corresponding path according to the usage in the following functions. You can look up the pre-defined pathes created inside the function `create_geopi_output_dir` in `geochemistrypi/data_mining/utils/base.py`. +**Note:** You can refer to other similar model workflow classes to complete your implementation. -The process is as follows: -(1) Find the location that needs to be added. +## 4. Instantiate Model Workflow Class -(2) Defined function. +### 4.1 Find File -(3) Define Special_components and add a parametric function to it. +Instantiating a model workflow class is the responsibilty of Layer 3. Layer 3 is represented by the scikit-learn API-style model selection interface in the corresponding mode file under the `geochemistrypi/data_mining/process` folder. -(4) Add the corresponding specific function implementation function to the `corresponding manual parameter tuning` file. +![image7](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/36e8f6ee-ae21-4f86-b000-0a373ea63cca) +**eg:** If your model workflow class belongs to regression mode, you need to implement the creation of ML model workflow objects in `regress.py` file. +### 4.2 Import Module -***\*eg:\**** An example is to add a score evaluation function to k-means clustering. +For example, for the model workflow class belonging to regression, you need to add your model inside `regress.py` file by using `from ..model.regression import()`. -***\*1. Find the location that needs to be added.\**** +``` +from ..model.regression import( + ... + ModelWorkflowClass, +) +``` -We add his own unique score to the k-means. +![image8](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/36fabb07-10b0-419a-b31d-31c036493b7b) -![Image2](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/b41a5af8-6cf3-4747-8c83-e613a3fee04b) +### 4.3 Define `activate` Method -![Image6](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/34f1b0f8-9809-4ba6-86d5-aa28a565abc9) +The `activate` method defined in Layer 3 will be invoked in Layer 4. -***\*2. Defined function.\**** +For supervised learning (regression and classification), the signature of `activate` method is: +``` +def activate( + self, + X: pd.DataFrame, + y: pd.DataFrame, + X_train: pd.DataFrame, + X_test: pd.DataFrame, + y_train: pd.DataFrame, + y_test: pd.DataFrame, +) -> None: + """Train by Scikit-learn framework.""" +``` -```python +For unsupervised learning (clustering, decomposition and abnormaly detection), the signature of `activate` method is: +``` +def activate( + self, + X: pd.DataFrame, + y: Optional[pd.DataFrame] = None, + X_train: Optional[pd.DataFrame] = None, + X_test: Optional[pd.DataFrame] = None, + y_train: Optional[pd.DataFrame] = None, + y_test: Optional[pd.DataFrame] = None, +) -> None: + """Train by Scikit-learn framework.""" +``` -def _get_inertia_scores(self, algorithm_name: str, store_path: str) -> None: +The difference is that for unsupervised learning, there is no need to seperate y and split the training-testing set. But for consistency, we keep it there. - """Get the scores of the clustering result.""" +In **regression** mode and **classification** mode, there are two different scenarios (AutoML and manual ML) when defining either `activated` method. It is needed because we need to differentiate AutoML and manual ML. Hence, we still use @dispatch to decorate it. For example, in `RegressionModelSelection` class, we need to define two `activate` methods with different decorators. - print("-----* KMeans Inertia Scores *-----") +``` +Manual ML: +@dispatch(object, object, object, object, object, object) +def activate( + self, + X: pd.DataFrame, + y: pd.DataFrame, + X_train: pd.DataFrame, + X_test: pd.DataFrame, + y_train: pd.DataFrame, + y_test: pd.DataFrame, +) -> None: - print("Inertia Score: ", self.model.inertia_) +AutoML: +@dispatch(object, object, object, object, object, object, bool) +def activate( + self, + X: pd.DataFrame, + y: pd.DataFrame, + X_train: pd.DataFrame, + X_test: pd.DataFrame, + y_train: pd.DataFrame, + y_test: pd.DataFrame, + is_automl: bool, +) -> None: +``` - inertia_scores = {"Inertia Score": self.model.inertia_} +The differences above include the signature of @dispatch and the signature of `activate` method. - mlflow.log_metrics(inertia_scores) +### 4.4 Create Model Workflow Object - inertia_scores_str = json.dumps(inertia_scores, indent=4) +There are two `activate` methods defined in the Regression and Classification mode, the first method uses the Scikit-learn framework, and the second method uses the FLAML and RAY frameworks. Decomposition and Clustering algorithms only use the Scikit-learn framework. The instantiation of model workflow class inside `activate` method builds the connnectioni between Layer 3 and Layer 2. - save_text(inertia_scores_str, f"KMeans Inertia Scores - {algorithm_name}", store_path) +(1) The invocatioin of model workflow class in the first activate method (Used in classification, regression,decomposition, clustering, abnormaly detection) needs to pass the hyperparameters for manual ML: +``` +elif self.model_name == "ModelName": + hyper_parameters = ModelWorkflowClass.manual_hyper_parameters() + self.dcp_workflow = ModelWorkflowClass( + Hyperparameters1=hyper_parameters["Hyperparameters2"], + Hyperparameters1=hyper_parameters["Hyperparameters2"], + ... + ) +``` ++ This "ModelName" needs to be added to the corresponding constant variable in `geochemistrypi/data_mining/constants.py` file. It will be further explained in later section. + +**eg:** +![image9](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/d4d3c208-e7a5-4e5c-a403-1fa6646bf7a7) + +(2)The invocatioin of model workflow class in the second activate method(Used in classification, regression)for AutoML: ``` +elif self.model_name == "ModelName": + self.reg_workflow = ModelWorkflowClass() +``` ++ This "ModelName" needs to be added to the corresponding constant variable in `geochemistrypi/data_mining/constants.py` file. It will be further explained in later section. -(1) Define the function name and add the required parameters. +**eg:** +![image10](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/0eae64d1-8e50-4a02-bf08-c9fc543130d0) -(2) Use annotations to describe function functionsUse annotations to describe function functions. +### 4.5 Invoke Other Methods in Scikit-learn API Style -(3) Referencing specific functions to implement functionality. +It should contain at least these functoins below: -(4) Change the format of data acquisition and save data or images. ++ data_upload(): Load the required data into the base class's attributes. ++ show_info(): Display what application functions the algorithm will provide. ++ fit(): Fit the model. ++ save_hyper_parameters(): Save the model hyper-parameters into the storage. ++ common_components(): Invoke all common application functions. ++ special_components(): Invoke all special application functions. ++ model_save(): Save the trained model. -***\*3. Define Special_components and add a parametric function to it.\**** +You can refer to other existing mode inside `geochemistrypi/data_mining/process/mode.py` to see what other else you need. +### 4.6 Add `model_name` to `MODE_MODELS` or `NON_AUTOML_MODELS` -```python +Find the `constants.py` file under `geochemistrypi/data_mining` folder to add the model name which should be identical to that in `geochemistrypi/data_mining/process/mode.py` and in `geochemistrypi/data_mining/model/mode.py`. -def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: +![image11](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/84544ad9-44aa-4fb4-b0f1-668f4c3da65f) - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" +**(1) Add `model_name` to `MODE_MODELS`** - GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") +Append `model_name` to the `MODE_MODELS` list corresponding to the mode in the constants file. - self._get_inertia_scores( +**eg:** Add the name of the Lasso regression algorithm to `REGRESSION_MODELS` list. - algorithm_name=self.naming, +![image12](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/ec647037-2467-4a86-b7bb-e009a48cb964) - store_path=GEOPI_OUTPUT_METRICS_PATH, +**(2) Add `model_name` to `NON_AUTOML_MODELS`** - ) +Only for those algorithms, they belong to either regression or classification and don't need to provide AutoML functionality. They need to append `model_name` to `NON_AUTOML_MODELS` list. -``` +**eg:** Add the name of the Linear Regression algorithm to `NON_AUTOML_MODELS` list. -The positional relationship is shown in Figure 7. +![image13](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/d6b03566-a833-4868-8738-be09d7356c9c) -![Image7](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/18dec84b-44ae-4883-a5b8-db2c6e0ef5c8) -***\*4. Add the corresponding specific function implementation function to the `corresponding manual parameter tuning` file.\**** +## 5. Test Model Workflow Class + +After the model workflow class is added, you can test it through running the command `python start_cli_pipeline.py` on the terminal. If the test reports an error, you need to debug and fix it. If there is no error, it can be submitted. -If the defined function has complex functions, it is necessary to further improve its function content in the manual parameter file, and the code format should refer to Common_component. -![Image](https://github.com/ZJUEarthData/geochemistrypi/assets/113361635/a3ea82c2-9c20-49f4-bf3e-354b012aff7c) +## 6. Completed Pull Request -## 3. Test model -After the model is added, it can be tested. If the test reports an error, it needs to be checked. If there is no error, it can be submitted. +After the test is correct, you can complete the pull request according to the puu document instructions in [Geochemistry π - Completed Pull Request](https://geochemistrypi.readthedocs.io/en/latest/index.html) -## 4. Completed Pull Request -After the model test is correct, you can complete the pull request according to the puu document instructions in [Geochemistry π](https://geochemistrypi.readthedocs.io/en/latest/index.html) ![image](https://github.com/ZJUEarthData/geochemistrypi/assets/97781484/e95c2e44-21f7-44af-8e32-e857189a5204) -## 5. Precautions + +## 7. Precautions **Note1:** This tutorial only discusses the general process of adding a model, and the specific addition needs to be combined with the actual situation of the model to accurately add relevant codes. **Note2:** If there are unclear situations and problems during the adding process, communicate with other people in time to solve them. diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py index c57d14c8..8e701a3f 100644 --- a/geochemistrypi/data_mining/model/_base.py +++ b/geochemistrypi/data_mining/model/_base.py @@ -30,7 +30,7 @@ class WorkflowBase(metaclass=ABCMeta): @classmethod def show_info(cls) -> None: - """Display how many functions the algorithm will provide.""" + """Display what application functions the algorithm will provide.""" print("*-*" * 2, cls.name, "is running ...", "*-*" * 2) print("Expected Functionality:") function = cls.common_function + cls.special_function diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index 6dc456ae..5a8b34d0 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -17,13 +17,14 @@ from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters from .func.algo_clustering._common import plot_silhouette_diagram, plot_silhouette_value_diagram, scatter2d, scatter3d, score from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters +from .func.algo_clustering._enum import ClusteringCommonFunction, KMeansSpecialFunction from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters class ClusteringWorkflowBase(WorkflowBase): """The base workflow class of clustering algorithms.""" - common_function = ["Cluster Centers", "Cluster Labels", "Model Persistence"] + common_function = [func.value for func in ClusteringCommonFunction] def __init__(self): super().__init__() @@ -58,12 +59,12 @@ def get_labels(self): save_data(self.clustering_result, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) @staticmethod - def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_path: str) -> None: + def _score(data: pd.DataFrame, labels: pd.DataFrame, func_name: str, algorithm_name: str, store_path: str) -> None: """Calculate the score of the model.""" - print("-----* Model Score *-----") + print(f"-----* {func_name} *-----") scores = score(data, labels) scores_str = json.dumps(scores, indent=4) - save_text(scores_str, f"Model Score - {algorithm_name}", store_path) + save_text(scores_str, f"{func_name}- {algorithm_name}", store_path) mlflow.log_metrics(scores) @staticmethod @@ -112,6 +113,7 @@ def common_components(self) -> None: self._score( data=self.X, labels=self.clustering_result["clustering result"], + func_name=ClusteringCommonFunction.MODEL_SCORE.value, algorithm_name=self.naming, store_path=GEOPI_OUTPUT_METRICS_PATH, ) @@ -190,7 +192,7 @@ class KMeansClustering(ClusteringWorkflowBase): """The automation workflow of using KMeans algorithm to make insightful products.""" name = "KMeans" - special_function = ["KMeans Score"] + special_function = [func.value for func in KMeansSpecialFunction] def __init__( self, @@ -304,14 +306,15 @@ def __init__( self.naming = KMeansClustering.name - def _get_inertia_scores(self, algorithm_name: str, store_path: str) -> None: + @staticmethod + def _get_inertia_scores(func_name: str, algorithm_name: str, trained_model: object, store_path: str) -> None: """Get the scores of the clustering result.""" - print("-----* KMeans Inertia Scores *-----") - print("Inertia Score: ", self.model.inertia_) - inertia_scores = {"Inertia Score": self.model.inertia_} + print(f"-----* {func_name} *-----") + print(f"{func_name}: ", trained_model.inertia_) + inertia_scores = {f"{func_name}": trained_model.inertia_} mlflow.log_metrics(inertia_scores) inertia_scores_str = json.dumps(inertia_scores, indent=4) - save_text(inertia_scores_str, f"KMeans Inertia Scores - {algorithm_name}", store_path) + save_text(inertia_scores_str, f"{func_name} - {algorithm_name}", store_path) @classmethod def manual_hyper_parameters(cls) -> Dict: @@ -325,7 +328,9 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") self._get_inertia_scores( + func_name=KMeansSpecialFunction.INERTIA_SCORE.value, algorithm_name=self.naming, + trained_model=self.model, store_path=GEOPI_OUTPUT_METRICS_PATH, ) diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_enum.py b/geochemistrypi/data_mining/model/func/algo_clustering/_enum.py new file mode 100644 index 00000000..22293cac --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_enum.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class ClusteringCommonFunction(Enum): + CLUSTER_CENTERS = "Cluster Centers" + CLUSTER_LABELS = "Cluster Labels" + MODEL_PERSISTENCE = "Model Persistence" + MODEL_SCORE = "Model Score" + + +class KMeansSpecialFunction(Enum): + INERTIA_SCORE = "Inertia Score" diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index e78cda95..06a33c5b 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -200,7 +200,7 @@ def common_components(self) -> None: ) @dispatch(bool) - def common_components(self, is_automl: bool) -> None: + def common_components(self, is_automl: bool = False) -> None: """Invoke all common application functions for regression algorithms by FLAML framework.""" GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") @@ -904,7 +904,7 @@ def manual_hyper_parameters(cls) -> Dict: return hyper_parameters @dispatch() - def special_components(self): + def special_components(self, **kwargs): """Invoke all special application functions for this algorithms by Scikit-learn framework.""" GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py index 42768353..324cb591 100644 --- a/geochemistrypi/data_mining/process/classify.py +++ b/geochemistrypi/data_mining/process/classify.py @@ -41,6 +41,7 @@ def activate( ) -> None: """Train by Scikit-learn framework.""" + # Load the required data into the base class's attributes self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) # Customize label @@ -163,6 +164,7 @@ def activate( validation_fraction=hyper_parameters["validation_fraction"], n_iter_no_change=hyper_parameters["n_iter_no_change"], ) + # Display what application functions the algorithm will provide self.clf_workflow.show_info() # Use Scikit-learn style API to process input data diff --git a/geochemistrypi/data_mining/process/detect.py b/geochemistrypi/data_mining/process/detect.py index cf49df5d..94313758 100644 --- a/geochemistrypi/data_mining/process/detect.py +++ b/geochemistrypi/data_mining/process/detect.py @@ -16,7 +16,6 @@ def __init__(self, model_name: str) -> None: self.ad_workflow = AbnormalDetectionWorkflowBase() self.transformer_config = {} - # @dispatch(object, object, object, object, object, object) def activate( self, X: pd.DataFrame, From 6ec99a795a197ee9b1369efc843a52a314e54a5c Mon Sep 17 00:00:00 2001 From: Panyan Weng Date: Thu, 11 Jul 2024 14:19:17 +0800 Subject: [PATCH 06/12] feat: Add local outlier factor algorithm to abnormal detection work flow base --- geochemistrypi/data_mining/constants.py | 2 +- geochemistrypi/data_mining/model/detection.py | 162 +++++++++++++++++- .../func/algo_abnormaldetection/_iforest.py | 38 ++++ geochemistrypi/data_mining/process/detect.py | 12 +- 4 files changed, 211 insertions(+), 3 deletions(-) diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index b3d8ccb7..1303f60e 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -68,7 +68,7 @@ ] CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] -ABNORMALDETECTION_MODELS = ["Isolation Forest"] +ABNORMALDETECTION_MODELS = ["Isolation Forest", "Local Outlier Factor"] # The model can deal with missing values # Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values diff --git a/geochemistrypi/data_mining/model/detection.py b/geochemistrypi/data_mining/model/detection.py index cce5abcf..930d703a 100644 --- a/geochemistrypi/data_mining/model/detection.py +++ b/geochemistrypi/data_mining/model/detection.py @@ -6,10 +6,11 @@ import pandas as pd from rich import print from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor from ..utils.base import clear_output from ._base import WorkflowBase -from .func.algo_abnormaldetection._iforest import isolation_forest_manual_hyper_parameters +from .func.algo_abnormaldetection._iforest import isolation_forest_manual_hyper_parameters, local_outlier_factor_manual_hyper_parameters class AbnormalDetectionWorkflowBase(WorkflowBase): @@ -223,3 +224,162 @@ def manual_hyper_parameters(cls) -> Dict: def special_components(self, **kwargs) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" pass + + +class LocalOutlierFactorAbnormalDetection(AbnormalDetectionWorkflowBase): + """The automation workflow of using Local Outlier Factor algorithm to make insightful products.""" + + name = "Local Outlier Factor" + # special_function = [] + + def __init__( + self, + n_neighbors: int = 20, + algorithm: str = "auto", + leaf_size: int = 30, + metric: Union[str, callable] = "minkowski", + p: float = 2.0, + metric_params: dict = None, + contamination: Union[str, float] = "auto", + novelty: bool = True, # Change this variable from False to True inorder to make this function work + n_jobs: int = None, + ) -> None: + """ + Unsupervised Outlier Detection using the Local Outlier Factor (LOF). + + The anomaly score of each sample is called the Local Outlier Factor. + It measures the local deviation of the density of a given sample with respect + to its neighbors. + It is local in that the anomaly score depends on how isolated the object + is with respect to the surrounding neighborhood. + More precisely, locality is given by k-nearest neighbors, whose distance + is used to estimate the local density. + By comparing the local density of a sample to the local densities of its + neighbors, one can identify samples that have a substantially lower density + than their neighbors. These are considered outliers. + + .. versionadded:: 0.19 + + Parameters + ---------- + n_neighbors : int, default=20 + Number of neighbors to use by default for :meth:`kneighbors` queries. + If n_neighbors is larger than the number of samples provided, + all samples will be used. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can + affect the speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + p : float, default=2 + Parameter for the Minkowski metric from + :func:`sklearn.metrics.pairwise_distances`. When p = 1, this + is equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + contamination : 'auto' or float, default='auto' + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. When fitting this is used to define the + threshold on the scores of the samples. + + - if 'auto', the threshold is determined as in the + original paper, + - if a float, the contamination should be in the range (0, 0.5]. + + .. versionchanged:: 0.22 + The default value of ``contamination`` changed from 0.1 + to ``'auto'``. + + novelty : bool, default=False + By default, LocalOutlierFactor is only meant to be used for outlier + detection (novelty=False). Set novelty to True if you want to use + LocalOutlierFactor for novelty detection. In this case be aware that + you should only use predict, decision_function and score_samples + on new unseen data and not on the training set; and note that the + results obtained this way may differ from the standard LOF results. + + .. versionadded:: 0.20 + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + References + ---------- + Scikit-learn API: sklearn.neighbors.LocalOutlierFactor + https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html# + """ + + super().__init__() + self.n_neighbors = n_neighbors + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.p = p + self.metric_params = metric_params + self.contamination = contamination + self.novelty = novelty + self.n_jobs = n_jobs + + self.model = LocalOutlierFactor( + n_neighbors=self.n_neighbors, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + contamination=self.contamination, + novelty=self.novelty, + n_jobs=self.n_jobs, + ) + + self.naming = LocalOutlierFactorAbnormalDetection.name + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = local_outlier_factor_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + pass diff --git a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py b/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py index 41994f9e..385b8832 100644 --- a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py +++ b/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py @@ -46,3 +46,41 @@ def isolation_forest_manual_hyper_parameters() -> Dict: else: hyper_parameters["max_samples"] = max_samples return hyper_parameters + + +def local_outlier_factor_manual_hyper_parameters() -> Dict: + """Manually set hyperparameters. + + Returns + ------- + hyper_parameters : dict + """ + print("N neighbors: The number of neighbors to use.") + print("Please specify the number of neighbors. A good starting range could be between 10 and 50, such as 20.") + n_neighbors = num_input(SECTION[2], "@N Neighbors: ") + + print("Leaf size: The leaf size used in the ball tree or KD tree.") + print("Please specify the leaf size. A good starting range could be between 20 and 50, such as 30.") + leaf_size = num_input(SECTION[2], "@Leaf Size: ") + + print("P: The power parameter for the Minkowski metric.") + print("Please specify the power parameter. A good starting range could be between 1 and 3, such as 2.0.") + p = float_input(2.0, SECTION[2], "@P: ") + + print("Contamination: The amount of contamination of the data set.") + print("Please specify the contamination of the data set. A good starting range could be between 0.1 and 0.5, such as 0.3.") + contamination = float_input(0.3, SECTION[2], "@Contamination: ") + + print("N jobs: The number of parallel jobs to run.") + print("Please specify the number of jobs. Use -1 to use all available CPUs, 1 for no parallelism, or specify the number of CPUs to use. A good starting value is None.") + n_jobs = num_input(SECTION[2], "@N Jobs: ") + + hyper_parameters = { + "n_neighbors": n_neighbors, + "leaf_size": leaf_size, + "p": p, + "contamination": contamination, + "n_jobs": n_jobs, + } + + return hyper_parameters diff --git a/geochemistrypi/data_mining/process/detect.py b/geochemistrypi/data_mining/process/detect.py index 94313758..0c3e45dd 100644 --- a/geochemistrypi/data_mining/process/detect.py +++ b/geochemistrypi/data_mining/process/detect.py @@ -4,7 +4,7 @@ import pandas as pd from ..constants import MLFLOW_ARTIFACT_DATA_PATH -from ..model.detection import AbnormalDetectionWorkflowBase, IsolationForestAbnormalDetection +from ..model.detection import AbnormalDetectionWorkflowBase, IsolationForestAbnormalDetection, LocalOutlierFactorAbnormalDetection from ._base import ModelSelectionBase @@ -40,6 +40,16 @@ def activate( max_samples=hyper_parameters["max_samples"], ) + if self.model_name == "Local Outlier Factor": + hyper_parameters = LocalOutlierFactorAbnormalDetection.manual_hyper_parameters() + self.ad_workflow = LocalOutlierFactorAbnormalDetection( + n_neighbors=hyper_parameters["n_neighbors"], + contamination=hyper_parameters["contamination"], + leaf_size=hyper_parameters["leaf_size"], + n_jobs=hyper_parameters["n_jobs"], + p=hyper_parameters["p"], + ) + self.ad_workflow.show_info() # Use Scikit-learn style API to process input data From 05383aeeeee3d1a3b034ef403682b61ca957f71a Mon Sep 17 00:00:00 2001 From: Panyan Weng Date: Fri, 12 Jul 2024 19:58:44 +0800 Subject: [PATCH 07/12] fix the description of input and reformat manual hyper parameter function for local outlier factor --- geochemistrypi/data_mining/model/detection.py | 3 +- .../func/algo_abnormaldetection/_iforest.py | 38 ------------------ .../_local_outlier_factor.py | 39 +++++++++++++++++++ 3 files changed, 41 insertions(+), 39 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_abnormaldetection/_local_outlier_factor.py diff --git a/geochemistrypi/data_mining/model/detection.py b/geochemistrypi/data_mining/model/detection.py index 930d703a..9dff73f2 100644 --- a/geochemistrypi/data_mining/model/detection.py +++ b/geochemistrypi/data_mining/model/detection.py @@ -10,7 +10,8 @@ from ..utils.base import clear_output from ._base import WorkflowBase -from .func.algo_abnormaldetection._iforest import isolation_forest_manual_hyper_parameters, local_outlier_factor_manual_hyper_parameters +from .func.algo_abnormaldetection._iforest import isolation_forest_manual_hyper_parameters +from .func.algo_abnormaldetection._local_outlier_factor import local_outlier_factor_manual_hyper_parameters class AbnormalDetectionWorkflowBase(WorkflowBase): diff --git a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py b/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py index 385b8832..41994f9e 100644 --- a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py +++ b/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py @@ -46,41 +46,3 @@ def isolation_forest_manual_hyper_parameters() -> Dict: else: hyper_parameters["max_samples"] = max_samples return hyper_parameters - - -def local_outlier_factor_manual_hyper_parameters() -> Dict: - """Manually set hyperparameters. - - Returns - ------- - hyper_parameters : dict - """ - print("N neighbors: The number of neighbors to use.") - print("Please specify the number of neighbors. A good starting range could be between 10 and 50, such as 20.") - n_neighbors = num_input(SECTION[2], "@N Neighbors: ") - - print("Leaf size: The leaf size used in the ball tree or KD tree.") - print("Please specify the leaf size. A good starting range could be between 20 and 50, such as 30.") - leaf_size = num_input(SECTION[2], "@Leaf Size: ") - - print("P: The power parameter for the Minkowski metric.") - print("Please specify the power parameter. A good starting range could be between 1 and 3, such as 2.0.") - p = float_input(2.0, SECTION[2], "@P: ") - - print("Contamination: The amount of contamination of the data set.") - print("Please specify the contamination of the data set. A good starting range could be between 0.1 and 0.5, such as 0.3.") - contamination = float_input(0.3, SECTION[2], "@Contamination: ") - - print("N jobs: The number of parallel jobs to run.") - print("Please specify the number of jobs. Use -1 to use all available CPUs, 1 for no parallelism, or specify the number of CPUs to use. A good starting value is None.") - n_jobs = num_input(SECTION[2], "@N Jobs: ") - - hyper_parameters = { - "n_neighbors": n_neighbors, - "leaf_size": leaf_size, - "p": p, - "contamination": contamination, - "n_jobs": n_jobs, - } - - return hyper_parameters diff --git a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_local_outlier_factor.py b/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_local_outlier_factor.py new file mode 100644 index 00000000..984b3750 --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_local_outlier_factor.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +from typing import Dict + +from rich import print + +from ....constants import SECTION +from ....data.data_readiness import float_input, num_input + + +def local_outlier_factor_manual_hyper_parameters() -> Dict: + """Manually set hyperparameters. + + Returns + ------- + hyper_parameters : dict + """ + print("N neighbors: The number of neighbors to use.") + print("Please specify the number of neighbors. A good starting range could be between 10 and 50, such as 20.") + n_neighbors = num_input(SECTION[2], "@N Neighbors: ") + print("Leaf size: The leaf size used in the ball tree or KD tree.") + print("Please specify the leaf size. A good starting range could be between 20 and 50, such as 30.") + leaf_size = num_input(SECTION[2], "@Leaf Size: ") + print("P: The power parameter for the Minkowski metric.") + print("Please specify the power parameter. When p = 1, this is equivalent to using manhattan_distance, and when p = 2 euclidean_distance is applied. For arbitrary p, minkowski_distance is used.") + p = float_input(2.0, SECTION[2], "@P: ") + print("Contamination: The amount of contamination of the data set.") + print("Please specify the contamination of the data set. A good starting range could be between 0.1 and 0.5, such as 0.3.") + contamination = float_input(0.3, SECTION[2], "@Contamination: ") + print("N jobs: The number of parallel jobs to run.") + print("Please specify the number of jobs. Use -1 to use all available CPUs, 1 for no parallelism, or specify the number of CPUs to use. A good starting value is 1.") + n_jobs = num_input(SECTION[2], "@N Jobs: ") + hyper_parameters = { + "n_neighbors": n_neighbors, + "leaf_size": leaf_size, + "p": p, + "contamination": contamination, + "n_jobs": n_jobs, + } + return hyper_parameters From 5025813c6d94e04b7c7c33575519ba83336d9034 Mon Sep 17 00:00:00 2001 From: jin <571979568@qq.com> Date: Mon, 15 Jul 2024 07:52:02 +0800 Subject: [PATCH 08/12] docs:Update Regression.md of Model Example --- .../Model Example/Regression/Regression.md | 490 +++++++++--------- 1 file changed, 249 insertions(+), 241 deletions(-) diff --git a/docs/source/For User/Model Example/Regression/Regression.md b/docs/source/For User/Model Example/Regression/Regression.md index 731b7df7..e36745cc 100644 --- a/docs/source/For User/Model Example/Regression/Regression.md +++ b/docs/source/For User/Model Example/Regression/Regression.md @@ -24,19 +24,11 @@ By running this line of command, the following output should show up on your scr 2 - Data For Classification 3 - Data For Clustering 4 - Data For Dimensional Reduction -(User) ➜ @Number: +(User) ➜ @Number: 1 ``` Enter the serial number of the sub-menu you want to choose and press `Enter`. In this doc, we will focus on the usage of Regression function, to do that, enter `1` and press `Enter`. -```python --*-*- Built-in Data Option-*-*- -1 - Data For Regression -2 - Data For Classification -3 - Data For Clustering -4 - Data For Dimensional Reduction -(User) ➜ @Number: 1 -``` ### 2.2 Generate a map projection @@ -103,7 +95,7 @@ After pressing `Enter`to move forward, you will see a question pops up enquiring World Map Projection for A Specific Element Option: 1 - Yes 2 - No -(Plot) ➜ @Number: +(Plot) ➜ @Number: 1 ``` By choosing “Yes”, you can then choose one element to be projected in the world map; By choosing “No”, you can skip to the next mode. For demonstrating, we choose “Yes” in this case: @@ -161,16 +153,17 @@ Index - Column Name 46 - TH(PPM) 47 - U(PPM) -------------------- -(Plot) ➜ @Number: +(Plot) ➜ @Number: 10 ``` Here, we choose “10 - AL2O3(WT%)” as an example, after this, the path to save the image will be presented: ```python -Save figure 'Map Projection - AL2O3(WT%)' in /home/yucheng/output/images/ma +Save figure 'Map Projection - AL2O3(WT%)' in C:\Users\YSQ\geopi_output\Regression\test\artifacts\image\map. +Successfully store 'Map Projection - AL2O3(WT%)'in 'Map Projection - AL2O3(WT%).xlsx' inC:\Users\YSQ\geopi_output\Regression\test\artifacts\image\map. ``` -![Map Projection - AL2O3(WT%)](https://github.com/ZJUEarthData/geochemistrypi/assets/66779478/0edf28b3-3006-49e6-a2b4-8ddcc7f94306) +![Map Projection - AL2O3(WT%)](https://github.com/ZJUEarthData/geochemistrypi/assets/162782014/5b642790-99a6-4421-9422-7fd482a6d425)
Map Projection - AL2O3(WT%)
When you see the following instruction: @@ -179,10 +172,10 @@ When you see the following instruction: Do you want to continue to project a new element in the World Map? 1 - Yes 2 - No -(Plot) ➜ @Number: +(Plot) ➜ @Number: 2 ``` -You can choose “Yes” to map another element or choose “No” to exit map mode. +You can choose “Yes” to map another element or choose “No” to exit map mode. Here, we choose 2 to skip this step. ### 2.3 Enter the range of data and check the output @@ -193,7 +186,7 @@ Select the data range you want to process. Input format: Format 1: "[**, **]; **; [**, **]", such as "[1, 3]; 7; [10, 13]" --> you want to deal with the columns 1, 2, 3, 7, 10, 11, 12, 13 Format 2: "xx", such as "7" --> you want to deal with the columns 7 -@input: +@input: [10,13] ``` Here, we use “[10, 13]” as an example. The values of the elements we choose would be shown on the screen. @@ -253,30 +246,25 @@ min 0.230000 0.000000 1.371100 13.170000 50% 4.720000 0.925000 2.690000 21.223500 75% 6.233341 1.243656 3.330000 22.185450 max 8.110000 3.869550 8.145000 25.362000 -Successfully calculate the pair-wise correlation coefficient among the selected columns. -Save figure 'Correlation Plot' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. -Successfully store 'Correlation Plot' in 'Correlation Plot.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. -Successfully draw the distribution plot of the selected columns. -Save figure 'Distribution Histogram' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. -Successfully store 'Distribution Histogram' in 'Distribution Histogram.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. -Successfully draw the distribution plot after log transformation of the selected columns. -Save figure 'Distribution Histogram After Log Transformation' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. -Successfully store 'Distribution Histogram After Log Transformation' in 'Distribution Histogram After Log Transformation.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test -1\artifacts\image\statistic. -Successfully store 'Data Original' in 'Data Original.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\data. -Successfully store 'Data Selected' in 'Data Selected.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\data. -(Press Enter key to move forward.) +Successfully calculate the pair-wise correlation coefficient among the selected columns. Save figure 'Correlation Plot' in C:\Users\YSQ\geopi_output\Regression\test\artifacts\image\statistic. +Successfully store 'Correlation Plot' in 'Correlation Plot.xlsx' in C:\Users\YSQ\geopi_output\Regression\test\artifacts\image\statistic. +... +Successfully store 'Data Original' in 'DataOriginal.xlsx' in C:\Users\YSQ\geopi_output\Regression\test\artifacts\data. +Successfully store 'Data Selected' in 'DataSelected.xlsx' in C:\Users\YSQ\geopi_output\Regression\test\artifacts\data. ``` The function calculates the pairwise correlation coefficients among these elements and create a distribution plot for each element. Here are the plots generated by our example: -![Correlation Plot](https://github.com/ZJUEarthData/geochemistrypi/assets/66779478/be72b8da-aaca-4420-9d78-e8575d6ed8b4) + +![Correlation Plot](https://github.com/ZJUEarthData/geochemistrypi/assets/162782014/5774e386-c1ab-4347-8be0-592e00ab004f)
Correlation Plot
-![Distribution Histogram](https://github.com/ZJUEarthData/geochemistrypi/assets/66779478/96079dfe-8194-4412-af13-fe44fa1a3dd0) + +![Distribution Histogram](https://github.com/ZJUEarthData/geochemistrypi/assets/162782014/cfdd5c8b-2428-493d-98be-712885a1cde8)
Distribution Histogram
-![Distribution Histogram After Log Transformation](https://github.com/ZJUEarthData/geochemistrypi/assets/66779478/2156dc68-2989-44af-aa0b-f47e0ed56012) + +![Distribution Histogram After Log Transformation](https://github.com/ZJUEarthData/geochemistrypi/assets/162782014/7ebd82fa-1fb9-4cfe-9b59-9a479a59ca2b)
Distribution Histogram After Log Transformation
@@ -306,71 +294,27 @@ Note: you'd better use imputation techniques to deal with the missing values. (Press Enter key to move forward.) ``` -Here, we choose ”1 - Mean Values” as our strategy: - ```python --*-*- Strategy for Missing Values -*-*- -1 - Mean Value -2 - Median Value -3 - Most Frequent Value -4 - Constant(Specified Value) -Which strategy do you want to apply? +-*-*- Missing Values Process -*-*- +Do you want to deal with the missing values? +1 - Yes +2 - No (Data) ➜ @Number: 1 -Successfully fill the missing values with the mean value of each feature column respectively. -(Press Enter key to move forward.) ``` - -Here, the pragram is performing a hypothesis testing on the imputation method used to fill missing values in a dataset. The null hypothesis is that the distribution of the data set before and after imputing remains the same. The Kruskal Test is used to test this hypothesis, with a significance level of 0.05. Monte Carlo simulation is used with 100 iterations, each with a sample size of half the dataset (54 in this case). The p-values are calculated for each column and the columns that reject the null hypothesis are identified. +Here, let's choose 1 to deal with the missing values. ```python --*-*- Hypothesis Testing on Imputation Method -*-*- -Null Hypothesis: The distributions of the data set before and after imputing remain the same. -Thoughts: Check which column rejects null hypothesis. -Statistics Test Method: kruskal Test -Significance Level: 0.05 -The number of iterations of Monte Carlo simulation: 100 -The size of the sample for each iteration (half of the whole data set): 54 -Average p-value: -AL2O3(WT%) 1.0 -CR2O3(WT%) 0.9327453056346102 -FEOT(WT%) 1.0 -CAO(WT%) 1.0 -Note: 'p-value < 0.05' means imputation method doesn't apply to that column. -The columns which rejects null hypothesis: None -Successfully draw the respective probability plot (origin vs. impute) of the selected columns -Save figure 'Probability Plot' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. -Successfully store 'Probability Plot' in 'Probability Plot.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\image\statistic. - -RangeIndex: 109 entries, 0 to 108 -Data columns (total 4 columns): - # Column Non-Null Count Dtype ---- ------ -------------- ----- - 0 AL2O3(WT%) 109 non-null float64 - 1 CR2O3(WT%) 109 non-null float64 - 2 FEOT(WT%) 109 non-null float64 - 3 CAO(WT%) 109 non-null float64 -dtypes: float64(4) -memory usage: 3.5 KB -None -Some basic statistic information of the designated data set: - AL2O3(WT%) CR2O3(WT%) FEOT(WT%) CAO(WT%) -count 109.000000 109.000000 109.000000 109.000000 -mean 4.554212 0.956426 2.962310 21.115756 -std 1.969756 0.524695 1.133967 1.964380 -min 0.230000 0.000000 1.371100 13.170000 -25% 3.110977 0.680000 2.350000 20.310000 -50% 4.720000 0.956426 2.690000 21.223500 -75% 6.233341 1.170000 3.330000 22.185450 -max 8.110000 3.869550 8.145000 25.362000 -Successfully store 'Data Selected Imputed' in 'Data Selected Imputed.xlsx' in C:\Users\86188\geopi_output\GeoPi - Rock Classification\Xgboost Algorithm - Test 1\artifacts\data. -(Press Enter key to move forward.) +-*-*- Strategy for Missing Values -*-*- +1 - Drop Rows with Missing Values +2 - Impute Missing Values +Notice: Drop the rows with missing values may lead to a significant loss of data if too many +features are chosen. +Which strategy do you want to apply? +(Data) ➜ @Number:1 ``` +We'll just skip the lines with missing info to keep things simple. -A probability plot of the selected columns is also drawn and saved in a specified location. -![Probability Plot](https://github.com/ZJUEarthData/geochemistrypi/assets/66779478/8cab64ac-593a-4f46-bf58-475a59e993e8) - -
Probability Plot
### 2.5 Feature Engineering @@ -389,7 +333,7 @@ Index - Column Name Feature Engineering Option: 1 - Yes 2 - No -(Data) ➜ @Number: +(Data) ➜ @Number: 1 ``` After enter “1”, we now is ready to name the constructed feature and build the formula. In this example, we use “newFeature” as the name and we build the formula with “b*c+d”: @@ -402,6 +346,8 @@ c - FEOT(WT%) d - CAO(WT%) Name the constructed feature (column name), like 'NEW-COMPOUND': @input: new Feature +``` +```python Build up new feature with the combination of basic arithmatic operators, including '+', '-', '*', '/', '()'. Input example 1: a * b - c --> Step 1: Multiply a column with b column; @@ -422,51 +368,66 @@ You can use mean(x) to calculate the average value. @input: b*c+d ``` -The output is as below: +This step, we enter b*c+d. And the output is as below: ```python -Successfully construct a new feature "new Feature". -0 23.005680 -1 22.084600 -2 23.126441 -3 24.392497 -4 23.394575 - ... -104 23.801200 -105 22.017500 -106 27.033200 -107 23.825000 -108 22.656000 -Name: new Feature, Length: 109, dtype: float64 +Successfully construct a new feature new Feature. +0 23.00568 +1 22.08460 +2 25.43000 +3 23.39590 +4 22.90900 + ... +93 23.80120 +94 22.01750 +95 27.03320 +96 23.82500 +97 22.65600 +Name: new Feature, Length: 98, dtype: float64 (Press Enter key to move forward.) ------------------ -RangeIndex: 109 entries, 0 to 108 +RangeIndex: 98 entries, 0 to 97 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 AL2O3(WT%) 109 non-null float64 - 1 CR2O3(WT%) 109 non-null float64 - 2 FEOT(WT%) 109 non-null float64 - 3 CAO(WT%) 109 non-null float64 - 4 new Feature 109 non-null float64 + 0 AL2O3(WT%) 98 non-null float64 + 1 CR2O3(WT%) 98 non-null float64 + 2 FEOT(WT%) 98 non-null float64 + 3 CAO(WT%) 98 non-null float64 + 4 new Feature 98 non-null float64 dtypes: float64(5) -memory usage: 4.4 KB +memory usage: 4.0 KB None Some basic statistic information of the designated data set: - AL2O3(WT%) CR2O3(WT%) FEOT(WT%) CAO(WT%) new Feature -count 109.000000 109.000000 109.000000 109.000000 109.000000 -mean 4.554212 0.956426 2.962310 21.115756 23.853732 -std 1.969756 0.524695 1.133967 1.964380 1.596076 -min 0.230000 0.000000 1.371100 13.170000 18.474000 -25% 3.110977 0.680000 2.350000 20.310000 22.909000 -50% 4.720000 0.956426 2.690000 21.223500 23.904360 -75% 6.233341 1.170000 3.330000 22.185450 24.763500 -max 8.110000 3.869550 8.145000 25.362000 29.231800 + AL2O3(WT%) CR2O3(WT%) FEOT(WT%) CAO(WT%) new Feature +count 98.000000 98.000000 98.000000 98.000000 98.000000 +mean 4.444082 0.956426 2.929757 21.187116 23.883266 +std 1.996912 0.553647 1.072481 1.891933 1.644173 +min 0.230000 0.000000 1.371100 13.170000 18.474000 +25% 3.051456 0.662500 2.347046 20.310000 22.872800 +50% 4.621250 0.925000 2.650000 21.310000 23.907180 +75% 6.222500 1.243656 3.346500 22.284019 24.795747 +max 8.110000 3.869550 8.145000 25.362000 29.231800 (Press Enter key to move forward.) ``` -After building the new feature, we can choose the mode to process data, in this doc, we choose “1 - Regression”: +If you feel it's enough, just select 'no' to proceed to the next step. Here, we choose 2. +```python +Do you want to continue to build a new feature? +1 - Yes +2 - No +(Data) ➜ @Number:2 +``` + +``` +Successfully store 'Data Selected Dropped-Imputed Feature-Engineering' in 'Data Selected +Dropped-Imputed Feature-Engineering.xlsx' in +C:\Users\YSQ\geopi_output\Regression\test\artifacts\data. +Exit Feature Engineering Mode. +(Press Enter key to move forward.) +``` + +After building the feature, we can choose the mode to process data, in this doc, we choose “1 - Regression”: ```python -*-*- Mode Selection -*-*- @@ -482,7 +443,7 @@ After building the new feature, we can choose the mode to process data, in this After entering the Regression menu, we are going to input X Set and Y Set separately, note that the new feature we just created is also in the list: ```python --*-*- Data Split - X Set and Y Set-*-*- +-*-*- Data Segmentation - X Set and Y Set -*-*- Divide the processing data set into X (feature value) and Y (target value) respectively. Selected sub data set to create X data set: -------------------- @@ -498,42 +459,101 @@ Select the data range you want to process. Input format: Format 1: "[**, **]; **; [**, **]", such as "[1, 3]; 7; [10, 13]" --> you want to deal with the columns 1, 2, 3, 7, 10, 11, 12, 13 Format 2: "xx", such as "7" --> you want to deal with the columns 7 -@input: +@input: 1 ``` After entering the X Set, the prompt of successful operation and basic statistical information would be shown: ```python -uccessfully create X data set. +Successfully create X data set. The Selected Data Set: - AL2O3(WT%) -0 3.936000 -1 3.040000 -2 7.016561 -3 3.110977 -4 6.971044 -.. ... -104 2.740000 -105 5.700000 -106 0.230000 -107 2.580000 -108 6.490000 - -[109 rows x 1 columns] + AL2O3(WT%) +0 3.936 +1 3.040 +2 4.220 +3 6.980 +4 6.250 +.. ... +93 2.740 +94 5.700 +95 0.230 +96 2.580 +97 6.490 + +[98 rows x 1 columns] Basic Statistical Information: Some basic statistic information of the designated data set: AL2O3(WT%) -count 109.000000 -mean 4.554212 -std 1.969756 +count 98.000000 +mean 4.444082 +std 1.996912 min 0.230000 -25% 3.110977 -50% 4.720000 -75% 6.233341 +25% 3.051456 +50% 4.621250 +75% 6.222500 max 8.110000 -Successfully store 'X Without Scaling' in 'X Without Scaling.xlsx' in /home/yucheng/output/data. +Successfully store 'X Without Scaling' in 'X Without Scaling.xlsx' in +C:\Users\YSQ\geopi_output\Regression\test\artifacts\data. (Press Enter key to move forward.) ``` +Then, input Y Set like "2 - CR203(WT%)". +```python +-*-*- Data Segmentation - X Set and Y Set-*-*- +Selected sub data set to create Y data set: +-------------------- +Index - Column Name +1 - AL2O3(WT%) +2 - CR2O3(WT%) +3 - FEOT(WT%) +4 - CAO(WT%) +5 - new Feature +-------------------- +The selected Y data set: +Notice: Normally, please choose only one column to be tag column Y, not multiple columns. +Notice: For classification model training, please choose the label column which has +distinctive integers. +Select the data range you want to process. +Input format: +Format 1: "[**, **]; **; [**, **]", such as "[1, 3]; 7; [10, 13]" --> you want to deal with the columns 1, 2, 3, 7, 10, 11, 12, 13 +Format 2: "xx", such as "7" --> you want to deal with the columns 7 +@input:2 +``` + +The prompt of successful operation and basic statistical information would be shown: + +```python +Successfully create Y data set. +The Selected Data Set: + CR2O3(WT%) +0 1.440 +1 0.578 +2 1.000 +3 0.830 +4 0.740 +.. ... +93 0.060 +94 0.690 +95 2.910 +96 0.750 +97 0.800 + +[98 rows x 1 columns] +Basic Statistical Information: +Some basic statistic information of the designated data set: + CR2O3(WT%) +count 98.000000 +mean 0.956426 +std 0.553647 +min 0.000000 +25% 0.662500 +50% 0.925000 +75% 1.243656 +max 3.869550 +Successfully store 'Y' in 'Y.xlsx' in +C:\Users\YSQ\geopi_output\Regression\test\artifacts\data. +(Press Enter key to move forward.) +``` + After this, you may choose to process feature scaling on X Set or not: @@ -541,10 +561,18 @@ After this, you may choose to process feature scaling on X Set or not: -*-*- Feature Scaling on X Set -*-*- 1 - Yes 2 - No -(Data) ➜ @Number: +(Data) ➜ @Number: 2 +``` + +In this doc, we choose 2, and for the next step of feature selection, we also choose option 2. + +```python +-*-*- Feature Selection on X set -*-*- +1 - Yes +2 - No +(Data) ➜ @Number:2 ``` -In the similar manner, we then set Y Set and check the related information generated onto the screen. The next step is to split the data into a training set and a test set. The test set will be used to evaluate the performance of the machine learning model that will be trained on the training set. In this example, we set 20% of the data to be set aside as the test set. This means that 80% of the data will be used as the training set. The data split is important to prevent overfitting of the model on the training data and to ensure that the model's performance can be generalized to new, unseen data: @@ -554,10 +582,17 @@ Note: Normally, set 20% of the dataset aside as test set, such as 0.2 (Data) ➜ @Test Ratio: 0.2 ``` -After checking the output, you should be able to see a menu to choose a machine learning model for your data, in this example, we are going to use “2 - Polynomial Regression”: + + + +// + + + +After checking the output, you should be able to see a menu to choose a machine learning model for your data, in this example, we are going to use “7 - Extra-Trees”. ```python --*-*- Model Selection -*-*-: +-*-*- Model Selection -*-*- 1 - Linear Regression 2 - Polynomial Regression 3 - K-Nearest Neighbors @@ -566,106 +601,79 @@ After checking the output, you should be able to see a menu to choose a machine 6 - Random Forest 7 - Extra-Trees 8 - Gradient Boosting -9 - Xgboost +9 - XGBoost 10 - Multi-layer Perceptron 11 - Lasso Regression -12 - All models above to be trained +12 - Elastic Net +13 - SGD Regression +14 - BayesianRidge Regression +15 - All models above to be trained Which model do you want to apply?(Enter the Corresponding Number) -(Model) ➜ @Number: 2 +(Model) ➜ @Number:7 ``` -After choosing the model, the command line may prompt you to provide more specific options in terms of the model you choose, after offering the options, the program is good to go! And you may check the output like this after processing: +We have already set up an automated learning program. You can simply choose option '1' to easily access it. +```python +Do you want to employ automated machine learning with respect +to this algorithm?(Enter the Corresponding Number): +1 - Yes +2 - No +(Model) ➜ @Number:1 +``` ```python -*-**-* Polynomial Regression is running ... *-**-* -Expected Functionality: -+ Model Score -+ Cross Validation -+ Model Prediction -+ Model Persistence -+ Predicted vs. Actual Diagram -+ Residuals Diagram -+ Permutation Importance Diagram -+ Polynomial Regression Formula ------* Model Score *----- -Root Mean Square Error: 1.2981800081993564 -Mean Absolute Error: 0.8666537321359384 -R2 Score: -0.5692041761356125 -Explained Variance Score: -0.5635060495257759 ------* Cross Validation *----- -K-Folds: 10 -* Fit Time * -Scores: [0.00217414 0.00214863 0.00225115 0.00212574 0.00201654 0.00203323 - 0.00196433 0.00200295 0.00195527 0.00195432] -Mean: 0.0020626306533813475 -Standard deviation: 9.940905756158756e-05 -------------- -* Score Time * -Scores: [0.00440168 0.00398946 0.00407624 0.0041182 0.00420284 0.00452423 - 0.00406241 0.00427079 0.00406742 0.00404215] -Mean: 0.004175543785095215 -Standard deviation: 0.0001651057611709732 -------------- -* Root Mean Square Error * -Scores: [1.15785222 1.29457522 2.71100276 3.38856833 0.94791697 1.0329962 - 1.54759602 1.8725529 1.82623562 0.84039699] -Mean: 1.6619693228088945 -Standard deviation: 0.7833005136355865 -------------- -* Mean Absolute Error * -Scores: [0.86020769 0.85255076 1.71707909 2.17595274 0.73042456 0.8864327 - 1.2754413 1.32740744 1.48587525 0.67660019] -Mean: 1.1987971734378662 -Standard deviation: 0.4639441214337496 -------------- -* R2 Score * -Scores: [ 0.3821429 -0.12200627 -0.58303497 -0.98544835 0.3240076 0.02309755 - -0.93382518 -9.20857756 -1.11023532 -0.50902637] -Mean: -1.2722905973773913 -Standard deviation: 2.6935459556340082 -------------- -* Explained Variance Score * -Scores: [ 0.42490745 -0.01768215 -0.54672932 -0.90106814 0.32644583 0.18391296 - -0.92481771 -7.4016756 -0.39601889 0.24420376] -Mean: -0.9008521815781642 -Standard deviation: 2.2175052662305945 -------------- ------* Predicted Value Evaluation *----- -Save figure 'Predicted Value Evaluation - Polynomial Regression' in /home/yucheng/output/images/model_output. ------* True Value vs. Predicted Value *----- -Save figure 'True Value vs. Predicted Value - Polynomial Regression' in /home/yucheng/output/images/model_output. ------* Polynomial Regression Formula *----- -y = 1.168AL2O3(WT%)+4.677CR2O3(WT%)-0.085AL2O3(WT%)^2-2.572AL2O3(WT%) CR2O3(WT%)-2.229CR2O3(WT%)^2+0.002AL2O3(WT%)^3+0.14AL2O3(WT%)^2 CR2O3(WT%)+0.762AL2O3(WT%) CR2O3(WT%)^2+0.232CR2O3(WT%)^3+1.4708950432993957 ------* Model Prediction *----- - FEOT(WT%) CAO(WT%) -0 6.234901 21.516655 -1 3.081208 20.471231 -2 3.082333 19.539309 -3 2.838430 20.666521 -4 2.434649 21.558533 -5 2.478282 21.784115 -6 2.689378 20.075947 -7 2.744644 21.954583 -8 3.336340 22.054664 -9 3.033059 20.288637 -10 3.268753 21.438835 -11 3.129242 22.290128 -12 2.451531 21.640214 -13 2.984390 19.752188 -14 2.513781 21.035197 -15 2.699384 20.676107 -16 2.641574 21.844654 -17 3.449548 20.632201 -18 3.134386 22.138135 -19 2.986511 21.673300 -20 2.899159 19.943711 -21 2.606604 22.146161 -Successfully store 'Y Test Predict' in 'Y Test Predict.xlsx' in /home/yucheng/output/data. ------* Model Persistence *----- -Successfully store the trained model 'Polynomial Regression' in 'Polynomial_Regression_2023-02-24.pkl' in /home/yucheng/output/trained_models. -Successfully store the trained model 'Polynomial Regression' in 'Polynomial_Regression_2023-02-24.joblib' in /home/yucheng/output/trained_models. +-*-*- Feature Engineering on Application Data -*-*- +The same feature engineering operation will be applied to the +inference data. +Successfully construct a new feature new Feature. +0 NaN +1 NaN +2 NaN +3 25.430000 +4 22.909000 +5 23.211800 +... +49 25.158800 +50 23.342814 +51 21.512000 +52 25.668000 +53 23.801200 +54 23.825000 +Name: new Feature, dtype: float64 +Successfully store 'Application Data Original' in 'Application Data Original.xlsx' in C:\Users\YSQ\geopi_output\GeoPi - Rock Classification\Regression\artifacts\data. +Successfully store 'Application Data Feature-Engineering' in 'Application Data Feature-Engineering.xlsx' in C:\Users\YSQ\geopi_output\GeoPi - Rock Classification\Regression\artifacts\data. +Successfully store 'Application Data Feature-Engineering Selected' in 'Application Data Feature-Engineering Selected.xlsx' in C:\Users\YSQ\geopi_output\GeoPi - Rock Classification\Regression\artifacts\data. +(Press Enter key to move forward.) ``` + +After moving to the next step, the Extra-Trees algorithm training will run automatically. +This includes functionalities such as Model Scoring, Cross Validation, Predicted vs. Actual Diagram, Residuals Diagram, Permutation Importance Diagram, Feature Importance Diagram, Single Tree Diagram, Model Prediction, and Model Persistence. +You can find the output stored in the specified path. + +```python +-*-*- Transform Pipeline Construction -*-*- +Build the transform pipeline according to the previous operations. +Successfully store 'Transform Pipeline Configuration' in 'Transform Pipeline Configuration.txt' in C:\Users\YSQ\geopi_output\GeoPi - Rock Classification\Regression\artifacts. +(Press Enter key to move forward.) + +-*-*- Model Inference -*-*- +Use the trained model to make predictions on the application data. +Successfully store 'Application Data Predicted' in 'Application Data Predicted.xlsx' in C:\Users\YSQ\geopi_output\GeoPi - Rock Classification\Regression\artifacts\data. +Successfully store 'Application Data Feature-Engineering Selected Dropped-Imputed' in 'Application Data Feature-Engineering Selected Dropped-Imputed.xlsx' in C:\Users\YSQ\geopi_output\GeoPi - Rock Classification\Regression\artifacts\data. +(Press Enter key to move forward.) ``` + + +![Feature Importance - Extra-Trees](https://github.com/user-attachments/assets/1d3f6177-8495-445f-ae6b-7bee8ab002a7) +
Feature Importance - Extra-Trees
+![Permutation Importance - Extra-Trees](https://github.com/user-attachments/assets/22d825b4-c58d-4ead-ad6a-35cf7179a426) +
Permutation Importance - Extra-Trees
+![Predicted vs Actual Diagram - Extra-Trees](https://github.com/user-attachments/assets/a1adc20d-cfc0-459d-8a05-83c8b877af93) +
Predicted vs Actual Diagram - Extra-Trees
+![Residuals Diagram - Extra-Trees](https://github.com/user-attachments/assets/08cf261b-a60f-4bf5-94b1-c7f03e9c3358) +
Residuals Diagram - Extra-Trees
+![Tree Diagram - Extra-Trees](https://github.com/user-attachments/assets/619d9a3d-9b60-4560-bdfd-3623ba293bff) +
Tree Diagram - Extra-Trees
From f1cbe82ec53963df27613abd354e452b7e420701 Mon Sep 17 00:00:00 2001 From: jin <571979568@qq.com> Date: Mon, 15 Jul 2024 12:14:44 +0800 Subject: [PATCH 09/12] fix:Optimization of Regression.md format problem --- .../Model Example/Regression/Regression.md | 28 ++++++++----------- .../geochemistrypi.data_mining.rst | 8 ++++++ 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/source/For User/Model Example/Regression/Regression.md b/docs/source/For User/Model Example/Regression/Regression.md index e36745cc..b57debd9 100644 --- a/docs/source/For User/Model Example/Regression/Regression.md +++ b/docs/source/For User/Model Example/Regression/Regression.md @@ -12,7 +12,7 @@ There are several types of regression models, including linear regression, polyn Overall, regression is a powerful tool for predicting numerical values, and is used in a wide range of applications, from finance and economics to healthcare and social sciences. -## 2. Introduction to Regression function of `Geochemistry π` +## 2. Introduction to Regression function of Geochemistry π ### 2.1 Enter the sub-menu of Regression @@ -259,7 +259,6 @@ The function calculates the pairwise correlation coefficients among these elemen ![Correlation Plot](https://github.com/ZJUEarthData/geochemistrypi/assets/162782014/5774e386-c1ab-4347-8be0-592e00ab004f)
Correlation Plot
- ![Distribution Histogram](https://github.com/ZJUEarthData/geochemistrypi/assets/162782014/cfdd5c8b-2428-493d-98be-712885a1cde8)
Distribution Histogram
@@ -314,8 +313,6 @@ Which strategy do you want to apply? ``` We'll just skip the lines with missing info to keep things simple. - - ### 2.5 Feature Engineering Next, you can choose “Yes” for feature engineering option to construct a new feature and select a dataset in the former we choose, or No to exit Feature Engineering mode: @@ -438,7 +435,7 @@ After building the feature, we can choose the mode to process data, in this doc, (Model) ➜ @Number: 1 ``` -## 3. Regression Model-Running +## 3. Model Selection After entering the Regression menu, we are going to input X Set and Y Set separately, note that the new feature we just created is also in the list: @@ -582,13 +579,6 @@ Note: Normally, set 20% of the dataset aside as test set, such as 0.2 (Data) ➜ @Test Ratio: 0.2 ``` - - - -// - - - After checking the output, you should be able to see a menu to choose a machine learning model for your data, in this example, we are going to use “7 - Extra-Trees”. ```python @@ -668,12 +658,16 @@ Successfully store 'Application Data Feature-Engineering Selected Dropped-Impute ![Feature Importance - Extra-Trees](https://github.com/user-attachments/assets/1d3f6177-8495-445f-ae6b-7bee8ab002a7) -
Feature Importance - Extra-Trees
+
Feature Importance - Extra-Trees
+ ![Permutation Importance - Extra-Trees](https://github.com/user-attachments/assets/22d825b4-c58d-4ead-ad6a-35cf7179a426) -
Permutation Importance - Extra-Trees
+
Permutation Importance - Extra-Trees
+ ![Predicted vs Actual Diagram - Extra-Trees](https://github.com/user-attachments/assets/a1adc20d-cfc0-459d-8a05-83c8b877af93) -
Predicted vs Actual Diagram - Extra-Trees
+
Predicted vs Actual Diagram - Extra-Trees
+ ![Residuals Diagram - Extra-Trees](https://github.com/user-attachments/assets/08cf261b-a60f-4bf5-94b1-c7f03e9c3358) -
Residuals Diagram - Extra-Trees
+
Residuals Diagram - Extra-Trees
+ ![Tree Diagram - Extra-Trees](https://github.com/user-attachments/assets/619d9a3d-9b60-4560-bdfd-3623ba293bff) -
Tree Diagram - Extra-Trees
+
Tree Diagram - Extra-Trees
diff --git a/docs/source/python_apis/geochemistrypi.data_mining.rst b/docs/source/python_apis/geochemistrypi.data_mining.rst index aea8e203..f8417a50 100644 --- a/docs/source/python_apis/geochemistrypi.data_mining.rst +++ b/docs/source/python_apis/geochemistrypi.data_mining.rst @@ -41,6 +41,14 @@ geochemistrypi.data\_mining.dash\_pipeline module :undoc-members: :show-inheritance: +geochemistrypi.data\_mining.enum module +--------------------------------------- + +.. automodule:: geochemistrypi.data_mining.enum + :members: + :undoc-members: + :show-inheritance: + geochemistrypi.data\_mining.router module ----------------------------------------- From c6de3771fbba576f30308ed50515fe3f6768f44e Mon Sep 17 00:00:00 2001 From: Mengqi <2534671415@qq.com> Date: Sat, 20 Jul 2024 16:35:52 +0800 Subject: [PATCH 10/12] perf: change 'abnormal' to 'anomaly'. --- README.md | 2 +- .../Add New Model To Framework.md | 4 +- .../anomaly_detection.md} | 26 ++++++------ .../Data_Preprocessing/Data Preprocessing.md | 2 +- docs/source/Home/CHANGELOG.md | 4 +- docs/source/Home/Introduction.md | 2 +- docs/source/model example.rst | 2 +- ...ning.model.func.algo_anomalydetection.rst} | 4 +- .../geochemistrypi.data_mining.model.func.rst | 2 +- geochemistrypi/data_mining/cli_pipeline.py | 12 +++--- geochemistrypi/data_mining/constants.py | 6 +-- ...ection.xlsx => Data_AnomalyDetection.xlsx} | Bin geochemistrypi/data_mining/enum.py | 2 +- geochemistrypi/data_mining/model/detection.py | 37 +++++++++--------- .../__init__.py | 0 .../_iforest.py | 0 .../_local_outlier_factor.py | 0 geochemistrypi/data_mining/process/detect.py | 26 ++++++------ 18 files changed, 65 insertions(+), 66 deletions(-) rename docs/source/For User/Model Example/{Abnormal_Detection/abnormal_detection.md => Anomaly_Detection/anomaly_detection.md} (96%) rename docs/source/python_apis/{geochemistrypi.data_mining.model.func.algo_abnormaldetection.rst => geochemistrypi.data_mining.model.func.algo_anomalydetection.rst} (51%) rename geochemistrypi/data_mining/data/dataset/{Data_AbnormalDetection.xlsx => Data_AnomalyDetection.xlsx} (100%) rename geochemistrypi/data_mining/model/func/{algo_abnormaldetection => algo_anomalydetection}/__init__.py (100%) rename geochemistrypi/data_mining/model/func/{algo_abnormaldetection => algo_anomalydetection}/_iforest.py (100%) rename geochemistrypi/data_mining/model/func/{algo_abnormaldetection => algo_anomalydetection}/_local_outlier_factor.py (100%) diff --git a/README.md b/README.md index a8c64248..0eb5627d 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN) + Data_Decomposition.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1kix82qj5--vhnm8-KhuUBH9dqYH6zcY8/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true) | [[Tencent Docs]](https://docs.qq.com/document/DQ29oZ0lhUGtZUmdN?&u=6868f96d4a384b309036e04e637e367a) -+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( ++ Data_AnomalyDetection.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) **Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. diff --git a/docs/source/For Developer/Add New Model To Framework.md b/docs/source/For Developer/Add New Model To Framework.md index d2276ea9..5f4a2a08 100644 --- a/docs/source/For Developer/Add New Model To Framework.md +++ b/docs/source/For Developer/Add New Model To Framework.md @@ -749,7 +749,7 @@ def activate( """Train by Scikit-learn framework.""" ``` -For unsupervised learning (clustering, decomposition and abnormaly detection), the signature of `activate` method is: +For unsupervised learning (clustering, decomposition and anomaly detection), the signature of `activate` method is: ``` def activate( self, @@ -800,7 +800,7 @@ The differences above include the signature of @dispatch and the signature of `a There are two `activate` methods defined in the Regression and Classification mode, the first method uses the Scikit-learn framework, and the second method uses the FLAML and RAY frameworks. Decomposition and Clustering algorithms only use the Scikit-learn framework. The instantiation of model workflow class inside `activate` method builds the connnectioni between Layer 3 and Layer 2. -(1) The invocatioin of model workflow class in the first activate method (Used in classification, regression,decomposition, clustering, abnormaly detection) needs to pass the hyperparameters for manual ML: +(1) The invocatioin of model workflow class in the first activate method (Used in classification, regression,decomposition, clustering, anomaly detection) needs to pass the hyperparameters for manual ML: ``` elif self.model_name == "ModelName": hyper_parameters = ModelWorkflowClass.manual_hyper_parameters() diff --git a/docs/source/For User/Model Example/Abnormal_Detection/abnormal_detection.md b/docs/source/For User/Model Example/Anomaly_Detection/anomaly_detection.md similarity index 96% rename from docs/source/For User/Model Example/Abnormal_Detection/abnormal_detection.md rename to docs/source/For User/Model Example/Anomaly_Detection/anomaly_detection.md index 426e6ad6..ea789a2d 100644 --- a/docs/source/For User/Model Example/Abnormal_Detection/abnormal_detection.md +++ b/docs/source/For User/Model Example/Anomaly_Detection/anomaly_detection.md @@ -1,4 +1,4 @@ -# Abnomal Detection - Isolation Forest +# Anomaly Detection - Isolation Forest Anomaly detection is a broad problem-solving strategy that encompasses various algorithms, each with its own approach to identifying unusual data points. One such algorithm is the Isolation Forest, which distinguishes itself by constructing an ensemble of decision trees to isolate anomalies. The algorithm's core principle is that anomalies are more easily isolated, requiring fewer splits in the trees compared to normal data points. @@ -75,19 +75,19 @@ After pressing the Enter key, the program propts the following options to let yo 4 - Data For Dimensional Reduction -5 - Data For Abnormal Detection +5 - Data For Anomaly Detection (User) ➜ @Number: 5 ``` -Here, we choose *_5 - Data For Abnormal Detection_* and press the Enter key to move forward. +Here, we choose *_5 - Data For Anomaly Detection_* and press the Enter key to move forward. Now, you should see the output below on your screen: ```bash Successfully loading the built-in training data set -'Data_AbnormalDetection.xlsx'. +'Data_AnomalyDetection.xlsx'. -------------------- @@ -778,7 +778,7 @@ Successfully store 'Data Selected Dropped-Imputed Feature-Engineering' in 'Data -We select **5 - Abnormal Detection** as our model: +We select **5 - Anomaly Detection** as our model: @@ -794,7 +794,7 @@ We select **5 - Abnormal Detection** as our model: 4 - Dimensional Reduction -5 - Abnormal Detection +5 - Anomaly Detection (Model) ➜ @Number: 5 (Press Enter key to move forward.) @@ -904,9 +904,9 @@ Expected Functionality: Successfully store 'Hyper Parameters - Isolation Forest' in 'Hyper Parameters - Isolation Forest.txt' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/parameters. ------* Abnormal Detection Data *----- +-----* Anomaly Detection Data *----- - SIO2(WT%) TIO2(WT%) ... MNO(WT%) NA2O(WT%) is_abnormal + SIO2(WT%) TIO2(WT%) ... MNO(WT%) NA2O(WT%) is_anomaly 0 53.536000 0.291000 ... 0.083000 0.861000 -1 @@ -932,11 +932,11 @@ Successfully store 'Hyper Parameters - Isolation Forest' in 'Hyper Parameters - [109 rows x 10 columns] -Successfully store 'X Abnormal Detection' in 'X Abnormal Detection.xlsx' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/data. +Successfully store 'X Anomaly Detection' in 'X Anomaly Detection.xlsx' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/data. -----* Normal Data *----- - SIO2(WT%) TIO2(WT%) ... MNO(WT%) NA2O(WT%) is_abnormal + SIO2(WT%) TIO2(WT%) ... MNO(WT%) NA2O(WT%) is_anomaly 2 50.873065 0.720622 ... 0.102185 1.920395 1 @@ -965,9 +965,9 @@ Successfully store 'X Abnormal Detection' in 'X Abnormal Detection.xlsx' in User Successfully store 'X Normal' in 'X Normal.xlsx' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/data. ------* Abnormal Data *----- +-----* Anomaly Data *----- - SIO2(WT%) TIO2(WT%) ... MNO(WT%) NA2O(WT%) is_abnormal + SIO2(WT%) TIO2(WT%) ... MNO(WT%) NA2O(WT%) is_anomaly 0 53.536000 0.291000 ... 0.083000 0.861000 -1 @@ -1035,7 +1035,7 @@ Successfully store 'X Normal' in 'X Normal.xlsx' in Users/geopi/geopi_output/Geo 106 54.200000 0.100000 ... 0.130000 1.430000 -1 ``` -Successfully store 'X Abnormal' in 'X Abnormal.xlsx' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/data. +Successfully store 'X Anomaly' in 'X Anomaly.xlsx' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/data. -----* Model Persistence *----- Successfully store 'Isolation Forest' in 'Isolation Forest.pkl' in Users/geopi/geopi_output/GeoPi-Rock Isolation Forest/Algorithm - Test 1/artifacts/model. diff --git a/docs/source/For User/Model Example/Data_Preprocessing/Data Preprocessing.md b/docs/source/For User/Model Example/Data_Preprocessing/Data Preprocessing.md index ea905770..6344deba 100644 --- a/docs/source/For User/Model Example/Data_Preprocessing/Data Preprocessing.md +++ b/docs/source/For User/Model Example/Data_Preprocessing/Data Preprocessing.md @@ -39,7 +39,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN) + Data_Decomposition.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1kix82qj5--vhnm8-KhuUBH9dqYH6zcY8/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true) | [[Tencent Docs]](https://docs.qq.com/document/DQ29oZ0lhUGtZUmdN?&u=6868f96d4a384b309036e04e637e367a) -+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( ++ Data_AnomalyDetection.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) diff --git a/docs/source/Home/CHANGELOG.md b/docs/source/Home/CHANGELOG.md index e225674b..39b9c66e 100644 --- a/docs/source/Home/CHANGELOG.md +++ b/docs/source/Home/CHANGELOG.md @@ -40,12 +40,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), + Clustering Models + Affinity Propagation Clustering + New Mode: - + Abnormal Detection + + Anomaly Detection + Isolation Forest + Docs: + Mind map of all options in README + Citation info - + Abnormal detection algorithm example + + Anomaly detection algorithm example ### Changed diff --git a/docs/source/Home/Introduction.md b/docs/source/Home/Introduction.md index 7063c32e..469c3456 100644 --- a/docs/source/Home/Introduction.md +++ b/docs/source/Home/Introduction.md @@ -146,7 +146,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN) + Data_Decomposition.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1kix82qj5--vhnm8-KhuUBH9dqYH6zcY8/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true) | [[Tencent Docs]](https://docs.qq.com/document/DQ29oZ0lhUGtZUmdN?&u=6868f96d4a384b309036e04e637e367a) -+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( ++ Data_AnomalyDetection.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) **Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. diff --git a/docs/source/model example.rst b/docs/source/model example.rst index 87ae671f..165b7ee9 100644 --- a/docs/source/model example.rst +++ b/docs/source/model example.rst @@ -9,5 +9,5 @@ Model Example Regression Clustering Decomposition - Abnormal Detection + Anomaly Detection Network Analysis \ No newline at end of file diff --git a/docs/source/python_apis/geochemistrypi.data_mining.model.func.algo_abnormaldetection.rst b/docs/source/python_apis/geochemistrypi.data_mining.model.func.algo_anomalydetection.rst similarity index 51% rename from docs/source/python_apis/geochemistrypi.data_mining.model.func.algo_abnormaldetection.rst rename to docs/source/python_apis/geochemistrypi.data_mining.model.func.algo_anomalydetection.rst index 9fc56457..011e4b83 100644 --- a/docs/source/python_apis/geochemistrypi.data_mining.model.func.algo_abnormaldetection.rst +++ b/docs/source/python_apis/geochemistrypi.data_mining.model.func.algo_anomalydetection.rst @@ -1,10 +1,10 @@ -geochemistrypi.data\_mining.model.func.algo\_abnormaldetection package +geochemistrypi.data\_mining.model.func.algo\_anomalydetection package ====================================================================== Module contents --------------- -.. automodule:: geochemistrypi.data_mining.model.func.algo_abnormaldetection +.. automodule:: geochemistrypi.data_mining.model.func.algo_anomalydetection :members: :undoc-members: :show-inheritance: diff --git a/docs/source/python_apis/geochemistrypi.data_mining.model.func.rst b/docs/source/python_apis/geochemistrypi.data_mining.model.func.rst index 3183d2dd..895fbbdf 100644 --- a/docs/source/python_apis/geochemistrypi.data_mining.model.func.rst +++ b/docs/source/python_apis/geochemistrypi.data_mining.model.func.rst @@ -7,7 +7,7 @@ Subpackages .. toctree:: :maxdepth: 4 - geochemistrypi.data_mining.model.func.algo_abnormaldetection + geochemistrypi.data_mining.model.func.algo_anomalydetection geochemistrypi.data_mining.model.func.algo_classification geochemistrypi.data_mining.model.func.algo_clustering geochemistrypi.data_mining.model.func.algo_decomposition diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 3dd117b6..631d0ec6 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -9,7 +9,7 @@ from rich.prompt import Confirm, Prompt from .constants import ( - ABNORMALDETECTION_MODELS, + ANOMALYDETECTION_MODELS, CLASSIFICATION_MODELS, CLASSIFICATION_MODELS_WITH_MISSING_VALUES, CLUSTERING_MODELS, @@ -43,7 +43,7 @@ from .process.classify import ClassificationModelSelection from .process.cluster import ClusteringModelSelection from .process.decompose import DecompositionModelSelection -from .process.detect import AbnormalDetectionModelSelection +from .process.detect import AnomalyDetectionModelSelection from .process.regress import RegressionModelSelection from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning from .utils.mlflow_utils import retrieve_previous_experiment_id @@ -198,7 +198,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = elif built_in_training_data_num == 4: training_data_path = "Data_Decomposition.xlsx" elif built_in_training_data_num == 5: - training_data_path = "Data_AbnormalDetection.xlsx" + training_data_path = "Data_AnomalyDetection.xlsx" data = read_data(file_path=training_data_path) print(f"Successfully loading the built-in training data set '{training_data_path}'.") show_data_columns(data.columns) @@ -403,7 +403,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering models. # Otherwise, allow the user to choose decomposition models. if missing_value_flag and not process_missing_value_flag: - # The abnormal detection mode and decomposition mode don't support missing values. + # The anomaly detection mode and decomposition mode don't support missing values. num2option(MODE_OPTION_WITH_MISSING_VALUES) mode_num = limit_num_input(MODE_OPTION_WITH_MISSING_VALUES, SECTION[2], num_input) else: @@ -545,13 +545,13 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = Modes2Models = {1: REGRESSION_MODELS_WITH_MISSING_VALUES, 2: CLASSIFICATION_MODELS_WITH_MISSING_VALUES, 3: CLUSTERING_MODELS_WITH_MISSING_VALUES} Modes2Initiators = {1: RegressionModelSelection, 2: ClassificationModelSelection, 3: ClusteringModelSelection} else: - Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS, 5: ABNORMALDETECTION_MODELS} + Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS, 5: ANOMALYDETECTION_MODELS} Modes2Initiators = { 1: RegressionModelSelection, 2: ClassificationModelSelection, 3: ClusteringModelSelection, 4: DecompositionModelSelection, - 5: AbnormalDetectionModelSelection, + 5: AnomalyDetectionModelSelection, } MODELS = Modes2Models[mode_num] num2option(MODELS) diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 1303f60e..9acece9d 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -26,8 +26,8 @@ OPTION = ["Yes", "No"] DATA_OPTION = ["Own Data", "Testing Data (Built-in)"] -TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection"] -MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Abnormal Detection"] +TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Anomaly Detection"] +MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Anomaly Detection"] MODE_OPTION_WITH_MISSING_VALUES = ["Regression", "Classification", "Clustering"] # The model provided to use @@ -68,7 +68,7 @@ ] CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] -ABNORMALDETECTION_MODELS = ["Isolation Forest", "Local Outlier Factor"] +ANOMALYDETECTION_MODELS = ["Isolation Forest", "Local Outlier Factor"] # The model can deal with missing values # Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values diff --git a/geochemistrypi/data_mining/data/dataset/Data_AbnormalDetection.xlsx b/geochemistrypi/data_mining/data/dataset/Data_AnomalyDetection.xlsx similarity index 100% rename from geochemistrypi/data_mining/data/dataset/Data_AbnormalDetection.xlsx rename to geochemistrypi/data_mining/data/dataset/Data_AnomalyDetection.xlsx diff --git a/geochemistrypi/data_mining/enum.py b/geochemistrypi/data_mining/enum.py index b14a5cbb..f861a8a1 100644 --- a/geochemistrypi/data_mining/enum.py +++ b/geochemistrypi/data_mining/enum.py @@ -6,7 +6,7 @@ class ModeOption(Enum): CLASSIFICATION = "Classification" CLUSTERING = "Clustering" DIMENSIONAL_REDUCTION = "Dimensional Reduction" - ABNORMAL_DETECTION = "Abnormal Detection" + ANOMALY_DETECTION = "Anomaly Detection" class ModeOptionWithMissingValues(Enum): diff --git a/geochemistrypi/data_mining/model/detection.py b/geochemistrypi/data_mining/model/detection.py index 9dff73f2..e73b818d 100644 --- a/geochemistrypi/data_mining/model/detection.py +++ b/geochemistrypi/data_mining/model/detection.py @@ -10,18 +10,17 @@ from ..utils.base import clear_output from ._base import WorkflowBase -from .func.algo_abnormaldetection._iforest import isolation_forest_manual_hyper_parameters -from .func.algo_abnormaldetection._local_outlier_factor import local_outlier_factor_manual_hyper_parameters +from .func.algo_anomalydetection._iforest import isolation_forest_manual_hyper_parameters, local_outlier_factor_manual_hyper_parameters -class AbnormalDetectionWorkflowBase(WorkflowBase): - """The base workflow class of abnormal detection algorithms.""" +class AnomalyDetectionWorkflowBase(WorkflowBase): + """The base workflow class of anomaly detection algorithms.""" # common_function = [] def __init__(self) -> None: super().__init__() - self.mode = "Abnormal Detection" + self.mode = "Anomaly Detection" def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: """Fit the model by Scikit-learn framework.""" @@ -29,7 +28,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: self.model.fit(X) def predict(self, X: pd.DataFrame) -> np.ndarray: - """Perform Abnormal Detection on samples in X by Scikit-learn framework.""" + """Perform Anomaly Detection on samples in X by Scikit-learn framework.""" y_predict = self.model.predict(X) return y_predict @@ -52,29 +51,29 @@ def _detect_data(X: pd.DataFrame, detect_label: np.ndarray) -> tuple[pd.DataFram Returns ------- - X_abnormal_detection : pd.DataFrame + X_anomaly_detection : pd.DataFrame DataFrame containing the original data with detection results. X_normal : pd.DataFrame DataFrame containing the normal data points. - X_abnormal : pd.DataFrame - DataFrame containing the abnormal data points. + X_anomaly : pd.DataFrame + DataFrame containing the anomaly data points. """ - X_abnormal_detection = X.copy() + X_anomaly_detection = X.copy() # Merge detection results into the source data - X_abnormal_detection["is_abnormal"] = detect_label - X_normal = X_abnormal_detection[X_abnormal_detection["is_abnormal"] == 1] - X_abnormal = X_abnormal_detection[X_abnormal_detection["is_abnormal"] == -1] + X_anomaly_detection["is_anomaly"] = detect_label + X_normal = X_anomaly_detection[X_anomaly_detection["is_anomaly"] == 1] + X_anomaly = X_anomaly_detection[X_anomaly_detection["is_anomaly"] == -1] - return X_abnormal_detection, X_normal, X_abnormal + return X_anomaly_detection, X_normal, X_anomaly def common_components(self) -> None: - """Invoke all common application functions for abnormal detection algorithms by Scikit-learn framework.""" + """Invoke all common application functions for anomaly detection algorithms by Scikit-learn framework.""" pass -class IsolationForestAbnormalDetection(AbnormalDetectionWorkflowBase): +class IsolationForestAnomalyDetection(AnomalyDetectionWorkflowBase): """The automation workflow of using Isolation Forest algorithm to make insightful products.""" name = "Isolation Forest" @@ -212,7 +211,7 @@ def __init__( warm_start=self.warm_start, ) - self.naming = IsolationForestAbnormalDetection.name + self.naming = IsolationForestAnomalyDetection.name @classmethod def manual_hyper_parameters(cls) -> Dict: @@ -227,7 +226,7 @@ def special_components(self, **kwargs) -> None: pass -class LocalOutlierFactorAbnormalDetection(AbnormalDetectionWorkflowBase): +class LocalOutlierFactorAnomalyDetection(AnomalyDetectionWorkflowBase): """The automation workflow of using Local Outlier Factor algorithm to make insightful products.""" name = "Local Outlier Factor" @@ -371,7 +370,7 @@ def __init__( n_jobs=self.n_jobs, ) - self.naming = LocalOutlierFactorAbnormalDetection.name + self.naming = LocalOutlierFactorAnomalyDetection.name @classmethod def manual_hyper_parameters(cls) -> Dict: diff --git a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/__init__.py b/geochemistrypi/data_mining/model/func/algo_anomalydetection/__init__.py similarity index 100% rename from geochemistrypi/data_mining/model/func/algo_abnormaldetection/__init__.py rename to geochemistrypi/data_mining/model/func/algo_anomalydetection/__init__.py diff --git a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py b/geochemistrypi/data_mining/model/func/algo_anomalydetection/_iforest.py similarity index 100% rename from geochemistrypi/data_mining/model/func/algo_abnormaldetection/_iforest.py rename to geochemistrypi/data_mining/model/func/algo_anomalydetection/_iforest.py diff --git a/geochemistrypi/data_mining/model/func/algo_abnormaldetection/_local_outlier_factor.py b/geochemistrypi/data_mining/model/func/algo_anomalydetection/_local_outlier_factor.py similarity index 100% rename from geochemistrypi/data_mining/model/func/algo_abnormaldetection/_local_outlier_factor.py rename to geochemistrypi/data_mining/model/func/algo_anomalydetection/_local_outlier_factor.py diff --git a/geochemistrypi/data_mining/process/detect.py b/geochemistrypi/data_mining/process/detect.py index 0c3e45dd..c0424f40 100644 --- a/geochemistrypi/data_mining/process/detect.py +++ b/geochemistrypi/data_mining/process/detect.py @@ -4,16 +4,16 @@ import pandas as pd from ..constants import MLFLOW_ARTIFACT_DATA_PATH -from ..model.detection import AbnormalDetectionWorkflowBase, IsolationForestAbnormalDetection, LocalOutlierFactorAbnormalDetection +from ..model.detection import AnomalyDetectionWorkflowBase, IsolationForestAnomalyDetection, LocalOutlierFactorAnomalyDetection from ._base import ModelSelectionBase -class AbnormalDetectionModelSelection(ModelSelectionBase): - """Simulate the normal way of invoking scikit-learn abnormal detection algorithms.""" +class AnomalyDetectionModelSelection(ModelSelectionBase): + """Simulate the normal way of invoking scikit-learn anomaly detection algorithms.""" def __init__(self, model_name: str) -> None: self.model_name = model_name - self.ad_workflow = AbnormalDetectionWorkflowBase() + self.ad_workflow = AnomalyDetectionWorkflowBase() self.transformer_config = {} def activate( @@ -31,8 +31,8 @@ def activate( # Model option if self.model_name == "Isolation Forest": - hyper_parameters = IsolationForestAbnormalDetection.manual_hyper_parameters() - self.ad_workflow = IsolationForestAbnormalDetection( + hyper_parameters = IsolationForestAnomalyDetection.manual_hyper_parameters() + self.ad_workflow = IsolationForestAnomalyDetection( n_estimators=hyper_parameters["n_estimators"], contamination=hyper_parameters["contamination"], max_features=hyper_parameters["max_features"], @@ -41,8 +41,8 @@ def activate( ) if self.model_name == "Local Outlier Factor": - hyper_parameters = LocalOutlierFactorAbnormalDetection.manual_hyper_parameters() - self.ad_workflow = LocalOutlierFactorAbnormalDetection( + hyper_parameters = LocalOutlierFactorAnomalyDetection.manual_hyper_parameters() + self.ad_workflow = LocalOutlierFactorAnomalyDetection( n_neighbors=hyper_parameters["n_neighbors"], contamination=hyper_parameters["contamination"], leaf_size=hyper_parameters["leaf_size"], @@ -55,23 +55,23 @@ def activate( # Use Scikit-learn style API to process input data self.ad_workflow.fit(X) y_predict = self.ad_workflow.predict(X) - X_abnormal_detection, X_normal, X_abnormal = self.ad_workflow._detect_data(X, y_predict) + X_anomaly_detection, X_normal, X_anomaly = self.ad_workflow._detect_data(X, y_predict) self.ad_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) # Save the model hyper-parameters self.ad_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) - # Common components for every abnormal detection algorithm + # Common components for every anomaly detection algorithm self.ad_workflow.common_components() # special components of different algorithms self.ad_workflow.special_components() - # Save abnormal detection result - self.ad_workflow.data_save(X_abnormal_detection, "X Abnormal Detection", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Abnormal Detection Data") + # Save anomaly detection result + self.ad_workflow.data_save(X_anomaly_detection, "X Anomaly Detection", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Anomaly Detection Data") self.ad_workflow.data_save(X_normal, "X Normal", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Normal Data") - self.ad_workflow.data_save(X_abnormal, "X Abnormal", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Abnormal Data") + self.ad_workflow.data_save(X_anomaly, "X Anomaly", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Anomaly Data") # Save the trained model self.ad_workflow.model_save() From 4957c6bf475a95f60ce3f42caae23b473fde9bbd Mon Sep 17 00:00:00 2001 From: sanyhe Date: Sat, 20 Jul 2024 21:47:42 +0800 Subject: [PATCH 11/12] fix: adjust the clean output command in missing value process. --- geochemistrypi/data_mining/cli_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 3dd117b6..3997f6b0 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -330,11 +330,11 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = missing_value_flag = check_missing_value(data_selected_dropped) if missing_value_flag: process_missing_value_flag = False + clear_output() elif missing_value_strategy_num == 2: # Don't drop the rows with missing values but use imputation techniques to deal with the missing values later. # No need to save the data set here because it will be saved after imputation. imputed_flag = True - clear_output() else: # Don't deal with the missing values, which means neither drop the rows with missing values nor use imputation techniques. imputed_flag = False From 7d1dbbb54a099b8f49711c06abed0e5a8b092bd4 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Sat, 20 Jul 2024 22:00:49 +0800 Subject: [PATCH 12/12] fix: fix the path of manual hyperparameter of lof algorithm. --- geochemistrypi/data_mining/model/detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/geochemistrypi/data_mining/model/detection.py b/geochemistrypi/data_mining/model/detection.py index e73b818d..e380b048 100644 --- a/geochemistrypi/data_mining/model/detection.py +++ b/geochemistrypi/data_mining/model/detection.py @@ -10,7 +10,8 @@ from ..utils.base import clear_output from ._base import WorkflowBase -from .func.algo_anomalydetection._iforest import isolation_forest_manual_hyper_parameters, local_outlier_factor_manual_hyper_parameters +from .func.algo_anomalydetection._iforest import isolation_forest_manual_hyper_parameters +from .func.algo_anomalydetection._local_outlier_factor import local_outlier_factor_manual_hyper_parameters class AnomalyDetectionWorkflowBase(WorkflowBase):