diff --git a/.gitignore b/.gitignore index c36569b..23c99de 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,6 @@ output.txt # Web Dependency node_modules -package-lock.json # yarn v2 yarn.lock diff --git a/README.md b/README.md index b5fdff8..b934b3d 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ pip install geochemistrypi Download the latest version to avoid some old version issues, such as dependency downloading. ``` -pip install "geochemistrypi==0.6.1" +pip install "geochemistrypi==0.7.0" ``` One instruction to download on **Jupyter Notebook** or **Google Colab**. @@ -85,7 +85,7 @@ One instruction to download on **Jupyter Notebook** or **Google Colab**. ``` Download the latest version to avoid some old version issues, such as dependency downloading. ``` -!pip install "geochemistrypi==0.6.1" +!pip install "geochemistrypi==0.7.0" ``` Check the downloaded version of our software: @@ -95,6 +95,14 @@ geochemistrypi --version **Note**: For more detail on installation, please refer to our online documentation in **Installation Manual** under the section of **FOR USER**. Over there, we highly recommend to use virtual environment (Conda) to avoid dependency version problems. + +The following screenshot shows the downloads and launching of our software on macOS: + +

+ Downloads and Launching on macOS +

+ + ## Quick Update One instruction to update the software to the latest version on **command line**, such as Terminal on macOS, Power Shell on Windows. @@ -156,9 +164,15 @@ https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) ## Running Example -**How to run:** After successfully downloading, run this instruction on **command line / Jupyter Notebook / Google Colab** whatever directory it is. +**How to run:** After successfully downloading, run the instructions as the following examples shown on **command line / Jupyter Notebook / Google Colab**. + +Once the software starts, there are two folders `geopi_output` and `geopi_tracking` generated automatically for result storage. -### Case 1: Run with built-in data set for testing +`geopi_tracking`: It is used by MLflow as the storage for visualized operations in the web interface, which users cannot modify directly. + +`geopi_output`: It is a regular folder aligning with MLflow's storage structure, which users can operate. + +### Case 1: Run with built-in data set for model training and model inference On command line: @@ -172,9 +186,34 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining ``` -**Note**: There are four built-in data sets corresponding to four kinds of model pattern. +**Note**: + ++ There are five built-in data sets corresponding to five kinds of model pattern. + ++ The generated output directory `geopi_output` and `geopi_tracking` will be on desktop by default. + + +### Case 2: Run with your own data set on desktop for model training and model inference + +On command line: + +``` +geochemistrypi data-mining --desktop +``` + +On Jupyter Notebook / Google Colab: + +``` +!geochemistrypi data-mining --desktop +``` + +**Note**: -### Case 2: Run with your own data set without model inference ++ You need to create a directory `geopi_input` on desktop and put the date sets in it. + ++ The generated output directory `geopi_output` and `geopi_tracking` will be on desktop by default. + +### Case 3: Run with your own data set without model inference On command line: @@ -188,9 +227,13 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining --data your_own_data_set.xlsx ``` -**Note**: Currently, `.xlsx` and `.csv` files are supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. +**Note**: + ++ Currently, `.xlsx` and `.csv` files are supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. -### Case 3: Implement model inference on application data ++ The generated output directory `geopi_output` and `geopi_tracking` will be on the directory where you run this command. + +### Case 4: Implement model inference on application data On command line: @@ -204,11 +247,15 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining --training your_own_training_data.xlsx --application your_own_application_data.xlsx ``` -**Note**: Please make sure the column names (data schema) in both training data file and application data file are the same. Because the operations you perform via our software on the training data will be record automatically and subsequently applied to the application data in the same order. +**Note**: + ++ Please make sure the column names (data schema) in both training data file and application data file are the same. Because the operations you perform via our software on the training data will be record automatically and subsequently applied to the application data in the same order. + ++ The training data in our pipeline will be divided into the train set and test set used for training the ML model and evaluating the model's performance. The score includes two types. The first type is the scores from the prediction on the test set while the second type is cv scores from the cross validation on the train set. -The training data in our pipeline will be divided into the train set and test set used for training the ML model and evaluating the model's performance. The score includes two types. The first type is the scores from the prediction on the test set while the second type is cv scores from the cross validation on the train set. ++ The generated output directory 'geopi_output' and 'geopi_tracking' will be on the directory where you run this command. -### Case 4: Activate MLflow web interface +### Case 5: Activate MLflow web interface On command line: @@ -222,9 +269,11 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining --mlflow ``` -**Note**: Once you run our software, there are two folders (`geopi_output` and `geopi_tracking`) generated automatically. Make sure the directory where you execute using the above command should have the genereted file `geopi_tracking`. +**Note**: -Copy the URL shown on the console into any browser to open the MLflow web interface. The URL is normally like this http://127.0.0.1:5000. Search MLflow online to see more operations and usages. ++ Once the command is executed, our software will search `geopi_tracking` directory from the current working directory. If it doesn't exist, then our software will search it on desktop. + ++ Copy the URL shown on the console into any browser to open the MLflow web interface. The URL is normally like this http://127.0.0.1:5000. Search MLflow online to see more operations and usages. For more details: Please refer to: @@ -232,11 +281,6 @@ For more details: Please refer to: - MLflow UI user guide - Geochemistry π v0.5.0 [[Bilibili]](https://b23.tv/CW5Rjmo) | [[YouTube]](https://www.youtube.com/watch?v=Yu1nzNeLfRY) -The following screenshot shows the downloads and launching of our software on macOS: - -

- Downloads and Launching on macOS -

## Roadmap @@ -315,21 +359,20 @@ The whole package is under construction and the documentation is progressively e + Jianhao Sun (Jin, Nanjing University, China) + Mengying Ye (Mary, Jilin University, China) + Chengtu Li(Trenki, Henan Polytechnic University, Beijing, China) -+ Yucheng Yan (Andy, University of Sydney, Australia) -+ Ruitao Chang (China University of Geosciences Beijing, China) + Panyan Weng (The University of Sydney, Australia) ++ Haibin Lai (Michael, Southern University of Science and Technology, China) ++ Siqi Yao (Clara, Dongguan University of Technology, China) **Product Group**: -+ Siqi Yao (Clara, Dongguan University of Technology, China) + Zhelan Lin(Lan, Fuzhou University, China) + ShuYi Li (Communication University Of China, Beijing, China) + Junbo Wang (China University of Geosciences, Beijing, China) + Haibin Wang(Watson, University of Sydney, Australia) + Guoqiang Qiu(Elsen, Fuzhou University, China) + Yating Dong (Yetta,Dongguan University of Technology,China) -+ Haibin Lai (Michael, Southern University of Science and Technology, China) + Bailun Jiang (EPSI / Lille University, France) ++ Chufan Zhou (Yoko, Institute of Geochemistry, Chinese Academy of Sciences; University of Chinese Academy of Sciences, China) ## Join Us :) @@ -398,6 +441,8 @@ More Videos will be recorded soon. + Wenyu Zhao (Molly, Zhejiang University, China) + Qiuhao Zhao (Brad, Zhejiang University, China) + Kaixin Zheng (Hayne, Sun Yat-sen University, China) ++ Ruitao Chang (China University of Geosciences Beijing, China) ++ Yucheng Yan (Andy, University of Sydney, Australia) + Anzhou Li (Andrian, Zhejiang University, China) + Keran Li (Kirk, Chengdu University of Technology, China) + Dan Hu (Notre Dame University, United States) diff --git a/docs/source/Home/Introduction.md b/docs/source/Home/Introduction.md index 0ae3260..94c6a97 100644 --- a/docs/source/Home/Introduction.md +++ b/docs/source/Home/Introduction.md @@ -1,9 +1,14 @@ -![Geochemistry π.png](https://github.com/ZJUEarthData/geochemistrypi/assets/66779478/8b8c9a61-68bb-40ca-8545-c96e6802bda5) +# Introduction + +

+ +

- + +

--- @@ -72,7 +77,7 @@ pip install geochemistrypi Download the latest version to avoid some old version issues, such as dependency downloading. ``` -pip install "geochemistrypi==0.6.1" +pip install "geochemistrypi==0.7.0" ``` One instruction to download on **Jupyter Notebook** or **Google Colab**. @@ -82,7 +87,7 @@ One instruction to download on **Jupyter Notebook** or **Google Colab**. ``` Download the latest version to avoid some old version issues, such as dependency downloading. ``` -!pip install "geochemistrypi==0.6.1" +!pip install "geochemistrypi==0.7.0" ``` Check the downloaded version of our software: @@ -92,6 +97,14 @@ geochemistrypi --version **Note**: For more detail on installation, please refer to our online documentation in **Installation Manual** under the section of **FOR USER**. Over there, we highly recommend to use virtual environment (Conda) to avoid dependency version problems. + +The following screenshot shows the downloads and launching of our software on macOS: + +

+ Downloads and Launching on macOS +

+ + ## Quick Update One instruction to update the software to the latest version on **command line**, such as Terminal on macOS, Power Shell on Windows. @@ -153,9 +166,15 @@ https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) ## Running Example -**How to run:** After successfully downloading, run this instruction on **command line / Jupyter Notebook / Google Colab** whatever directory it is. +**How to run:** After successfully downloading, run the instructions as the following examples shown on **command line / Jupyter Notebook / Google Colab**. + +Once the software starts, there are two folders `geopi_output` and `geopi_tracking` generated automatically for result storage. -### Case 1: Run with built-in data set for testing +`geopi_tracking`: It is used by MLflow as the storage for visualized operations in the web interface, which users cannot modify directly. + +`geopi_output`: It is a regular folder aligning with MLflow's storage structure, which users can operate. + +### Case 1: Run with built-in data set for model training and model inference On command line: @@ -169,9 +188,34 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining ``` -**Note**: There are four built-in data sets corresponding to four kinds of model pattern. +**Note**: + ++ There are five built-in data sets corresponding to five kinds of model pattern. + ++ The generated output directory `geopi_output` and `geopi_tracking` will be on desktop by default. + + +### Case 2: Run with your own data set on desktop for model training and model inference + +On command line: + +``` +geochemistrypi data-mining --desktop +``` + +On Jupyter Notebook / Google Colab: + +``` +!geochemistrypi data-mining --desktop +``` + +**Note**: -### Case 2: Run with your own data set without model inference ++ You need to create a directory `geopi_input` on desktop and put the date sets in it. + ++ The generated output directory `geopi_output` and `geopi_tracking` will be on desktop by default. + +### Case 3: Run with your own data set without model inference On command line: @@ -185,9 +229,13 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining --data your_own_data_set.xlsx ``` -**Note**: Currently, `.xlsx` and `.csv` files are supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. +**Note**: + ++ Currently, `.xlsx` and `.csv` files are supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. + ++ The generated output directory `geopi_output` and `geopi_tracking` will be on the directory where you run this command. -### Case 3: Implement model inference on application data +### Case 4: Implement model inference on application data On command line: @@ -201,11 +249,15 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining --training your_own_training_data.xlsx --application your_own_application_data.xlsx ``` -**Note**: Please make sure the column names (data schema) in both training data file and application data file are the same. Because the operations you perform via our software on the training data will be record automatically and subsequently applied to the application data in the same order. +**Note**: -The training data in our pipeline will be divided into the train set and test set used for training the ML model and evaluating the model's performance. The score includes two types. The first type is the scores from the prediction on the test set while the second type is cv scores from the cross validation on the train set. ++ Please make sure the column names (data schema) in both training data file and application data file are the same. Because the operations you perform via our software on the training data will be record automatically and subsequently applied to the application data in the same order. -### Case 4: Activate MLflow web interface ++ The training data in our pipeline will be divided into the train set and test set used for training the ML model and evaluating the model's performance. The score includes two types. The first type is the scores from the prediction on the test set while the second type is cv scores from the cross validation on the train set. + ++ The generated output directory 'geopi_output' and 'geopi_tracking' will be on the directory where you run this command. + +### Case 5: Activate MLflow web interface On command line: @@ -219,9 +271,11 @@ On Jupyter Notebook / Google Colab: !geochemistrypi data-mining --mlflow ``` -**Note**: Once you run our software, there are two folders (`geopi_output` and `geopi_tracking`) generated automatically. Make sure the directory where you execute using the above command should have the genereted file `geopi_tracking`. +**Note**: + ++ Once the command is executed, our software will search `geopi_tracking` directory from the current working directory. If it doesn't exist, then our software will search it on desktop. -Copy the URL shown on the console into any browser to open the MLflow web interface. The URL is normally like this http://127.0.0.1:5000. Search MLflow online to see more operations and usages. ++ Copy the URL shown on the console into any browser to open the MLflow web interface. The URL is normally like this http://127.0.0.1:5000. Search MLflow online to see more operations and usages. For more details: Please refer to: @@ -229,11 +283,6 @@ For more details: Please refer to: - MLflow UI user guide - Geochemistry π v0.5.0 [[Bilibili]](https://b23.tv/CW5Rjmo) | [[YouTube]](https://www.youtube.com/watch?v=Yu1nzNeLfRY) -The following screenshot shows the downloads and launching of our software on macOS: - -

- Downloads and Launching on macOS -

## Roadmap @@ -312,21 +361,20 @@ The whole package is under construction and the documentation is progressively e + Jianhao Sun (Jin, Nanjing University, China) + Mengying Ye (Mary, Jilin University, China) + Chengtu Li(Trenki, Henan Polytechnic University, Beijing, China) -+ Yucheng Yan (Andy, University of Sydney, Australia) -+ Ruitao Chang (China University of Geosciences Beijing, China) + Panyan Weng (The University of Sydney, Australia) ++ Haibin Lai (Michael, Southern University of Science and Technology, China) ++ Siqi Yao (Clara, Dongguan University of Technology, China) **Product Group**: -+ Siqi Yao (Clara, Dongguan University of Technology, China) + Zhelan Lin(Lan, Fuzhou University, China) + ShuYi Li (Communication University Of China, Beijing, China) + Junbo Wang (China University of Geosciences, Beijing, China) + Haibin Wang(Watson, University of Sydney, Australia) + Guoqiang Qiu(Elsen, Fuzhou University, China) + Yating Dong (Yetta,Dongguan University of Technology,China) -+ Haibin Lai (Michael, Southern University of Science and Technology, China) + Bailun Jiang (EPSI / Lille University, France) ++ Chufan Zhou (Yoko, Institute of Geochemistry, Chinese Academy of Sciences; University of Chinese Academy of Sciences, China) ## Join Us :) @@ -395,6 +443,8 @@ More Videos will be recorded soon. + Wenyu Zhao (Molly, Zhejiang University, China) + Qiuhao Zhao (Brad, Zhejiang University, China) + Kaixin Zheng (Hayne, Sun Yat-sen University, China) ++ Ruitao Chang (China University of Geosciences Beijing, China) ++ Yucheng Yan (Andy, University of Sydney, Australia) + Anzhou Li (Andrian, Zhejiang University, China) + Keran Li (Kirk, Chengdu University of Technology, China) + Dan Hu (Notre Dame University, United States) @@ -407,12 +457,3 @@ More Videos will be recorded soon. + Zhenglin Xu (Garry, Jilin University, China) + Jianing Wang (National University of Singapore, Singapore) + Junchi Liao(Roceda, University of Electronic Science and Technology of China, China) -ame University, United States) -+ Xunxin Liu (Tante, China University of Geosciences, Wuhan, China) -+ Fang Li (liv, Shenzhen University, China) -+ Xin Li (The University of Manchester, United Kingdom) -+ Ting Liu (Kira, Sun Yat-sen University, China) -+ Xirui Zhu (Rae, University of York, United Kingdom) -+ Aixiwake·Janganuer (Ayshuak, Sun Yat-sen University, China) -+ Zhenglin Xu (Garry, Jilin University, China) -+ Jianing Wang (National University of Singapore, Singapore) diff --git a/geochemistrypi/cli.py b/geochemistrypi/cli.py index d385011..db48fdc 100644 --- a/geochemistrypi/cli.py +++ b/geochemistrypi/cli.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os -import platform + +# import platform import subprocess import threading from typing import Optional @@ -31,7 +32,7 @@ def _version_callback(value: bool) -> None: def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="Show version.", callback=_version_callback, is_eager=True)) -> None: """ Geochemistry π is an open-sourced highly automated machine learning Python framework for data-driven geochemistry discovery. - It has the cores components of continous training, machine learning lifecycle management and model serving. + It has the cores components of continous training, machine learning lifecycle management and model inference. """ return @@ -39,11 +40,11 @@ def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="S @app.command() def data_mining( data: str = typer.Option("", help="The path of the training data without model inference."), - desktop: bool = typer.Option(False, help="Use the data in the directory 'geopi_input' on the desktop for training and inference."), + desktop: bool = typer.Option(False, help="Use the data in the directory 'geopi_input' on the desktop for model training and model inference."), training: str = typer.Option("", help="The path of the training data."), application: str = typer.Option("", help="The path of the inference data."), mlflow: bool = typer.Option(False, help="Start the mlflow server."), - web: bool = False, + # web: bool = False, ) -> None: """Implement the customized automated machine learning pipeline for geochemistry data mining.""" @@ -62,21 +63,28 @@ def start_mlflow(): # Check if the current working directory has the 'geopi_tracking' directory to store the tracking data for mlflow # If yes, set the MLFLOW_STORE_PATH to the current working directory # If no, set the MLFLOW_STORE_PATH to the desktop - geopi_tracking_dir = os.path.join(os.getcwd(), "geopi_tracking") + cur_working_dir = os.getcwd() + geopi_tracking_dir = os.path.join(cur_working_dir, "geopi_tracking") if not os.path.exists(geopi_tracking_dir): - print("[bold red]The 'geopi_tracking' directory is not found in the current working directory.[bold red]") + print(f"[bold red]The 'geopi_tracking' directory is not found in the current working directory '{cur_working_dir}'.[bold red]") geopi_tracking_dir = os.path.join(os.path.expanduser("~"), "Desktop", "geopi_tracking") if not os.path.exists(geopi_tracking_dir): print("[bold red]The 'geopi_tracking' directory is not found on the desktop.[bold red]") - print("[bold red]Our software will create a 'geopi_tracking' directory on the desktop to store the tracking data for mlflow.[bold red]") + print("[bold green]Creating the 'geopi_tracking' directory ...[/bold green]") + print("[bold green]Successfully create 'geopi_tracking' directory on the desktop to store the tracking data for mlflow.[/bold green]") else: print("[bold green]The 'geopi_tracking' directory is found on the desktop.[bold green]") print("[bold green]Our software will use the 'geopi_tracking' directory on the desktop to store the tracking data for mlflow.[bold green]") + else: + print(f"[bold green]The 'geopi_tracking' directory is found in the current working directory '{cur_working_dir}'.[bold green]") + print("[bold green]Our software will use the 'geopi_tracking' directory in the current working directory to store the tracking data for mlflow.[bold green]") MLFLOW_STORE_PATH = os.path.join("file:", geopi_tracking_dir) print("[bold green]Press [bold magenta]Ctrl + C[/bold magenta] to close mlflow server at any time.[bold green]") start_mlflow_command = f"mlflow ui --backend-store-uri {MLFLOW_STORE_PATH} " subprocess.run(start_mlflow_command, shell=True) + # TODO: Currently, the web application is not fully implemented. It is disabled by default. + web = False if web: # Start the backend and frontend in parallel backend_thread = threading.Thread(target=start_backend) @@ -87,63 +95,68 @@ def start_mlflow(): backend_thread.join() frontend_thread.join() else: - # If mlflow is enabled, start the mlflow server, otherwise start the CLI pipeline if mlflow: - # Start mlflow server to track the experiment + # If mlflow is enabled, start the mlflow server, otherwise start the CLI pipeline mlflow_thread = threading.Thread(target=start_mlflow) mlflow_thread.start() elif desktop: # Start the CLI pipeline with the data in the directory 'geopi_input' on the desktop + # - Both continuous training and model inference + # - Continuous training only cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.DESKTOP) else: - # If the data is provided, start the CLI pipeline with continuous training if data: + # If the data is provided, start the CLI pipeline with continuous training cli_pipeline(training_data_path=data, application_data_path="", data_source=DataSource.ANY_PATH) - # If the training data and inference data are provided, start the CLI pipeline with continuous training and inference elif training and application: + # If the training data and inference data are provided, start the CLI pipeline with continuous training and inference cli_pipeline(training_data_path=training, application_data_path=application, data_source=DataSource.ANY_PATH) - # If no data is provided, look for the data in the desktop to start the CLI pipeline with continuous training and inference + elif training and not application: + # If the training data is provided, start the CLI pipeline with continuous training + cli_pipeline(training_data_path=training, application_data_path="", data_source=DataSource.ANY_PATH) else: + # If no data is provided, use built-in data to start the CLI pipeline with continuous training and inference cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.BUILT_IN) -@app.command() -def web_setup() -> None: - """Set up the dependency of the web application.""" - my_os = platform.system() - if my_os == "Windows": - # Define the command to download and install Yarn on Windows using Chocolatey package manager - download_yarn = "choco install yarn" - subprocess.run(download_yarn, shell=True) - # Define the command to download and install Node.js on Windows using Chocolatey package manager - download_node = "choco install nodejs" - subprocess.run(download_node, shell=True) - elif my_os == "Linux": - # Define the command to download and install Yarn on Linux using npm - download_yarn = "apt-get install -y yarn" - subprocess.run(download_yarn, shell=True) - # Define the command to download and install Node.js on Linux using npm - download_node = "apt-get install -y nodejs" - subprocess.run(download_node, shell=True) - elif my_os == "Darwin": - try: - check_node = "node --version" - subprocess.run(check_node, shell=True) - print("Node.js is already installed.") - except subprocess.CalledProcessError: - # Define the command to download and install Node.js on macOS using Homebrew - download_node = "brew install node" - subprocess.run(download_node, shell=True) - try: - # Define the command to check if Yarn is installed - check_yarn = "yarn --version" - subprocess.run(check_yarn, shell=True) - print("Yarn is already installed.") - except subprocess.CalledProcessError: - # Define the command to download and install Yarn on macOS using Homebrew - download_yarn = "brew install yarn" - subprocess.run(download_yarn, shell=True) - - # Define the command to install the frontend dependencies - install_frontend_dependency_cmd = f"cd {FRONTEND_PATH} && yarn install" - subprocess.run(install_frontend_dependency_cmd, shell=True) +# TODO: Currently, the web application is not fully implemented. It is disabled by default. +# @app.command() +# def web_setup() -> None: +# """Set up the dependency of the web application.""" +# my_os = platform.system() +# if my_os == "Windows": +# # Define the command to download and install Yarn on Windows using Chocolatey package manager +# download_yarn = "choco install yarn" +# subprocess.run(download_yarn, shell=True) +# # Define the command to download and install Node.js on Windows using Chocolatey package manager +# download_node = "choco install nodejs" +# subprocess.run(download_node, shell=True) +# elif my_os == "Linux": +# # Define the command to download and install Yarn on Linux using npm +# download_yarn = "apt-get install -y yarn" +# subprocess.run(download_yarn, shell=True) +# # Define the command to download and install Node.js on Linux using npm +# download_node = "apt-get install -y nodejs" +# subprocess.run(download_node, shell=True) +# elif my_os == "Darwin": +# try: +# check_node = "node --version" +# subprocess.run(check_node, shell=True) +# print("Node.js is already installed.") +# except subprocess.CalledProcessError: +# # Define the command to download and install Node.js on macOS using Homebrew +# download_node = "brew install node" +# subprocess.run(download_node, shell=True) +# try: +# # Define the command to check if Yarn is installed +# check_yarn = "yarn --version" +# subprocess.run(check_yarn, shell=True) +# print("Yarn is already installed.") +# except subprocess.CalledProcessError: +# # Define the command to download and install Yarn on macOS using Homebrew +# download_yarn = "brew install yarn" +# subprocess.run(download_yarn, shell=True) + +# # Define the command to install the frontend dependencies +# install_frontend_dependency_cmd = f"cd {FRONTEND_PATH} && yarn install" +# subprocess.run(install_frontend_dependency_cmd, shell=True) diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 59842af..08e6e1e 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -11,6 +11,7 @@ from .constants import ( ANOMALYDETECTION_MODELS, + BUILT_IN_DATASET_PATH, CLASSIFICATION_MODELS, CLASSIFICATION_MODELS_WITH_MISSING_VALUES, CLUSTERING_MODELS, @@ -58,7 +59,19 @@ from .process.decompose import DecompositionModelSelection from .process.detect import AnomalyDetectionModelSelection from .process.regress import RegressionModelSelection -from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, list_excel_files, log, save_data, show_warning +from .utils.base import ( + check_package, + clear_output, + copy_files, + copy_files_from_source_dir_to_dest_dir, + create_geopi_output_dir, + get_os, + install_package, + list_excel_files, + log, + save_data, + show_warning, +) from .utils.mlflow_utils import retrieve_previous_experiment_id @@ -96,33 +109,44 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # If the user uses the desktop data, the working path is the desktop, the output path is the desktop. # If the user uses the any path, the working path is the current working directory, the output path is the current working directory. if data_source == DataSource.BUILT_IN: - # If the user uses the built-in data, the working path is the desktop. + # If the user uses the built-in data, the working path is the desktop, the output path is the desktop. WORKING_PATH = os.path.join(os.path.expanduser("~"), "Desktop") elif data_source == DataSource.DESKTOP: + # If the user uses the desktop data, the working path is the desktop, the output path is the desktop. WORKING_PATH = os.path.join(os.path.expanduser("~"), "Desktop") INPUT_PATH = os.path.join(WORKING_PATH, "geopi_input") + + with console.status("[bold green] Data Direcotry Checking ...[/bold green]", spinner="dots"): + sleep(1) + + def _data_requirement_print(): + print("[bold green]Please restart the software after putting the data in the 'geopi_input' directory.[/bold green]") + print("[bold green]Currently, the data file format only supports '.xlsx', '.xls', '.csv'.[/bold green]") + print("[bold green]If you want to activate the model inference, please put the 'application data' in it as well.[/bold green]") + print("[bold green]Check our online documentation for more information on the format of the 'application data'.[/bold green]") + if not os.path.exists(INPUT_PATH): print("[bold red]The 'geopi_input' directory is not found on the desktop.[/bold red]") os.makedirs(INPUT_PATH, exist_ok=True) print("[bold green]Creating the 'geopi_input' directory ...[/bold green]") print("[bold green]Successfully create 'geopi_input' directory on the desktop.[/bold green]") - print("Please restart the software after putting the data in the 'geopi_input' directory.") - print("Currently, the data file format only supports '.xlsx', '.xls', '.csv'.") - print("If you want to activate the model inference, please put the 'application data' in it as well.") - print("Check our online documentation for more information on the format of the 'application data'.") - clear_output("(Press Enter key to exit)") - exit(1) + # Copy the built-in datasets to the 'geopi_input' directory on the desktop. + copy_files_from_source_dir_to_dest_dir(BUILT_IN_DATASET_PATH, INPUT_PATH) + print("[bold green]Successfully copy the built-in datasets to the 'geopi_input' directory on the desktop.[/bold green]") - with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"): + with console.status("[bold green]Data Loading ...[/bold green]", spinner="dots"): sleep(1) # List all existing Excel files in the 'geopi_input' directory on the desktop. existing_excel_files = list_excel_files(INPUT_PATH) if len(existing_excel_files) == 0: print("[bold red]No data files found in the 'geopi_input' directory on the desktop.[/bold red]") - print("[bold green]Please put the data files in the 'geopi_input' directory on the desktop.[/bold green]") + _data_requirement_print() clear_output("(Press Enter key to exit)") exit(1) + else: + print("[bold green]Data files are found in the 'geopi_input' directory on the desktop.[/bold green]") + print(f"[bold green]Total Number of Data Files: {len(existing_excel_files)}[/bold green]") show_excel_columns(existing_excel_files) # Read the training data from the Excel file. @@ -136,6 +160,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # Limit the user input to a number within the range of available files and assign the result to application_data_path application_data_path = existing_excel_files[limit_num_input(range(1, len(existing_excel_files) + 1), SECTION[0], num_input) - 1] elif data_source == DataSource.ANY_PATH: + # If the user uses the any path, the working path is the current working directory, the output path is the current working directory. WORKING_PATH = os.getcwd() # Set the output path to the working path @@ -257,7 +282,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = logger.debug("Built-in Training Data Loading") # If the user doesn't provide the training data path, then use the built-in training data. if not training_data_path: - print("-*-*- Built-in Training Data Option-*-*-") + print("[bold green]-*-*- Built-in Training Data Option -*-*-[/bold green]") num2option(TEST_DATA_OPTION) built_in_training_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input) if built_in_training_data_num == 1: @@ -304,7 +329,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Name Selection ---> logger.debug("Output Data Identifier Column Selection") - print("-*-*- Output Data Identifier Column Selection -*-*-") + print("[bold green]-*-*- Output Data Identifier Column Selection -*-*-[/bold green]") show_data_columns(data.columns) NAME = select_column_name(data) clear_output() @@ -313,19 +338,19 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- World Map Projection ---> logger.debug("World Map Projection") - print("-*-*- World Map Projection -*-*-") + print("[bold green]-*-*- World Map Projection -*-*-[/bold green]") process_world_map(data, name_column_select) # <--- Data Selection ---> logger.debug("Data Selection") - print("-*-*- Data Selection -*-*-") + print("[bold green]-*-*- Data Selection -*-*-[/bold green]") show_data_columns(data.columns) data_selected = create_sub_data_set(data, allow_empty_columns=False) clear_output() print("The Selected Data Set:") print(data_selected) clear_output() - print("-*-*- Basic Statistical Information -*-*-") + print("[bold green]-*-*- Basic Statistical Information -*-*-[/bold green]") basic_info(data_selected) basic_statistic(data_selected) correlation_plot(data_selected.columns, data_selected, name_column_select) @@ -345,7 +370,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # 3. Impute the missing values with one of the imputation techniques. # Reference: https://scikit-learn.org/stable/modules/impute.html logger.debug("Missing Value") - print("-*-*- Missing Value Check -*-*-") + print("[bold green]-*-*- Missing Value Check -*-*-[/bold green]") is_null_value(data_selected) ratio_null_vs_filled(data_selected) # missing_value_flag and process_missing_value_flag will be used in mode selection and model selection to differeniate two scenarios. @@ -362,7 +387,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = if missing_value_flag: clear_output() # Ask the user whether to use imputation techniques to deal with the missing values. - print("-*-*- Missing Values Process -*-*-") + print("[bold green]-*-*- Missing Values Process -*-*-[/bold green]") print("[bold red]Caution: Only some algorithms can process the data with missing value, such as XGBoost for regression and classification![/bold red]") print("Do you want to deal with the missing values?") num2option(OPTION) @@ -371,14 +396,14 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = process_missing_value_flag = True # If the user wants to deal with the missing values, then ask the user which strategy to use. clear_output() - print("-*-*- Strategy for Missing Values -*-*-") + print("[bold green]-*-*- Strategy for Missing Values -*-*-[/bold green]") num2option(MISSING_VALUE_STRATEGY) print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.") print("Which strategy do you want to apply?") missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input) clear_output() if missing_value_strategy_num == 1: - print("-*-*- Drop the rows with Missing Values -*-*-") + print("[bold green]-*-*- Drop the rows with Missing Values -*-*-[/bold green]") num2option(DROP_MISSING_VALUE_STRATEGY) print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.") print("Which strategy do you want to apply?") @@ -447,7 +472,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # If the selected data set contains missing values and the user wants to deal with the missing values and choose not to drop the rows with missing values, # then use imputation techniques to deal with the missing values. if imputed_flag: - print("-*-*- Imputation Method Option -*-*-") + print("[bold green]-*-*- Imputation Method Option -*-*-[/bold green]") num2option(IMPUTING_STRATEGY) print("Which method do you want to apply?") strategy_num = limit_num_input(IMPUTING_STRATEGY, SECTION[1], num_input) @@ -455,7 +480,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = data_selected_imputed = np2pd(data_selected_imputed_np, data_selected.columns) del data_selected_imputed_np clear_output() - print("-*-*- Hypothesis Testing on Imputation Method -*-*-") + print("[bold green]-*-*- Hypothesis Testing on Imputation Method -*-*-[/bold green]") print("Null Hypothesis: The distributions of the data set before and after imputing remain the same.") print("Thoughts: Check which column rejects null hypothesis.") print("Statistics Test Method: Kruskal Test") @@ -480,7 +505,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Feature Engineering ---> logger.debug("Feature Engineering") - print("-*-*- Feature Engineering -*-*-") + print("[bold green]-*-*- Feature Engineering -*-*-[/bold green]") feature_builder = FeatureConstructor(data_selected_imputed, process_name_column) data_selected_imputed_fe = feature_builder.build() # feature_engineering_config is possible to be {} @@ -489,7 +514,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Mode Selection ---> logger.debug("Mode Selection") - print("-*-*- Mode Selection -*-*-") + print("[bold green]-*-*- Mode Selection -*-*-[/bold green]") # The following scenarios support three modes (regression, classification and clustering) with the models that support missing values. # Because finally, the data contains missing values. # 1. missing value flag = True, process_missing_value_flag = False, drop rows with missing values flag = Flase, imputed flag = False @@ -515,7 +540,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = name_all = process_name_column if mode_num == 1 or mode_num == 2: # Supervised learning - print("-*-*- Data Segmentation - X Set and Y Set -*-*-") + print("[bold green]-*-*- Data Segmentation - X Set and Y Set -*-*-[/bold green]") print("Divide the processing data set into X (feature value) and Y (target value) respectively.") # create X data set print("Selected sub data set to create X data set:") @@ -531,7 +556,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = clear_output() # Create Y data set - print("-*-*- Data Segmentation - X Set and Y Set-*-*-") + print("[bold green]-*-*- Data Segmentation - X Set and Y Set -*-*-[/bold green]") print("Selected sub data set to create Y data set:") show_data_columns(data_selected_imputed_fe.columns) print("The selected Y data set:") @@ -547,7 +572,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = clear_output() # <--- Feature Scaling ---> - print("-*-*- Feature Scaling on X Set -*-*-") + print("[bold green]-*-*- Feature Scaling on X Set -*-*-[/bold green]") num2option(OPTION) is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input) if is_feature_scaling == 1: @@ -567,7 +592,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = clear_output() # <--- Feature Selection ---> - print("-*-*- Feature Selection on X set -*-*-") + print("[bold green]-*-*- Feature Selection on X set -*-*-[/bold green]") num2option(OPTION) is_feature_selection = limit_num_input(OPTION, SECTION[1], num_input) if is_feature_selection == 1: @@ -583,7 +608,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = clear_output() # Create training data and testing data - print("-*-*- Data Split - Train Set and Test Set -*-*-") + print("[bold green]-*-*- Data Split - Train Set and Test Set -*-*-[/bold green]") print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2.") test_ratio = float_input(default=0.2, prefix=SECTION[1], slogan="@Test Ratio: ") train_test_data = data_split(X, y, process_name_column, test_ratio) @@ -610,7 +635,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # Create X data set without data split because it is unsupervised learning X = data_selected_imputed_fe # <--- Feature Scaling ---> - print("-*-*- Feature Scaling on X Set -*-*-") + print("[bold green]-*-*- Feature Scaling on X Set -*-*-[/bold green]") num2option(OPTION) is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input) if is_feature_scaling == 1: @@ -636,7 +661,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = name_all = process_name_column # <--- Model Selection ---> logger.debug("Model Selection") - print("-*-*- Model Selection -*-*-") + print("[bold green]-*-*- Model Selection -*-*-[/bold green]") # The following scenarios support three modes (regression, classification and clustering) with the models that support missing values. # Because finally, the data contains missing values. # 1. missing value flag = True, process_missing_value_flag = False, drop rows with missing values flag = Flase, imputed flag = False @@ -687,7 +712,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = is_inference = False # If the model is supervised learning, then allow the user to use model inference. if mode_num == 1 or mode_num == 2: - print("-*-*- Feature Engineering on Application Data -*-*-") + print("[bold green]-*-*- Feature Engineering on Application Data -*-*-[/bold green]") is_inference = True selected_columns = X_train.columns if inference_data is not None: @@ -710,7 +735,8 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = save_data(inference_data_fe_selected, inference_name_column, "Application Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: # If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference. - print("You did not enter inference data.") + print("You did not provide application data.") + print("Hence, this part will be skipped.") inference_data_fe_selected = None clear_output() else: @@ -737,7 +763,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Transform Pipeline ---> # Construct the transform pipeline using sklearn.pipeline.make_pipeline method. logger.debug("Transform Pipeline") - print("-*-*- Transform Pipeline Construction -*-*-") + print("[bold green]-*-*- Transform Pipeline Construction -*-*-[/bold green]") transformer_config, transform_pipeline = build_transform_pipeline(imputation_config, feature_scaling_config, feature_selection_config, run, X_train, y_train) clear_output() @@ -746,7 +772,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # If the user chooses to drop the rows with missing values, then before running the model inference, need to drop the rows with missing values in inference data either. logger.debug("Model Inference") if inference_data_fe_selected is not None: - print("-*-*- Model Inference -*-*-") + print("[bold green]-*-*- Model Inference -*-*-[/bold green]") if drop_rows_with_missing_value_flag: inference_name_column = inference_data[NAME] inference_data_name = pd.concat([inference_name_column, inference_data_fe_selected], axis=1) @@ -795,7 +821,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Transform Pipeline ---> # Construct the transform pipeline using sklearn.pipeline.make_pipeline method. logger.debug("Transform Pipeline") - print("-*-*- Transform Pipeline Construction -*-*-") + print("[bold green]-*-*- Transform Pipeline Construction -*-*-[/bold green]") transformer_config, transform_pipeline = build_transform_pipeline(imputation_config, feature_scaling_config, feature_selection_config, run, X_train, y_train) # <--- Model Inference ---> @@ -803,7 +829,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # If the user chooses to drop the rows with missing values, then before running the model inference, need to drop the rows with missing values in inference data either. logger.debug("Model Inference") if inference_data_fe_selected is not None: - print("-*-*- Model Inference -*-*-") + print("[bold green]-*-*- Model Inference -*-*-[/bold green]") if drop_rows_with_missing_value_flag: inference_name_column = inference_data[NAME] inference_data_name = pd.concat([inference_name_column, inference_data_fe_selected], axis=1) diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py index a629e47..b4c2791 100644 --- a/geochemistrypi/data_mining/model/_base.py +++ b/geochemistrypi/data_mining/model/_base.py @@ -31,11 +31,11 @@ class WorkflowBase(metaclass=ABCMeta): @classmethod def show_info(cls) -> None: """Display what application functions the algorithm will provide.""" - print("*-*" * 2, cls.name, "is running ...", "*-*" * 2) - print("Expected Functionality:") + print(f"[bold green]-*-*- {cls.name} Training Process -*-*-[/bold green]") + print("[bold green]Expected Functionality:[/bold green]") function = cls.common_function + cls.special_function for i in range(len(function)): - print("+ ", function[i]) + print(f"[bold green]+ {function[i]}[/bold green]") def __init__(self) -> None: # Default for child class. They need to be overwritten in child classes. @@ -43,6 +43,7 @@ def __init__(self) -> None: self.naming = None self.automl = None self.ray_best_model = None + # Set the random state fixed value for reproducibility of the results. self.random_state = 42 @property diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 93a9c6d..bbb7e33 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -221,7 +221,7 @@ def _plot_2d_decision_boundary( @staticmethod def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple: """Use this method when the sample size is unbalanced.""" - print("-*-*- Sample Balance on Train Set -*-*-") + print("[bold green]-*-*- Sample Balance on Train Set -*-*-[/bold green]") num2option(OPTION) is_sample_balance = limit_num_input(OPTION, SECTION[1], num_input) if is_sample_balance == 1: @@ -246,7 +246,7 @@ def customize_label( y: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, name_column1: str, name_column2: str, name_column3: str, local_path: str, mlflow_path: str ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Using this function to customize the label to which samples of each category belong.""" - print("-*-*- Customize Label on Label Set -*-*-") + print("[bold green]-*-*- Customize Label on Label Set -*-*-[/bold green]") num2option(OPTION) is_customize_label = limit_num_input(OPTION, SECTION[1], num_input) if is_customize_label == 1: @@ -667,7 +667,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = svc_manual_hyper_parameters() clear_output() return hyper_parameters @@ -921,7 +921,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = decision_tree_manual_hyper_parameters() clear_output() return hyper_parameters @@ -1240,7 +1240,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = random_forest_manual_hyper_parameters() clear_output() return hyper_parameters @@ -1599,7 +1599,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = xgboost_manual_hyper_parameters() clear_output() return hyper_parameters @@ -1883,7 +1883,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = logistic_regression_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2252,7 +2252,7 @@ def objective(config: Dict) -> None: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = multi_layer_perceptron_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2540,7 +2540,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = extra_trees_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2905,7 +2905,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = gradient_boosting_manual_hyper_parameters() clear_output() return hyper_parameters @@ -3085,7 +3085,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = knn_manual_hyper_parameters() clear_output() return hyper_parameters @@ -3421,7 +3421,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = sgd_classificaiton_manual_hyper_parameters() clear_output() return hyper_parameters diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index e646d54..9df245b 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -364,7 +364,7 @@ def _get_inertia_scores(func_name: str, algorithm_name: str, trained_model: obje @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = kmeans_manual_hyper_parameters() clear_output() return hyper_parameters @@ -464,7 +464,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = dbscan_manual_hyper_parameters() clear_output() return hyper_parameters @@ -599,7 +599,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = agglomerative_manual_hyper_parameters() clear_output() return hyper_parameters @@ -701,7 +701,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = affinitypropagation_manual_hyper_parameters() clear_output() return hyper_parameters @@ -802,7 +802,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = meanshift_manual_hyper_parameters() clear_output() return hyper_parameters diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py index 5737e71..25cb611 100644 --- a/geochemistrypi/data_mining/model/decomposition.py +++ b/geochemistrypi/data_mining/model/decomposition.py @@ -270,7 +270,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = pca_manual_hyper_parameters() clear_output() return hyper_parameters @@ -594,7 +594,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = tsne_manual_hyper_parameters() clear_output() return hyper_parameters @@ -719,7 +719,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = mds_manual_hyper_parameters() clear_output() return hyper_parameters diff --git a/geochemistrypi/data_mining/model/detection.py b/geochemistrypi/data_mining/model/detection.py index f162c5f..377070e 100644 --- a/geochemistrypi/data_mining/model/detection.py +++ b/geochemistrypi/data_mining/model/detection.py @@ -292,7 +292,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = isolation_forest_manual_hyper_parameters() clear_output() return hyper_parameters @@ -451,7 +451,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = local_outlier_factor_manual_hyper_parameters() clear_output() return hyper_parameters diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index 7055218..394612c 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -59,7 +59,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None: @dispatch(object, object, bool) def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None, is_automl: bool = False) -> None: """Fit the model by FLAML framework and RAY framework.""" - # print(f"-*-*- {self.naming} - AutoML -*-*-.") + # print(f"[bold green]-*-*- {self.naming} - AutoML -*-*-[/bold green].") if self.naming not in RAY_FLAML: self.automl = AutoML() if self.customized: # When the model is not built-in in FLAML framwork, use FLAML customization. @@ -299,7 +299,7 @@ def poly(self, X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFram @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = polynomial_regression_manual_hyper_parameters() clear_output() return hyper_parameters @@ -623,7 +623,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = xgboost_manual_hyper_parameters() clear_output() return hyper_parameters @@ -906,7 +906,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = decision_tree_manual_hyper_parameters() clear_output() return hyper_parameters @@ -1204,7 +1204,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = extra_trees_manual_hyper_parameters() clear_output() return hyper_parameters @@ -1504,7 +1504,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = random_forest_manual_hyper_parameters() clear_output() return hyper_parameters @@ -1706,7 +1706,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = svr_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2012,7 +2012,7 @@ def objective(config: Dict) -> None: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = multi_layer_perceptron_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2111,7 +2111,7 @@ def __init__( @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = linear_regression_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2337,7 +2337,7 @@ def settings(self) -> Dict: @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = knn_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2677,7 +2677,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = gradient_boosting_manual_hyper_parameters() clear_output() return hyper_parameters @@ -2880,7 +2880,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = lasso_regression_manual_hyper_parameters() clear_output() return hyper_parameters @@ -3224,7 +3224,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = elastic_net_manual_hyper_parameters() clear_output() return hyper_parameters @@ -3672,7 +3672,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = sgd_regression_manual_hyper_parameters() clear_output() return hyper_parameters @@ -4009,7 +4009,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = bayesian_ridge_manual_hyper_parameters() clear_output() return hyper_parameters @@ -4222,7 +4222,7 @@ def search_space(cls, data_size, task): @classmethod def manual_hyper_parameters(cls) -> Dict: """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") hyper_parameters = ridge_regression_manual_hyper_parameters() clear_output() return hyper_parameters diff --git a/geochemistrypi/data_mining/plot/map_plot.py b/geochemistrypi/data_mining/plot/map_plot.py index fef1c4b..a47c708 100644 --- a/geochemistrypi/data_mining/plot/map_plot.py +++ b/geochemistrypi/data_mining/plot/map_plot.py @@ -160,7 +160,7 @@ def process_world_map(data: pd.DataFrame, name_column: str) -> None: clear_output() if is_map_projection == 1: # If the user chooses to project the data on the world map, then the user can select the element to be projected. - print("-*-*- Distribution in World Map -*-*-") + print("[bold green]-*-*- Distribution in World Map -*-*-[/bold green]") print("Select one of the elements below to be projected in the World Map: ") show_data_columns(data.columns) elm_num = limit_num_input(data.columns, SECTION[3], num_input) diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 2a5f794..d2a6a7a 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -392,6 +392,23 @@ def copy_files(GEOPI_OUTPUT_ARTIFACTS_PATH: str, GEOPI_OUTPUT_METRICS_PATH: str, shutil.copy2(source_file_path, GEOPI_OUTPUT_SUMMARY_PATH) +def copy_files_from_source_dir_to_dest_dir(source_dir: str, dest_dir: str) -> None: + """Copy all files from the source folder to the destination folder. + + Parameters + ---------- + source_dir: str + Source folder path. + + dest_dir: str + Destination folder path + """ + for root, dirs, files in os.walk(source_dir): + for file in files: + source_file_path = os.path.join(root, file) + shutil.copy2(source_file_path, dest_dir) + + def list_excel_files(directory: str) -> list: """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories. diff --git a/geochemistrypi/start_cli_pipeline.py b/geochemistrypi/start_cli_pipeline.py index 7d5dcc6..f6cd579 100644 --- a/geochemistrypi/start_cli_pipeline.py +++ b/geochemistrypi/start_cli_pipeline.py @@ -8,12 +8,12 @@ # Mock the scenario where the user uses the built-in dataset for both training and application # - Test both continuous training and model inference -# cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.BUILT_IN) +cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.BUILT_IN) # Mock the scenario where the user uses the desktop dataset for both training and application # - Test both continuous training and model inference # - Test continuous training only -cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.DESKTOP) +# cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.DESKTOP) # Mock the scenario where the user uses the provided dataset for both training and application # - Test both continuous training and model inference diff --git a/pyproject.toml b/pyproject.toml index 95e9cb2..f46d55f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "geochemistrypi" -version = "0.6.1" +version = "0.7.0" authors = [ { name="Can He", email="sanyhew1097618435@163.com"}, { name="Jianhao Sun", email="sjh20171502@gmail.com"}, @@ -15,8 +15,7 @@ maintainers = [ { name="Can He", email="sanyhew1097618435@163.com"}, { name="Jianhao Sun", email="sjh20171502@gmail.com"}, { name="Jianming Zhao", email="zhaojianming@zju.edu.cn"}, - { name="Yongkang Shan", email="kk1361207571@163.com"}, - { name="Mengqi Gao", email="2534671415@qq.com"} + { name="Yongkang Shan", email="kk1361207571@163.com"} ] description = "A highly automated machine learning Python framework dedicating to build up MLOps level 1 software product for data-driven geochemistry discovery" keywords = ["Geochemistry π", "Automated", "Machine Learning", "MLOps", "Geochemistry Discovery", "Continuous Training", "Machine Learning Lifecycle Management", "Model Inference", "Data Mining"]