diff --git a/geochemistrypi/_version.py b/geochemistrypi/_version.py index 43c4ab0..49e0fc1 100644 --- a/geochemistrypi/_version.py +++ b/geochemistrypi/_version.py @@ -1 +1 @@ -__version__ = "0.6.1" +__version__ = "0.7.0" diff --git a/geochemistrypi/cli.py b/geochemistrypi/cli.py index 20bb44c..d385011 100644 --- a/geochemistrypi/cli.py +++ b/geochemistrypi/cli.py @@ -6,10 +6,11 @@ from typing import Optional import typer +from rich import print from ._version import __version__ from .data_mining.cli_pipeline import cli_pipeline -from .data_mining.constants import WORKING_PATH +from .data_mining.enum import DataSource app = typer.Typer() @@ -17,7 +18,6 @@ FRONTEND_PATH = os.path.join(CURRENT_PATH, "frontend") BACKEND_PATH = os.path.join(CURRENT_PATH, "start_dash_pipeline.py") PIPELINE_PATH = os.path.join(CURRENT_PATH, "start_cli_pipeline.py") -MLFLOW_STORE_PATH = os.path.join(f"file:{WORKING_PATH}", "geopi_tracking") def _version_callback(value: bool) -> None: @@ -39,6 +39,7 @@ def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="S @app.command() def data_mining( data: str = typer.Option("", help="The path of the training data without model inference."), + desktop: bool = typer.Option(False, help="Use the data in the directory 'geopi_input' on the desktop for training and inference."), training: str = typer.Option("", help="The path of the training data."), application: str = typer.Option("", help="The path of the inference data."), mlflow: bool = typer.Option(False, help="Start the mlflow server."), @@ -58,6 +59,21 @@ def start_frontend(): def start_mlflow(): """Start the mlflow server.""" + # Check if the current working directory has the 'geopi_tracking' directory to store the tracking data for mlflow + # If yes, set the MLFLOW_STORE_PATH to the current working directory + # If no, set the MLFLOW_STORE_PATH to the desktop + geopi_tracking_dir = os.path.join(os.getcwd(), "geopi_tracking") + if not os.path.exists(geopi_tracking_dir): + print("[bold red]The 'geopi_tracking' directory is not found in the current working directory.[bold red]") + geopi_tracking_dir = os.path.join(os.path.expanduser("~"), "Desktop", "geopi_tracking") + if not os.path.exists(geopi_tracking_dir): + print("[bold red]The 'geopi_tracking' directory is not found on the desktop.[bold red]") + print("[bold red]Our software will create a 'geopi_tracking' directory on the desktop to store the tracking data for mlflow.[bold red]") + else: + print("[bold green]The 'geopi_tracking' directory is found on the desktop.[bold green]") + print("[bold green]Our software will use the 'geopi_tracking' directory on the desktop to store the tracking data for mlflow.[bold green]") + MLFLOW_STORE_PATH = os.path.join("file:", geopi_tracking_dir) + print("[bold green]Press [bold magenta]Ctrl + C[/bold magenta] to close mlflow server at any time.[bold green]") start_mlflow_command = f"mlflow ui --backend-store-uri {MLFLOW_STORE_PATH} " subprocess.run(start_mlflow_command, shell=True) @@ -76,16 +92,19 @@ def start_mlflow(): # Start mlflow server to track the experiment mlflow_thread = threading.Thread(target=start_mlflow) mlflow_thread.start() + elif desktop: + # Start the CLI pipeline with the data in the directory 'geopi_input' on the desktop + cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.DESKTOP) else: # If the data is provided, start the CLI pipeline with continuous training if data: - cli_pipeline(data) + cli_pipeline(training_data_path=data, application_data_path="", data_source=DataSource.ANY_PATH) # If the training data and inference data are provided, start the CLI pipeline with continuous training and inference elif training and application: - cli_pipeline(training, application) - # If no data is provided, use built-in data to start the CLI pipeline with continuous training and inference + cli_pipeline(training_data_path=training, application_data_path=application, data_source=DataSource.ANY_PATH) + # If no data is provided, look for the data in the desktop to start the CLI pipeline with continuous training and inference else: - cli_pipeline(training, application) + cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.BUILT_IN) @app.command() diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index ba66e9e..a7333f7 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -26,13 +26,10 @@ MODE_OPTION_WITH_MISSING_VALUES, NON_AUTOML_MODELS, OPTION, - OUTPUT_PATH, REGRESSION_MODELS, REGRESSION_MODELS_WITH_MISSING_VALUES, SECTION, TEST_DATA_OPTION, - TOGGLE_ADDRESS_STATUS, - WORKING_PATH, ) from .data.data_readiness import ( basic_info, @@ -53,6 +50,7 @@ from .data.inference import build_transform_pipeline, model_inference from .data.preprocessing import feature_scaler, feature_selector from .data.statistic import monte_carlo_simulator +from .enum import DataSource from .plot.map_plot import process_world_map from .plot.statistic_plot import basic_statistic, check_missing_value, correlation_plot, distribution_plot, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled from .process.classify import ClassificationModelSelection @@ -60,12 +58,11 @@ from .process.decompose import DecompositionModelSelection from .process.detect import AnomalyDetectionModelSelection from .process.regress import RegressionModelSelection -from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning +from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, list_excel_files, log, save_data, show_warning from .utils.mlflow_utils import retrieve_previous_experiment_id -from .utils.toggle_address_status import toggle_address_status -def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None: +def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None, data_source: Optional[DataSource] = None) -> None: """The command line interface software for Geochemistry π. The business logic of this CLI software can be found in the figures in the README.md file. It provides three MLOps core functionalities: @@ -82,37 +79,76 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = The path of the application data, by default None """ - # Local test: Uncomment the following line to utilize built-in datasets to test the pipeline. Don't forget to modify the path value to be consistent with your own location. - # training_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" - # application_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" - # Local test: If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True. show_warning(False) - os.makedirs(OUTPUT_PATH, exist_ok=True) - logger = log(OUTPUT_PATH, "geopi_inner_test.log") - logger.info("Geochemistry Pi is running.") - # Display the interactive splash screen when launching the CLI software console = Console() print("\n[bold blue]Welcome to Geochemistry π![/bold blue]") - print("[bold]Initializing...[/bold]") + print("[bold blue]Three cores components:[/bold blue]") + print("✨ [bold blue]Continuous Training[/bold blue]") + print("✨ [bold blue]Model Inference[/bold blue]") + print("✨ [bold blue]Machine Learning Lifecycle Management[/bold blue]") + print("[bold green]Initializing...[/bold green]") + + # Set the working path based on the data source + # If the user uses the built-in data, the working path is the desktop, the output path is the desktop. + # If the user uses the desktop data, the working path is the desktop, the output path is the desktop. + # If the user uses the any path, the working path is the current working directory, the output path is the current working directory. + if data_source == DataSource.BUILT_IN: + # If the user uses the built-in data, the working path is the desktop. + WORKING_PATH = os.path.join(os.path.expanduser("~"), "Desktop") + elif data_source == DataSource.DESKTOP: + WORKING_PATH = os.path.join(os.path.expanduser("~"), "Desktop") + INPUT_PATH = os.path.join(WORKING_PATH, "geopi_input") + if not os.path.exists(INPUT_PATH): + print("[bold red]The 'geopi_input' directory is not found on the desktop.[/bold red]") + os.makedirs(INPUT_PATH, exist_ok=True) + print("[bold green]Creating the 'geopi_input' directory ...[/bold green]") + print("[bold green]Successfully create 'geopi_input' directory on the desktop.[/bold green]") + print("Please restart the software after putting the data in the 'geopi_input' directory.") + print("Currently, the data file format only supports '.xlsx', '.xls', '.csv'.") + print("If you want to activate the model inference, please put the 'application data' in it as well.") + print("Check our online documentation for more information on the format of the 'application data'.") + clear_output("(Press Enter key to exit)") + exit(1) + + with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"): + sleep(1) + + # List all existing Excel files in the 'geopi_input' directory on the desktop. + existing_excel_files = list_excel_files(INPUT_PATH) + if len(existing_excel_files) == 0: + print("[bold red]No data files found in the 'geopi_input' directory on the desktop.[/bold red]") + print("[bold green]Please put the data files in the 'geopi_input' directory on the desktop.[/bold green]") + clear_output("(Press Enter key to exit)") + exit(1) + show_excel_columns(existing_excel_files) + + # Read the training data from the Excel file. + print("Please select the training data by index:") + # Limit the user input to a number within the range of available files and assign the result to training_data_path + training_data_path = existing_excel_files[limit_num_input(range(1, len(existing_excel_files) + 1), SECTION[0], num_input) - 1] + is_application_data = Confirm.ask("Do you want to activate the inference functionality", default=False) + if is_application_data: + # Read the application data from the Excel file. + print("Please select the application data by index:") + # Limit the user input to a number within the range of available files and assign the result to application_data_path + application_data_path = existing_excel_files[limit_num_input(range(1, len(existing_excel_files) + 1), SECTION[0], num_input) - 1] + elif data_source == DataSource.ANY_PATH: + WORKING_PATH = os.getcwd() + + # Set the output path to the working path + OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output") + os.makedirs(OUTPUT_PATH, exist_ok=True) + + # Set the log file path + logger = log(OUTPUT_PATH, "geopi_inner_test.log") + logger.info("Geochemistry Pi is running.") # <-- User Training Data Loading --> with console.status("[bold green]Training Data Loading...[/bold green]", spinner="dots"): sleep(0.75) - - # Call toggle_address_status and pass status and training_data_path as parameters to obtain the address of the training data - training_data_path = toggle_address_status(status=TOGGLE_ADDRESS_STATUS, training_data_path=training_data_path)[0] - - # Check if the length of training_data_path is greater than 1 - if len(training_data_path) > 1: - # Display the columns of the Excel file located at training_data_path - show_excel_columns(training_data_path) - print("Please select only one file that you want to process:") - # Limit the user input to a number within the range of available files and assign the result to training_data_path - training_data_path = training_data_path[limit_num_input(range(1, len(training_data_path) + 1), SECTION[0], num_input) - 1] - if training_data_path: # If the user provides file name, then load the training data from the file. data = read_data(file_path=training_data_path, is_own_data=1) @@ -124,6 +160,13 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <-- User Application Data Loading --> with console.status("[bold green]Application Data Loading...[/bold green]", spinner="dots"): sleep(0.75) + # Three scenarios for the application data loading: + # 1. The user provides the training data path and the application data path. + # - The user wants to use the model inference. + # 2. The user provides the training data path but doesn't provide the application data path. + # - The user doesn't want to use the model inference. + # 3. The user doesn't provide the training data path and the application data path. + # - The continuous training and model inference will use the built-in data. is_built_in_inference_data = False if training_data_path and application_data_path: # If the user provides file name, then load the inference data from the file. @@ -169,8 +212,8 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # Create a new experiment or use the previous experiment is_used_previous_experiment = Confirm.ask("✨ Use Previous Experiment", default=False) # Set the tracking uri to the local directory, in the future, we can set it to the remote server. - experiments_localtion = f"file:{WORKING_PATH}/geopi_tracking" - mlflow.set_tracking_uri(experiments_localtion) + experiments_location = os.path.join("file:", WORKING_PATH, "geopi_tracking") + mlflow.set_tracking_uri(experiments_location) # Print the tracking uri for debugging. # print("tracking uri:", mlflow.get_tracking_uri()) if is_used_previous_experiment: @@ -207,7 +250,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.") # mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description}) mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id) - create_geopi_output_dir(experiment.name, run_name) + create_geopi_output_dir(OUTPUT_PATH, experiment.name, run_name) clear_output() # <--- Built-in Training Data Loading ---> @@ -235,6 +278,11 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Built-in Application Data Loading ---> logger.debug("Built-in Application Data Loading") # If the user doesn't provide training data path and inference data path, then use the built-in inference data. + # There are two scenarios for the built-in inference data loading: + # 1. The user chooses the built-in training data for regression or classification. + # - Only the supervised learning mode supports model inference. + # 2. The user chooses the built-in training data for clustering, decomposition or anomaly detection. + # - The unsupervised learning mode doesn't support model inference. if is_built_in_inference_data and built_in_training_data_num == 1: application_data_path = "ApplicationData_Regression.xlsx" inference_data = read_data(file_path=application_data_path) @@ -616,7 +664,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # Add the option of all models all_models_num = len(MODELS) + 1 print(str(all_models_num) + " - All models above to be trained") - print("Which model do you want to apply?(Enter the Corresponding Number)") + print("Which model do you want to apply?") MODELS.append("all_models") model_num = limit_num_input(MODELS, SECTION[2], num_input) clear_output() @@ -628,7 +676,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = if mode_num == 1 or mode_num == 2: # If the model is not in the NON_AUTOML_MODELS, then ask the user whether to use AutoML. if model_name not in NON_AUTOML_MODELS: - print("Do you want to employ automated machine learning with respect to this algorithm?" "(Enter the Corresponding Number):") + print("Do you want to employ automated machine learning with respect to this algorithm?") num2option(OPTION) automl_num = limit_num_input(OPTION, SECTION[2], num_input) if automl_num == 1: @@ -728,7 +776,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = for i in range(len(MODELS) - 1): # Start a nested MLflow run within the current MLflow run with mlflow.start_run(run_name=MODELS[i], experiment_id=experiment.experiment_id, nested=True): - create_geopi_output_dir(experiment.name, run_name, MODELS[i]) + create_geopi_output_dir(OUTPUT_PATH, experiment.name, run_name, MODELS[i]) run = Modes2Initiators[mode_num](MODELS[i]) # If is_automl is False, then run all models without AutoML. if not is_automl: diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 0dde2f3..7863392 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -1,10 +1,5 @@ import os -from .utils.toggle_address_status import toggle_address_status - -# Adjust the path of project data flow: The number 1 indicates standard mode, and the number 2 indicates APP mode. -TOGGLE_ADDRESS_STATUS = 1 - # The number of uploading dataset per user is limited to 5. MAX_UPLOADS_PER_USER = 5 @@ -14,12 +9,6 @@ # the directory where the built-in data set to be processed stays BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset") -# current working directory in which the user activates the application -WORKING_PATH = toggle_address_status(status=TOGGLE_ADDRESS_STATUS)[1] - -# the root directory where all the output stays -OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output") - # the directory where the artifact is saved within the MLflow run's artifact directory MLFLOW_ARTIFACT_DATA_PATH = "data" MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH = os.path.join("image", "statistic") diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py index 927f212..08ec01d 100644 --- a/geochemistrypi/data_mining/data/data_readiness.py +++ b/geochemistrypi/data_mining/data/data_readiness.py @@ -162,10 +162,10 @@ def select_column_name(data: pd.DataFrame) -> str: The data set to be selected name. """ print( - "You need to choose the number of the column above as the output data identifier column.\n" + "You need to choose the number of the column above as [bold red]the output data identifier column[/bold red].\n" "The data identifier column helps identify uniquely each row of data point in the output data.\n" - "** For example, when using built-in dataset, you can choose the column ‘SAMPLE NAME’.**\n" - "Once finishing the whole run, in the output data file, all data point will have the value in the column ‘SAMPLE NAME’ as its unique identifier.\n" + "For example, when using built-in dataset, you can choose the column [bold red]SAMPLE NAME[/bold red].\n" + "Once finishing the whole run, in the output data files, each row of data will have the value in the column [bold red]SAMPLE NAME[/bold red] as its unique identifier.\n" "Enter the number of the output data identifier column." ) while True: diff --git a/geochemistrypi/data_mining/enum.py b/geochemistrypi/data_mining/enum.py index f861a8a..30b1601 100644 --- a/geochemistrypi/data_mining/enum.py +++ b/geochemistrypi/data_mining/enum.py @@ -13,3 +13,9 @@ class ModeOptionWithMissingValues(Enum): REGRESSION = "Regression" CLASSIFICATION = "Classification" CLUSTERING = "Clustering" + + +class DataSource(Enum): + BUILT_IN = "Built-in" + DESKTOP = "Desktop" + ANY_PATH = "Any Path" diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 202f049..2a5f794 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -12,14 +12,15 @@ from matplotlib import pyplot as plt from rich import print -from ..constants import OUTPUT_PATH - -def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None: +def create_geopi_output_dir(output_path: str, experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None: """Create the output directory for the current run and store the related pathes as environment variable. Parameters ---------- + output_path : str + The root path to store the output. + experiment_name : str The name of the experiment. @@ -32,9 +33,9 @@ def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: O # Set the output path for the current run # timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M") if sub_run_name: - geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}", sub_run_name) + geopi_output_path = os.path.join(output_path, experiment_name, f"{run_name}", sub_run_name) else: - geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}") + geopi_output_path = os.path.join(output_path, experiment_name, f"{run_name}") os.environ["GEOPI_OUTPUT_PATH"] = geopi_output_path os.makedirs(geopi_output_path, exist_ok=True) @@ -143,9 +144,12 @@ def install_package(package_name: str) -> None: subprocess.check_call(["python", "-m", "pip", "install", "--quiet", package_name]) -def clear_output() -> None: +def clear_output(text: str = None) -> None: """Clear the console output.""" - flag = input("(Press Enter key to move forward.)") + if text: + flag = input(text) + else: + flag = input("(Press Enter key to move forward)") my_os = platform.system() if flag == "": if my_os == "Windows": @@ -386,3 +390,29 @@ def copy_files(GEOPI_OUTPUT_ARTIFACTS_PATH: str, GEOPI_OUTPUT_METRICS_PATH: str, for file in files: source_file_path = os.path.join(root, file) shutil.copy2(source_file_path, GEOPI_OUTPUT_SUMMARY_PATH) + + +def list_excel_files(directory: str) -> list: + """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories. + + Parameters + ---------- + directory : str + The path to the directory to search for Excel files. + + Returns + ------- + excel_files : list + A list of file paths for all Excel files found. + + Notes + ----- + (1) The function uses `os.walk` to traverse the directory and its subdirectories. + (2) Only files with extensions .xlsx, .xls, and .csv are considered as Excel files. + """ + excel_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"): + excel_files.append(os.path.join(root, file)) + return excel_files diff --git a/geochemistrypi/data_mining/utils/toggle_address_status.py b/geochemistrypi/data_mining/utils/toggle_address_status.py index 4d96872..d7a5849 100644 --- a/geochemistrypi/data_mining/utils/toggle_address_status.py +++ b/geochemistrypi/data_mining/utils/toggle_address_status.py @@ -1,34 +1,11 @@ import os +from ..enum import DataSource +from .base import list_excel_files -def list_excel_files(directory: str) -> list: - """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories. - Parameters - ---------- - directory : str - The path to the directory to search for Excel files. - - Returns - ------- - excel_files : list - A list of file paths for all Excel files found. - - Notes - ----- - (1) The function uses `os.walk` to traverse the directory and its subdirectories. - (2) Only files with extensions .xlsx, .xls, and .csv are considered as Excel files. - """ - excel_files = [] - for root, dirs, files in os.walk(directory): - for file in files: - if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"): - excel_files.append(os.path.join(root, file)) - return excel_files - - -def toggle_address_status(status: str = None, training_data_path: str = None) -> list: - """Toggles the training data path and output path based on the provided status. +def toggle_data_source(data_source: DataSource = None) -> list: + """Toggle the training data path and output path based on the provided status. Parameters ---------- @@ -36,6 +13,7 @@ def toggle_address_status(status: str = None, training_data_path: str = None) -> The status value, which can be "1" or "2". - "1": Use the input and output paths in command line mode. - "2": Retrieves all Excel files from the "data" folder on the desktop as the training data path, and sets the output path to the desktop. + training_data_path : str, optional The path to the training data. This parameter is used when `status` is "1". @@ -46,13 +24,11 @@ def toggle_address_status(status: str = None, training_data_path: str = None) -> """ - if int(status) == 1: + if data_source == DataSource.BUILT_IN: working_path = os.path.dirname(os.getcwd()) - elif int(status) == 2: + elif data_source == DataSource.DESKTOP: desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") - training_data_path = list_excel_files(os.path.join(desktop_path, "data")) + existing_excel_files = list_excel_files(os.path.join(desktop_path, "geopi_input")) working_path = desktop_path - else: - raise ValueError("Invalid status value. It should be '1' or '2'.") - return [training_data_path, working_path] + return [existing_excel_files, working_path] diff --git a/geochemistrypi/start_cli_pipeline.py b/geochemistrypi/start_cli_pipeline.py index b3834d9..7d5dcc6 100644 --- a/geochemistrypi/start_cli_pipeline.py +++ b/geochemistrypi/start_cli_pipeline.py @@ -1,5 +1,25 @@ # -*- coding: utf-8 -*- from data_mining.cli_pipeline import cli_pipeline +from data_mining.enum import DataSource -# Used for internal testing, run in debug mode in IDE to inspect the pipeline -cli_pipeline("", "") +""" +Used for internal testing, run in debug mode in IDE to inspect the pipeline +""" + +# Mock the scenario where the user uses the built-in dataset for both training and application +# - Test both continuous training and model inference +# cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.BUILT_IN) + +# Mock the scenario where the user uses the desktop dataset for both training and application +# - Test both continuous training and model inference +# - Test continuous training only +cli_pipeline(training_data_path="", application_data_path="", data_source=DataSource.DESKTOP) + +# Mock the scenario where the user uses the provided dataset for both training and application +# - Test both continuous training and model inference +# - Test continuous training only +# Uncomment the following line to utilize built-in datasets to test the pipeline. Don't forget to modify the path value to be consistent with your own location. +# training_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" +# application_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx" +# cli_pipeline(training_data_path=training_data_path, application_data_path=application_data_path, data_source=DataSource.ANY_PATH) +# cli_pipeline(training_data_path=training_data_path, application_data_path="", data_source=DataSource.ANY_PATH)