diff --git a/MANIFEST.in b/MANIFEST.in index 09b9d627da..046e779d28 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,3 +2,4 @@ include versioneer.py include nvflare/_version.py include nvflare/libs/*.so include nvflare/fuel/utils/*.json +include nvflare/private/fed/app/simulator/log_config.json diff --git a/docs/resources/log_config.json b/docs/resources/log_config.json index 92c4f9890a..997d9d0420 100644 --- a/docs/resources/log_config.json +++ b/docs/resources/log_config.json @@ -6,7 +6,7 @@ "()": "nvflare.fuel.utils.log_utils.BaseFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, - "colorFormatter": { + "consoleFormatter": { "()": "nvflare.fuel.utils.log_utils.ColorFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, @@ -25,7 +25,7 @@ "consoleHandler": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "colorFormatter", + "formatter": "consoleFormatter", "filters": [], "stream": "ext://sys.stdout" }, diff --git a/docs/user_guide/configurations/logging_configuration.rst b/docs/user_guide/configurations/logging_configuration.rst index 640bf7398b..8c86ba4703 100644 --- a/docs/user_guide/configurations/logging_configuration.rst +++ b/docs/user_guide/configurations/logging_configuration.rst @@ -33,7 +33,7 @@ See the `configuration dictionary schema <(https://docs.python.org/3/library/log "()": "nvflare.fuel.utils.log_utils.BaseFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, - "colorFormatter": { + "consoleFormatter": { "()": "nvflare.fuel.utils.log_utils.ColorFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, @@ -52,7 +52,7 @@ See the `configuration dictionary schema <(https://docs.python.org/3/library/log "consoleHandler": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "colorFormatter", + "formatter": "consoleFormatter", "filters": [], "stream": "ext://sys.stdout" }, @@ -147,7 +147,7 @@ Example configuration: .. code-block:: json - "colorFormatter": { + "consoleFormatter": { "()": "nvflare.fuel.utils.log_utils.ColorFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s", "level_colors": { @@ -221,7 +221,7 @@ Example configuration: "consoleHandler": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "colorFormatter", + "formatter": "consoleFormatter", "filters": ["FLFilter"], "stream": "ext://sys.stdout" } diff --git a/examples/advanced/brats18/README.md b/examples/advanced/brats18/README.md index 881dbb5a72..f3494a276a 100644 --- a/examples/advanced/brats18/README.md +++ b/examples/advanced/brats18/README.md @@ -30,8 +30,8 @@ First, we add the image and datalist directory roots to `config_train.json` file ``` for alg in brats_central brats_fedavg brats_fedavg_dp do - sed -i "s|DATASET_ROOT|${PWD}/dataset_brats18/dataset|g" configs/${alg}/config/config_train.json - sed -i "s|DATALIST_ROOT|${PWD}/dataset_brats18/datalist|g" configs/${alg}/config/config_train.json + sed -i "s|DATASET_ROOT|${PWD}/dataset_brats18/dataset|g" configs/${alg}/app/config/config_train.json + sed -i "s|DATALIST_ROOT|${PWD}/dataset_brats18/datalist|g" configs/${alg}/app/config/config_train.json done ``` diff --git a/examples/advanced/llm_hf/requirements.txt b/examples/advanced/llm_hf/requirements.txt index b5ef99c4c0..a288bbff5b 100644 --- a/examples/advanced/llm_hf/requirements.txt +++ b/examples/advanced/llm_hf/requirements.txt @@ -1,8 +1,8 @@ nvflare -torch +torch==2.5.1 datasets tensorboard -transformers -peft -trl -bitsandbytes \ No newline at end of file +transformers==4.48.0 +peft==0.14.0 +trl==0.13.0 +bitsandbytes diff --git a/examples/advanced/llm_hf/src/hf_sft_peft_fl.py b/examples/advanced/llm_hf/src/hf_sft_peft_fl.py index 49113190ff..96667151bc 100755 --- a/examples/advanced/llm_hf/src/hf_sft_peft_fl.py +++ b/examples/advanced/llm_hf/src/hf_sft_peft_fl.py @@ -23,7 +23,7 @@ import numpy as np import torch from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict, utils -from transformers import AutoModelForCausalLM, AutoTokenizer, trainer_utils +from transformers import AutoModelForCausalLM, trainer_utils from trl import SFTConfig, SFTTrainer import nvflare.client as flare @@ -126,11 +126,6 @@ def main(): model = get_peft_model(model, peft_config) model.config.pretraining_tp = 1 - # Set tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "right" - # Training arguments train_args = SFTConfig( output_dir=args.output_path, @@ -159,8 +154,6 @@ def main(): train_dataset=dataset_train, eval_dataset=dataset_valid, peft_config=peft_config, - tokenizer=tokenizer, - packing=False, formatting_func=format_instruction, args=train_args, ) diff --git a/examples/advanced/llm_hf/utils/hf_sft_peft.py b/examples/advanced/llm_hf/utils/hf_sft_peft.py index ae1d429281..862148368c 100755 --- a/examples/advanced/llm_hf/utils/hf_sft_peft.py +++ b/examples/advanced/llm_hf/utils/hf_sft_peft.py @@ -21,7 +21,7 @@ import numpy as np import torch from peft import LoraConfig, get_peft_model -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM from trl import SFTConfig, SFTTrainer torch.manual_seed(0) @@ -114,11 +114,6 @@ def main(): model = get_peft_model(model, peft_config) model.config.pretraining_tp = 1 - # Set tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "right" - # Training arguments train_args = SFTConfig( output_dir=args.output_path, @@ -144,8 +139,6 @@ def main(): train_dataset=dataset_train, eval_dataset=dataset_valid, peft_config=peft_config, - tokenizer=tokenizer, - packing=False, formatting_func=format_instruction, args=train_args, ) diff --git a/examples/advanced/llm_hf/utils/hf_sft_peft_iter.py b/examples/advanced/llm_hf/utils/hf_sft_peft_iter.py index 065280f3f2..c6f34283c1 100755 --- a/examples/advanced/llm_hf/utils/hf_sft_peft_iter.py +++ b/examples/advanced/llm_hf/utils/hf_sft_peft_iter.py @@ -22,7 +22,7 @@ import numpy as np import torch from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict, utils -from transformers import AutoModelForCausalLM, AutoTokenizer, trainer_utils +from transformers import AutoModelForCausalLM, trainer_utils from trl import SFTConfig, SFTTrainer torch.manual_seed(0) @@ -115,11 +115,6 @@ def main(): model = get_peft_model(model, peft_config) model.config.pretraining_tp = 1 - # Set tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "right" - # Training arguments train_args = SFTConfig( output_dir=args.output_path, @@ -147,8 +142,6 @@ def main(): train_dataset=dataset_train, eval_dataset=dataset_valid, peft_config=peft_config, - tokenizer=tokenizer, - packing=False, formatting_func=format_instruction, args=train_args, ) diff --git a/examples/tutorials/custom_log_config.json b/examples/tutorials/custom_log_config.json new file mode 100644 index 0000000000..69c7cc59d8 --- /dev/null +++ b/examples/tutorials/custom_log_config.json @@ -0,0 +1,84 @@ +{ + "version": 1, + "disable_existing_loggers": false, + "formatters": { + "baseFormatter": { + "()": "nvflare.fuel.utils.log_utils.BaseFormatter", + "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" + }, + "consoleFormatter": { + "()": "nvflare.fuel.utils.log_utils.ColorFormatter", + "fmt": "%(asctime)s - %(identity)s - %(name)s - %(levelname)s - %(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S", + "logger_colors": { + "NPModelPersistor": "blue" + } + }, + "jsonFormatter": { + "()": "nvflare.fuel.utils.log_utils.JsonFormatter", + "fmt": "%(asctime)s - %(name)s - %(fullName)s - %(levelname)s - %(fl_ctx)s - %(message)s" + } + }, + "filters": { + "FLFilter": { + "()": "nvflare.fuel.utils.log_utils.LoggerNameFilter", + "logger_names": ["custom", "nvflare.app_common", "nvflare.app_opt"] + } + }, + "handlers": { + "consoleHandler": { + "class": "logging.StreamHandler", + "level": "DEBUG", + "formatter": "consoleFormatter", + "filters": ["FLFilter"], + "stream": "ext://sys.stdout" + }, + "logFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "baseFormatter", + "filename": "log.txt", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10 + }, + "errorFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "ERROR", + "formatter": "baseFormatter", + "filename": "log_error.txt", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10 + }, + "jsonFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "jsonFormatter", + "filename": "log.json", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10 + }, + "FLFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "baseFormatter", + "filters": ["FLFilter"], + "filename": "log_fl.txt", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10, + "delay": true + } + }, + "loggers": { + "root": { + "level": "INFO", + "handlers": ["consoleHandler", "logFileHandler", "errorFileHandler", "jsonFileHandler", "FLFileHandler"] + }, + "nvflare.app_common.aggregators": { + "level": "DEBUG" + } + } +} \ No newline at end of file diff --git a/examples/tutorials/logging.ipynb b/examples/tutorials/logging.ipynb new file mode 100644 index 0000000000..35d07f86e9 --- /dev/null +++ b/examples/tutorials/logging.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "654a158f-42c6-4706-84c6-4bbbec79be26", + "metadata": {}, + "source": [ + "# FLARE Logging\n", + "\n", + "This tutorial covers how to configure logging in FLARE for different use cases and modes.\n", + "\n", + "To learn more, see our [Logging Configuration Documentation](https://nvflare.readthedocs.io/en/main/user_guide/configurations/logging_configuration.html) for a more comprehensive description of the various features.\n", + "\n", + "## Setup\n", + "\n", + "The NVFlare [Quickstart Guide](https://nvflare.readthedocs.io/en/main/quickstart.html#installation) provides instructions for setting up FLARE on a local system or in a Docker image. We've also cloned the NVFlare GitHub in our top-level working directory." + ] + }, + { + "cell_type": "markdown", + "id": "e6f4f315", + "metadata": {}, + "source": [ + "## Simulator Logging\n", + "\n", + "To get started, let's run the **hello-numpy-sag** job in the simulator and take a look at the default logging output:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d6a1942", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p hello-numpy-sag-workspace\n", + "!nvflare simulator -w hello-numpy-sag-workspace -n 2 -t 2 ../hello-world/hello-numpy-sag/jobs/hello-numpy-sag" + ] + }, + { + "cell_type": "markdown", + "id": "80ee3335", + "metadata": {}, + "source": [ + "Notice how the output contains lots of logs from both the FLARE system, as well as the training workflow.\n", + "Additionally, the different level of logs (eg. INFO, WARNING, ERROR) have different console colors.\n", + "\n", + "We can view the default configuration used in this run and the generated log files in the workspace:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f93cac84", + "metadata": {}, + "outputs": [], + "source": [ + "!tree hello-numpy-sag-workspace" + ] + }, + { + "cell_type": "markdown", + "id": "a28e85f2", + "metadata": {}, + "source": [ + "### Default Log Config\n", + "\n", + "The **log_config.json** is the default logging configuration used. \n", + "This configuration comes with pre-configured handlers for console level colors, logs, error logs, structured json logs, and fl training logs using the following main sections:\n", + "\n", + "- formatters: formatting the structure of the log records\n", + "- filters: filters the log based on a specified criteria\n", + "- handler: sends logs to a destination, can use formatter and filters\n", + "- loggers: configure root logger with handlers, and any other loggers\n", + "\n", + "Let's take a look at the configuration under the server (will be the same for all sites by default):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "befa1e2a", + "metadata": {}, + "outputs": [], + "source": [ + "!cat hello-numpy-sag-workspace/server/local/log_config.json" + ] + }, + { + "cell_type": "markdown", + "id": "8ba41cfb", + "metadata": {}, + "source": [ + "### Default Log Files\n", + "\n", + "Next we can look at the various log files that are created by the FileHandlers.\n", + "Below we look at the server logs, but feel free to also check out the site logs as well." + ] + }, + { + "cell_type": "markdown", + "id": "36d04d8e", + "metadata": {}, + "source": [ + "#### log.txt\n", + "\n", + "The logFileHandler uses the baseFormatter to write all logs to log.txt.\n", + "This is the default log that we see in the console:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc69590", + "metadata": {}, + "outputs": [], + "source": [ + "!cat hello-numpy-sag-workspace/server/log.txt" + ] + }, + { + "cell_type": "markdown", + "id": "51a63e1f", + "metadata": {}, + "source": [ + "#### log.json\n", + "\n", + "The jsonFileHandler uses the jsonFormatter to write json formatted logs to log.json.\n", + "This is useful for leveraging the structured logs (ie with a 3rd party observability package):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf576cf0", + "metadata": {}, + "outputs": [], + "source": [ + "!cat hello-numpy-sag-workspace/server/log.json" + ] + }, + { + "cell_type": "markdown", + "id": "74eb7441", + "metadata": {}, + "source": [ + "#### log_error.txt\n", + "\n", + "The errorFileHandler uses the baseFormatter and level “ERROR” to write error level logs to log_error.txt.\n", + "This allows users to easily see when errors are logged:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bb3436da", + "metadata": {}, + "outputs": [], + "source": [ + "!cat hello-numpy-sag-workspace/server/log_error.txt" + ] + }, + { + "cell_type": "markdown", + "id": "a396c920", + "metadata": {}, + "source": [ + "#### log_fl.txt\n", + "\n", + "The FLFileHandler uses the baseFormatter and FLFilter (uses LoggerNameFilter allowing certain logger names) to write fl training and custom logs to log_fl.txt.\n", + "This removes the system and communication related logs and clearly shows logs related to FL training:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "526434bf", + "metadata": {}, + "outputs": [], + "source": [ + "!cat hello-numpy-sag-workspace/server/log_fl.txt" + ] + }, + { + "cell_type": "markdown", + "id": "d30ee805", + "metadata": {}, + "source": [ + "### Customization\n", + "\n", + "The log config file can easily be customized for different use cases.\n", + "\n", + "For this example, let's assume we are a federated learning researcher mainly interested in the algorithm parts of the log.\n", + "By default, we provide the FLFileHandler to generate the log_fl.txt However say we would also like to have a cleaner output in the console, as well as reduce any unnecessary log fields.\n", + "\n", + "#### custom_log_config.json\n", + "\n", + "In custom_log_config.json, let's see how we modify the consoleHandler and consoleFormatter to achieve our goal of a cleaner console output.\n", + "Below are some example changes, however feel free to experiment with different configurations:\n", + "\n", + "**Filters:** In the consoleHandler, we add the FLFilter which only allows logs related to FL training to pass through using the the LoggerNameFilter:\n", + "\n", + "- ``\"filters\": [\"FLFilter\"],``\n", + "\n", + "**Log Format:** In the consoleFormatter, we remove the fl_ctx field and add the identity field for a cleaner log structure:\n", + "\n", + "- `\"fmt\": \"%(asctime)s - %(identity)s - %(name)s - %(levelname)s - %(message)s\",`\n", + "\n", + "**Log Date Format:** In the consoleFormatter, we configure the datefmt to only use seconds rather than milliseconds:\n", + "\n", + "- ``\"datefmt\": \"%Y-%m-%d %H:%M:%S\"``\n", + "\n", + "**Color Format**: In the consoleFormatter, to highlight the NPModelPersistor for example, we can color it blue to make it stand out in the console:\n", + "- ```\n", + " \"logger_colors\": {\n", + " \"NPModelPersistor\": \"blue\"\n", + " }\n", + " ```\n", + "\n", + "**Logger Hierarchy**: In the consoleFormatter, to turn all loggers under nvflare.app_common.aggregators to DEBUG level for example, we can configure it under loggers (note: FLARE loggers are organized matching package hierarchy with dot separated name, allowing for configuration at different granularities. Additionally, logs from children loggers will by default propagate up to parent loggers and their handlers):\n", + "- ```\n", + " \"nvflare.app_common.aggregrators\": {\n", + " \"level\": \"DEBUG\"\n", + " }\n", + " ```\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cea18ebc", + "metadata": {}, + "outputs": [], + "source": [ + "!cat custom_log_config.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6caecd4", + "metadata": {}, + "outputs": [], + "source": [ + "!diff custom_log_config.json hello-numpy-sag-workspace/server/local/log_config.json" + ] + }, + { + "cell_type": "markdown", + "id": "2d563228", + "metadata": {}, + "source": [ + "Now let's run the simulator with the custom_log_config.json using the ``-l`` option:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b6e2cd8", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare simulator -w hello-numpy-sag-workspace -n 2 -t 2 -l custom_log_config.json ../hello-world/hello-numpy-sag/jobs/hello-numpy-sag" + ] + }, + { + "cell_type": "markdown", + "id": "d3b43521", + "metadata": {}, + "source": [ + "Compare this to the original output from the first command, and note the differences in the log output.\n", + "\n", + "In addition to the consoleHandler, all the other formatters, filters, handlers, and loggers can all also be customized just as easily.\n", + "\n", + "See the [Logging Configuration Documentation](https://nvflare.readthedocs.io/en/main/user_guide/configurations/logging_configuration.html) for more information on how to customize the different sections." + ] + }, + { + "cell_type": "markdown", + "id": "6460bdd9", + "metadata": {}, + "source": [ + "## Provisioned System and Logging Configuration Commands\n", + "\n", + "For this part of the example, we will showcase how to use the [Dynamic Logging Configuration Commands](https://nvflare.readthedocs.io/en/main/user_guide/configurations/logging_configuration.html#dynamic-logging-configuration-commands) with a running FLARE system.\n", + "\n", + "To provision and start an FL system, you can use [POC mode](setup_poc.ipynb) to quickly get started. Feel free to use an existing **provisioned** FLARE project if you have that available. Remember that we recommend starting the system in a separate terminal. \n", + "\n", + "Once the system is running and you have logged into the admin console you are ready to try out the commands. We provide two admin commands to enable users to dynamically configure the site or job level logging:" + ] + }, + { + "cell_type": "markdown", + "id": "d4838020", + "metadata": {}, + "source": [ + "### configure_site_log\n", + "\n", + "Configures the site level logs, but does not affect the job logs.\n", + "\n", + "Usage: ``configure_site_log ``\n", + "\n", + "- **target**: server, client ..., or all\n", + "- **config**: log configuration\n", + " - path to a json log configuration file (/path/to/my_log_config.json)\n", + " - log level name/number (debug, INFO, 30)\n", + " - read the current log configuration file (reload)\n", + "\n", + "Try and experiment with the following commands in the admin console:" + ] + }, + { + "cell_type": "markdown", + "id": "3da8d9da", + "metadata": {}, + "source": [ + "- ``configure_site_log server debug``\n", + "- ``configure_site_log client site-1 debug``\n", + "- ``configure_site_log all info``" + ] + }, + { + "cell_type": "markdown", + "id": "390a1f1b", + "metadata": {}, + "source": [ + "### configure_job_log\n", + "\n", + "Configures the job logs, does not affect site logs.\n", + "\n", + "Usage: ``configure_job_log ``\n", + "\n", + "- **job_id**: id of a running job\n", + "- **target**: server, client ..., or all\n", + "- **config**: log configuration (see above)\n", + "\n", + "Submit a job with ``submit_job //``,\n", + "then try and experiment with the following commands in the admin console:\n" + ] + }, + { + "cell_type": "markdown", + "id": "5b4f284f", + "metadata": {}, + "source": [ + "- ``configure_job_log server debug``\n", + "- ``configure_job_log client site-1 debug``\n", + "- ``configure_job_log all info``\n", + "- ``configure_job_log all //custom_log_config.json``" + ] + }, + { + "cell_type": "markdown", + "id": "cd631629", + "metadata": {}, + "source": [ + "Lastly, take a look at the generated log files in the workspace, noting the difference between the site and job logs.\n", + "For example if using POC mode:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ed5fe67", + "metadata": {}, + "outputs": [], + "source": [ + "!tree /tmp/nvflare/poc" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.0_introduction/introduction.ipynb deleted file mode 100644 index b3d2194744..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.0_introduction/introduction.ipynb +++ /dev/null @@ -1,48 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Runing Federated Learning Applications\n", - "\n", - "\n", - "In this chapter, we will explore the process of running federated learning applications. We will start by setting up the environment and preparing the data, followed by training a classifier using PyTorch. We will then convert deep learning models to federated learning, customize server and client logic, and setup track experiments. Finally, we will delve into the job structure and configurations, including running a simulator, and conclude with a recap of the covered topics.\n", - "\n", - "\n", - "1. **Running federated learning job**\n", - " * [Installation, prepare data](../01.1.1_running_federated_learning_job/setup.ipynb)\n", - " * [traing classifier with pytorch](../01.1.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n", - "\n", - "2. [Convert deep learning with pytorch to federated leraning](../01.1.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n", - "\n", - "2. [customize server logics](../01.1.3_customize_server_logics/)\n", - "\n", - "4. [customize client logics](../01.1.4_customize_client_training/customize_client_training.ipynb)\n", - "\n", - "5. [experiment tracking](../01.1.5_experiment_tracking/experiment_tracking.ipynb)\n", - "\n", - "6. **Job structure and configurations**\n", - "\n", - " * [Federated job ](../01.1.6_job_structure_and_configuration/01.1.5.1_understanding_fl_job.ipynb)\n", - "\n", - " * [job structure & configuration](../01.1.6_job_structure_and_configuration/01.1.5.2_job_structure_and_config.ipynb)\n", - "\n", - " * [running simulator](../01.1.6_job_structure_and_configuration/01.1.5.3_running_simulator.ipynb)\n", - "\n", - "7. [Recap of the covered topics](./01.1.8_recap.ipynb)\n", - "\n", - "\n", - "\n", - "Let's get started with [Installation & data preparation](../01.1.1_running_federated_learning_job/setup.ipynb)\n" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb deleted file mode 100644 index f6ab181ac0..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb +++ /dev/null @@ -1,177 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b75b2253-cba8-4579-907b-09311e0da587", - "metadata": {}, - "source": [ - "# PyTorch Deep Learning to Federated Learning Conversion\n", - "\n", - "One common question frequently heard from data scientists is how do I wrote a federated learning ? If I already have training code already for deep learning? how do I write an federated learning training code for the same problem?\n", - "\n", - "In this section, we will look at the classification training code we ran earlier and see how to convert the existing the pytorch training script to federated Learning client training code\n", - "\n", - "\n", - "#### Orginal Deep learning Training Script" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78422d7e", - "metadata": {}, - "outputs": [], - "source": [ - "%cd 01.1.3.code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d86225e", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 src/client_origin.py" - ] - }, - { - "cell_type": "markdown", - "id": "4d9db032", - "metadata": {}, - "source": [ - "#### Convert the Deep learning Training Script\n", - "\n", - "Now let's convert it to federated learning training code with NVIDIA FLARE's Client API\n" - ] - }, - { - "cell_type": "markdown", - "id": "62a0ce53", - "metadata": {}, - "source": [ - "**Step-1** import\n", - "\n", - "```\n", - "import nvflare.client as flare\n", - "\n", - "```\n", - "\n", - "**Step-2** init\n", - "\n", - "we call \n", - "\n", - "```\n", - "flare.init()\n", - "```\n", - "\n", - "Once the flare is initialized, we will recieve some system metadata for example\n", - "```\n", - " sys_info = flare.system_info()\n", - " client_name = sys_info[\"site_name\"]\n", - "\n", - "```\n", - "We can get current client's \"identity\". \n", - "\n", - "Next we need to extends the trainig beyond local iterations. Image the Federated Learning is like the following for-loop: \n", - "\n", - "```\n", - "rounds = 5\n", - "for current_round in ranage (rounds):\n", - " \n", - " \n", - "\n", - "```\n", - "\n", - "Therefore we need to additional loop for the Federated Learning training. This can be expressed \n", - "\n", - "**Step 3** global round loop \n", - "\n", - " while flare.is_running():\n", - " \n", - "\n", - "\n", - "For each round: we need to receive and evaluate the global model. \n", - "\n", - "\n", - "**Step-4** Recive global model \n", - "\n", - "```\n", - " input_model = flare.receive()\n", - " round=input_model.current_round\n", - "\n", - " # update model based on global model\n", - " model.load_state_dict(input_model.params)\n", - "```\n", - "\n", - "**Step-5** Eveluate Global Model\n", - "\n", - " Since the local model is being updated with global model, the training procedue caclate the loss which evaluate the model \n", - "\n", - "**Step-6** Send the local trained model back to aggregator\n", - "\n", - " we take the newly trained local model parameters as well as metadata, sned it back to aggregator. \n", - "\n", - "```\n", - "\n", - " output_model = flare.FLModel( params=model.cpu().state_dict(), meta={\"NUM_STEPS_CURRENT_ROUND\": steps},)\n", - "\n", - " flare.send(output_model)\n", - "```\n", - "\n", - "\n", - "With above steps, just a few lines of code changes, no code structural changes, we converted the pytorch deep learning code to federated learning with NVIDIA FLARE\n", - "\n", - "The complete code can be found at client.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7249afc0", - "metadata": {}, - "outputs": [], - "source": [ - "!cat src/client.py" - ] - }, - { - "cell_type": "markdown", - "id": "7f1824bf", - "metadata": {}, - "source": [ - "Now, we converted the client pytorch training script to federated learning code, lets explore further and see how can we customize the server and client training logics. \n", - "\n", - "Go to [customize server logics](../01.1.3_customize_server_logics/customize_server_logics.ipynb)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "7b024c86", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.3_customize_server_logics/code/src/fedavg_v1.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.3_customize_server_logics/code/src/fedavg_v1.py deleted file mode 100644 index 077f8dc33b..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.3_customize_server_logics/code/src/fedavg_v1.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nvflare.app_common.utils.math_utils import parse_compare_criteria -from nvflare.app_common.workflows.base_fedavg import BaseFedAvg - - -class FedAvgV1(BaseFedAvg): - """FedAvg with Early Stopping - - Args: - num_clients (int, optional): The number of clients. Defaults to 3. - num_rounds (int, optional): The total number of training rounds. Defaults to 5. - stop_cond (str, optional): early stopping condition based on metric. - string literal in the format of " " (e.g. "accuracy >= 80") - """ - - def __init__( - self, - *args, - stop_cond: str = None, - **kwargs, - ): - super().__init__(*args, **kwargs) - - self.stop_cond = stop_cond - if stop_cond: - self.stop_condition = parse_compare_criteria(stop_cond) - else: - self.stop_condition = None - - def run(self) -> None: - - # self.info("Start FedAvg v1.") - - # if self.initial_model: - # # Use FOBS for serializing/deserializing PyTorch tensors (self.initial_model) - # fobs.register(TensorDecomposer) - # # PyTorch weights - # initial_weights = self.initial_model.state_dict() - # else: - # initial_weights = {} - - # model = FLModel(params=initial_weights) - - # model.start_round = self.start_round - # model.total_rounds = self.num_rounds - - # for self.current_round in range(self.start_round, self.start_round + self.num_rounds): - # self.info(f"Round {self.current_round} started.") - # model.current_round = self.current_round - - # clients = self.sample_clients(self.num_clients) - - # results = self.send_model_and_wait(targets=clients, data=model) - - # aggregate_results = self.aggregate( - # results, aggregate_fn=self.aggregate_fn - # ) # using default aggregate_fn with `WeightedAggregationHelper`. Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] - - # model = self.update_model(model, aggregate_results) - - # self.info(f"Round {self.current_round} global metrics: {model.metrics}") - - # self.select_best_model(model) - - # self.save_model(self.best_model, os.path.join(os.getcwd(), self.save_filename)) - - # if self.should_stop(model.metrics, self.stop_condition): - # self.info( - # f"Stopping at round={self.current_round} out of total_rounds={self.num_rounds}. Early stop condition satisfied: {self.stop_condition}" - # ) - # break - - self.info("Finished FedAvg.") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.3_customize_server_logics/customize_server_logics.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.3_customize_server_logics/customize_server_logics.ipynb deleted file mode 100644 index 272c9b839d..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.3_customize_server_logics/customize_server_logics.ipynb +++ /dev/null @@ -1,141 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "f64188d2", - "metadata": {}, - "source": [ - "\n", - "# Customizing Federated Learning Server logics\n", - "\n", - "\n", - "In previous notebooks, we are able to run federated pytorch image classification code with NVIDIA FLARE builtin FedAvg algorithm. \n", - "What if we want to build my own algorithms or modify the existing algorithm ? \n", - "\n", - "In the following, using FedAvg as starting point, we like to make a few changes to FedAvg to fit our needs: \n", - "\n", - "* Instead of rely on the internal best model selection approach, we want to provide our own best model selection\n", - "* Add early stopping mechanism so that the training could stop instead of waiting to the total numbers of rounds if the criteria is statisfied\n", - "* Instead of using building persiste component PTFileModelPersistor, we like to have our own save and loading functions\n", - "\n", - "\n", - "In this section, we will go over these changes step-by-step. You can find these also in [FedAvg with early stopping](https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/hello-fedavg/hello-fedavg.ipynb) example\n", - "\n", - "\n", - "First, let's look at the FedAvg Job, which includes the FedAvg algorithm. \n", - "\n", - "## Customized FedAvg v1\n", - "\n", - "Lets starts with BaseFedAvg class and 1st modify the early stopping logics\n", - "\n", - "\n", - "```class BaseFedAvg``` provided a core based class for the customize FedAvg, it define a run() methods that capture all the running logs\n", - "as well as some utiliies. \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "### Early Stoping FedAvg\n", - "\n", - "```\n", - "class FedAvg(BaseFedAvg):\n", - " \"\"\"FedAvg with Early Stopping\n", - "\n", - " Args:\n", - " num_clients (int, optional): The number of clients. Defaults to 3.\n", - " num_rounds (int, optional): The total number of training rounds. Defaults to 5.\n", - " stop_cond (str, optional): early stopping condition based on metric.\n", - " string literal in the format of \" \" (e.g. \"accuracy >= 80\")\n", - " \"\"\"\n", - "\n", - " def __init__(\n", - " self,\n", - " *args,\n", - " stop_cond: str = None,\n", - " initial_model=None,\n", - " **kwargs,\n", - " ):\n", - " super().__init__(*args, **kwargs)\n", - "\n", - " self.stop_cond = stop_cond\n", - " if stop_cond:\n", - " self.stop_condition = parse_compare_criteria(stop_cond)\n", - " else:\n", - " self.stop_condition = None\n", - " \n", - " self.initial_model = initial_model\n", - " \n", - "\n", - "\n", - "\n", - "```\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7871717", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa6d4de5", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "9beac00b", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "437f0d14", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "84b319c6", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.4_customize_client_training/customize_client_training.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.4_customize_client_training/customize_client_training.ipynb deleted file mode 100644 index 981fb7b78f..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.4_customize_client_training/customize_client_training.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "38561a0b-a072-41ea-b027-290402cf4582", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.2_job_structure_and_config.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.2_job_structure_and_config.ipynb deleted file mode 100644 index a1f704fe3d..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.2_job_structure_and_config.ipynb +++ /dev/null @@ -1,229 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b22a78a5-5175-4778-a676-d548d3f5f443", - "metadata": {}, - "source": [ - "\n", - "The job API will create the job for you. The \n", - "\n", - "```\n", - "job.simulator_run(\"/tmp/nvflare/jobs/workdir\") \n", - "\n", - "```\n", - "\n", - "is actually create a job, then use simulator run the job. \n", - "\n", - "Let's use \n", - "\n", - "```\n", - " job.export_job(\"/tmp/nvflare/jobs/job_config\")\n", - "```\n", - "to generate job configuration without running the job. This code will be located at [fl_job_config.py](code/fl_job_config.py)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30bd9037", - "metadata": {}, - "outputs": [], - "source": [ - "%cd code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56b7a0f9", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 fl_job_config.py\n" - ] - }, - { - "cell_type": "markdown", - "id": "813c5a70", - "metadata": {}, - "source": [ - "Now we have create job configuration, let's take a closer look. " - ] - }, - { - "cell_type": "markdown", - "id": "0d77bb78", - "metadata": {}, - "source": [ - "## Job structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63dc5ddf", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "! tree /tmp/nvflare/jobs/job_config/fedavg" - ] - }, - { - "cell_type": "markdown", - "id": "3e01e0c3", - "metadata": {}, - "source": [ - "The job name \"FedAvg\" is folder structure, with each folder representing one app at one site. \n", - "\n", - "* **\"app_server\"**: is the name for the server app\n", - "\n", - "* **\"app_site-n\"**: is the name for the client app\n", - "\n", - "* for each site: it consits of \n", - " * **config**: directory which contains side specific configuration\n", - "\n", - " * **custom**: store the custom code for the specifc site\n", - "\n", - "These names can be changed if you manually edit these configurations. By default Job API uses above conventions. \n", - "\n", - "\n", - "* meta.json gives additional information related to the each app's deployment. \n", - "\n", - "```\n", - "{\n", - " \"name\": \"fedavg\",\n", - " \"resource_spec\": {},\n", - " \"min_clients\": 1,\n", - " \"deploy_map\": {\n", - " \"app_server\": [\n", - " \"server\"\n", - " ],\n", - " \"app_site-1\": [\n", - " \"site-1\"\n", - " ],\n", - " \"app_site-2\": [\n", - " \"site-2\"\n", - " ],\n", - " \"app_site-3\": [\n", - " \"site-3\"\n", - " ],\n", - " \"app_site-4\": [\n", - " \"site-4\"\n", - " ],\n", - " \"app_site-5\": [\n", - " \"site-5\"\n", - " ]\n", - " }\n", - "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "98a7a126", - "metadata": {}, - "source": [ - "A simplifed format of job structure can also be used when the client code and configuration is the same for all sites\n", - "\n", - "```\n", - "/tmp/nvflare/jobs/job_config/fedavg\n", - "├── app_server\n", - "│ ├── config\n", - "│ │ └── config_fed_server.json\n", - "│ └── custom\n", - "│ └── src\n", - "│ └── network.py\n", - "├── app_client\n", - "│ ├── config\n", - "│ │ └── config_fed_client.json\n", - "│ └── custom\n", - "│ ├── network.py\n", - "│ └── src\n", - "│ └── client.py\n", - "└── meta.json\n", - "\n", - "\n", - "```\n", - "\n", - "meta.json needs to be \n", - "\n", - "\n", - "```\n", - "{\n", - " \"name\": \"fedavg\",\n", - " \"resource_spec\": {},\n", - " \"min_clients\": 1,\n", - " \"deploy_map\": {\n", - " \"app_server\": [\n", - " \"server\"\n", - " ],\n", - " \"app_client\": [\n", - " \"site-1\", \"site-2\", \"site-3\", \"site-4\", \"site-5\" \n", - " ]\n", - " }\n", - "}\n", - "```\n", - "\n", - "\n", - "If we don't mind deploy all code to all sites, we can change the job config into the followings\n", - "\n", - "A simplifed format of job structure can also be used when the client code and configuration is the same for all sites\n", - "\n", - "```\n", - "/tmp/nvflare/jobs/job_config/fedavg\n", - "├── app\n", - "│ ├── config\n", - " | └── config_fed_client.json\n", - "│ │ └── config_fed_server.json\n", - "│ └── custom\n", - "│ └── src\n", - "│ └── network.py\n", - "| └── client.py\n", - "└── meta.json\n", - "\n", - "\n", - "```\n", - "\n", - "meta.json needs to be \n", - "\n", - "\n", - "```\n", - "{\n", - " \"name\": \"fedavg\",\n", - " \"resource_spec\": {},\n", - " \"min_clients\": 1,\n", - " \"deploy_map\": {\n", - " app = [\"@ALL\"]\n", - " }\n", - "}\n", - "```\n", - "\n", - "The default Job configuration is json, but one can also use pyhocon or YAML, please refer to [config file documentation](https://nvflare.readthedocs.io/en/2.4/user_guide/configurations.html) for details\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/fl_job.py deleted file mode 100644 index 8805ce05ed..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/fl_job.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from src.network import SimpleNetwork - -from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob -from nvflare.job_config.script_runner import ScriptRunner - -if __name__ == "__main__": - n_clients = 5 - num_rounds = 2 - - train_script = "src/client.py" - - job = FedAvgJob(name="fedavg", n_clients=n_clients, num_rounds=num_rounds, initial_model=SimpleNetwork()) - - # Add clients - for i in range(n_clients): - executor = ScriptRunner( - script=train_script, script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}" - ) - job.to(executor, f"site-{i + 1}") - - job.simulator_run("/tmp/nvflare/jobs/workdir") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.7_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.7_recap/recap.ipynb deleted file mode 100644 index 0aecd6b8ac..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.7_recap/recap.ipynb +++ /dev/null @@ -1,68 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7b152728-3366-4432-adb1-29aa3051dc22", - "metadata": {}, - "source": [ - "# Recap\n", - "\n", - "What we have learnt in Chapter-1" - ] - }, - { - "cell_type": "markdown", - "id": "4f2e3cb3-e61f-45e9-8dad-ad55ebb3641a", - "metadata": {}, - "source": [ - "## Chapter 1 Federated Learning Introduction\n", - "\n", - "### Running Federated Learning Applications\n", - "\n", - "* Running federated hello-pytorch-lightning with simulator\n", - "\n", - "* Understanding FL Job structure: client, server and Job\n", - "\n", - "* Understanding FL Job Concepts: controller, executor and configuration\n", - "\n", - "* How to convert deep learning pytorch-lighting to federated learning\n", - "\n", - "* Customize server aggregation logic\n", - "\n", - "* Custom client apps at different sites with different training parameters\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53cbc090-833e-4be1-9f3e-a2e977026281", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.1_convert_dl_to_fl_contd/convert_dl_to_fl_contd.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.1_convert_dl_to_fl_contd/convert_dl_to_fl_contd.ipynb deleted file mode 100644 index 0c2f60e059..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.1_convert_dl_to_fl_contd/convert_dl_to_fl_contd.ipynb +++ /dev/null @@ -1,31 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "53d2d18a-38bb-40a2-b5ba-8476970faad2", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb new file mode 100644 index 0000000000..96d870aff9 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recap: Runing Federated Learning Applications\n", + "\n", + "\n", + "In this chapter, we will explore the process of running federated learning applications. We will start by setting up the environment and preparing the data, followed by training a classifier using PyTorch. We will then convert deep learning models to federated learning, customize server and client logic, and setup track experiments. Finally, we will delve into the job structure and configurations, including running a simulator, and conclude with a recap of the covered topics.\n", + "\n", + "\n", + "1. **Running federated learning job**\n", + " * [Installation, prepare data](../01.1_running_federated_learning_job/setup.ipynb)\n", + " * [traing classifier with pytorch](../01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n", + "\n", + "2. **From stand-alone-deep learning to Federated Learning**\n", + "\n", + " * [Convert deep learning with pytorch to federated leraning](../01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n", + "\n", + "\n", + "2. **How to Customize the Federated Algorithms**\n", + "\n", + " * [customize server logics](../01.3_customize_server_logics/customize_server_logics.ipynb)\n", + "\n", + "4. **How to make adjustments to different traing parameters** \n", + "\n", + " * [customize client logics](../01.4_customize_client_training/customize_client_training.ipynb)\n", + "\n", + "5. **Tracking the trainig metrics** \n", + "\n", + " * [experiment tracking](../01.5_experiment_tracking/experiment_tracking.ipynb )\n", + "\n", + "6. **Job structure and configurations**\n", + "\n", + " * [job structure & configuration ](../01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb)\n", + "\n", + " \n", + "7. [Recap of the covered topics](../01.7_recap/recap.ipynb)\n", + "\n", + "\n", + "\n", + "Let's get started with [Installation & data preparation](.././01.1_running_federated_learning_job/setup.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/data/download.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/data/download.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/data/download.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/data/download.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py similarity index 92% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/fl_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py index 8805ce05ed..5cd4c6de70 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/fl_job.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py @@ -32,4 +32,4 @@ ) job.to(executor, f"site-{i + 1}") - job.simulator_run("/tmp/nvflare/jobs/workdir") + job.simulator_run(workspace="/tmp/nvflare/jobs/workdir", log_config="./log_config.json") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/img/cifar10.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/img/cifar10.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/img/cifar10.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/img/cifar10.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/log_config.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/log_config.json new file mode 100644 index 0000000000..e5732b4950 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/log_config.json @@ -0,0 +1,87 @@ +{ + "version": 1, + "disable_existing_loggers": false, + "formatters": { + "baseFormatter": { + "()": "nvflare.fuel.utils.log_utils.BaseFormatter", + "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" + }, + "colorFormatter": { + "()": "nvflare.fuel.utils.log_utils.ColorFormatter", + "fmt": "%(asctime)s - %(levelname)s - %(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S" + }, + "jsonFormatter": { + "()": "nvflare.fuel.utils.log_utils.JsonFormatter", + "fmt": "%(asctime)s - %(identity)s - %(name)s - %(fullName)s - %(levelname)s - %(fl_ctx)s - %(message)s" + } + }, + "filters": { + "FLFilter": { + "()": "nvflare.fuel.utils.log_utils.LoggerNameFilter", + "logger_names": ["custom", "nvflare.app_common", "nvflare.app_opt"] + } + }, + "handlers": { + "consoleHandler": { + "class": "logging.StreamHandler", + "level": "INFO", + "formatter": "colorFormatter", + "filters": ["FLFilter"], + "stream": "ext://sys.stdout" + }, + "logFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "baseFormatter", + "filename": "log.txt", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10 + }, + "errorFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "ERROR", + "formatter": "baseFormatter", + "filename": "log_error.txt", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10 + }, + "jsonFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "jsonFormatter", + "filename": "log.json", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10 + }, + "FLFileHandler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "baseFormatter", + "filters": ["FLFilter"], + "filename": "log_fl.txt", + "mode": "a", + "maxBytes": 20971520, + "backupCount": 10, + "delay": true + } + }, + "loggers": { + "root": { + "level": "INFO", + "handlers": ["consoleHandler", "logFileHandler", "errorFileHandler", "jsonFileHandler", "FLFileHandler"] + } + } +} + + + + + + + + + diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/src/client.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/src/client.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/src/client.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/src/network.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/code/src/network.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/src/network.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb similarity index 70% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb index c5a1913a9e..e641367c31 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb @@ -39,20 +39,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "951d0fe6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/chester/projects/NVFlare/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chester/.local/lib/python3.10/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", + " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" + ] + } + ], "source": [ "%cd code " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "ecc3a0cc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/chester/projects/NVFlare/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py\", line 17, in \n", + " from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob\n", + "ModuleNotFoundError: No module named 'nvflare'\n" + ] + } + ], "source": [ "! python3 fl_job.py" ] @@ -98,7 +125,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/setup.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb similarity index 81% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/setup.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb index aae74efbdb..770844b12a 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.1_running_federated_learning_job/setup.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb @@ -104,10 +104,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "87a13909", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100.0%\n", + "Extracting /tmp/nvflare/data/cifar10/cifar-10-python.tar.gz to /tmp/nvflare/data/cifar10\n", + "Files already downloaded and verified\n" + ] + } + ], "source": [ "!python3 code/data/download.py" ] @@ -122,10 +132,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "08bbe572", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34m/tmp/nvflare/data/cifar10/cifar-10-batches-py/\u001b[0m\n", + "├── \u001b[00mbatches.meta\u001b[0m\n", + "├── \u001b[00mdata_batch_1\u001b[0m\n", + "├── \u001b[00mdata_batch_2\u001b[0m\n", + "├── \u001b[00mdata_batch_3\u001b[0m\n", + "├── \u001b[00mdata_batch_4\u001b[0m\n", + "├── \u001b[00mdata_batch_5\u001b[0m\n", + "├── \u001b[00mreadme.html\u001b[0m\n", + "└── \u001b[00mtest_batch\u001b[0m\n", + "\n", + "0 directories, 8 files\n" + ] + } + ], "source": [ "!tree /tmp/nvflare/data/cifar10/cifar-10-batches-py/" ] @@ -156,7 +184,7 @@ "id": "316bae55", "metadata": {}, "source": [ - "Next Step, we will start to run training using simulation: [run pytorch federated learning job](..///01.1.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n" + "Next Step, we will start to run training using simulation: [run pytorch federated learning job](../01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n" ] }, { @@ -168,7 +196,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client.py new file mode 100644 index 0000000000..2b218919da --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client.py @@ -0,0 +1,193 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +import torchvision.transforms as transforms +from network import SimpleNetwork + +# (1) import nvflare client API +import nvflare.client as flare +from nvflare.app_common.app_constant import ModelName + +# (optional) set a fix place so we don't need to download everytime +CIFAR10_ROOT = "/tmp/nvflare/data/cifar10" + +# (optional) We change to use GPU to speed things up. +# if you want to use CPU, change DEVICE="cpu" +DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path", type=str, default=CIFAR10_ROOT, nargs="?") + parser.add_argument("--batch_size", type=int, default=4, nargs="?") + parser.add_argument("--num_workers", type=int, default=1, nargs="?") + parser.add_argument("--local_epochs", type=int, default=2, nargs="?") + parser.add_argument("--model_path", type=str, default=f"{CIFAR10_ROOT}/cifar_net.pth", nargs="?") + return parser.parse_args() + + +def main(): + # define local parameters + args = define_parser() + + dataset_path = args.dataset_path + batch_size = args.batch_size + num_workers = args.num_workers + local_epochs = args.local_epochs + model_path = args.model_path + + transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + trainset = torchvision.datasets.CIFAR10(root=dataset_path, train=True, download=True, transform=transform) + trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) + testset = torchvision.datasets.CIFAR10(root=dataset_path, train=False, download=True, transform=transform) + testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers) + + net = SimpleNetwork() + best_accuracy = 0.0 + + # wraps evaluation logic into a method to re-use for + # evaluation on both trained and received model + def evaluate(input_weights): + net = SimpleNetwork() + + net.load_state_dict(input_weights) + # (optional) use GPU to speed things up + net.to(DEVICE) + + correct = 0 + total = 0 + # since we're not training, we don't need to calculate the gradients for our outputs + with torch.no_grad(): + for data in testloader: + # (optional) use GPU to speed things up + images, labels = data[0].to(DEVICE), data[1].to(DEVICE) + # calculate outputs by running images through the network + outputs = net(images) + # the class with the highest energy is what we choose as prediction + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + return 100 * correct // total + + # (2) initialize NVFlare client API + flare.init() + + # (3) run continously when launch_once=true + while flare.is_running(): + + # (4) receive FLModel from NVFlare + input_model = flare.receive() + client_id = flare.get_site_name() + + # Based on different "task" we will do different things + # for "train" task (flare.is_train()) we use the received model to do training and/or evaluation + # and send back updated model and/or evaluation metrics, if the "train_with_evaluation" is specified as True + # in the config_fed_client we will need to do evaluation and include the evaluation metrics + # for "evaluate" task (flare.is_evaluate()) we use the received model to do evaluation + # and send back the evaluation metrics + # for "submit_model" task (flare.is_submit_model()) we just need to send back the local model + # (5) performing train task on received model + if flare.is_train(): + print(f"({client_id}) current_round={input_model.current_round}, total_rounds={input_model.total_rounds}") + + # (5.1) loads model from NVFlare + net.load_state_dict(input_model.params) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) + + # (optional) use GPU to speed things up + net.to(DEVICE) + # (optional) calculate total steps + steps = local_epochs * len(trainloader) + for epoch in range(local_epochs): # loop over the dataset multiple times + + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + # get the inputs; data is a list of [inputs, labels] + # (optional) use GPU to speed things up + inputs, labels = data[0].to(DEVICE), data[1].to(DEVICE) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.item() + if i % 2000 == 1999: # print every 2000 mini-batches + print(f"({client_id}) [{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}") + running_loss = 0.0 + break + + print(f"({client_id}) Finished Training") + + # (5.2) evaluation on local trained model to save best model + local_accuracy = evaluate(net.state_dict()) + print(f"({client_id}) Evaluating local trained model. Accuracy on the 10000 test images: {local_accuracy}") + if local_accuracy > best_accuracy: + best_accuracy = local_accuracy + torch.save(net.state_dict(), model_path) + + # (5.3) evaluate on received model for model selection + accuracy = evaluate(input_model.params) + print( + f"({client_id}) Evaluating received model for model selection. Accuracy on the 10000 test images: {accuracy}" + ) + + # (5.4) construct trained FL model + output_model = flare.FLModel( + params=net.cpu().state_dict(), + metrics={"accuracy": accuracy}, + meta={"NUM_STEPS_CURRENT_ROUND": steps}, + ) + + # (5.5) send model back to NVFlare + flare.send(output_model) + + # (6) performing evaluate task on received model + elif flare.is_evaluate(): + accuracy = evaluate(input_model.params) + print(f"({client_id}) accuracy: {accuracy}") + flare.send(flare.FLModel(metrics={"accuracy": accuracy})) + + # (7) performing submit_model task to obtain best local model + elif flare.is_submit_model(): + model_name = input_model.meta["submit_model_name"] + if model_name == ModelName.BEST_MODEL: + try: + weights = torch.load(model_path) + net = SimpleNetwork() + + net.load_state_dict(weights) + flare.send(flare.FLModel(params=net.cpu().state_dict())) + except Exception as e: + raise ValueError("Unable to load best model") from e + else: + raise ValueError(f"Unknown model_type: {model_name}") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/code/src/client_origin.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client_origin.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/code/src/client_origin.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client_origin.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client_v1.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/code/src/client.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/client_v1.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/network.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.2_convert_deep_learning_to_federated_learning/code/src/network.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/code/src/network.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb new file mode 100644 index 0000000000..208177e0be --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b75b2253-cba8-4579-907b-09311e0da587", + "metadata": {}, + "source": [ + "# PyTorch Deep Learning to Federated Learning Conversion\n", + "\n", + "One common question frequently heard from data scientists is how do I wrote a federated learning ? If I already have training code already for deep learning? how do I write an federated learning training code for the same problem?\n", + "\n", + "In this section, we will look at the classification training code we ran earlier and see how to convert the existing the pytorch training script to federated Learning client training code\n", + "\n", + "\n", + "## Orginal Deep learning Training Script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78422d7e", + "metadata": {}, + "outputs": [], + "source": [ + "%cd code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d86225e", + "metadata": {}, + "outputs": [], + "source": [ + "! python3 src/client_origin.py" + ] + }, + { + "cell_type": "markdown", + "id": "4d9db032", + "metadata": {}, + "source": [ + "## Convert the Deep learning Training Script\n", + "\n", + "Now let's convert it to federated learning training code with NVIDIA FLARE's Client API\n" + ] + }, + { + "cell_type": "markdown", + "id": "62a0ce53", + "metadata": {}, + "source": [ + "**Step-1** import\n", + "\n", + "```\n", + "import nvflare.client as flare\n", + "\n", + "```\n", + "\n", + "**Step-2** init\n", + "\n", + "we call \n", + "\n", + "```\n", + "flare.init()\n", + "```\n", + "\n", + "Once the flare is initialized, we will recieve some system metadata for example\n", + "```\n", + " sys_info = flare.system_info()\n", + " client_name = sys_info[\"site_name\"]\n", + "\n", + "```\n", + "We can get current client's \"identity\". \n", + "\n", + "Next we need to extends the trainig beyond local iterations. Image the Federated Learning is like the following for-loop: \n", + "\n", + "```\n", + "rounds = 5\n", + "for current_round in ranage (rounds):\n", + " \n", + " \n", + "\n", + "```\n", + "\n", + "Therefore we need to additional loop for the Federated Learning training. This can be expressed \n", + "\n", + "**Step 3** global round loop \n", + "\n", + " while flare.is_running():\n", + " \n", + "\n", + "\n", + "For each round: we need to receive and evaluate the global model. \n", + "\n", + "\n", + "**Step-4** Recive global model \n", + "\n", + "```\n", + " input_model = flare.receive()\n", + " round=input_model.current_round\n", + "\n", + " # update model based on global model\n", + " model.load_state_dict(input_model.params)\n", + "```\n", + "\n", + "**Step-5** Eveluate Global Model\n", + "\n", + " Since the local model is being updated with global model, the training procedue caclate the loss which evaluate the model \n", + "\n", + "**Step-6** Send the local trained model back to aggregator\n", + "\n", + " we take the newly trained local model parameters as well as metadata, sned it back to aggregator. \n", + "\n", + "```\n", + "\n", + " output_model = flare.FLModel( params=model.cpu().state_dict(), meta={\"NUM_STEPS_CURRENT_ROUND\": steps},)\n", + "\n", + " flare.send(output_model)\n", + "```\n", + "\n", + "\n", + "With above steps, just a few lines of code changes, no code structural changes, we converted the pytorch deep learning code to federated learning with NVIDIA FLARE\n", + "\n", + "The complete code can be found at client.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7249afc0", + "metadata": {}, + "outputs": [], + "source": [ + "!cat code/src/client_v1.py" + ] + }, + { + "cell_type": "markdown", + "id": "7f1824bf", + "metadata": {}, + "source": [ + "Now, we converted the client pytorch training script to federated learning code. Lets look further to handle multi-task client code\n", + "\n", + "\n", + "## Multi-Task Client Scripts\n", + "\n", + "So far, the client only handles traing, regardless what tasks the server issues to the clients. What if there are many tasks ? Client should take different actions based on the different tasks. Also, in previous version, we did not evaluate the global model. We are also to handle all these in this section. \n", + "\n", + "\n", + "In Flare's Client API, by detault, we will issue three different tasks: \"train\", \"evaluate\" and \"submit_model\"\n", + "\n", + "These three tasks can be checked by \n", + "\n", + "```\n", + "\n", + "flare.is_train()\n", + "\n", + "flare.is_evaluate()\n", + "\n", + "flare.is_submit_model()\n", + "\n", + "```\n", + "\n", + "So we need to motify our existing training code to have both training and evaluation logics\n", + "\n", + "### Training logics changes\n", + "\n", + "Besides the training logics we have seen before. We also need to evaluate and obtain the accuracy of the trainiing. \n", + "here we perform two evaluates \n", + "\n", + "evaluate the local model: \n", + "\n", + "```\n", + " # (5.2) evaluation on local trained model to save best model\n", + " local_accuracy = evaluate(net.state_dict())\n", + "\n", + "\n", + "```\n", + "\n", + "evalute the global model received \n", + "\n", + "```\n", + " # (5.3) evaluate on received model for model selection\n", + " accuracy = evaluate(input_model.params)\n", + "```\n", + "\n", + "Then add the global model accuracy into the metrics parameter of the FLModel before send it back to server. \n", + "\n", + "```\n", + " output_model = flare.FLModel(\n", + " params=net.cpu().state_dict(),\n", + " metrics={\"accuracy\": accuracy},\n", + " meta={\"NUM_STEPS_CURRENT_ROUND\": steps},\n", + " )\n", + "```\n", + "\n", + "\n", + "The newly added training logics is like this. \n", + "\n", + ">Note: the evaluate() function will discussed next\n", + "\n", + "\n", + "```\n", + " \n", + "\n", + " # (5.2) evaluation on local trained model to save best model\n", + " local_accuracy = evaluate(net.state_dict())\n", + " print(f\"({client_id}) Evaluating local trained model. Accuracy on the 10000 test images: {local_accuracy}\")\n", + " if local_accuracy > best_accuracy:\n", + " best_accuracy = local_accuracy\n", + " torch.save(net.state_dict(), model_path)\n", + "\n", + " # (5.3) evaluate on received model for model selection\n", + " accuracy = evaluate(input_model.params)\n", + " print(\n", + " f\"({client_id}) Evaluating received model for model selection. Accuracy on the 10000 test images: {accuracy}\"\n", + " )\n", + "\n", + " # (5.4) construct trained FL model\n", + " output_model = flare.FLModel(\n", + " params=net.cpu().state_dict(),\n", + " metrics={\"accuracy\": accuracy},\n", + " meta={\"NUM_STEPS_CURRENT_ROUND\": steps},\n", + " )\n", + "\n", + " # (5.5) send model back to NVFlare\n", + " flare.send(output_model)\n", + "\n", + "```\n", + "\n", + "### Evaluate functions\n", + "\n", + "The evaluate() functions requires test data, it is a nested inner evaluation that can directly use the testloader. \n", + "The return value is accuracy percentage. \n", + "\n", + "\n", + "```\n", + "\n", + " # wraps evaluation logic into a method to re-use for\n", + " # evaluation on both trained and received model\n", + " def evaluate(input_weights):\n", + " net = Net()\n", + " net.load_state_dict(input_weights)\n", + " # (optional) use GPU to speed things up\n", + " net.to(DEVICE)\n", + "\n", + " correct = 0\n", + " total = 0\n", + " # since we're not training, we don't need to calculate the gradients for our outputs\n", + " with torch.no_grad():\n", + " for data in testloader:\n", + "\n", + " # (optional) use GPU to speed things up\n", + " images, labels = data[0].to(DEVICE), data[1].to(DEVICE)\n", + "\n", + " # calculate outputs by running images through the network\n", + " outputs = net(images)\n", + "\n", + " # the class with the highest energy is what we choose as prediction\n", + "\n", + " _, predicted = torch.max(outputs.data, 1)\n", + " total += labels.size(0)\n", + " correct += (predicted == labels).sum().item()\n", + "\n", + " return 100 * correct // total\n", + "\n", + "```\n", + "\n", + "\n", + "The overall logics becomes\n", + "\n", + "```\n", + "if flare.is_training(): \n", + " traing and evaluate metrics\n", + " send model and merics back\n", + "\n", + "elif flare.is_evaluate():\n", + " # evaluate only, this can be used for cross-site evaluation\n", + " evaluate()\n", + " send the model and metrics back \n", + "\n", + "elif flare.is_submit_model()\n", + " \n", + " # expecting client submit best model \n", + " load and set best model \n", + "\n", + "```\n", + "\n", + "Please take a look at the [client.py](./code/src/client.py)\n" + ] + }, + { + "cell_type": "markdown", + "id": "7b024c86", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdd7a99d", + "metadata": {}, + "outputs": [], + "source": [ + "!cat code/src/client.py" + ] + }, + { + "cell_type": "markdown", + "id": "bdd0eb76", + "metadata": {}, + "source": [ + "Now, we know how to convert an existing Deep Learning code to Federated Learning training script. We can now explore how to customize the training logics. \n", + "\n", + "Please checkout [customize server logics](../01.1.3_customize_server_logics/customize_server_logics.ipynb)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "d3876f78", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/data/download.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/data/download.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/data/download.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/data/download.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/fl_job_config.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/fl_job.py similarity index 52% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/fl_job_config.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/fl_job.py index 01a4999a45..e3b018d079 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/fl_job_config.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/fl_job.py @@ -12,28 +12,41 @@ # See the License for the specific language governing permissions and # limitations under the License. + +import os + +from src.fedavg_v2 import FedAvgV2 from src.network import SimpleNetwork -from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob +from nvflare.job_config.api import FedJob from nvflare.job_config.script_runner import ScriptRunner if __name__ == "__main__": - n_clients = 5 - num_rounds = 2 - + num_clients = 5 + num_rounds = 5 + job_name = "fedavg_v2" train_script = "src/client.py" - job = FedAvgJob(name="fedavg", n_clients=n_clients, num_rounds=num_rounds, initial_model=SimpleNetwork()) + job = FedJob(name=job_name, min_clients=num_clients) + + controller = FedAvgV2( + stop_cond="accuracy > 25", + save_filename="global_model.pt", + initial_model=SimpleNetwork(), + num_clients=num_clients, + num_rounds=num_rounds, + ) + + job.to_server(controller) # Add clients - for i in range(n_clients): - executor = ScriptRunner( - script=train_script, script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}" - ) + for i in range(num_clients): + executor = ScriptRunner(script=train_script, script_args="") job.to(executor, f"site-{i + 1}") - job_config_dir = "/tmp/nvflare/jobs/job_config" + job_config_dir = "/tmp/nvflare/jobs/workdir" - print(f"create job config at {job_config_dir}/fedavg") + print("job-config is at ", os.path.join(job_config_dir, job_name)) - job.export_job(job_config_dir) + # job.export_job(job_config_dir) + job.simulator_run(job_config_dir) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/client.py new file mode 100644 index 0000000000..2b218919da --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/client.py @@ -0,0 +1,193 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +import torchvision.transforms as transforms +from network import SimpleNetwork + +# (1) import nvflare client API +import nvflare.client as flare +from nvflare.app_common.app_constant import ModelName + +# (optional) set a fix place so we don't need to download everytime +CIFAR10_ROOT = "/tmp/nvflare/data/cifar10" + +# (optional) We change to use GPU to speed things up. +# if you want to use CPU, change DEVICE="cpu" +DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path", type=str, default=CIFAR10_ROOT, nargs="?") + parser.add_argument("--batch_size", type=int, default=4, nargs="?") + parser.add_argument("--num_workers", type=int, default=1, nargs="?") + parser.add_argument("--local_epochs", type=int, default=2, nargs="?") + parser.add_argument("--model_path", type=str, default=f"{CIFAR10_ROOT}/cifar_net.pth", nargs="?") + return parser.parse_args() + + +def main(): + # define local parameters + args = define_parser() + + dataset_path = args.dataset_path + batch_size = args.batch_size + num_workers = args.num_workers + local_epochs = args.local_epochs + model_path = args.model_path + + transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + trainset = torchvision.datasets.CIFAR10(root=dataset_path, train=True, download=True, transform=transform) + trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) + testset = torchvision.datasets.CIFAR10(root=dataset_path, train=False, download=True, transform=transform) + testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers) + + net = SimpleNetwork() + best_accuracy = 0.0 + + # wraps evaluation logic into a method to re-use for + # evaluation on both trained and received model + def evaluate(input_weights): + net = SimpleNetwork() + + net.load_state_dict(input_weights) + # (optional) use GPU to speed things up + net.to(DEVICE) + + correct = 0 + total = 0 + # since we're not training, we don't need to calculate the gradients for our outputs + with torch.no_grad(): + for data in testloader: + # (optional) use GPU to speed things up + images, labels = data[0].to(DEVICE), data[1].to(DEVICE) + # calculate outputs by running images through the network + outputs = net(images) + # the class with the highest energy is what we choose as prediction + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + return 100 * correct // total + + # (2) initialize NVFlare client API + flare.init() + + # (3) run continously when launch_once=true + while flare.is_running(): + + # (4) receive FLModel from NVFlare + input_model = flare.receive() + client_id = flare.get_site_name() + + # Based on different "task" we will do different things + # for "train" task (flare.is_train()) we use the received model to do training and/or evaluation + # and send back updated model and/or evaluation metrics, if the "train_with_evaluation" is specified as True + # in the config_fed_client we will need to do evaluation and include the evaluation metrics + # for "evaluate" task (flare.is_evaluate()) we use the received model to do evaluation + # and send back the evaluation metrics + # for "submit_model" task (flare.is_submit_model()) we just need to send back the local model + # (5) performing train task on received model + if flare.is_train(): + print(f"({client_id}) current_round={input_model.current_round}, total_rounds={input_model.total_rounds}") + + # (5.1) loads model from NVFlare + net.load_state_dict(input_model.params) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) + + # (optional) use GPU to speed things up + net.to(DEVICE) + # (optional) calculate total steps + steps = local_epochs * len(trainloader) + for epoch in range(local_epochs): # loop over the dataset multiple times + + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + # get the inputs; data is a list of [inputs, labels] + # (optional) use GPU to speed things up + inputs, labels = data[0].to(DEVICE), data[1].to(DEVICE) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.item() + if i % 2000 == 1999: # print every 2000 mini-batches + print(f"({client_id}) [{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}") + running_loss = 0.0 + break + + print(f"({client_id}) Finished Training") + + # (5.2) evaluation on local trained model to save best model + local_accuracy = evaluate(net.state_dict()) + print(f"({client_id}) Evaluating local trained model. Accuracy on the 10000 test images: {local_accuracy}") + if local_accuracy > best_accuracy: + best_accuracy = local_accuracy + torch.save(net.state_dict(), model_path) + + # (5.3) evaluate on received model for model selection + accuracy = evaluate(input_model.params) + print( + f"({client_id}) Evaluating received model for model selection. Accuracy on the 10000 test images: {accuracy}" + ) + + # (5.4) construct trained FL model + output_model = flare.FLModel( + params=net.cpu().state_dict(), + metrics={"accuracy": accuracy}, + meta={"NUM_STEPS_CURRENT_ROUND": steps}, + ) + + # (5.5) send model back to NVFlare + flare.send(output_model) + + # (6) performing evaluate task on received model + elif flare.is_evaluate(): + accuracy = evaluate(input_model.params) + print(f"({client_id}) accuracy: {accuracy}") + flare.send(flare.FLModel(metrics={"accuracy": accuracy})) + + # (7) performing submit_model task to obtain best local model + elif flare.is_submit_model(): + model_name = input_model.meta["submit_model_name"] + if model_name == ModelName.BEST_MODEL: + try: + weights = torch.load(model_path) + net = SimpleNetwork() + + net.load_state_dict(weights) + flare.send(flare.FLModel(params=net.cpu().state_dict())) + except Exception as e: + raise ValueError("Unable to load best model") from e + else: + raise ValueError(f"Unknown model_type: {model_name}") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v0.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v0.py new file mode 100644 index 0000000000..abed9a0681 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v0.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nvflare.app_common.abstract.fl_model import FLModel +from nvflare.app_common.workflows.base_fedavg import BaseFedAvg +from nvflare.app_opt.pt.decomposers import TensorDecomposer +from nvflare.fuel.utils import fobs + + +class FedAvgV0(BaseFedAvg): + def __init__( + self, + *args, + initial_model=None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.initial_model = initial_model + fobs.register(TensorDecomposer) + + def run(self) -> None: + + if self.initial_model: + initial_weights = self.initial_model.state_dict() + else: + initial_weights = {} + + model = FLModel(params=initial_weights) + + model.start_round = self.start_round + model.total_rounds = self.num_rounds + + for self.current_round in range(self.start_round, self.start_round + self.num_rounds): + self.info(f"Round {self.current_round} started.") + model.current_round = self.current_round + + clients = self.sample_clients(self.num_clients) + + results = self.send_model_and_wait(targets=clients, data=model) + + # using default aggregate_fn with `WeightedAggregationHelper`. + # Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] + aggregate_results = self.aggregate(results, aggregate_fn=self.aggregate_fn) + + model = self.update_model(model, aggregate_results) + + self.info(f"Round {self.current_round} global metrics: {model.metrics}") + + self.info("Finished FedAvg.") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v1.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v1.py new file mode 100644 index 0000000000..b0508d33eb --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v1.py @@ -0,0 +1,91 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Dict, Optional + +from nvflare.app_common.abstract.fl_model import FLModel +from nvflare.app_common.utils.math_utils import parse_compare_criteria +from nvflare.app_common.workflows.base_fedavg import BaseFedAvg +from nvflare.app_opt.pt.decomposers import TensorDecomposer +from nvflare.fuel.utils import fobs + + +class FedAvgV1(BaseFedAvg): + + def __init__( + self, + *args, + stop_cond: str = None, + initial_model=None, + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.stop_cond = stop_cond + if stop_cond: + self.stop_condition = parse_compare_criteria(stop_cond) + else: + self.stop_condition = None + + self.initial_model = initial_model + fobs.register(TensorDecomposer) + + def run(self) -> None: + if self.initial_model: + initial_weights = self.initial_model.state_dict() + else: + initial_weights = {} + + model = FLModel(params=initial_weights) + + model.start_round = self.start_round + model.total_rounds = self.num_rounds + + for self.current_round in range(self.start_round, self.start_round + self.num_rounds): + + self.info(f"Round {self.current_round} started.") + + model.current_round = self.current_round + + clients = self.sample_clients(self.num_clients) + + results = self.send_model_and_wait(targets=clients, data=model) + + # using default aggregate_fn with `WeightedAggregationHelper`. + # Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] + aggregate_results = self.aggregate(results, aggregate_fn=self.aggregate_fn) + model = self.update_model(model, aggregate_results) + + self.info(f"Round {self.current_round} global metrics: {model.metrics}") + + if self.should_stop(model.metrics, self.stop_condition): + self.info( + f"Stopping at round={self.current_round} out of total_rounds={self.num_rounds}. Early stop condition satisfied: {self.stop_condition}" + ) + break + + self.info("Finished FedAvg.") + + def should_stop(self, metrics: Optional[Dict] = None, stop_condition: Optional[str] = None): + if stop_condition is None or metrics is None: + return False + + key, target, op_fn = stop_condition + value = metrics.get(key, None) + + if value is None: + raise RuntimeError(f"stop criteria key '{key}' doesn't exists in metrics") + + return op_fn(value, target) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v2.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v2.py new file mode 100644 index 0000000000..5b8f4f9019 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v2.py @@ -0,0 +1,149 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Callable, Dict, List, Optional + +import torch + +from nvflare.app_common.abstract.fl_model import FLModel +from nvflare.app_common.utils.math_utils import parse_compare_criteria +from nvflare.app_common.workflows.base_fedavg import BaseFedAvg +from nvflare.app_opt.pt.decomposers import TensorDecomposer +from nvflare.fuel.utils import fobs + + +class FedAvgV2(BaseFedAvg): + """Controller for FedAvg Workflow with Early Stopping and Model Selection. + + Args: + num_clients (int, optional): The number of clients. Defaults to 3. + num_rounds (int, optional): The total number of training rounds. Defaults to 5. + stop_cond (str, optional): early stopping condition based on metric. + string literal in the format of " " (e.g. "accuracy >= 80") + save_filename (str, optional): filename for saving model + initial_model (nn.Module, optional): initial PyTorch model + """ + + def __init__(self, *args, stop_cond: str, save_filename: str = "FL_global_model.pt", initial_model=None, **kwargs): + super().__init__(*args, **kwargs) + + self.stop_cond = stop_cond + + if stop_cond: + self.stop_condition = parse_compare_criteria(stop_cond) + else: + self.stop_condition = None + self.save_filename = save_filename + self.initial_model = initial_model + self.best_model: Optional[FLModel] = None + + def run(self) -> None: + self.info("Start FedAvg.") + + if self.initial_model: + # Use FOBS for serializing/deserializing PyTorch tensors (self.initial_model) + fobs.register(TensorDecomposer) + # PyTorch weights + initial_weights = self.initial_model.state_dict() + else: + initial_weights = {} + + model = FLModel(params=initial_weights) + + model.start_round = self.start_round + model.total_rounds = self.num_rounds + + for self.current_round in range(self.start_round, self.start_round + self.num_rounds): + self.info(f"Round {self.current_round} started.") + model.current_round = self.current_round + + clients = self.sample_clients(self.num_clients) + + results: List[FLModel] = self.send_model_and_wait(targets=clients, data=model) + aggregate_results = self.aggregate( + results, aggregate_fn=self.aggregate_fn + ) # using default aggregate_fn with `WeightedAggregationHelper`. Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] + + model = self.update_model(model, aggregate_results) + + self.info(f"Round {self.current_round} global metrics: {model.metrics}") + + self.select_best_model(model) + + self.save_model(self.best_model, os.path.join(os.getcwd(), self.save_filename)) + + if self.should_stop(model.metrics, self.stop_condition): + self.info( + f"Stopping at round={self.current_round} out of total_rounds={self.num_rounds}. Early stop condition satisfied: {self.stop_condition}" + ) + break + + self.info("Finished FedAvg.") + + def should_stop(self, metrics: Optional[Dict] = None, stop_condition: Optional[str] = None): + if stop_condition is None or metrics is None: + return False + + key, target, op_fn = stop_condition + value = metrics.get(key, None) + + if value is None: + raise RuntimeError(f"stop criteria key '{key}' doesn't exists in metrics") + + return op_fn(value, target) + + def select_best_model(self, curr_model: FLModel): + if self.best_model is None: + self.best_model = curr_model + return + + if self.stop_condition: + metric, _, op_fn = self.stop_condition + if self.is_curr_model_better(self.best_model, curr_model, metric, op_fn): + self.info("Current model is new best model.") + self.best_model = curr_model + else: + self.best_model = curr_model + + def is_curr_model_better( + self, best_model: FLModel, curr_model: FLModel, target_metric: str, op_fn: Callable + ) -> bool: + curr_metrics = curr_model.metrics + if curr_metrics is None: + return False + if target_metric not in curr_metrics: + return False + + best_metrics = best_model.metrics + return op_fn(curr_metrics.get(target_metric), best_metrics.get(target_metric)) + + def save_model(self, model, filepath=""): + params = model.params + # PyTorch save + torch.save(params, filepath) + + # save FLModel metadata + model.params = {} + fobs.dumpf(model, filepath + ".metadata") + model.params = params + + def load_model(self, filepath=""): + # PyTorch load + params = torch.load(filepath) + + # load FLModel metadata + model = fobs.loadf(filepath + ".metadata") + model.params = params + return model diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/network.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/src/network.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/network.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/customize_server_logics.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/customize_server_logics.ipynb new file mode 100644 index 0000000000..ed22affd71 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.3_customize_server_logics/customize_server_logics.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f64188d2", + "metadata": {}, + "source": [ + "\n", + "# Customizing Federated Learning Server logics\n", + "\n", + "\n", + "In previous sections, we are able to run federated pytorch image classification code with NVIDIA FLARE builtin FedAvg algorithm. \n", + "What if we want to build my own algorithms or modify the existing algorithm ? \n", + "\n", + "In the following, using FedAvg as starting point, we like to make a few changes to FedAvg to fit our needs: \n", + "\n", + "* Add early stopping mechanism so that the training could stop instead of waiting to the total numbers of rounds if the criteria is statisfied\n", + "* Instead of rely on the internal best model selection approach, we want to provide our own best model selection\n", + "* Instead of using building persiste component PTFileModelPersistor, we like to have our own save and loading functions\n", + "\n", + "\n", + "In this section, we will go over these changes step-by-step. \n", + "\n", + "> Reference:\n", + "> _[FedAvg with early stopping](https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/hello-fedavg/hello-fedavg.ipynb) example_\n", + "\n", + "\n", + "## Customized FedAvg v1\n", + "\n", + "There are several factors to consider:\n", + "\n", + "* **How to write a Federated Avg Algorithms** \n", + "\n", + "* **How to express and apply the early stop condition** \n", + "\n", + "\n", + "### Write a FedAvg Algorithm\n", + "\n", + "FedAvg can be written as very simple for-loop. There are several other factors to consider \n", + "\n", + "* How to send the model to clients?\n", + "* How to receive the response \n", + "* for the model and response, what's the format ? \n", + "* The model and responses and corresponding objects must be serialized, how to series them ? \n", + "\n", + "Let's dive into these questions.\n", + "\n", + "\n", + "#### Transfer Structure: FLModel\n", + "\n", + "FLARE defined a high-level data structure \"FLModel\" that holds the model parameters, metrics and metadata\n", + "\n", + "```\n", + "\n", + "class ParamsType(str, Enum):\n", + " FULL = \"FULL\"\n", + " DIFF = \"DIFF\"\n", + "\n", + "\n", + "class FLModel:\n", + " def __init__(\n", + " self,\n", + " params_type: Union[None, str, ParamsType] = None,\n", + " params: Any = None,\n", + " optimizer_params: Any = None,\n", + " metrics: Optional[Dict] = None,\n", + " start_round: Optional[int] = 0,\n", + " current_round: Optional[int] = None,\n", + " total_rounds: Optional[int] = None,\n", + " meta: Optional[Dict] = None,\n", + " ):\n", + "\n", + "```\n", + "the data can be packaged into FLModel transfer between clients and server as well as among clients. \n", + "\n", + "\n", + "#### Serialization \n", + "\n", + "Many of the deep learning machine frameworks using python pickle as default serrialization mechanism. There are enough security concerns that FLARE is not using Pickle. NVIDIA FLARE Object Serializer (FOBS) used a [messagePack](https://msgpack.org/index.html)-based serialization approach. \n", + "User needs to register a component ( \"Decomposer\") to serialize/de-serialize certain project to fobs. \n", + "\n", + "To PyTorch Tensor, we need to register [TensorDecompressor](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_opt/pt/decomposers.py) component at FOBS. \n", + "\n", + "```\n", + " # Use FOBS for serializing/deserializing PyTorch tensors\n", + " fobs.register(TensorDecomposer)\n", + "```\n", + "\n", + "#### Send and Recieve Object\n", + "\n", + "For high-level API, we can use the followings\n", + "\n", + "```\n", + " results = self.send_model_and_wait(targets=clients, data=model)\n", + "```\n", + "the function send the FLModel to targeted clients and recieve result. This is synchornized methood like scatter and gather. We broadcast the model to all targeted clients and receive results when required clients send back the results. \n", + "\n", + "The BasedFedAvg is derived from ModelController which has the communication component, which allows the component to send the model and wait for result. \n", + "\n", + "\n", + "Now we are covered these few factors, lets write a class and see how it will look\n", + "\n", + "We will start with BaseFedAvg class. ```class BaseFedAvg``` provided a core based class for the customize FedAvg, it define a run() methods that capture all the running logs\n", + "as well as some utiliies. We can look at the initial version of the code\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "832de87f", + "metadata": {}, + "outputs": [], + "source": [ + "! cat code/src/fedavg_v0.py\n" + ] + }, + { + "cell_type": "markdown", + "id": "ab85e1f1", + "metadata": {}, + "source": [ + "Now, we have our own FedAvg version, we now look into how to stop the training\n", + "\n", + "### Express and apply the early stop condition\n", + "\n", + "#### Stop Condition\n", + "\n", + "```stop_cond``` is a string to represent the stop condition, its string literal in the format of \" \" (e.g. \"accuracy >= 80\")\n", + "\n", + "we need to parse this condition so we can compare. To parse this, we leverage FLARE's math_utils\n", + "```\n", + "\n", + "math_utils.parse_compare_criteria(compare_expr: Optional[str] = None) -> Tuple[str, float, Callable]\n", + "\n", + "```\n", + "the return will be\n", + "* key,\n", + "* target_value,\n", + "* callable op_fn\n", + "\n", + "For example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7871717", + "metadata": {}, + "outputs": [], + "source": [ + "from nvflare.app_common.utils.math_utils import parse_compare_criteria\n", + "key, target_value, fn= parse_compare_criteria(\"accuracy > 80\")\n", + "print (key, target_value, fn)\n", + "accuracy = 90\n", + "fn (accuracy, target_value)" + ] + }, + { + "cell_type": "markdown", + "id": "aa6d4de5", + "metadata": {}, + "source": [ + "#### Integrate the early stop condition\n", + "\n", + "This should simple, if the condition is satified and simply break out the for-loop\n", + "\n", + "```\n", + " if self.should_stop(model.metrics, self.stop_condition):\n", + " break\n", + "```\n", + "\n", + "and the ```should_stop``` function is defined as followings\n", + "\n", + "```\n", + "def should_stop(self, metrics: Optional[Dict] = None, stop_condition: Optional[str] = None):\n", + " key, target, op_fn = stop_condition\n", + " value = metrics.get(key, None)\n", + " return op_fn(value, target)\n", + "```\n", + "\n", + "the code can be found in [fedavg_v1.py](code/src/fedavg_v1.py)\n" + ] + }, + { + "cell_type": "markdown", + "id": "9beac00b", + "metadata": {}, + "source": [ + "## Customized FedAvg v2\n", + "\n", + "We have successfully modify the FedAvg logics and allow user to specify early stop conditions. \n", + "Now, we want to make additional changes\n", + "\n", + "* We like to implements our own best model selection\n", + "* we like to have our own model save and loading instead of using the FLARE's persistor. \n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "437f0d14", + "metadata": {}, + "source": [ + "### Select best model \n", + "\n", + "we simply write the following two functions and put into previus code\n", + "\n", + "```\n", + " def select_best_model(self, curr_model: FLModel):\n", + " if self.best_model is None:\n", + " self.best_model = curr_model\n", + " return\n", + "\n", + " if self.stop_condition:\n", + " metric, _, op_fn = self.stop_condition\n", + " if self.is_curr_model_better(self.best_model, curr_model, metric, op_fn):\n", + " self.info(\"Current model is new best model.\")\n", + " self.best_model = curr_model\n", + " else:\n", + " self.best_model = curr_model\n", + "\n", + " def is_curr_model_better(\n", + " self, best_model: FLModel, curr_model: FLModel, target_metric: str, op_fn: Callable\n", + " ) -> bool:\n", + " curr_metrics = curr_model.metrics\n", + " if curr_metrics is None:\n", + " return False\n", + " if target_metric not in curr_metrics:\n", + " return False\n", + "\n", + " best_metrics = best_model.metrics\n", + " return op_fn(curr_metrics.get(target_metric), best_metrics.get(target_metric))\n", + "\n", + "```\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "84b319c6", + "metadata": {}, + "source": [ + "### Customized save and load model functions\n", + " \n", + "The ```BaseFedAvg``` class defined ```save_model()``` and ```load_model()``` functions for user to overwrite. \n", + "We use torch save and load functions, and save the FLModel metadata separately with the fobs.dumpf and fobs.loadf serialization utilities.\n", + "\n", + "\n", + "\n", + " def save_model(self, model, filepath=\"\"):\n", + " params = model.params\n", + " # PyTorch save\n", + " torch.save(params, filepath)\n", + "\n", + " # save FLModel metadata\n", + " model.params = {}\n", + " fobs.dumpf(model, filepath + \".metadata\")\n", + " model.params = params\n", + "\n", + " def load_model(self, filepath=\"\"):\n", + " # PyTorch load\n", + " params = torch.load(filepath)\n", + "\n", + " # load FLModel metadata\n", + " model = fobs.loadf(filepath + \".metadata\")\n", + " model.params = params\n", + " return model\n" + ] + }, + { + "cell_type": "markdown", + "id": "1d40eb84", + "metadata": {}, + "source": [ + "## Add Evaluation to the training code\n", + "\n", + "We need to add the evaluation code to the training code to compute accuracy. We then use accuracy to select best model\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "a0a83a06", + "metadata": {}, + "source": [ + "## Running Customized FedAvg\n", + "\n", + "Now, put everything together in [fedavg_v2](code/src/fedavg_v2.py), we can take a look at the server code\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7b0cd39", + "metadata": {}, + "outputs": [], + "source": [ + "!cat code/src/fedavg_v2.py" + ] + }, + { + "cell_type": "markdown", + "id": "e3b3b848", + "metadata": {}, + "source": [ + "Lets create Job with our newly modified FedAvgV2. \n", + "\n", + "### Create Fed Job\n", + "\n", + "```\n", + " n_clients = 5\n", + " num_rounds = 2\n", + "\n", + " train_script = \"src/client.py\"\n", + "\n", + " job = FedJob(name=\"fedavg_v2\", n_clients=n_clients)\n", + "\n", + " controller = FedAvgV2(\n", + " num_clients=n_clients,\n", + " num_rounds=num_rounds,\n", + " stop_cond = None,\n", + " save_filename = \"global_model.pt\",\n", + " initial_model=SimpleNetwork())\n", + " \n", + " job.to_server(controller)\n", + "\n", + " # Add clients\n", + " for i in range(n_clients):\n", + " executor = ScriptRunner(\n", + " script=train_script, script_args=\"\"\n", + " )\n", + " job.to(executor, f\"site-{i + 1}\")\n", + "\n", + " job.simulator_run(\"/tmp/nvflare/jobs/workdir\")\n", + "\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "### Run job with simulator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca7ae30a", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install nvflare" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eead6dcc", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -r code/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2575cdcf", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 code/data/download.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "465add19", + "metadata": {}, + "outputs": [], + "source": [ + "%cd code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c620fe19", + "metadata": {}, + "outputs": [], + "source": [ + "! python3 fl_job.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e096501e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/data/download.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/data/download.py new file mode 100644 index 0000000000..ebd8cfdc41 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/data/download.py @@ -0,0 +1,60 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This Dirichlet sampling strategy for creating a heterogeneous partition is adopted +# from FedMA (https://github.com/IBM/FedMA). + +# MIT License + +# Copyright (c) 2020 International Business Machines + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import argparse + +import torchvision.datasets as datasets + +# default dataset path +CIFAR10_ROOT = "/tmp/nvflare/data/cifar10" + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path", type=str, default=CIFAR10_ROOT, nargs="?") + args = parser.parse_args() + return args + + +def main(args): + datasets.CIFAR10(root=args.dataset_path, train=True, download=True) + datasets.CIFAR10(root=args.dataset_path, train=False, download=True) + + +if __name__ == "__main__": + main(define_parser()) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/fl_job.py new file mode 100644 index 0000000000..8fc6f73846 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/fl_job.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +from src.fedavg import FedAvg +from src.network import SimpleNetwork + +from nvflare.job_config.api import FedJob +from nvflare.job_config.script_runner import ScriptRunner + +if __name__ == "__main__": + num_clients = 5 + num_rounds = 5 + job_name = "fedavg" + train_script = "src/client.py" + + job = FedJob(name=job_name, min_clients=num_clients) + + controller = FedAvg( + stop_cond="accuracy > 25", + save_filename="global_model.pt", + initial_model=SimpleNetwork(), + num_clients=num_clients, + num_rounds=num_rounds, + ) + + job.to_server(controller) + + # Add clients + + executor_1 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 12") + job.to(executor_1, "site-1") + + executor_2 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 10") + job.to(executor_2, "site-2") + + executor_3 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 8") + job.to(executor_3, "site-3") + + executor_4 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 6") + job.to(executor_3, "site-4") + + executor_5 = ScriptRunner(script=train_script, script_args="--learning_rate 0.0001 --batch_size 4") + job.to(executor_3, "site-5") + + job_config_dir = "/tmp/nvflare/jobs/workdir" + + print("job-config is at ", os.path.join(job_config_dir, job_name)) + + # job.export_job(job_config_dir) + job.simulator_run(job_config_dir) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/requirements.txt new file mode 100644 index 0000000000..57b4df2ed4 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +tensorboard \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/client.py new file mode 100644 index 0000000000..220559b3cf --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/client.py @@ -0,0 +1,193 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +import torchvision.transforms as transforms +from network import SimpleNetwork + +# (1) import nvflare client API +import nvflare.client as flare +from nvflare.app_common.app_constant import ModelName + +# (optional) set a fix place so we don't need to download everytime +CIFAR10_ROOT = "/tmp/nvflare/data/cifar10" + +# (optional) We change to use GPU to speed things up. +# if you want to use CPU, change DEVICE="cpu" +DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path", type=str, default=CIFAR10_ROOT, nargs="?") + parser.add_argument("--batch_size", type=int, default=4, nargs="?") + parser.add_argument("--learning_rate", type=float, default=0.001, nargs="?") + parser.add_argument("--num_workers", type=int, default=1, nargs="?") + parser.add_argument("--local_epochs", type=int, default=2, nargs="?") + parser.add_argument("--model_path", type=str, default=f"{CIFAR10_ROOT}/cifar_net.pth", nargs="?") + return parser.parse_args() + + +def main(): + # define local parameters + args = define_parser() + + dataset_path = args.dataset_path + batch_size = args.batch_size + num_workers = args.num_workers + local_epochs = args.local_epochs + model_path = args.model_path + lr = args.learning_rate + + transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + trainset = torchvision.datasets.CIFAR10(root=dataset_path, train=True, download=True, transform=transform) + trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) + testset = torchvision.datasets.CIFAR10(root=dataset_path, train=False, download=True, transform=transform) + testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers) + + net = SimpleNetwork() + best_accuracy = 0.0 + + # wraps evaluation logic into a method to re-use for + # evaluation on both trained and received model + def evaluate(input_weights): + net = SimpleNetwork() + net.load_state_dict(input_weights) + # (optional) use GPU to speed things up + net.to(DEVICE) + + correct = 0 + total = 0 + # since we're not training, we don't need to calculate the gradients for our outputs + with torch.no_grad(): + for data in testloader: + # (optional) use GPU to speed things up + images, labels = data[0].to(DEVICE), data[1].to(DEVICE) + # calculate outputs by running images through the network + outputs = net(images) + # the class with the highest energy is what we choose as prediction + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + return 100 * correct // total + + # (2) initialize NVFlare client API + flare.init() + + # (3) run continously when launch_once=true + while flare.is_running(): + + # (4) receive FLModel from NVFlare + input_model = flare.receive() + client_id = flare.get_site_name() + + # Based on different "task" we will do different things + # for "train" task (flare.is_train()) we use the received model to do training and/or evaluation + # and send back updated model and/or evaluation metrics, if the "train_with_evaluation" is specified as True + # in the config_fed_client we will need to do evaluation and include the evaluation metrics + # for "evaluate" task (flare.is_evaluate()) we use the received model to do evaluation + # and send back the evaluation metrics + # for "submit_model" task (flare.is_submit_model()) we just need to send back the local model + # (5) performing train task on received model + if flare.is_train(): + print(f"({client_id}) current_round={input_model.current_round}, total_rounds={input_model.total_rounds}") + + # (5.1) loads model from NVFlare + net.load_state_dict(input_model.params) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9) + + # (optional) use GPU to speed things up + net.to(DEVICE) + # (optional) calculate total steps + steps = local_epochs * len(trainloader) + for epoch in range(local_epochs): # loop over the dataset multiple times + + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + # get the inputs; data is a list of [inputs, labels] + # (optional) use GPU to speed things up + inputs, labels = data[0].to(DEVICE), data[1].to(DEVICE) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.item() + if i % 2000 == 1999: # print every 2000 mini-batches + print(f"({client_id}) [{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}") + running_loss = 0.0 + break + + print(f"({client_id}) Finished Training") + + # (5.2) evaluation on local trained model to save best model + local_accuracy = evaluate(net.state_dict()) + print(f"({client_id}) Evaluating local trained model. Accuracy on the 10000 test images: {local_accuracy}") + if local_accuracy > best_accuracy: + best_accuracy = local_accuracy + torch.save(net.state_dict(), model_path) + + # (5.3) evaluate on received model for model selection + accuracy = evaluate(input_model.params) + print( + f"({client_id}) Evaluating received model for model selection. Accuracy on the 10000 test images: {accuracy}" + ) + + # (5.4) construct trained FL model + output_model = flare.FLModel( + params=net.cpu().state_dict(), + metrics={"accuracy": accuracy}, + meta={"NUM_STEPS_CURRENT_ROUND": steps}, + ) + + # (5.5) send model back to NVFlare + flare.send(output_model) + + # (6) performing evaluate task on received model + elif flare.is_evaluate(): + accuracy = evaluate(input_model.params) + print(f"({client_id}) accuracy: {accuracy}") + flare.send(flare.FLModel(metrics={"accuracy": accuracy})) + + # (7) performing submit_model task to obtain best local model + elif flare.is_submit_model(): + model_name = input_model.meta["submit_model_name"] + if model_name == ModelName.BEST_MODEL: + try: + weights = torch.load(model_path) + net = SimpleNetwork() + net.load_state_dict(weights) + flare.send(flare.FLModel(params=net.cpu().state_dict())) + except Exception as e: + raise ValueError("Unable to load best model") from e + else: + raise ValueError(f"Unknown model_type: {model_name}") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fedavg.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fedavg.py new file mode 100644 index 0000000000..2792cbfee5 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fedavg.py @@ -0,0 +1,149 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Callable, Dict, List, Optional + +import torch + +from nvflare.app_common.abstract.fl_model import FLModel +from nvflare.app_common.utils.math_utils import parse_compare_criteria +from nvflare.app_common.workflows.base_fedavg import BaseFedAvg +from nvflare.app_opt.pt.decomposers import TensorDecomposer +from nvflare.fuel.utils import fobs + + +class FedAvg(BaseFedAvg): + """Controller for FedAvg Workflow with Early Stopping and Model Selection. + + Args: + num_clients (int, optional): The number of clients. Defaults to 3. + num_rounds (int, optional): The total number of training rounds. Defaults to 5. + stop_cond (str, optional): early stopping condition based on metric. + string literal in the format of " " (e.g. "accuracy >= 80") + save_filename (str, optional): filename for saving model + initial_model (nn.Module, optional): initial PyTorch model + """ + + def __init__(self, *args, stop_cond: str, save_filename: str = "FL_global_model.pt", initial_model=None, **kwargs): + super().__init__(*args, **kwargs) + + self.stop_cond = stop_cond + + if stop_cond: + self.stop_condition = parse_compare_criteria(stop_cond) + else: + self.stop_condition = None + self.save_filename = save_filename + self.initial_model = initial_model + self.best_model: Optional[FLModel] = None + + def run(self) -> None: + self.info("Start FedAvg.") + + if self.initial_model: + # Use FOBS for serializing/deserializing PyTorch tensors (self.initial_model) + fobs.register(TensorDecomposer) + # PyTorch weights + initial_weights = self.initial_model.state_dict() + else: + initial_weights = {} + + model = FLModel(params=initial_weights) + + model.start_round = self.start_round + model.total_rounds = self.num_rounds + + for self.current_round in range(self.start_round, self.start_round + self.num_rounds): + self.info(f"Round {self.current_round} started.") + model.current_round = self.current_round + + clients = self.sample_clients(self.num_clients) + + results: List[FLModel] = self.send_model_and_wait(targets=clients, data=model) + aggregate_results = self.aggregate( + results, aggregate_fn=self.aggregate_fn + ) # using default aggregate_fn with `WeightedAggregationHelper`. Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] + + model = self.update_model(model, aggregate_results) + + self.info(f"Round {self.current_round} global metrics: {model.metrics}") + + self.select_best_model(model) + + self.save_model(self.best_model, os.path.join(os.getcwd(), self.save_filename)) + + if self.should_stop(model.metrics, self.stop_condition): + self.info( + f"Stopping at round={self.current_round} out of total_rounds={self.num_rounds}. Early stop condition satisfied: {self.stop_condition}" + ) + break + + self.info("Finished FedAvg.") + + def should_stop(self, metrics: Optional[Dict] = None, stop_condition: Optional[str] = None): + if stop_condition is None or metrics is None: + return False + + key, target, op_fn = stop_condition + value = metrics.get(key, None) + + if value is None: + raise RuntimeError(f"stop criteria key '{key}' doesn't exists in metrics") + + return op_fn(value, target) + + def select_best_model(self, curr_model: FLModel): + if self.best_model is None: + self.best_model = curr_model + return + + if self.stop_condition: + metric, _, op_fn = self.stop_condition + if self.is_curr_model_better(self.best_model, curr_model, metric, op_fn): + self.info("Current model is new best model.") + self.best_model = curr_model + else: + self.best_model = curr_model + + def is_curr_model_better( + self, best_model: FLModel, curr_model: FLModel, target_metric: str, op_fn: Callable + ) -> bool: + curr_metrics = curr_model.metrics + if curr_metrics is None: + return False + if target_metric not in curr_metrics: + return False + + best_metrics = best_model.metrics + return op_fn(curr_metrics.get(target_metric), best_metrics.get(target_metric)) + + def save_model(self, model, filepath=""): + params = model.params + # PyTorch save + torch.save(params, filepath) + + # save FLModel metadata + model.params = {} + fobs.dumpf(model, filepath + ".metadata") + model.params = params + + def load_model(self, filepath=""): + # PyTorch load + params = torch.load(filepath) + + # load FLModel metadata + model = fobs.loadf(filepath + ".metadata") + model.params = params + return model diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fl_job.py new file mode 100644 index 0000000000..8fc6f73846 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fl_job.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +from src.fedavg import FedAvg +from src.network import SimpleNetwork + +from nvflare.job_config.api import FedJob +from nvflare.job_config.script_runner import ScriptRunner + +if __name__ == "__main__": + num_clients = 5 + num_rounds = 5 + job_name = "fedavg" + train_script = "src/client.py" + + job = FedJob(name=job_name, min_clients=num_clients) + + controller = FedAvg( + stop_cond="accuracy > 25", + save_filename="global_model.pt", + initial_model=SimpleNetwork(), + num_clients=num_clients, + num_rounds=num_rounds, + ) + + job.to_server(controller) + + # Add clients + + executor_1 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 12") + job.to(executor_1, "site-1") + + executor_2 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 10") + job.to(executor_2, "site-2") + + executor_3 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 8") + job.to(executor_3, "site-3") + + executor_4 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 6") + job.to(executor_3, "site-4") + + executor_5 = ScriptRunner(script=train_script, script_args="--learning_rate 0.0001 --batch_size 4") + job.to(executor_3, "site-5") + + job_config_dir = "/tmp/nvflare/jobs/workdir" + + print("job-config is at ", os.path.join(job_config_dir, job_name)) + + # job.export_job(job_config_dir) + job.simulator_run(job_config_dir) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/network.py new file mode 100644 index 0000000000..609b0b1581 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/network.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SimpleNetwork(nn.Module): + def __init__(self): + super(SimpleNetwork, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = torch.flatten(x, 1) # flatten all dimensions except batch + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/customize_client_training.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/customize_client_training.ipynb new file mode 100644 index 0000000000..56f1a33a89 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.4_customize_client_training/customize_client_training.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "38561a0b-a072-41ea-b027-290402cf4582", + "metadata": {}, + "source": [ + "# Customize client training scripts for different sites\n" + ] + }, + { + "cell_type": "markdown", + "id": "3f020f8b", + "metadata": {}, + "source": [ + "The client training script, so far, assume all sides have the same training parameters. In the real-world applications, each site's data will be different, therefore the training parameters such batch size and learning rate will be different.\n", + "\n", + "\n", + "In this section, we will show to set different parameters \n", + "\n", + "```\n", + " # Add clients\n", + "\n", + " executor_1 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.01 --batch_size 12\")\n", + " job.to(executor_1, \"site-1\")\n", + "\n", + " executor_2 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.01 --batch_size 10\")\n", + " job.to(executor_2, \"site-2\")\n", + " \n", + " executor_3 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.001 --batch_size 8\")\n", + " job.to(executor_3, \"site-3\")\n", + "\n", + " executor_4 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.001 --batch_size 6\")\n", + " job.to(executor_3, \"site-4\")\n", + " \n", + " executor_5 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.0001 --batch_size 4\")\n", + " job.to(executor_3, \"site-5\")\n", + "\n", + "```\n", + "\n", + "Let's see what effect this has on the training accuracy\n" + ] + }, + { + "cell_type": "markdown", + "id": "933b0778", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install nvflare" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -r code/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python3 code/data/download.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! python3 fl_job.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7966ab3a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.5_experiment_tracking/experiment_tracking.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.5_experiment_tracking/experiment_tracking.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.5_experiment_tracking/experiment_tracking.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.5_experiment_tracking/experiment_tracking.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.1_understanding_fl_job.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb similarity index 68% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.1_understanding_fl_job.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb index 73f7952d43..ae93a347f5 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.1_understanding_fl_job.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb @@ -43,10 +43,19 @@ " \n", " train_script = \"src/client.py\"\n", "\n", - " job = FedAvgJob(\n", - " name=\"fedavg\", n_clients=n_clients, num_rounds=num_rounds, initial_model=SimpleNetwork()\n", + "\n", + " job = FedJob(name=job_name, min_clients=num_clients)\n", + "\n", + " controller = FedAvg(\n", + " stop_cond=\"accuracy > 25\",\n", + " save_filename=\"global_model.pt\",\n", + " initial_model=SimpleNetwork(),\n", + " num_clients=num_clients,\n", + " num_rounds=num_rounds,\n", " )\n", "\n", + " job.to_server(controller)\n", + "\n", " # Add clients\n", " for i in range(n_clients):\n", " executor = ScriptRunner(\n", @@ -61,41 +70,7 @@ "\n", "### Server\n", "\n", - "In this code, we used FedAvgJob class. If we look at the inside of the FedAvgJob class, its just a wrapper class for pytorch FedJob adding a server FedAvg algorithm\n", - "\n", - "\n", - "```\n", - " job = FedAvgJob(\n", - " name=\"fedavg\", n_clients=n_clients, num_rounds=num_rounds, initial_model=SimpleNetwork()\n", - " )\n", - "```\n", - "\n", - "```\n", - "\n", - "class FedAvgJob(BaseFedJob):\n", - " def __init__(\n", - " self,\n", - " initial_model: nn.Module,\n", - " n_clients: int,\n", - " num_rounds: int,\n", - " name: str = \"fed_job\",\n", - " min_clients: int = 1,\n", - " mandatory_clients: Optional[List[str]] = None,\n", - " key_metric: str = \"accuracy\",\n", - " ):\n", - " \n", - " if not isinstance(initial_model, nn.Module):\n", - " raise ValueError(f\"Expected initial model to be nn.Module, but got type f{type(initial_model)}.\")\n", - "\n", - " super().__init__(initial_model, name, min_clients, mandatory_clients, key_metric)\n", - "\n", - " controller = FedAvg(\n", - " num_clients=n_clients,\n", - " num_rounds=num_rounds,\n", - " persistor_id=self.comp_ids[\"persistor_id\"],\n", - " )\n", - " self.to_server(controller)\n", - "```\n", + "We create FedJob, we create a FedAvg Algorithm ( called Controller, details later) and add to the server of the Fed Job. \n", "\n", "\n", "#### Client Side\n", @@ -332,22 +307,178 @@ " }\n", "}\n", "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "b0d07393", + "metadata": {}, + "source": [ + "## Job Configuration\n", + "\n", + "We have convered a lot of ground so far. You could stop here, and move to the next chapter of the training materials. \n", + "\n", + "But if you like to futher understand how NVIDIA FLARE works, you might want to go through this section: Job Configuration. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b61e8ac", + "metadata": {}, + "outputs": [], + "source": [ + "%cd code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0892c07e", + "metadata": {}, + "outputs": [], + "source": [ + "! tree /tmp/nvflare/jobs/workdir/fedavg/\n" + ] + }, + { + "cell_type": "markdown", + "id": "6edc997b", + "metadata": {}, + "source": [ + "At each site, there is job configuration file: \n", + "\n", + "\n", + "* ```config_fed_client.json``` Or\n", + "* ```config_fed_server.json```\n", + "\n", + "These are the job configuration," + ] + }, + { + "cell_type": "markdown", + "id": "885aeb7a", + "metadata": {}, + "source": [ + "### Server Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed26c66b", + "metadata": {}, + "outputs": [], + "source": [ + "! cat /tmp/nvflare/jobs/workdir/fedavg/app_server/config/config_fed_server.json" + ] + }, + { + "cell_type": "markdown", + "id": "a75c80c4", + "metadata": {}, + "source": [ + "The server configuration is a json file descripe the workflows. In our case, we defined one workflow, whci has a controller using our defined FedAvg class. \n", + "\n", + "\n", + ">Note: The configuration pattern is like the followings\n", + "```\n", + " id: ,\n", + " path: ,\n", + " args: {\n", + " class contructor arguments\n", + " }\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "c56170bf", + "metadata": {}, + "source": [ + "### Client Configurations\n", + "\n", + "We look at the site-1 client's configuration " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f5fd1a", + "metadata": {}, + "outputs": [], + "source": [ + "! cat /tmp/nvflare/jobs/workdir/fedavg/app_site-1//config/config_fed_client.json" + ] + }, + { + "cell_type": "markdown", + "id": "d9753aeb", + "metadata": {}, + "source": [ + "the configuration is simular, it defines an array of \"executors\", a builtin ```PTInProcessClientAPIExecutor``` is used, \n", + "which takes the training script client.py and its corresponding arguments as input. \n", + "\n", + "\n", + "```\n", + " \"executor\": {\n", + " \"path\": \"nvflare.app_opt.pt.in_process_client_api_executor.PTInProcessClientAPIExecutor\",\n", + " \"args\": {\n", + " \"task_script_path\": \"src/client.py\",\n", + " \"task_script_args\": \"--learning_rate 0.01 --batch_size 12\"\n", + " }\n", + " }\n", + "\n", + "```\n", + "\n", "\n", "The default Job configuration is json, but one can also use pyhocon or YAML, please refer to [config file documentation](https://nvflare.readthedocs.io/en/2.4/user_guide/configurations.html) for details\n" ] }, + { + "cell_type": "markdown", + "id": "3dc17db6", + "metadata": {}, + "source": [ + "## Simulator CLI\n", + "\n", + "With these job configuration, one can directly run simulator from command line. Here is the syntax and we will use it to run our previous job \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24055f32", + "metadata": {}, + "outputs": [], + "source": [ + "! nvflare simulator --help" + ] + }, { "cell_type": "code", "execution_count": null, "id": "a99ebc6c", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "!nvflare simulator /tmp/nvflare/jobs/workdir/fedavg/ -w /tmp/nvflare/jobs/workdir/fedavg/workspace -n 5 -t 5 " + ] + }, + { + "cell_type": "markdown", + "id": "e8914e76", + "metadata": {}, + "source": [ + "Hope you have a good standing of working with NVIDIA FLARE job so far. Let's move on to other chapters. " + ] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/data/download.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/data/download.py new file mode 100644 index 0000000000..ebd8cfdc41 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/data/download.py @@ -0,0 +1,60 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This Dirichlet sampling strategy for creating a heterogeneous partition is adopted +# from FedMA (https://github.com/IBM/FedMA). + +# MIT License + +# Copyright (c) 2020 International Business Machines + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import argparse + +import torchvision.datasets as datasets + +# default dataset path +CIFAR10_ROOT = "/tmp/nvflare/data/cifar10" + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path", type=str, default=CIFAR10_ROOT, nargs="?") + args = parser.parse_args() + return args + + +def main(args): + datasets.CIFAR10(root=args.dataset_path, train=True, download=True) + datasets.CIFAR10(root=args.dataset_path, train=False, download=True) + + +if __name__ == "__main__": + main(define_parser()) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/fl_job.py new file mode 100644 index 0000000000..8fc6f73846 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/fl_job.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +from src.fedavg import FedAvg +from src.network import SimpleNetwork + +from nvflare.job_config.api import FedJob +from nvflare.job_config.script_runner import ScriptRunner + +if __name__ == "__main__": + num_clients = 5 + num_rounds = 5 + job_name = "fedavg" + train_script = "src/client.py" + + job = FedJob(name=job_name, min_clients=num_clients) + + controller = FedAvg( + stop_cond="accuracy > 25", + save_filename="global_model.pt", + initial_model=SimpleNetwork(), + num_clients=num_clients, + num_rounds=num_rounds, + ) + + job.to_server(controller) + + # Add clients + + executor_1 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 12") + job.to(executor_1, "site-1") + + executor_2 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 10") + job.to(executor_2, "site-2") + + executor_3 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 8") + job.to(executor_3, "site-3") + + executor_4 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 6") + job.to(executor_3, "site-4") + + executor_5 = ScriptRunner(script=train_script, script_args="--learning_rate 0.0001 --batch_size 4") + job.to(executor_3, "site-5") + + job_config_dir = "/tmp/nvflare/jobs/workdir" + + print("job-config is at ", os.path.join(job_config_dir, job_name)) + + # job.export_job(job_config_dir) + job.simulator_run(job_config_dir) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/fl_job_config.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/fl_job_config.py new file mode 100644 index 0000000000..3780aa8de4 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/fl_job_config.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +from src.fedavg import FedAvg +from src.network import SimpleNetwork + +from nvflare.job_config.api import FedJob +from nvflare.job_config.script_runner import ScriptRunner + +if __name__ == "__main__": + num_clients = 5 + num_rounds = 5 + job_name = "fedavg" + train_script = "src/client.py" + + job = FedJob(name=job_name, min_clients=num_clients) + + controller = FedAvg( + stop_cond="accuracy > 25", + save_filename="global_model.pt", + initial_model=SimpleNetwork(), + num_clients=num_clients, + num_rounds=num_rounds, + ) + + job.to_server(controller) + + # Add clients + + executor_1 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 12") + job.to(executor_1, "site-1") + + executor_2 = ScriptRunner(script=train_script, script_args="--learning_rate 0.01 --batch_size 10") + job.to(executor_2, "site-2") + + executor_3 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 8") + job.to(executor_3, "site-3") + + executor_4 = ScriptRunner(script=train_script, script_args="--learning_rate 0.001 --batch_size 6") + job.to(executor_3, "site-4") + + executor_5 = ScriptRunner(script=train_script, script_args="--learning_rate 0.0001 --batch_size 4") + job.to(executor_3, "site-5") + + job_config_dir = "/tmp/nvflare/jobs/workdir" + + print("job-config is at ", os.path.join(job_config_dir, job_name)) + + job.export_job(job_config_dir) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/img/cifar10.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/img/cifar10.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/img/cifar10.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/img/cifar10.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/requirements.txt new file mode 100644 index 0000000000..57b4df2ed4 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +tensorboard \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/client.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/code/src/client.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/client.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/fedavg.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/fedavg.py new file mode 100644 index 0000000000..2792cbfee5 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/fedavg.py @@ -0,0 +1,149 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Callable, Dict, List, Optional + +import torch + +from nvflare.app_common.abstract.fl_model import FLModel +from nvflare.app_common.utils.math_utils import parse_compare_criteria +from nvflare.app_common.workflows.base_fedavg import BaseFedAvg +from nvflare.app_opt.pt.decomposers import TensorDecomposer +from nvflare.fuel.utils import fobs + + +class FedAvg(BaseFedAvg): + """Controller for FedAvg Workflow with Early Stopping and Model Selection. + + Args: + num_clients (int, optional): The number of clients. Defaults to 3. + num_rounds (int, optional): The total number of training rounds. Defaults to 5. + stop_cond (str, optional): early stopping condition based on metric. + string literal in the format of " " (e.g. "accuracy >= 80") + save_filename (str, optional): filename for saving model + initial_model (nn.Module, optional): initial PyTorch model + """ + + def __init__(self, *args, stop_cond: str, save_filename: str = "FL_global_model.pt", initial_model=None, **kwargs): + super().__init__(*args, **kwargs) + + self.stop_cond = stop_cond + + if stop_cond: + self.stop_condition = parse_compare_criteria(stop_cond) + else: + self.stop_condition = None + self.save_filename = save_filename + self.initial_model = initial_model + self.best_model: Optional[FLModel] = None + + def run(self) -> None: + self.info("Start FedAvg.") + + if self.initial_model: + # Use FOBS for serializing/deserializing PyTorch tensors (self.initial_model) + fobs.register(TensorDecomposer) + # PyTorch weights + initial_weights = self.initial_model.state_dict() + else: + initial_weights = {} + + model = FLModel(params=initial_weights) + + model.start_round = self.start_round + model.total_rounds = self.num_rounds + + for self.current_round in range(self.start_round, self.start_round + self.num_rounds): + self.info(f"Round {self.current_round} started.") + model.current_round = self.current_round + + clients = self.sample_clients(self.num_clients) + + results: List[FLModel] = self.send_model_and_wait(targets=clients, data=model) + aggregate_results = self.aggregate( + results, aggregate_fn=self.aggregate_fn + ) # using default aggregate_fn with `WeightedAggregationHelper`. Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] + + model = self.update_model(model, aggregate_results) + + self.info(f"Round {self.current_round} global metrics: {model.metrics}") + + self.select_best_model(model) + + self.save_model(self.best_model, os.path.join(os.getcwd(), self.save_filename)) + + if self.should_stop(model.metrics, self.stop_condition): + self.info( + f"Stopping at round={self.current_round} out of total_rounds={self.num_rounds}. Early stop condition satisfied: {self.stop_condition}" + ) + break + + self.info("Finished FedAvg.") + + def should_stop(self, metrics: Optional[Dict] = None, stop_condition: Optional[str] = None): + if stop_condition is None or metrics is None: + return False + + key, target, op_fn = stop_condition + value = metrics.get(key, None) + + if value is None: + raise RuntimeError(f"stop criteria key '{key}' doesn't exists in metrics") + + return op_fn(value, target) + + def select_best_model(self, curr_model: FLModel): + if self.best_model is None: + self.best_model = curr_model + return + + if self.stop_condition: + metric, _, op_fn = self.stop_condition + if self.is_curr_model_better(self.best_model, curr_model, metric, op_fn): + self.info("Current model is new best model.") + self.best_model = curr_model + else: + self.best_model = curr_model + + def is_curr_model_better( + self, best_model: FLModel, curr_model: FLModel, target_metric: str, op_fn: Callable + ) -> bool: + curr_metrics = curr_model.metrics + if curr_metrics is None: + return False + if target_metric not in curr_metrics: + return False + + best_metrics = best_model.metrics + return op_fn(curr_metrics.get(target_metric), best_metrics.get(target_metric)) + + def save_model(self, model, filepath=""): + params = model.params + # PyTorch save + torch.save(params, filepath) + + # save FLModel metadata + model.params = {} + fobs.dumpf(model, filepath + ".metadata") + model.params = params + + def load_model(self, filepath=""): + # PyTorch load + params = torch.load(filepath) + + # load FLModel metadata + model = fobs.loadf(filepath + ".metadata") + model.params = params + return model diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/network.py new file mode 100644 index 0000000000..609b0b1581 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/code/src/network.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SimpleNetwork(nn.Module): + def __init__(self): + super(SimpleNetwork, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = torch.flatten(x, 1) # flatten all dimensions except batch + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.7_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.7_recap/recap.ipynb new file mode 100644 index 0000000000..595a35d473 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1_running_federated_learning_applications/01.7_recap/recap.ipynb @@ -0,0 +1,82 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b152728-3366-4432-adb1-29aa3051dc22", + "metadata": {}, + "source": [ + "# Summary of Chapter 1\n", + "\n", + "We cover a lot of materials in Chapter 1. We guide you through the process of running federated learning applications. Here is an overview of the key contents:\n", + "\n", + "1. **Running Federated Learning Job**\n", + " - **Installation and Data Preparation**: Instructions for setting up the environment and preparing the data.\n", + " - [setup.ipynb](../01.1_running_federated_learning_job/setup.ipynb)\n", + " - **Training Classifier with PyTorch**: Steps to train a classifier using PyTorch in a federated learning setup.\n", + " - [runing_pytorch_fl_job.ipynb](../01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n", + "\n", + "2. **From Stand-Alone Deep Learning to Federated Learning**\n", + " - **Conversion to Federated Learning**: Guide on converting deep learning models with PyTorch to federated learning.\n", + " - [convert_dl_to_fl.ipynb](../01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n", + "\n", + "3. **Customizing the Federated Algorithms**\n", + " - **Server Logic Customization**: Techniques to customize server logic for specific federated learning needs, we built an our own fed avg algorithms with best model seleciton, model saving and loading, as well as early stopping. \n", + " - [customize_server_logics.ipynb](../01.3_customize_server_logics/customize_server_logics.ipynb)\n", + "\n", + "4. **Adjusting Training Parameters**\n", + " - **Client Logic Customization**: Methods to customize client logic to optimize training parameters. Here we show how to customize the training for each site. \n", + " - [customize_client_training.ipynb](../01.4_customize_client_training/customize_client_training.ipynb)\n", + "\n", + "5. **Tracking Training Metrics**\n", + " - **Experiment Tracking**: Tools and methods to track experiments and monitor training metrics effectively.\n", + " - [experiment_tracking.ipynb](../01.5_experiment_tracking/experiment_tracking.ipynb)\n", + "\n", + "6. **Job Structure and Configurations**\n", + " - **Understanding Job Structure and Configuration**: Detailed explanation of the job structure and configurations necessary for running federated learning jobs.\n", + " - [01.1.6.1_understanding_fl_job.ipynb](../01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb)\n", + "\n", + "7. **Recap of Covered Topics**\n", + " - **Summary and Recap**: A recap of the topics covered in the previous sections.\n", + " - [recap.ipynb](../01.7_recap/recap.ipynb)\n", + "\n", + "Each section is designed to provide comprehensive guidance and practical examples to help you implement and customize federated learning in your applications. For detailed instructions and examples, refer to the respective notebooks linked in each section.\n", + "\n", + "\n", + "Now let's move on to the [Chapter 2](../../Chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4f2e3cb3-e61f-45e9-8dad-ad55ebb3641a", + "metadata": {}, + "source": [ + " \n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nvflare_example", + "language": "python", + "name": "nvflare_example" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.1_federated_statistics/01.2.1.1_federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.1_federated_statistics/0federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.1_federated_statistics/01.2.1.1_federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.1_federated_statistics/0federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.1_federated_statistics/01.2.1.2_federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.1_federated_statistics/01.2.1.2_federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.3_running_simulator.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_dl.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.1_running_federated_learning_applications/01.1.6_job_structure_and_configuration/01.1.5.3_running_simulator.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_dl.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.2_convert_Logistics_regression_to_federated_learning/convert_lr_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_Logistics_regression_to_federated_learning/convert_lr_to_fl.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.2_convert_Logistics_regression_to_federated_learning/convert_lr_to_fl.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_Logistics_regression_to_federated_learning/convert_lr_to_fl.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.3_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.3_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.4_convert_survival_analysis_to_federated_learning/01.2.6_convert_survival_analysis_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.3_convert_machine_learning_to_federated_learning/01.2.3.4_convert_survival_analysis_to_federated_learning/01.2.6_convert_survival_analysis_to_fl.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.4_flare_low_level_apis.ipynb/04.4.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.4_client_api/Client_api.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.4_flare_low_level_apis.ipynb/04.4.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.4_client_api/Client_api.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.4_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.5_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-1.2_develop_federated_learning_applications/01.2.4_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/Chapter-2_develop_federated_learning_applications/02.5_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb index 6738e51b62..5c44122bf5 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb @@ -18,27 +18,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[Chapter 1.1: Running federated learning applications](Chapter-1.1_running_federated_learning_applications/01.1.0_introduction/introduction.ipynb)\n", + "[Chapter 1.1: Running federated learning applications](./Chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb)\n", "\n", - "[Chapter 1.2: Develop federated learning applications](Chapter-1.2_develop_federated_learning_applications/01.2.0_introduction/introduction.ipynb)\n" + "[Chapter 1.2: Develop federated learning applications](./Chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In Part 1, we explored the fundamentals of federated learning. We covered:\n", + "In Part 1, we explored the fundamentals of federated learning. We will cover:\n", "\n", - "#### Chapter 1.1: \n", + "#### Chapter 1: \n", "* How to train an image classification model with PyTorch\n", - "* Understanding the federated job structure\n", "* How to convert a standard PyTorch training code to federated learning code\n", "* How to customize client and server side logics\n", + "* Understanding the federated job structure and configurations\n", "\n", - "#### Chapter 1.2: \n", - "We delved into \n", + "#### Chapter 2: \n", "* federated statistics for both image and tabular data.\n", - "* how to convert traditional ML training code to federated learning code," + "* convert porch lightning to federated learning\n", + "* convert traditional ML training code to federated learning code,\n", + "* FLARE Client API" ] }, { diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.1_federated_computing_architecture/federated_computing_architecture.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/federated_computing_architecture.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.1_federated_computing_architecture/federated_computing_architecture.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/federated_computing_architecture.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.2_deployment_simulation/02.1.2_simulate_real_world_deployment.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.2_deployment_simulation/simulate_real_world_deployment.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.2_deployment_simulation/02.1.2_simulate_real_world_deployment.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.2_deployment_simulation/simulate_real_world_deployment.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.0 ways_to_interact_with_fl_system.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.0 ways_to_interact_with_fl_system.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.0 ways_to_interact_with_fl_system.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.0 ways_to_interact_with_fl_system.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.1_interaction_via_admin_console.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.1_interaction_via_admin_console.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.1_interaction_via_admin_console.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.1_interaction_via_admin_console.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.2_interaction_system_with_python_api.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.2_interaction_system_with_python_api.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.2_interaction_system_with_python_api.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.2_interaction_system_with_python_api.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.3_Interaction_with_cli.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.3_Interaction_with_cli.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.3_interact_with_federated_computing_system/02.1.3.3_Interaction_with_cli.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/03.3.3_Interaction_with_cli.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.4_system_monitoring/system_monitorinig.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/system_monitorinig.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.4_system_monitoring/system_monitorinig.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/system_monitorinig.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.5_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.5_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.1_federated_computing_platform/02.1.5_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.5_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.1_provision_via_cli/provision_via_cli.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.1_provision_via_cli/provision_via_cli.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.1_provision_via_cli/provision_via_cli.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.1_provision_via_cli/provision_via_cli.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.2_provision_via_dashboard/provision_via_dashboard.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.2_provision_via_dashboard/provision_via_dashboard.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.2_provision_via_dashboard/provision_via_dashboard.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.2_provision_via_dashboard/provision_via_dashboard.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.3_provision_and_run_with_docker/provision_and_run_with_dockers.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.3_provision_and_run_with_docker/provision_and_run_with_dockers.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.3_provision_and_run_with_docker/provision_and_run_with_dockers.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.3_provision_and_run_with_docker/provision_and_run_with_dockers.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.4_provision_and_run_with_k8s/02.2.4_provision_and_run_with_k8s.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.4_provision_and_run_with_k8s/provision_and_run_with_k8s.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.4_provision_and_run_with_k8s/02.2.4_provision_and_run_with_k8s.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.4_provision_and_run_with_k8s/provision_and_run_with_k8s.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.5_deployment_in_aws/deployment_in_aws.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.5_deployment_in_aws/deployment_in_aws.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.5_deployment_in_aws/deployment_in_aws.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.5_deployment_in_aws/deployment_in_aws.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.6_deployment_in_azure/deployment_in_azure.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.6_deployment_in_azure/deployment_in_azure.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.6_deployment_in_azure/deployment_in_azure.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.6_deployment_in_azure/deployment_in_azure.ipynb diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.7_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.7_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-2.2_setup_federated_system/02.2.7_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-4_setup_federated_system/04.7_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.1_privacy_filter/privacy_filtering.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.1_privacy_filter/privacy_filtering.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.1_privacy_filter/privacy_filtering.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.1_privacy_filter/privacy_filtering.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.2_differency_privacy/privacy_with_differential_privacy.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.2_differency_privacy/privacy_with_differential_privacy.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.2_differency_privacy/privacy_with_differential_privacy.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.2_differency_privacy/privacy_with_differential_privacy.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.3_homormorphic_encryption/03.1.3.1_privacy_with_homormorphic_encryption.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.3_homormorphic_encryption/05.3.1_privacy_with_homormorphic_encryption.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.3_homormorphic_encryption/03.1.3.1_privacy_with_homormorphic_encryption.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.3_homormorphic_encryption/05.3.1_privacy_with_homormorphic_encryption.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.3_homormorphic_encryption/03.1.3.2_kaplan_meier_survaval_analysis_with_he.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.3_homormorphic_encryption/05.3.2_kaplan_meier_survaval_analysis_with_he.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.3_homormorphic_encryption/03.1.3.2_kaplan_meier_survaval_analysis_with_he.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.3_homormorphic_encryption/05.3.2_kaplan_meier_survaval_analysis_with_he.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.4_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.4_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.1_Privacy_In_Federated_Learning/03.1.4_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-5_Privacy_In_Federated_Learning/05.4_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.1_security_architecture/Seurity_architecture.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.1_security_architecture/Seurity_architecture.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.1_security_architecture/Seurity_architecture.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.1_security_architecture/Seurity_architecture.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.2_authentication_and_authorization/site_specific_authentication_and_authorization.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.2_authentication_and_authorization/site_specific_authentication_and_authorization.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.2_authentication_and_authorization/site_specific_authentication_and_authorization.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.2_authentication_and_authorization/site_specific_authentication_and_authorization.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.3_local_security_policy/local_security_policy.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.3_local_security_policy/local_security_policy.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.3_local_security_policy/local_security_policy.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.3_local_security_policy/local_security_policy.ipynb diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.4_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.4_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-3.2_Security_in_federated_compute_system/03.2.4_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.4_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.0_introduction/introduction.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.1_fed_xgboost/Running_fed_xgboost_applications.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/Running_fed_xgboost_applications.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.1_fed_xgboost/Running_fed_xgboost_applications.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/Running_fed_xgboost_applications.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.1_fed_xgboost/fed_xgboost_introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost_introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.1_fed_xgboost/fed_xgboost_introduction.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost_introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.3_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.3_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.3_federated_XGBoost/04.3.3_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.3_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.0_introduction/04.1.0_introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.0_introduction/04.1.0_introduction.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.0_introduction/04.1.2_different_workflow_patterns.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.0_introduction/different_workflow_patterns.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.0_introduction/04.1.2_different_workflow_patterns.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.0_introduction/different_workflow_patterns.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.1_cyclic/cyclic_weight_transfer_example.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.1_cyclic/cyclic_weight_transfer_example.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.1_cyclic/cyclic_weight_transfer_example.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.1_cyclic/cyclic_weight_transfer_example.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.3_split_learning/federated_private_set_interaction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.3_split_learning/federated_private_set_interaction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.3_split_learning/federated_private_set_interaction.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.3_split_learning/federated_private_set_interaction.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.3_split_learning/split_learning.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.3_split_learning/split_learning.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.3_split_learning/split_learning.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.3_split_learning/split_learning.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.3_split_learning/vertical_federated_learning_introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.3_split_learning/vertical_federated_learning_introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.3_split_learning/vertical_federated_learning_introduction.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.3_split_learning/vertical_federated_learning_introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.4_swarm_learning/swarm_learning.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.4_swarm_learning/swarm_learning.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.2_workflows/04.1.2.4_swarm_learning/swarm_learning.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.2_workflows/07.2.4_swarm_learning/swarm_learning.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.3_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.3_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.1_algorithms_and_workflows/04.1.3_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-7_algorithms_and_workflows/07.3_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.0_introduction/04.2.0_introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.0_introduction/04.2.0_introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.0_introduction/04.2.0_introduction.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.0_introduction/04.2.0_introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.2_llm_p_tuning/04.2.1_LLM_prompt_tuning.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.1_llm_p_tuning/LLM_prompt_tuning.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.2_llm_p_tuning/04.2.1_LLM_prompt_tuning.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.1_llm_p_tuning/LLM_prompt_tuning.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.3_llm_peft/04.2.2_LLM_PEFT.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.2_llm_peft/LLM_PEFT.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.3_llm_peft/04.2.2_LLM_PEFT.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.2_llm_peft/LLM_PEFT.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.3_llm_sft/04.2.3_LLM_SFT.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.3_llm_sft/LLM_SFT.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.3_llm_sft/04.2.3_LLM_SFT.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.3_llm_sft/LLM_SFT.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.4_fed_nlp/04.2.4_federated_nlp_with_bert.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.4_fed_nlp/federated_nlp_with_bert.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.4_fed_nlp/04.2.4_federated_nlp_with_bert.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.4_fed_nlp/federated_nlp_with_bert.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.5_retiever_model_training/04.2.5_federated_retriever_model_training.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.5_retiever_model_training/federated_retriever_model_training.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.5_retiever_model_training/04.2.5_federated_retriever_model_training.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.5_retiever_model_training/federated_retriever_model_training.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.6_llm_quantization/04.2.6_LLM_quantization.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.6_llm_quantization/LLM_quantization.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.6_llm_quantization/04.2.6_LLM_quantization.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.6_llm_quantization/LLM_quantization.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.7_llm_streaming/04.2.7_LLM_streaming.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.7_llm_streaming/LLM_streaming.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.7_llm_streaming/04.2.7_LLM_streaming.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.7_llm_streaming/LLM_streaming.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.8_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.8_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-4.2_federated_LLM_training/04.2.8_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-8_federated_LLM_training/08.8_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis.ipynb/09_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis.ipynb/09_introduction/introduction.ipynb new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.0_introduction/05.1.0_introduction.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.0_introduction/introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.0_introduction/05.1.0_introduction.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.0_introduction/introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.1_medical_imaging/medical_imaging.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.1_medical_imaging/medical_imaging.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.1_medical_imaging/medical_imaging.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.1_medical_imaging/medical_imaging.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.1_medical_imaging/mednist.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.1_medical_imaging/mednist.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.1_medical_imaging/mednist.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.1_medical_imaging/mednist.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.2_drug_discovery/05.1.2.0_introduction/drug_discovery_with_fl.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.2_drug_discovery/11.2.0_introduction/drug_discovery_with_fl.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.2_drug_discovery/05.1.2.0_introduction/drug_discovery_with_fl.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.2_drug_discovery/11.2.0_introduction/drug_discovery_with_fl.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.2_drug_discovery/05.1.2.1_discovery_with_nemo/chemical_property_prediction_with_BioNemo.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.2_drug_discovery/11.2.1_discovery_with_nemo/chemical_property_prediction_with_BioNemo.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.2_drug_discovery/05.1.2.1_discovery_with_nemo/chemical_property_prediction_with_BioNemo.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.2_drug_discovery/11.2.1_discovery_with_nemo/chemical_property_prediction_with_BioNemo.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.2_drug_discovery/05.1.2.2_drug_discovery_with_boltz-1/training_boltz-1.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.2_drug_discovery/11.2.2_drug_discovery_with_boltz-1/training_boltz-1.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.2_drug_discovery/05.1.2.2_drug_discovery_with_boltz-1/training_boltz-1.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.2_drug_discovery/11.2.2_drug_discovery_with_boltz-1/training_boltz-1.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.3_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.3_recap/recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.1_federated_learning_in_healthcare_lifescience/05.1.3_recap/recap.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-11_federated_learning_in_healthcare_lifescience/11.3_recap/recap.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.0_introduction.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.0_introduction.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.0_introduction.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.0_introduction.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.1_fraud_detection_with_decentralized_ai.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.1_fraud_detection_with_decentralized_ai.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.1_fraud_detection_with_decentralized_ai.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.1_fraud_detection_with_decentralized_ai.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.2_credit_card_frauld_detection_with_fed_xgboost.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.2_credit_card_frauld_detection_with_fed_xgboost.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.2_credit_card_frauld_detection_with_fed_xgboost.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.2_credit_card_frauld_detection_with_fed_xgboost.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.3_end_to_end_federated_fruad_detection_process.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.3_end_to_end_federated_fruad_detection_process.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.3_end_to_end_federated_fruad_detection_process.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.3_end_to_end_federated_fruad_detection_process.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.4_enrich_feature_with_graph_embedding.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.4_enrich_feature_with_graph_embedding.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.4_enrich_feature_with_graph_embedding.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.4_enrich_feature_with_graph_embedding.ipynb diff --git a/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.5_recap.ipynb b/examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.5_recap.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-5.2_federated_learning_in_financial_services/05.2.5_recap.ipynb rename to examples/tutorials/self-paced-training/part-5_federated_learning_applications_in_industries/chapter-12_federated_learning_in_financial_services/12.5_recap.ipynb diff --git a/examples/tutorials/self-paced-training/welcome.ipynb b/examples/tutorials/self-paced-training/welcome.ipynb index 39cb0642c7..3385f96578 100644 --- a/examples/tutorials/self-paced-training/welcome.ipynb +++ b/examples/tutorials/self-paced-training/welcome.ipynb @@ -104,6 +104,15 @@ "```\n" ] }, + { + "cell_type": "markdown", + "id": "932d6f73", + "metadata": {}, + "source": [ + "# Log Configuration\n", + "\n" + ] + }, { "cell_type": "markdown", "id": "99636a22-ace2-4988-96bd-33466091b35a", diff --git a/nvflare/app_common/executors/client_api_launcher_executor.py b/nvflare/app_common/executors/client_api_launcher_executor.py index 5c10d7b390..3b470edf2c 100644 --- a/nvflare/app_common/executors/client_api_launcher_executor.py +++ b/nvflare/app_common/executors/client_api_launcher_executor.py @@ -40,7 +40,7 @@ def __init__( heartbeat_interval: float = 5.0, heartbeat_timeout: float = 60.0, workers: int = 4, - train_with_evaluation: bool = True, + train_with_evaluation: bool = False, train_task_name: str = AppConstants.TASK_TRAIN, evaluate_task_name: str = AppConstants.TASK_VALIDATION, submit_model_task_name: str = AppConstants.TASK_SUBMIT_MODEL, diff --git a/nvflare/app_common/executors/in_process_client_api_executor.py b/nvflare/app_common/executors/in_process_client_api_executor.py index c89233904b..c920f1b4f9 100644 --- a/nvflare/app_common/executors/in_process_client_api_executor.py +++ b/nvflare/app_common/executors/in_process_client_api_executor.py @@ -56,7 +56,7 @@ def __init__( params_transfer_type: TransferType = TransferType.FULL, from_nvflare_converter_id: Optional[str] = None, to_nvflare_converter_id: Optional[str] = None, - train_with_evaluation: bool = True, + train_with_evaluation: bool = False, train_task_name: str = AppConstants.TASK_TRAIN, evaluate_task_name: str = AppConstants.TASK_VALIDATION, submit_model_task_name: str = AppConstants.TASK_SUBMIT_MODEL, diff --git a/nvflare/app_common/executors/launcher_executor.py b/nvflare/app_common/executors/launcher_executor.py index 89fd2d7872..b7540021f3 100644 --- a/nvflare/app_common/executors/launcher_executor.py +++ b/nvflare/app_common/executors/launcher_executor.py @@ -49,7 +49,7 @@ def __init__( heartbeat_interval: float = 5.0, heartbeat_timeout: float = 60.0, workers: int = 4, - train_with_evaluation: bool = True, + train_with_evaluation: bool = False, train_task_name: str = AppConstants.TASK_TRAIN, evaluate_task_name: str = AppConstants.TASK_VALIDATION, submit_model_task_name: str = AppConstants.TASK_SUBMIT_MODEL, @@ -227,6 +227,7 @@ def _init_converter(self, fl_ctx: FLContext): def _initialize_external_execution( self, task_name: str, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal ) -> bool: + self.reset_peer_is_up_or_dead() with self._lock: self._abort_signal = abort_signal self._current_task = task_name @@ -242,13 +243,15 @@ def _initialize_external_execution( abort_signal.trigger("launch task failed") return False - self.log_info(fl_ctx, f"External execution for task ({task_name}) is launched.") + self.log_info(fl_ctx, f"Launcher successfully launched task ({task_name}).") # wait for external execution to set up their pipe_handler setup_success = self._wait_external_setup(task_name, fl_ctx, abort_signal) if not setup_success: - self.log_error(fl_ctx, "External execution set up failed.") - abort_signal.trigger("External execution set up failed.") + error = f"Failed external setup for task ({task_name})." + self.log_error(fl_ctx, error) + abort_signal.trigger(error) return False + self.log_info(fl_ctx, f"External setup for task ({task_name}) succeeded.") return True def _execute_launcher_method_in_thread_executor(self, method_name: str, **kwargs) -> Any: diff --git a/nvflare/app_common/executors/task_exchanger.py b/nvflare/app_common/executors/task_exchanger.py index be33fbe7b3..66873d2575 100644 --- a/nvflare/app_common/executors/task_exchanger.py +++ b/nvflare/app_common/executors/task_exchanger.py @@ -256,6 +256,9 @@ def ask_peer_to_end(self, fl_ctx: FLContext) -> bool: def peer_is_up_or_dead(self) -> bool: return self.pipe_handler.peer_is_up_or_dead.is_set() + def reset_peer_is_up_or_dead(self): + self.pipe_handler.peer_is_up_or_dead.clear() + def pause_pipe_handler(self): """Stops pipe_handler heartbeat.""" self.pipe_handler.pause() diff --git a/nvflare/app_opt/confidential_computing/aci_authorizer.py b/nvflare/app_opt/confidential_computing/aci_authorizer.py new file mode 100644 index 0000000000..a696eec1d3 --- /dev/null +++ b/nvflare/app_opt/confidential_computing/aci_authorizer.py @@ -0,0 +1,66 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import time + +import jwt +import requests +from jwt import PyJWKClient + +from nvflare.app_opt.confidential_computing.cc_authorizer import CCAuthorizer + +ACI_NAMESPACE = "x-ms" +maa_endpoint = "sharedeus2.eus2.attest.azure.net" + + +class ACIAuthorizer(CCAuthorizer): + def __init__(self, retry_count=5, retry_sleep=2): + self.retry_count = retry_count + self.retry_sleep = retry_sleep + + def generate(self): + count = 0 + token = "" + while True: + count = count + 1 + try: + r = requests.post( + "http://localhost:8284/attest/maa", + data=json.dumps({"maa_endpoint": maa_endpoint, "runtime_data": "ewp9"}), + headers={"Content-Type": "application/json"}, + ) + if r.status_code == requests.codes.ok: + token = r.json().get("token") + break + except: + if count > self.retry_count: + break + time.sleep(self.retry_sleep) + return token + + def verify(self, token): + try: + header = jwt.get_unverified_header(token) + alg = header.get("alg") + jwks_client = PyJWKClient(f"https://{maa_endpoint}/certs") + signing_key = jwks_client.get_signing_key_from_jwt(token) + claims = jwt.decode(token, signing_key.key, algorithms=[alg]) + if claims: + return True + except: + return False + return False + + def get_namespace(self) -> str: + return ACI_NAMESPACE diff --git a/nvflare/app_opt/confidential_computing/cc_helper.py b/nvflare/app_opt/confidential_computing/cc_helper.py deleted file mode 100644 index c01d0146e0..0000000000 --- a/nvflare/app_opt/confidential_computing/cc_helper.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# import os.path - -import os -from typing import Dict - -from nv_attestation_sdk.attestation import Attestation, Devices, Environment - -from nvflare.fuel.utils.log_utils import get_obj_logger - - -class VerifierProp: - - DEVICES = "devices" # GPU, CPU, etc. - ENV = "env" - URL = "url" - APPRAISAL_POLICY_FILE = "appraisal_policy_file" - RESULT_POLICY_FILE = "result_policy_file" - - -class Device: - - GPU = "gpu" - CPU = "cpu" - NIC = "nic" - OS = "os" - DPU = "dpu" - - mapping = {GPU: Devices.GPU, CPU: Devices.CPU, NIC: Devices.NIC, OS: Devices.OS, DPU: Devices.DPU} - - -class Env: - - TEST = "test" - LOCAL = "local" - AZURE = "azure" - GCP = "gcp" - - mapping = {TEST: Environment.TEST, LOCAL: Environment.LOCAL, AZURE: Environment.AZURE, GCP: Environment.GCP} - - -class CCHelper(object): - def __init__(self, site_name: str, verifiers: list): - """Create an AttestationHelper instance - - Args: - site_name: name of the site - verifiers: dict that specifies verifiers to be used - """ - self.site_name = site_name - self.verifiers = verifiers - attestation = Attestation() - attestation.set_name(site_name) - self.attestation = attestation - self.token = None - self.logger = get_obj_logger(self) - for v in verifiers: - assert isinstance(v, dict) - url = None - env = None - devices = 0 - appraisal_policy_file = None - result_policy_file = None - for prop, value in v.items(): - if prop == VerifierProp.URL: - url = value - elif prop == VerifierProp.ENV: - env = Env.mapping.get(value) - elif prop == VerifierProp.DEVICES: - dv = Device.mapping.get(value) - if not dv: - raise ValueError(f"invalid device '{value}'") - devices = dv - elif prop == VerifierProp.APPRAISAL_POLICY_FILE: - appraisal_policy_file = value - elif prop == VerifierProp.RESULT_POLICY_FILE: - result_policy_file = value - if not env: - raise ValueError("Environment is not specified for verifier") - if not devices: - raise ValueError("Devices is not specified for verifier") - if url is None: - raise ValueError("Url is not specified for verifier") - if appraisal_policy_file is None: - raise ValueError("Appraisal policy file is not specified for verifier") - if not os.path.exists(appraisal_policy_file): - raise ValueError(f"Appraisal policy file '{appraisal_policy_file}' does not exist") - appraisal_policy = open(appraisal_policy_file, "rt").read().rstrip() - if result_policy_file is None: - raise ValueError("Result policy file is not specified for verifier") - if not os.path.exists(result_policy_file): - raise ValueError(f"Result policy file '{result_policy_file}' does not exist") - self.result_policy = open(result_policy_file, "rt").read().rstrip() - attestation.add_verifier(devices, env, url, appraisal_policy) - - def reset_participant(self, participant_name: str): - pass - - def prepare(self) -> bool: - """Prepare for attestation process - - Returns: error if any - """ - ok = self.attestation.attest() - self.logger.info(f"CC - attest result (is valid?): {ok}") - self.token = self.attestation.get_token(self.site_name) - self.logger.info(f"token {self.token=}") - return True - - def get_token(self): - return self.token - - def validate_participants(self, participants: Dict[str, str]) -> Dict[str, bool]: - """Validate CC policies of specified participants against the requirement policy of the site. - - Args: - participants: dict of participant name => token - - Returns: dict of participant name => bool - - """ - if not participants: - return {} - result = {k: self.attestation.validate_token(self.result_policy, v) for k, v in participants.items()} - self.logger.debug(f"CC - results from validating participants' tokens: {result}") - return result diff --git a/nvflare/app_opt/confidential_computing/gpu_authorizer.py b/nvflare/app_opt/confidential_computing/gpu_authorizer.py index bd55e2a463..1090059380 100644 --- a/nvflare/app_opt/confidential_computing/gpu_authorizer.py +++ b/nvflare/app_opt/confidential_computing/gpu_authorizer.py @@ -13,81 +13,85 @@ # limitations under the License. -from nvflare.app_opt.confidential_computing.cc_authorizer import CCAuthorizer - -GPU_NAMESPACE = "x-nv-gpu-" - - -class GPUAuthorizer(CCAuthorizer): - """Note: This is just a fake implementation for GPU authorizer. It will be replaced later - with the real implementation. - - """ - - def __init__(self, verifiers: list) -> None: - """ +import json +import logging +import uuid - Args: - verifiers (list): - each element in this list is a dictionary and the keys of dictionary are - "devices", "env", "url", "appraisal_policy_file" and "result_policy_file." +import jwt +from nv_attestation_sdk import attestation - the values of devices are "gpu" and "cpu" - the values of env are "local" and "test" - currently, valid combination is gpu + local - - url must be an empty string - appraisal_policy_file must point to an existing file - currently supports an empty file only - - result_policy_file must point to an existing file - currently supports the following content only +from nvflare.app_opt.confidential_computing.cc_authorizer import CCAuthorizer - .. code-block:: json +GPU_NAMESPACE = "x-nv-gpu" +default_policy = """{ + "version":"1.0", + "authorization-rules":{ + "sub":"NVIDIA-GPU-ATTESTATION", + "secboot":true, + "x-nvidia-gpu-manufacturer":"NVIDIA Corporation", + "x-nvidia-attestation-type":"GPU", + "x-nvidia-attestation-detailed-result":{ + "x-nvidia-gpu-driver-rim-schema-validated":true, + "x-nvidia-gpu-vbios-rim-cert-validated":true, + "x-nvidia-gpu-attestation-report-cert-chain-validated":true, + "x-nvidia-gpu-driver-rim-schema-fetched":true, + "x-nvidia-gpu-attestation-report-parsed":true, + "x-nvidia-gpu-nonce-match":true, + "x-nvidia-gpu-vbios-rim-signature-verified":true, + "x-nvidia-gpu-driver-rim-signature-verified":true, + "x-nvidia-gpu-arch-check":true, + "x-nvidia-gpu-measurements-match":true, + "x-nvidia-gpu-attestation-report-signature-verified":true, + "x-nvidia-gpu-vbios-rim-schema-validated":true, + "x-nvidia-gpu-driver-rim-cert-validated":true, + "x-nvidia-gpu-vbios-rim-schema-fetched":true, + "x-nvidia-gpu-vbios-rim-measurements-available":true + }, + "x-nvidia-gpu-driver-version":"535.104.05", + "hwmodel":"GH100 A01 GSP BROM", + "measres":"comparison-successful", + "x-nvidia-gpu-vbios-version":"96.00.5E.00.02" + } +} +""" - { - "version":"1.0", - "authorization-rules":{ - "x-nv-gpu-available":true, - "x-nv-gpu-attestation-report-available":true, - "x-nv-gpu-info-fetched":true, - "x-nv-gpu-arch-check":true, - "x-nv-gpu-root-cert-available":true, - "x-nv-gpu-cert-chain-verified":true, - "x-nv-gpu-ocsp-cert-chain-verified":true, - "x-nv-gpu-ocsp-signature-verified":true, - "x-nv-gpu-cert-ocsp-nonce-match":true, - "x-nv-gpu-cert-check-complete":true, - "x-nv-gpu-measurement-available":true, - "x-nv-gpu-attestation-report-parsed":true, - "x-nv-gpu-nonce-match":true, - "x-nv-gpu-attestation-report-driver-version-match":true, - "x-nv-gpu-attestation-report-vbios-version-match":true, - "x-nv-gpu-attestation-report-verified":true, - "x-nv-gpu-driver-rim-schema-fetched":true, - "x-nv-gpu-driver-rim-schema-validated":true, - "x-nv-gpu-driver-rim-cert-extracted":true, - "x-nv-gpu-driver-rim-signature-verified":true, - "x-nv-gpu-driver-rim-driver-measurements-available":true, - "x-nv-gpu-driver-vbios-rim-fetched":true, - "x-nv-gpu-vbios-rim-schema-validated":true, - "x-nv-gpu-vbios-rim-cert-extracted":true, - "x-nv-gpu-vbios-rim-signature-verified":true, - "x-nv-gpu-vbios-rim-driver-measurements-available":true, - "x-nv-gpu-vbios-index-conflict":true, - "x-nv-gpu-measurements-match":true - } - } - """ - super().__init__() - self.verifiers = verifiers +class GPUAuthorizer(CCAuthorizer): + def __init__(self, verifier_url="https://nras.attestation.nvidia.com/v1/attest/gpu", policy_file=None): + self._can_generate = True + self.client = attestation.Attestation() + self.client.set_name("nvflare_node") + nonce = uuid.uuid4().hex + uuid.uuid1().hex + self.client.set_nonce(nonce) + if policy_file is None: + self.remote_att_result_policy = default_policy + else: + self.remote_att_result_policy = open(policy_file).read() + self.client.add_verifier(attestation.Devices.GPU, attestation.Environment.REMOTE, verifier_url, "") + self.logger = logging.getLogger(self.__class__.__name__) + + def generate(self): + try: + self.client.attest() + token = self.client.get_token() + except BaseException: + self.can_generate = False + token = "[[],{}]" + return token + + def verify(self, eat_token): + try: + jwt_token = json.loads(eat_token)[1] + claims = jwt.decode(jwt_token.get("REMOTE_GPU_CLAIMS"), options={"verify_signature": False}) + # With claims, we will retrieve the nonce + nonce = claims.get("eat_nonce") + self.client.set_nonce(nonce) + self.client.set_token(name="nvflare_node", eat_token=eat_token) + result = self.client.validate_token(self.remote_att_result_policy) + except BaseException as e: + self.logger.info(f"Token verification failed {e=}") + result = False + return result def get_namespace(self) -> str: return GPU_NAMESPACE - - def generate(self) -> str: - raise NotImplementedError - - def verify(self, token: str) -> bool: - raise NotImplementedError diff --git a/nvflare/app_opt/confidential_computing/mock_authorizer.py b/nvflare/app_opt/confidential_computing/mock_authorizer.py new file mode 100644 index 0000000000..b9e3e99818 --- /dev/null +++ b/nvflare/app_opt/confidential_computing/mock_authorizer.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nvflare.app_opt.confidential_computing.cc_authorizer import CCAuthorizer + +MOCK_NAMESPACE = "x-mock" + + +class MockAuthorizer(CCAuthorizer): + def generate(self): + return "eyJhbGciOiJSUzI1NiIsImprdSI6Imh0dHBzOi8vc2hhcmVkZXVzMi5ldXMyLmF0dGVzdC5henVyZS5uZXQvY2VydHMiLCJraWQiOiJKMHBBUGRmWFhIcVdXaW1nckg4NTN3TUlkaDUvZkxlMXo2dVNYWVBYQ2EwPSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDk5NDM0OTYsImlhdCI6MTcwOTkxNDY5NiwiaXNzIjoiaHR0cHM6Ly9zaGFyZWRldXMyLmV1czIuYXR0ZXN0LmF6dXJlLm5ldCIsImp0aSI6IjQ1MGIxNWMxMmRmYzIxMWM5ZWRkOGU4MmFiY2NiZTEzYmMyOTgzZjlhNjAzOTZlMzljZTJmZGIwYjFmNTg1YzEiLCJuYmYiOjE3MDk5MTQ2OTYsInNlY3VyZWJvb3QiOnRydWUsIngtbXMtYXR0ZXN0YXRpb24tdHlwZSI6ImF6dXJldm0iLCJ4LW1zLWF6dXJldm0tYXR0ZXN0YXRpb24tcHJvdG9jb2wtdmVyIjoiMi4wIiwieC1tcy1henVyZXZtLWF0dGVzdGVkLXBjcnMiOlswLDEsMiwzLDQsNSw2LDddLCJ4LW1zLWF6dXJldm0tYm9vdGRlYnVnLWVuYWJsZWQiOmZhbHNlLCJ4LW1zLWF6dXJldm0tZGJ2YWxpZGF0ZWQiOnRydWUsIngtbXMtYXp1cmV2bS1kYnh2YWxpZGF0ZWQiOnRydWUsIngtbXMtYXp1cmV2bS1kZWJ1Z2dlcnNkaXNhYmxlZCI6dHJ1ZSwiwC1tcy1henVyZXZtLWRlZmF1bHQtc2VjdXJlYm9vdGtleXN2YWxpZGF0ZWQiOnRydWUsIngtbXMtYXp1cmV2bS1lbGFtLWVuYWJsZWQiOmZhbHNlLCJ4LW1zLWF6dXJldm0tZmxpZ2h0c2lnbmluZy1lbmFibGVkIjpmYWxzZSwieC1tcy1henVyZXZtLWh2Y2ktcG9saWN5IjowLCJ4LW1zLWF6dXJldm0taHlwZXJ2aXNvcmRlYnVnLWVuYWJsZWQiOmZhbHNlLCJ4LW1zLWF6dXJldm0taXMtd2luZG93cyI6ZmFsc2UsIngtbXMtYXp1cmV2bS1rZXJuZWxkZWJ1Zy1lbmFibGVkIjpmYWxzZSwieC1tcy1henVyZXZtLW9zYnVpbGQiOiJOb3RBcHBsaWNhdGlvbiIsIngtbXMtYXp1cmV2bS1vc2Rpc3RybyI6IlVidW50dSIsIngtbXMtYXp1cmV2bS1vc3R5cGUiOiJMaW51eCIsIngtbXMtYXp1cmV2bS1vc3ZlcnNpb24tbWFqb3IiOjIyLCJ4LW1zLWF6dXJldm0tb3N2ZXJzaW9uLW1pbm9yIjo0LCJ4LW1zLWF6dXJldm0tc2lnbmluZ2Rpc2FibGVkIjp0cnVlLCJ4LW1zLWF6dXJldm0tdGVzdHNpZ25pbmctZW5hYmxlZCI6ZmFsc2UsIngtbXMtYXp1cmV2bS12bWlkIjoiQzRGRkM0QjMtOUFERi00MEQxLThDQ0MtMTAxMUUxQkNDREIwIiwieC1tcy1pc29sYXRpb24tdGVlIjp7IngtbXMtYXR0ZXN0YXRpb24tdHlwZSI6InNldnNucHZtIiwieC1tcy1jb21wbGlhbmNlLXN0YXR1cyI6ImF6dXJlLWNvbXBsaWFudC1jdm0iLCJ4LW1zLXJ1bnRpbWUiOnsia2V5cyI6W3siZSI6IkFRQUIiLCJrZXlfb3BzIjpbInNpZ24iXSwia2lkIjoiSENMQWtQdWIiLCJrdHkiOiJSU0EiLCJuIjoiNEFQakF3QUFwQjE4cnc0bDh4Y0pmQXNpT1pJb1lSdGpYLVdOM0RhdVZ2cWlOSGlNU2RFaFNtWW9CQnVTcUVfa2pHblpZOFRWb2RSRkdJNWtFalR4NmhBZFM1OHIzY2R6OEtYMERmOHZERjF3Y2NjVW52SHJGY3FnRnNGVWs0UHJZZko2eU9nell2bmhvdWFSQlZ4dmZ5bEMwZWZhTUNUUkdES2pZSzhPVV9RcWxFeGIzY19neEJZWGlSZ3dBYWFaZUd4eFNId3U0a3lwZ3hwMlhjWXlHLVU3a3FHc01VWnlkZmM0eUxiS1BQcl9zMUJYUTNSbkFtdTQtblhkdVRmcWlWX2gxbGN4V29fYVhSWkFNdG9hTnVkclkyMVZVV3AzVW5xeFFtdGVGcXplWWpEQm8ta2Fjel9iNG9Gem5OM29aSlVUZmJnb09sTzJzbDc3U05Xa0x3In0seyJlIjoiQVFBQiIsImtleV9vcHMiOlsiZW5jcnlwdCJdLCJraWQiOiJIQ0xFa1B1YiIsImt0eSI6IlJTQSIsIm4iOiJ3c3lxREFBQloySHlTd08wTDFaWXZwVkhOVFVpdXpPaGcyb3Bfa1VpckNqM1M5bEtpT051YmU4RWFsSFVBcUh6bGdEcF84dHdBdGRONGNOUzZUSWdITG94TXpaZUFpaWU1OGd2VGtYdzVqMThmVUY4UEVvT2NXVERFSmRMVXIxWnBEdTRSdUZXbDdkZHNIdFBJRDVqcmt0R21FajBCZVp5NzZWVGFUYU1iamhGRmphUGNCT01fOWNaVHJYdFduSG80WTF1TG53VUJPRzA3T1hrUmlTSjBHZ3phVFhvaFVzVGE4X0w5NDJfeml5QU16STliUnJmUV9JMXY0SFV0M1YzS3laaUZJS3B4X2hWQnZZZUhwcTZBbXlYRTExS0VXek5HMTJaZVNkbzBzMUhudFNidTQ3dUktYklIdnBLaVVQSzBYSGZWbDQwVnNwenJ2MjlqekppR1EifV0sInVzZXItZGF0YSI6IjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwIiwidm0tY29uZmlndXJhdGlvbiI6eyJjb25zb2xlLWVuYWJsZWQiOnRydWUsInJvb3QtY2VydC10aHVtYnByaW50IjoiNm5aWm5ZYUpjNEtxVVpfeXZBLW11Y0ZkWU5vdXZsUG5JVG5OTVhzSGwtMCIsInNlY3VyZS1ib290Ijp0cnVlLCJ0cG0tZW5hYmxlZCI6dHJ1ZSwidHBtLXBlcnNpc3RlZCI6dHJ1ZSwidm1VbmlxdWVJZCI6IkM0RkZDNEIzLTlBREYtNDBEMS04Q0NDLTEwMTFFMUJDQ0RCMCJ9fSwieC1tcy1zZXZzbnB2bS1hdXRob3JrZXlkaWdlc3QiOiIwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAiLCJ4LW1zLXNldnNucHZtLWJvb3Rsb2FkZXItc3ZuIjo3LCJ4LW1zLXNldnNucHZtLWZhbWlseUlkIjoiMDEwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAiLCJ4LW1zLXNldnNucHZtLWd1ZXN0c3ZuIjo2NTU0MSwieC1tcy1zZXZzbnB2bS1ob3N0ZGF0YSI6IjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAiLCJ4LW1zLXNldnNucHZtLWlka2V5ZGlnZXN0IjoiMDM1NjIxNTg4MmE4MjUyNzlhODViMzAwYjBiNzQyOTMxZDExM2JmN2UzMmRkZTJlNTBmZmRlN2VjNzQzY2E0OTFlY2RkN2YzMzZkYzI4YTZlMGIyYmI1N2FmN2E0NGEzIiwieC1tcy1zZXZzbnB2bS1pbWFnZUlkIjoiMDIwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAiLCJ4LW1zLXNldnNucHZtLWlzLWRlYnVnZ2FibGUiOmZhbHNlLCJ4LW1zLXNldnNucHZtLWxhdW5jaG1lYXN1cmVtZW50IjoiN2M0MjA4NjE0ZDMyNzYzMDI4M2VkOGFhNjUyOTcxZjNkYzI0YzU0NmY2ZWUxMzBkMzJlNGUzYjg0ZjFhYTFmNWVmMmQyMTAxMmQwZmRlMDU2ZDhmOTAwYzM5MmM3NzJjIiwieC1tcy1zZXZzbnB2bS1taWNyb2NvZGUtc3ZuIjo2MiwieC1tcy1zZXZzbnB2bS1taWdyYXRpb24tYWxsb3dlZCI6ZmFsc2UsIngtbXMtc2V2c25wdm0tcmVwb3J0ZGF0YSI6IjU0ODE0ZTlhNjQ0N2JjMWM5MGE5YTExNmYxYjRjYjdlMDU5ZTYzMzQzNDgwY2Q4N2FmMjcxZjc5MjdjOThlMTMwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwIiwieC1tcy1zZXZzbnB2bS1yZXBvcnRpZCI6ImRhNjU0YWY3NGZiODIzZjJiM2E1ODMzMzVmOGYzYmUzMmY2ODkwYTdkYjIzMzUyM2RkOWUwNTRmNDQzMDU2OWYiLCJ4LW1zLXNldnNucHZtLXNtdC1hbGxvd2VkIjp0cnVlLCJ4LW1zLXNldnNucHZtLXNucGZ3LXN2biI6MTUsIngtbXMtc2V2c25wdm0tdGVlLXN2biI6MCwieC1tcy1zZXZzbnB2bS12bXBsIjowfSwieC1tcy1wb2xpY3ktaGFzaCI6IndtOW1IbHZUVTgyZThVcW9PeTFZajFGQlJTTmtmZTk5LTY5SVlEcTllV3MiLCJ4LW1zLXJ1bnRpbWUiOnsiY2xpZW50LXBheWxvYWQiOnsibm9uY2UiOiIifSwia2V5cyI6W3siZSI6IkFRQUIiLCJrZXlfb3BzIjpbImVuY3J5cHQiXSwia2lkIjoiVHBtRXBoZW1lcmFsRW5jcnlwdGlvbktleSIsImt0eSI6IlJTQSIsIm4iOiJ6eHN2bWdBQV9rRlBKMjZzYnRfdFhVUDhqcHowMk50YnhRU2hPd0lxa1h6U1hxUGJmV1Vxb1hpUm9idzJqTC01ZTNiQU1LU3J3cmpMU09DcnNtbjdPMnZxVmNBMW9Ucl9ObFE3NEpMMnlBanZVWGFId0dBZVVzbkNfWm51UXltdzNFMXJ4NnNCUUxZWFcwMTRiQjNKX01feFBDaGZfQk9NTGdNTlRzbTNwbGh0eTVNRWJ5OWJBSXp6a2ZPNjZhblhWUWxUVE1xQk43SkJVeU5QSFBWQU82V3N6bzl6YnI1aWhlUUxOZDZLZXNEMHU1VXBYaVo4UU5zUng2cXJCUS12TkVsVXQ2cXRvbC1xVzh4TkdvckxBWlhhZ0FyRVpGTE9aOEZWa0hQWGlMM0wwZ253eDBvb0l5UG5pbk5WY2dLanFYR3pOdnB1ejJGTmNuQU9uS2pKZFEifV19LCJ4LW1zLXZlciI6IjEuMCJ9.COvydLJUjR5voFyG-AUJlEp9fyJNBtbptkgq9p-KNkMlFCorY87VZPDrmITH4gYM5YpYuDk370P81hvd2Pw9COZB1-t9VSaWMJzcyL-T43Sh8nSGNO13kOqDQiHss1907kBiFy2jWngaoxuJvO4BSNFkxL9bsCsEZVSpMDmO9zZkp1Ja7sp-Cptm9rwf5JTfKuWZ4cazn2hUkWbQHBg51b8AeryNzU-35oEhGCIPYqryXv5SY32PB9s-lwh6l7K3t768P817XKF3Szip0TZpgIMoM0GU4oNOnjnFZ3u8DnvuyEim-pCZgP7qpQmJI4lrgI6Sn-jxqTg8q0FUmAkcnQ" + + def verify(self, token): + return True + + def get_namespace(self) -> str: + return MOCK_NAMESPACE diff --git a/nvflare/app_opt/confidential_computing/snp_authorizer.py b/nvflare/app_opt/confidential_computing/snp_authorizer.py new file mode 100644 index 0000000000..ba0650b304 --- /dev/null +++ b/nvflare/app_opt/confidential_computing/snp_authorizer.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import logging +import os +import subprocess +import uuid + +from nvflare.app_opt.confidential_computing.cc_authorizer import CCAuthorizer + +SNP_NAMESPACE = "x-snp" + + +class SNPAuthorizer(CCAuthorizer): + def __init__(self): + super().__init__() + self.logger = logging.getLogger(self.__class__.__name__) + + def generate(self): + cmd = ["sudo", "snpguest", "report", "report.bin", "request.bin"] + with open("request.bin", "wb") as request_file: + request_file.write(b"\x01" * 64) + _ = subprocess.run(cmd, capture_output=True) + with open("report.bin", "rb") as report_file: + token = base64.b64encode(report_file.read()) + return token + + def verify(self, token): + try: + report_bin = base64.b64decode(token) + tmp_bin_file = uuid.uuid4().hex + with open(tmp_bin_file, "wb") as report_file: + report_file.write(report_bin) + cmd = ["snpguest", "verify", "attestation", "./cert", tmp_bin_file] + cp = subprocess.run(cmd, capture_output=True) + if cp.returncode != 0: + return False + return True + except Exception as e: + self.logger.info(f"Token verification failed {e=}") + return False + finally: + if os.path.exists(tmp_bin_file): + os.remove(tmp_bin_file) + + def get_namespace(self) -> str: + return SNP_NAMESPACE diff --git a/nvflare/app_opt/confidential_computing/tdx_authorizer.py b/nvflare/app_opt/confidential_computing/tdx_authorizer.py index 21bff9035e..c8a3bb5756 100644 --- a/nvflare/app_opt/confidential_computing/tdx_authorizer.py +++ b/nvflare/app_opt/confidential_computing/tdx_authorizer.py @@ -17,7 +17,7 @@ from nvflare.app_opt.confidential_computing.cc_authorizer import CCAuthorizer -TDX_NAMESPACE = "tdx_" +TDX_NAMESPACE = "x-tdx" TDX_CLI_CONFIG = "config.json" TOKEN_FILE = "token.txt" VERIFY_FILE = "verify.txt" diff --git a/nvflare/app_opt/flower/applet.py b/nvflare/app_opt/flower/applet.py index 998e911523..137f8b451e 100644 --- a/nvflare/app_opt/flower/applet.py +++ b/nvflare/app_opt/flower/applet.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import time from nvflare.apis.fl_context import FLContext @@ -57,6 +58,8 @@ def get_command(self, ctx: dict) -> CommandDescriptor: job_id = fl_ctx.get_job_id() custom_dir = ws.get_app_custom_dir(job_id) app_dir = ws.get_app_dir(job_id) + if not os.path.isabs(custom_dir): + custom_dir = os.path.relpath(custom_dir, app_dir) cmd = f"flower-supernode --insecure --grpc-adapter --superlink {addr} {custom_dir}" # use app_dir as the cwd for flower's client app. diff --git a/nvflare/job_config/script_runner.py b/nvflare/job_config/script_runner.py index 4029b200f7..d39f65db61 100644 --- a/nvflare/job_config/script_runner.py +++ b/nvflare/job_config/script_runner.py @@ -207,7 +207,6 @@ def add_to_fed_job(self, job: FedJob, ctx, **kwargs): params_transfer_type=self._params_transfer_type, from_nvflare_converter_id=self._from_nvflare_converter_id, to_nvflare_converter_id=self._to_nvflare_converter_id, - heartbeat_timeout=0, ) ) job.add_executor(executor, tasks=tasks, ctx=ctx) diff --git a/nvflare/lighter/dummy_project.yml b/nvflare/lighter/dummy_project.yml index e16aef44df..3acb1aff99 100644 --- a/nvflare/lighter/dummy_project.yml +++ b/nvflare/lighter/dummy_project.yml @@ -67,12 +67,12 @@ builders: args: sp_end_point: server1:8002:8003 - docker_image: localhost/nvflare:0.0.1 - - - path: nvflare.lighter.impl.docker.DockerBuilder - args: - docker_image: localhost/nvflare:0.0.1 - base_image: python:3.10 - requirements_file: docker_compose_requirements.txt +# docker_image: localhost/nvflare:0.0.1 +# +# - path: nvflare.lighter.impl.docker.DockerBuilder +# args: +# docker_image: localhost/nvflare:0.0.1 +# base_image: python:3.10 +# requirements_file: docker_compose_requirements.txt - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder diff --git a/nvflare/lighter/impl/master_template.yml b/nvflare/lighter/impl/master_template.yml index 206fe6966f..b183dbc28d 100644 --- a/nvflare/lighter/impl/master_template.yml +++ b/nvflare/lighter/impl/master_template.yml @@ -311,7 +311,7 @@ log_config: | "()": "nvflare.fuel.utils.log_utils.BaseFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, - "colorFormatter": { + "consoleFormatter": { "()": "nvflare.fuel.utils.log_utils.ColorFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, @@ -330,7 +330,7 @@ log_config: | "consoleHandler": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "colorFormatter", + "formatter": "consoleFormatter", "filters": [], "stream": "ext://sys.stdout" }, diff --git a/nvflare/lighter/templates/master_template.yml b/nvflare/lighter/templates/master_template.yml index 8c1a1ab8af..145d0c0632 100644 --- a/nvflare/lighter/templates/master_template.yml +++ b/nvflare/lighter/templates/master_template.yml @@ -332,7 +332,7 @@ log_config: | "()": "nvflare.fuel.utils.log_utils.BaseFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, - "colorFormatter": { + "consoleFormatter": { "()": "nvflare.fuel.utils.log_utils.ColorFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, @@ -351,7 +351,7 @@ log_config: | "consoleHandler": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "colorFormatter", + "formatter": "consoleFormatter", "filters": [], "stream": "ext://sys.stdout" }, diff --git a/nvflare/private/fed/app/simulator/log_config.json b/nvflare/private/fed/app/simulator/log_config.json index 92c4f9890a..997d9d0420 100644 --- a/nvflare/private/fed/app/simulator/log_config.json +++ b/nvflare/private/fed/app/simulator/log_config.json @@ -6,7 +6,7 @@ "()": "nvflare.fuel.utils.log_utils.BaseFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, - "colorFormatter": { + "consoleFormatter": { "()": "nvflare.fuel.utils.log_utils.ColorFormatter", "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" }, @@ -25,7 +25,7 @@ "consoleHandler": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "colorFormatter", + "formatter": "consoleFormatter", "filters": [], "stream": "ext://sys.stdout" }, diff --git a/nvflare/private/fed/simulator/simulator_server.py b/nvflare/private/fed/simulator/simulator_server.py index 69d1f16df1..881eed6039 100644 --- a/nvflare/private/fed/simulator/simulator_server.py +++ b/nvflare/private/fed/simulator/simulator_server.py @@ -25,6 +25,7 @@ from ..server.fed_server import FederatedServer from ..server.server_engine import ServerEngine +from ..utils.identity_utils import IdentityAsserter class SimulatorServerEngine(ServerEngine): @@ -77,6 +78,22 @@ def create_job_processing_context_properties(self, workspace, job_id): return {} +class SimulatorIdentityAsserter(IdentityAsserter): + + def __init__(self, private_key_file: str, cert_file: str): + self.private_key_file = private_key_file + self.cert_file = cert_file + + def sign_common_name(self, nonce: str) -> str: + return nonce + + def sign(self, content, return_str: bool) -> str: + return "signature" + + def verify_signature(self, content, signature) -> bool: + return True + + class SimulatorServer(FederatedServer): def __init__( self, @@ -143,6 +160,9 @@ def _create_server_engine(self, args, snapshot_persistor): server=self, args=args, client_manager=self.client_manager, snapshot_persistor=snapshot_persistor ) + def _get_id_asserter(self): + return SimulatorIdentityAsserter("private_key_file", "cert_file") + def deploy(self, args, grpc_args=None, secure_train=False): super(FederatedServer, self).deploy(args, grpc_args, secure_train) os.makedirs(os.path.join(args.workspace, "local"), exist_ok=True) diff --git a/tests/integration_test/data/projects/ha_2_servers_2_clients.yml b/tests/integration_test/data/projects/ha_2_servers_2_clients.yml deleted file mode 100644 index 211e48178d..0000000000 --- a/tests/integration_test/data/projects/ha_2_servers_2_clients.yml +++ /dev/null @@ -1,47 +0,0 @@ -api_version: 3 -name: integration_test -description: NVIDIA FLARE integration_test project yaml file - -participants: - - name: localhost - type: overseer - org: nvidia - protocol: https - api_root: /api/v1 - port: 8443 - - name: localhost0 - type: server - org: nvidia - fed_learn_port: 8002 - admin_port: 8003 - - name: localhost1 - type: server - org: nvidia - fed_learn_port: 8102 - admin_port: 8103 - - name: site-1 - type: client - org: nvidia - - name: site-2 - type: client - org: nvidia - - name: super@test.org - type: admin - org: nvidia - role: project_admin - -# The same methods in all builders are called in their order defined in builders section -builders: - - path: nvflare.lighter.impl.workspace.WorkspaceBuilder - args: - template_file: master_template.yml - - path: nvflare.lighter.impl.template.TemplateBuilder - - path: nvflare.lighter.impl.static_file.StaticFileBuilder - args: - config_folder: config - overseer_agent: - path: nvflare.ha.overseer_agent.HttpOverseerAgent - overseer_exists: true - - - path: nvflare.lighter.impl.cert.CertBuilder - - path: nvflare.lighter.impl.signature.SignatureBuilder diff --git a/tests/integration_test/data/test_configs/ha/fladminapi.yml b/tests/integration_test/data/test_configs/ha/fladminapi.yml deleted file mode 100644 index 60272014a0..0000000000 --- a/tests/integration_test/data/test_configs/ha/fladminapi.yml +++ /dev/null @@ -1,30 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml - - -tests: - - test_name: "run fl admin api" - validators: - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "server_job_log" - "data": "sent task assignment to client" - "actions": [ "run_admin_commands" ] - "result": - "type": "run_state" - "data": { } - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "mark_test_done" ] - "result": - "type": "run_state" - "data": { } diff --git a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml deleted file mode 100644 index e15d731ca3..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml +++ /dev/null @@ -1,60 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_2_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill one server after we start training and the first round is completed in SAG, - second server should pick up the work and run to completion" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - "actions": [ - "kill server localhost0", - "sleep 5", - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - - "trigger": - "type": "server_job_log" - "data": "Round 1 started" - "actions": [ "no_op" ] - "result": - "type": "run_state" - "data": { } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml deleted file mode 100644 index 599dd0c936..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml +++ /dev/null @@ -1,60 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_2_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill one server after we start training but no round is completed in SAG, - second server should pick up the work and run to completion" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - "actions": [ - "kill server localhost0", - "sleep 5", - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - - "trigger": - "type": "server_job_log" - "data": "Round 0 started" - "actions": [ "no_op" ] - "result": - "type": "run_state" - "data": { } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml b/tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml deleted file mode 100644 index 01961760d6..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml +++ /dev/null @@ -1,33 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload np_sag and let it finish" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job np_sag" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ - "kill server", - "sleep 10", - "start server", - ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml deleted file mode 100644 index 1677ba762c..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml +++ /dev/null @@ -1,56 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill the server during training after SAG first round, - restart it should pick up the work" - - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - "actions": [ - "kill server", - "sleep 10", - "start server", - "sleep 1" - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml deleted file mode 100644 index 302c6324d8..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml +++ /dev/null @@ -1,56 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill the server after we start training but no round is completed in SAG, - restart it should pick up the work" - - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - "actions": [ - "kill server", - "sleep 10", - "start server", - "sleep 1" - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml b/tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml deleted file mode 100644 index 5fea19adf0..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml +++ /dev/null @@ -1,48 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill the server during sending models to clients, - restart it should pick up the work" - - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "server_job_log" - "data": "sent task assignment to client" - "actions": [ - "kill server", - "sleep 10", - "start server", - "sleep 1" - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/two_servers.yml b/tests/integration_test/data/test_configs/ha/two_servers.yml deleted file mode 100644 index feb0b43dde..0000000000 --- a/tests/integration_test/data/test_configs/ha/two_servers.yml +++ /dev/null @@ -1,26 +0,0 @@ -ha: True -jobs_root_dir: ./data/jobs -cleanup: True -project_yaml: ./data/projects/ha_2_servers_2_clients.yml - - -tests: - - test_name: "upload a job, wait for it to finish" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job hello-numpy-sag" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 4, 5, 6 ], [ 7, 8, 9 ], [ 10, 11, 12 ] ] } diff --git a/tests/integration_test/run_integration_tests.sh b/tests/integration_test/run_integration_tests.sh index 50d34e6c9d..299308b49f 100755 --- a/tests/integration_test/run_integration_tests.sh +++ b/tests/integration_test/run_integration_tests.sh @@ -3,7 +3,7 @@ set -e PYTHONPATH="${PWD}/../.." -backends=(numpy tensorflow pytorch overseer ha auth preflight cifar auto stats xgboost client_api client_api_qa model_controller_api) +backends=(numpy tensorflow pytorch overseer auth preflight cifar auto stats xgboost client_api client_api_qa model_controller_api) usage() { diff --git a/tests/integration_test/test_configs.yml b/tests/integration_test/test_configs.yml index 52aa9455de..867bddd327 100644 --- a/tests/integration_test/test_configs.yml +++ b/tests/integration_test/test_configs.yml @@ -6,15 +6,6 @@ test_configs: - ./data/test_configs/authorization/abort_job.yml - ./data/test_configs/authorization/list_job.yml - ./data/test_configs/authorization/shell_commands.yml - ha: - - ./data/test_configs/ha/kill_one_server_during_training_after_first_round.yml - - ./data/test_configs/ha/kill_one_server_during_training_before_first_round.yml - - ./data/test_configs/ha/kill_server_after_training_complete.yml - - ./data/test_configs/ha/kill_server_during_training_after_first_round.yml - - ./data/test_configs/ha/kill_server_during_training_before_first_round.yml - - ./data/test_configs/ha/kill_server_during_training_sending_model.yml - - ./data/test_configs/ha/two_servers.yml - - ./data/test_configs/ha/fladminapi.yml numpy: - ./data/test_configs/standalone_job/np_job.yml - ./data/test_configs/standalone_job/np_app.yml diff --git a/tests/unit_test/app_opt/quantization/quantization_test.py b/tests/unit_test/app_opt/quantization/quantization_test.py index b8b2a6fb35..5f452ca3a5 100644 --- a/tests/unit_test/app_opt/quantization/quantization_test.py +++ b/tests/unit_test/app_opt/quantization/quantization_test.py @@ -27,21 +27,21 @@ "float16", {"a": np.array([1.0, 2.0, 3.0, 65504.0], dtype="float32")}, ), - ( - {"a": np.array([1.0, 2.0, 3.0, 4.0], dtype="float32")}, - "blockwise8", - {"a": np.array([0.99062496, 2.003125, 3.015625, 4.0], dtype="float32")}, - ), + # ( + # {"a": np.array([1.0, 2.0, 3.0, 4.0], dtype="float32")}, + # "blockwise8", + # {"a": np.array([0.99062496, 2.003125, 3.015625, 4.0], dtype="float32")}, + # ), ( {"a": torch.tensor([1.0, 2.0, 3.0, 4000.0], dtype=torch.bfloat16)}, "float16", {"a": torch.tensor([1.0, 2.0, 3.0, 4000.0], dtype=torch.bfloat16)}, ), - ( - {"a": torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)}, - "blockwise8", - {"a": torch.tensor([0.99062496, 2.003125, 3.015625, 4.0], dtype=torch.float32)}, - ), + # ( + # {"a": torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)}, + # "blockwise8", + # {"a": torch.tensor([0.99062496, 2.003125, 3.015625, 4.0], dtype=torch.float32)}, + # ), ]