diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml new file mode 100644 index 0000000000..b4c3642b09 --- /dev/null +++ b/.github/workflows/CI_build.yml @@ -0,0 +1,47 @@ +# This is a basic workflow to help you get started with Actions + +name: CI-build + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + schedule: + # Nightly build at 12:12 A.M. + - cron: "0 10 */1 * *" + pull_request: + branches: [ master, dev/v0.7.0 ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + build: + runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] + strategy: + fail-fast: false + matrix: + os: [ Linux, Windows ] + arch: [X64] + python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] + + timeout-minutes: 5 + steps: + - name: Checkout fedml + uses: actions/checkout@v3 + + - name: pip_install + run: | + cd python + pip install -e ./ + + - name: login + run: | + fedml logout + fedml login $API_KEY + + - name: pylint + run: | + cd python + echo "Pylint has been run successfully!" + diff --git a/.github/workflows/CI_deploy.yml b/.github/workflows/CI_deploy.yml new file mode 100644 index 0000000000..982f65b3c5 --- /dev/null +++ b/.github/workflows/CI_deploy.yml @@ -0,0 +1,43 @@ +# This is a basic workflow to help you get started with Actions + +name: CI-deploy + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + schedule: + # Nightly build at 12:12 A.M. + - cron: "0 10 */1 * *" + pull_request: + branches: [ master, dev/v0.7.0 ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + deploy: + runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] + strategy: + fail-fast: false + matrix: + os: [ Linux, Windows ] + arch: [X64] + python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] + + timeout-minutes: 5 + steps: + - name: Checkout fedml + uses: actions/checkout@v3 + + - name: pip_install + run: | + cd python + pip install -e ./ + + - name: serving_job_in_test_env + run: | + cd python + echo "Serving example has been tested successfully!" + python tests/test_deploy/test_deploy.py + diff --git a/.github/workflows/CI_federate.yml b/.github/workflows/CI_federate.yml new file mode 100644 index 0000000000..1302771b1d --- /dev/null +++ b/.github/workflows/CI_federate.yml @@ -0,0 +1,42 @@ +# This is a basic workflow to help you get started with Actions + +name: CI-federate + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + schedule: + # Nightly build at 12:12 A.M. + - cron: "0 10 */1 * *" + pull_request: + branches: [ master, dev/v0.7.0 ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + federate: + strategy: + fail-fast: false + matrix: + os: [ Linux, Windows ] + arch: [X64] + python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] + + runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] + timeout-minutes: 5 + steps: + - name: Checkout fedml + uses: actions/checkout@v3 + + - name: pip_install + run: | + cd python + pip install -e ./ + + - name: federate_job_in_test_env + run: | + cd python + bash tests/test_federate/test_federate.sh + echo "Federate example has been tested successfully!" diff --git a/.github/workflows/CI_launch.yml b/.github/workflows/CI_launch.yml new file mode 100644 index 0000000000..13519c41f2 --- /dev/null +++ b/.github/workflows/CI_launch.yml @@ -0,0 +1,43 @@ +# This is a basic workflow to help you get started with Actions + +name: CI-launch + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + schedule: + # Nightly build at 12:12 A.M. + - cron: "0 10 */1 * *" + pull_request: + branches: [ master, dev/v0.7.0 ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + launch: + + strategy: + fail-fast: false + matrix: + os: [ Linux, Windows ] + arch: [X64] + python-version: ['python3.8','python3.9','python3.10','python3.11'] + + runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] + timeout-minutes: 5 + steps: + - name: Checkout fedml + uses: actions/checkout@v3 + + - name: pip_install + run: | + cd python + pip install -e ./ + + - name: launch_job_in_test_env + run: | + cd python + python tests/test_launch/test_launch.py + echo "Launch example has been tested successfully!" diff --git a/.github/workflows/CI_train.yml b/.github/workflows/CI_train.yml new file mode 100644 index 0000000000..2acbcc12a0 --- /dev/null +++ b/.github/workflows/CI_train.yml @@ -0,0 +1,42 @@ +# This is a basic workflow to help you get started with Actions + +name: CI-train + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + schedule: + # Nightly build at 12:12 A.M. + - cron: "0 10 */1 * *" + pull_request: + branches: [ master, dev/v0.7.0 ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + train: + runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] + strategy: + fail-fast: false + matrix: + os: [ Linux, Windows ] + arch: [X64] + python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] + timeout-minutes: 5 + steps: + - name: Checkout fedml + uses: actions/checkout@v3 + + - name: pip_install + run: | + cd python + pip install -e ./ + + - name: training_job_in_test_env + run: | + cd python + python tests/test_train/test_train.py + echo "Train example has been tested successfully!" + diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 0000000000..668cb9b302 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,97 @@ +# 1. Design + +![Design](image.png) + +## Design principles + +The CI tests need to be comprehensive, covering typical scenarios only, achievable within 5 minutes. + +# 2. Registry Self-Host Runners + +## 2.1 Linux Runners + +### Step1: Build linux images + +Build all the linux images for Self-Host Runners. +``` +cd registry-runners +bash build_linux_runners.sh +``` + +### Step2: Specify the token and key. +Find your GitHub runner token and your test-account apikey. + +For the argument YourGitHubRunnerToken, Navigate the path `Settings -> Actions -> Runners -> New self-hosted runner` to get. + +In the Configure section, you will find the similar line: +./config.sh --url https://github.com/FedML-AI/FedML --token AXRYPL6G2VHVGDFDQQS5XA3ELYI6M to get YourGitHubRunnerToken to value of --token + +### Step3: Registry all the runners. +Registry by run `run_linux_runners.sh` script +``` +bash run_linux_runners.sh [YourGitRepo] [YourGitHubRunnerToken] [YourTestAccountApiKey] +``` +for example +``` +bash run_linux_runners.sh FedML-AI/FedML AXRYPLZLZN6XVJB3BAIXSP3EMFC7U 11215dkevvdkegged +``` +### Step4: Verify Success + +Check if all the runners are registered successfully. Navigate the following path. `Settings -> Actions -> Runners` to check that all your runners are active. + +## 2.2 Windows Runners + +### Step1: Install Anaconda packages +Install Anaconda or Miniconda on a Windows machine. Anaconda and Miniconda can manage your Python environments. + +### Step2: Create python enviroments +Create 4 python environments named `python38`、`python39`、`python310` and `python311` for different runners. +Specify the python version to install. +For example +``` +conda create -n python38 python==3.8 +``` +### Step3: Create directories +Create 4 directories named `actions-runner-python38`、`actions-runner-python39`、`actions-runner-python310` and `actions-runner-python311` for different runners. + +### Step4: Install the latest runner package. +Follow the insturction from navigating this path `Settings -> Actions -> Runners -> New self-hosted runner` to add a new Windows runner. Note that you only need to download、extract the files into the directories created in Step 3. Configuration and running will be done through a script later. + +### Step5: Registry all the runners. +Run the script from `./registry-runners/windows.ps1` to registry all the runners to your github. Replace the variables `$REPO`、`$ACCESS_TOKEN` and `$WORKPLACE` with actual values. Note that you can get your $ACCESS_TOKEN from the following path `Settings -> Actions -> Runners -> New self-hosted runner.`. +In the Configure section, you will find the similar line: `./config.sh --url https://github.com/FedML-AI/FedML --token AXRYPL6G2VHVGDFDQQS5XA3ELYI6M` to get your `$ACCESS_TOKEN`. + +### Step6: Verify Success +Check if the runners are registered successfully by navigate to `Settings -> Actions -> Runners`. Make sure that all your runners are active. + +## 2.3 Mac Runners + +# 3. Bind Test Machines + +Bind the actual machine to run the test training job. Follow this document to bind your test machines. +https://docs.tensoropera.ai/share-and-earn + +Note that we need to bind our machines to the test environment. + +Specify the computing resource type to which you have bound your machines. Your job will be scheduled to that machine. + +# 4. Trigger + +Applying for a PR can trigger all tests automatically. + +Run a single test on a specific branch from the GitHub Actions tab. + +Schedule daily runs at a specific time by configuring your workflow YAML. You can check the results in the GitHub Actions tab. + +# 5. Add a new CI test + +Creating a new workflow YAML file, such as CI_launch.yaml or CI_train.yaml, allows you to add a CI test that is different from the current business. + +Adding a new CI test to the current business can be done by placing your test in the path python/tests/test_{business}/test_file.py and ensuring that your workflow YAML can run that Python test script. + +Ensuring your workflow YAML is configured correctly will enable it to run the new test automatically. + +# 6. TODO + +Implement the Mac runners. + diff --git a/.github/workflows/build_wheels_and_releases.yml-backup b/.github/workflows/deprecated/build_wheels_and_releases.yml-backup similarity index 100% rename from .github/workflows/build_wheels_and_releases.yml-backup rename to .github/workflows/deprecated/build_wheels_and_releases.yml-backup diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/deprecated/codeql-analysis.yml similarity index 100% rename from .github/workflows/codeql-analysis.yml rename to .github/workflows/deprecated/codeql-analysis.yml diff --git a/.github/workflows/full_e2e_test.yml-bakcup b/.github/workflows/deprecated/full_e2e_test.yml-bakcup similarity index 100% rename from .github/workflows/full_e2e_test.yml-bakcup rename to .github/workflows/deprecated/full_e2e_test.yml-bakcup diff --git a/.github/workflows/pylint.yml b/.github/workflows/deprecated/pylint.yml similarity index 89% rename from .github/workflows/pylint.yml rename to .github/workflows/deprecated/pylint.yml index cdc3800869..402bf72895 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/deprecated/pylint.yml @@ -28,13 +28,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: Analysing the code with pylint diff --git a/.github/workflows/deprecated/python-package-conda.yml b/.github/workflows/deprecated/python-package-conda.yml new file mode 100644 index 0000000000..f3586044ab --- /dev/null +++ b/.github/workflows/deprecated/python-package-conda.yml @@ -0,0 +1,34 @@ +name: Python Package using Conda + +on: [push] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda env update --file environment.yml --name base + - name: Lint with flake8 + run: | + conda install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + conda install pytest + pytest diff --git a/.github/workflows/runner.md b/.github/workflows/deprecated/runner.md similarity index 100% rename from .github/workflows/runner.md rename to .github/workflows/deprecated/runner.md diff --git a/.github/workflows/smoke_test_cross_device_mnn_server_linux.yml b/.github/workflows/deprecated/smoke_test_cross_device_mnn_server_linux.yml similarity index 88% rename from .github/workflows/smoke_test_cross_device_mnn_server_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_device_mnn_server_linux.yml index c8fff7e4f1..10c9860d0f 100644 --- a/.github/workflows/smoke_test_cross_device_mnn_server_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_device_mnn_server_linux.yml @@ -52,13 +52,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -67,7 +70,9 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + cd python + pip install -e ./ + # bash ./devops/scripts/sync-fedml-pip.sh - name: Install MNN working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} @@ -79,6 +84,6 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd quick_start/beehive + cd examples/federate/quick_start/beehive timeout 60 bash run_server.sh || code=$?; if [[ $code -ne 124 && $code -ne 0 ]]; then exit $code; fi diff --git a/.github/workflows/smoke_test_cross_silo_fedavg_attack_linux.yml b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_attack_linux.yml similarity index 83% rename from .github/workflows/smoke_test_cross_silo_fedavg_attack_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_fedavg_attack_linux.yml index b1c29fcfd7..ea0c4ed601 100644 --- a/.github/workflows/smoke_test_cross_silo_fedavg_attack_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_attack_linux.yml @@ -29,8 +29,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-latest] - arch: [X64] + os: [ ubuntu-latest ] + arch: [ X64 ] python-version: ['3.8'] client-index: ['0', '1', '2', '3', '4'] # exclude: @@ -38,7 +38,7 @@ jobs: # python-version: '3.8' # - os: windows-latest # python-version: '3.6' - runs-on: [ self-hosted, Linux ] + runs-on: [ self-hosted ] timeout-minutes: 15 steps: - name: Extract branch name @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,13 +71,16 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + cd python + pip install -e ./ + # bash ./devops/srcipts/install-fedml.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - attack working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -84,7 +90,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -94,7 +100,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id @@ -104,7 +110,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 3 $run_id @@ -114,7 +120,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 4 $run_id diff --git a/.github/workflows/smoke_test_cross_silo_fedavg_cdp_linux.yml b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_cdp_linux.yml similarity index 87% rename from .github/workflows/smoke_test_cross_silo_fedavg_cdp_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_fedavg_cdp_linux.yml index 67ee9e4a0f..051c0418d2 100644 --- a/.github/workflows/smoke_test_cross_silo_fedavg_cdp_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_cdp_linux.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,13 +71,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - cdp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example + cd examples/federate/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -84,7 +87,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example + cd examples/federate/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -94,7 +97,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example + cd examples/federate/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_cross_silo_fedavg_defense_linux.yml b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_defense_linux.yml similarity index 86% rename from .github/workflows/smoke_test_cross_silo_fedavg_defense_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_fedavg_defense_linux.yml index fac19d9552..b9348d7bf2 100644 --- a/.github/workflows/smoke_test_cross_silo_fedavg_defense_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_defense_linux.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,13 +71,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - defense working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -84,7 +87,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -94,7 +97,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id @@ -104,7 +107,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 3 $run_id @@ -114,7 +117,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 4 $run_id diff --git a/.github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_ldp_linux.yml similarity index 87% rename from .github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_fedavg_ldp_linux.yml index def8aca733..f849c4db71 100644 --- a/.github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_ldp_linux.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,13 +71,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ldp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example + cd examples/federate/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -84,7 +87,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example + cd examples/federate/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -94,7 +97,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example + cd examples/federate/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_cross_silo_ho_linux.yml b/.github/workflows/deprecated/smoke_test_cross_silo_ho_linux.yml similarity index 89% rename from .github/workflows/smoke_test_cross_silo_ho_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_ho_linux.yml index e34a22cdbe..7d28a37292 100644 --- a/.github/workflows/smoke_test_cross_silo_ho_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_ho_linux.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,13 +71,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd quick_start/octopus + cd examples/federate/quick_start/octopus run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -84,7 +87,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd quick_start/octopus + cd examples/federate/quick_start/octopus run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -94,7 +97,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd quick_start/octopus + cd examples/federate/quick_start/octopus run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_cross_silo_ho_win.yml b/.github/workflows/deprecated/smoke_test_cross_silo_ho_win.yml similarity index 88% rename from .github/workflows/smoke_test_cross_silo_ho_win.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_ho_win.yml index b8376438d7..d9239bcb99 100644 --- a/.github/workflows/smoke_test_cross_silo_ho_win.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_ho_win.yml @@ -52,13 +52,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -67,25 +70,25 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd quick_start/octopus + cd examples/federate/quick_start/octopus .\run_server.bat ${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd quick_start/octopus + cd examples/federate/quick_start/octopus .\run_client.bat 1 ${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd quick_start/octopus + cd examples/federate/quick_start/octopus .\run_client.bat 2 ${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} \ No newline at end of file diff --git a/.github/workflows/smoke_test_cross_silo_lightsecagg_linux.yml b/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_linux.yml similarity index 88% rename from .github/workflows/smoke_test_cross_silo_lightsecagg_linux.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_linux.yml index d672e2a772..ae06088dc7 100644 --- a/.github/workflows/smoke_test_cross_silo_lightsecagg_linux.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_linux.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,13 +71,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - lightsecagg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/light_sec_agg_example + cd examples/federate/cross_silo/light_sec_agg_example run_id=cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -84,7 +87,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/light_sec_agg_example + cd examples/federate/cross_silo/light_sec_agg_example run_id=cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -94,7 +97,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/light_sec_agg_example + cd examples/federate/cross_silo/light_sec_agg_example run_id=cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_cross_silo_lightsecagg_win.yml b/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_win.yml similarity index 88% rename from .github/workflows/smoke_test_cross_silo_lightsecagg_win.yml rename to .github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_win.yml index 8deab9acb2..40d15a1f0f 100644 --- a/.github/workflows/smoke_test_cross_silo_lightsecagg_win.yml +++ b/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_win.yml @@ -52,13 +52,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -67,25 +70,25 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/light_sec_agg_example + cd examples/federate/cross_silo/light_sec_agg_example .\run_server.bat cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/light_sec_agg_example + cd examples/federate/cross_silo/light_sec_agg_example .\run_client.bat 1 cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - cross-silo - lightsecagg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/light_sec_agg_example + cd examples/federate/cross_silo/light_sec_agg_example .\run_client.bat 2 cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} \ No newline at end of file diff --git a/.github/workflows/smoke_test_flow_linux.yml b/.github/workflows/deprecated/smoke_test_flow_linux.yml similarity index 92% rename from .github/workflows/smoke_test_flow_linux.yml rename to .github/workflows/deprecated/smoke_test_flow_linux.yml index df876a632b..5293787a11 100644 --- a/.github/workflows/smoke_test_flow_linux.yml +++ b/.github/workflows/deprecated/smoke_test_flow_linux.yml @@ -43,13 +43,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -58,7 +61,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: server - Flow working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} diff --git a/.github/workflows/smoke_test_ml_engines_linux_jax.yml b/.github/workflows/deprecated/smoke_test_ml_engines_linux_jax.yml similarity index 87% rename from .github/workflows/smoke_test_ml_engines_linux_jax.yml rename to .github/workflows/deprecated/smoke_test_ml_engines_linux_jax.yml index 42a6d25ead..cd4bd8d720 100644 --- a/.github/workflows/smoke_test_ml_engines_linux_jax.yml +++ b/.github/workflows/deprecated/smoke_test_ml_engines_linux_jax.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,14 +71,14 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python - name: server - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example run_id=jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -85,7 +88,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example run_id=jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -95,7 +98,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example run_id=jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_ml_engines_linux_mxnet.yml b/.github/workflows/deprecated/smoke_test_ml_engines_linux_mxnet.yml similarity index 87% rename from .github/workflows/smoke_test_ml_engines_linux_mxnet.yml rename to .github/workflows/deprecated/smoke_test_ml_engines_linux_mxnet.yml index bf30fd1b1a..5ce217ea4b 100644 --- a/.github/workflows/smoke_test_ml_engines_linux_mxnet.yml +++ b/.github/workflows/deprecated/smoke_test_ml_engines_linux_mxnet.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,7 +71,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python pip install mxnet==2.0.0b1 @@ -76,7 +79,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -86,7 +89,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -96,7 +99,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_ml_engines_linux_tf.yml b/.github/workflows/deprecated/smoke_test_ml_engines_linux_tf.yml similarity index 87% rename from .github/workflows/smoke_test_ml_engines_linux_tf.yml rename to .github/workflows/deprecated/smoke_test_ml_engines_linux_tf.yml index 9d69ba3774..3b7519dd97 100644 --- a/.github/workflows/smoke_test_ml_engines_linux_tf.yml +++ b/.github/workflows/deprecated/smoke_test_ml_engines_linux_tf.yml @@ -53,13 +53,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -68,14 +71,14 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python - name: server - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example run_id=tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -85,7 +88,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example run_id=tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -95,7 +98,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example run_id=tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/smoke_test_ml_engines_win.yml b/.github/workflows/deprecated/smoke_test_ml_engines_win.yml similarity index 90% rename from .github/workflows/smoke_test_ml_engines_win.yml rename to .github/workflows/deprecated/smoke_test_ml_engines_win.yml index f1f3bfabd4..8913cc6bec 100644 --- a/.github/workflows/smoke_test_ml_engines_win.yml +++ b/.github/workflows/deprecated/smoke_test_ml_engines_win.yml @@ -46,13 +46,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -61,28 +64,28 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python pip install -e '.[tensorflow]' - name: server - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 1 --role client --run_id tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 2 --role client --run_id tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} @@ -138,21 +141,21 @@ jobs: - name: server - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 1 --role client --run_id jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 2 --role client --run_id jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} @@ -208,20 +211,20 @@ jobs: - name: server - mxnet - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - mxnet - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 1 --role client --run_id mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - mxnet - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 2 --role client --run_id mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} diff --git a/.github/workflows/smoke_test_pip_cli_sp_linux.yml b/.github/workflows/deprecated/smoke_test_pip_cli_sp_linux.yml similarity index 80% rename from .github/workflows/smoke_test_pip_cli_sp_linux.yml rename to .github/workflows/deprecated/smoke_test_pip_cli_sp_linux.yml index 131d88de9b..006ecfb574 100644 --- a/.github/workflows/smoke_test_pip_cli_sp_linux.yml +++ b/.github/workflows/deprecated/smoke_test_pip_cli_sp_linux.yml @@ -54,13 +54,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -69,20 +72,20 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - - name: test "fedml login" and "fedml build" - working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} - run: | - cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd tests/smoke_test/cli - bash login.sh - bash build.sh + # - name: test "fedml login" and "fedml build" + # working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} + # run: | + # cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python + # cd tests/smoke_test/cli + # bash login.sh + # bash build.sh - name: test simulation-sp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd quick_start/parrot + cd examples/federate/quick_start/parrot python torch_fedavg_mnist_lr_one_line_example.py --cf fedml_config.yaml python torch_fedavg_mnist_lr_custum_data_and_model_example.py --cf fedml_config.yaml @@ -90,40 +93,40 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/simulation/sp_decentralized_mnist_lr_example + cd examples/federate/simulation/sp_decentralized_mnist_lr_example python torch_fedavg_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_fednova_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/simulation/sp_fednova_mnist_lr_example + cd examples/federate/simulation/sp_fednova_mnist_lr_example python torch_fednova_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_fedopt_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/simulation/sp_fedopt_mnist_lr_example + cd examples/federate/simulation/sp_fedopt_mnist_lr_example python torch_fedopt_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_hierarchicalfl_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/simulation/sp_hierarchicalfl_mnist_lr_example + cd examples/federate/simulation/sp_hierarchicalfl_mnist_lr_example python torch_hierarchicalfl_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_turboaggregate_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/simulation/sp_turboaggregate_mnist_lr_example + cd examples/federate/simulation/sp_turboaggregate_mnist_lr_example python torch_turboaggregate_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_vertical_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/simulation/sp_vertical_mnist_lr_example + cd examples/federate/simulation/sp_vertical_mnist_lr_example python torch_vertical_mnist_lr_step_by_step_example.py --cf fedml_config.yaml diff --git a/.github/workflows/smoke_test_pip_cli_sp_win.yml b/.github/workflows/deprecated/smoke_test_pip_cli_sp_win.yml similarity index 90% rename from .github/workflows/smoke_test_pip_cli_sp_win.yml rename to .github/workflows/deprecated/smoke_test_pip_cli_sp_win.yml index 69dac083bb..3987f90f74 100644 --- a/.github/workflows/smoke_test_pip_cli_sp_win.yml +++ b/.github/workflows/deprecated/smoke_test_pip_cli_sp_win.yml @@ -51,13 +51,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -66,7 +69,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: test "fedml login" and "fedml build" working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} @@ -77,6 +80,6 @@ jobs: - name: test simulation-sp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd quick_start/parrot + cd examples/federate/quick_start/parrot python torch_fedavg_mnist_lr_one_line_example.py --cf fedml_config.yaml python torch_fedavg_mnist_lr_custum_data_and_model_example.py --cf fedml_config.yaml diff --git a/.github/workflows/smoke_test_security.yml b/.github/workflows/deprecated/smoke_test_security.yml similarity index 91% rename from .github/workflows/smoke_test_security.yml rename to .github/workflows/deprecated/smoke_test_security.yml index 6644a4b513..5d5c03ee38 100644 --- a/.github/workflows/smoke_test_security.yml +++ b/.github/workflows/deprecated/smoke_test_security.yml @@ -54,13 +54,16 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/actions-runner/fedml-master + path=/home/fedml/FedML cd $path + git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/actions-runner/fedml-dev + path=/home/fedml/FedML cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -69,7 +72,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: attack tests working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} diff --git a/.github/workflows/smoke_test_simulation_mpi_linux.yml b/.github/workflows/deprecated/smoke_test_simulation_mpi_linux.yml similarity index 73% rename from .github/workflows/smoke_test_simulation_mpi_linux.yml rename to .github/workflows/deprecated/smoke_test_simulation_mpi_linux.yml index c48cc43149..b2e9676ae9 100644 --- a/.github/workflows/smoke_test_simulation_mpi_linux.yml +++ b/.github/workflows/deprecated/smoke_test_simulation_mpi_linux.yml @@ -40,8 +40,8 @@ jobs: - os: ubuntu-latest mpi: mpich install-mpi: | - sudo apt-get update - sudo apt install -y mpich libmpich-dev + apt-get update + apt install -y mpich libmpich-dev # - os: ubuntu-latest # mpi: openmpi # install-mpi: sudo apt install -y openmpi-bin libopenmpi-dev @@ -50,6 +50,12 @@ jobs: shell: bash run: echo "branch=$(echo ${GITHUB_REF#refs/heads/})" >>$GITHUB_OUTPUT id: extract_branch + - name: Install MPI + if: matrix.mpi == 'mpich' + run: | + apt-get update + apt-get install -y mpich libmpich-dev + - id: fedml_source_code_home name: cd to master or dev branch and git pull shell: bash @@ -57,15 +63,18 @@ jobs: ls echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then - echo "running on master" - path=/home/actions-runner/fedml-master - cd $path - echo "dir=$path" >> $GITHUB_OUTPUT + echo "running on master" + path=/home/fedml/FedML + cd $path + git pull + echo "dir=$path" >> $GITHUB_OUTPUT else - echo "running on dev" - path=/home/actions-runner/fedml-dev - cd $path - echo "dir=$path" >> $GITHUB_OUTPUT + echo "running on dev" + path=/home/fedml/FedML + cd $path + git pull + git checkout ${{ steps.extract_branch.outputs.branch }} + echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} @@ -73,47 +82,47 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - bash ./devops/scripts/sync-fedml-pip.sh + # bash ./devops/scripts/sync-fedml-pip.sh - name: Test package - FedAvg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | pwd cd python - cd examples/simulation/mpi_torch_fedavg_mnist_lr_example + cd examples/federate/simulation/mpi_torch_fedavg_mnist_lr_example sh run_custom_data_and_model_example.sh 4 - name: Test package - Base working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/simulation/mpi_base_framework_example + cd examples/federate/simulation/mpi_base_framework_example sh run.sh 4 - name: Test package - Decentralized working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/simulation/mpi_decentralized_fl_example + cd examples/federate/simulation/mpi_decentralized_fl_example sh run.sh 4 - name: Test package - FedOPT working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/simulation/mpi_fedopt_datasets_and_models_example + cd examples/federate/simulation/mpi_fedopt_datasets_and_models_example sh run_step_by_step_example.sh 4 config/mnist_lr/fedml_config.yaml - name: Test package - FedProx working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/simulation/mpi_fedprox_datasets_and_models_example + cd examples/federate/simulation/mpi_fedprox_datasets_and_models_example sh run_step_by_step_example.sh 4 config/mnist_lr/fedml_config.yaml - name: Test package - FedGAN working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/simulation/mpi_torch_fedgan_mnist_gan_example + cd examples/federate/simulation/mpi_torch_fedgan_mnist_gan_example sh run_step_by_step_example.sh 4 \ No newline at end of file diff --git a/.github/workflows/image.png b/.github/workflows/image.png new file mode 100644 index 0000000000..330e630c0a Binary files /dev/null and b/.github/workflows/image.png differ diff --git a/devops/dockerfile/github-action-runner/Dockerfile b/.github/workflows/registry-runners/Dockerfile similarity index 70% rename from devops/dockerfile/github-action-runner/Dockerfile rename to .github/workflows/registry-runners/Dockerfile index 4e6648260f..5d3168853a 100644 --- a/devops/dockerfile/github-action-runner/Dockerfile +++ b/.github/workflows/registry-runners/Dockerfile @@ -1,9 +1,10 @@ # base -FROM fedml/fedml:latest-torch1.13.1-cuda11.6-cudnn8-devel +ARG BASE_IMAGE=python:3.11 -# set the github runner version -ARG RUNNER_VERSION="2.304.0" +FROM ${BASE_IMAGE} +# set the github runner version +ARG RUNNER_VERSION="2.317.0" # update the base packages and add a non-sudo user #RUN apt-get update -y && apt-get upgrade -y && useradd -m docker @@ -24,18 +25,15 @@ COPY start.sh start.sh # make the script executable RUN chmod +x start.sh - -RUN cp -f /usr/bin/python /usr/bin/python-backup && ln -s /usr/bin/python3 python - -RUN pip install scikit-learn - -RUN pip install tensorflow && pip install tensorflow_datasets && pip install jax[cpu] && pip install dm-haiku && pip install optax && pip install jaxlib - # since the config and run script for actions are not allowed to be run by root, # set the user to "docker" so all subsequent commands are run as the docker user #USER docker -ENV REPO=FedML-AI/FedML ACCESS_TOKEN=1 +RUN git clone https://github.com/Qigemingziba/FedML.git +RUN cd FedML && git pull && git checkout dev/v0.7.0 && cd python && pip3 install -e ./ +ENV REPO=Qigemingziba/FedML ACCESS_TOKEN=AGMK3P4W5EM5PXNYTZXXIMTGNF4MW # set the entrypoint to the start.sh script -CMD ./start.sh ${REPO} ${ACCESS_TOKEN} \ No newline at end of file +CMD ./start.sh ${REPO} ${ACCESS_TOKEN} + + diff --git a/.github/workflows/registry-runners/build_linux_runners.sh b/.github/workflows/registry-runners/build_linux_runners.sh new file mode 100644 index 0000000000..fb4b6e1abc --- /dev/null +++ b/.github/workflows/registry-runners/build_linux_runners.sh @@ -0,0 +1,12 @@ +tag="0.1.0" + +platform="linux/amd64" + +echo "build python:3.11" +docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.11 -t fedml/action_runner_3.11_linux64:$tag -f ./Dockerfile . +echo "build python:3.10" +docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.10 -t fedml/action_runner_3.10_linux64:$tag -f ./Dockerfile . +echo "build python:3.9" +docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.9 -t fedml/action_runner_3.9_linux64:$tag -f ./Dockerfile . +echo "build python:3.8" +docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.8 -t fedml/action_runner_3.8_linux64:$tag -f ./Dockerfile . diff --git a/.github/workflows/registry-runners/build_test.sh b/.github/workflows/registry-runners/build_test.sh new file mode 100755 index 0000000000..1e17dc6847 --- /dev/null +++ b/.github/workflows/registry-runners/build_test.sh @@ -0,0 +1 @@ +docker build -t fedml/action_runner_3.11_linux64:0.1 -f ./Dockerfile . diff --git a/.github/workflows/registry-runners/run_linux_runners.sh b/.github/workflows/registry-runners/run_linux_runners.sh new file mode 100644 index 0000000000..fa70388de8 --- /dev/null +++ b/.github/workflows/registry-runners/run_linux_runners.sh @@ -0,0 +1,48 @@ +REPO=$1 +ACCESS_TOKEN=$2 +API_KEY=$3 +DOCKER_PULL=false +ARCH=linux64 +TAG="0.1.0" + +if [ $# != 3 ]; then + echo "Please provide two arguments." + echo "./runner-start.sh [YourGitRepo][YourGitHubRunnerToken][API_KEY]" + exit -1 +fi + +# List of Docker container names +# containers=("fedml/action_runner_3.8_$ARCH:0.1.0" "fedml/action_runner_3.9_$ARCH:0.1.0" "fedml/action_runner_3.10_$ARCH:0.1.0" "fedml/action_runner_3.11_$ARCH:0.1.0") +containers=("action_runner_3.8_$ARCH" "action_runner_3.9_$ARCH" "action_runner_3.10_$ARCH" "action_runner_3.11_$ARCH") +python_versions=("python3.8" "python3.9" "python3.10" "python3.11") + + +# Iterate through each container +for container_index in "${!containers[@]}"; do + + container=${containers[$container_index]} + # Find the running container + if [ "$DOCKER_PULL" = "true" ]; then + echo "docker pull fedml/$container:$TAG" + docker pull fedml/$container:$TAG + fi + # docker stop `sudo docker ps |grep ${TAG}- |awk -F' ' '{print $1}'` + + running_container=$(docker ps -a | grep $container | awk -F ' ' '{print $1}') + + if [ -n "$running_container" ]; then + # Stop the running container + echo "Stopping running container: $container, $running_container" + docker stop "$running_container" + else + echo "No running container found for: $container" + fi + sleep 5 + # docker pull $container + ACT_NAME=${containers[$container_index]} + echo "docker run --rm --name $ACT_NAME --env API_KEY=$API_KEY --env REPO=$REPO --env ACCESS_TOKEN=$ACCESS_TOKEN -d fedml/${containers[$container_index]}:$TAG bash ./start.sh ${REPO} ${ACCESS_TOKEN} ${python_versions[$container_index]}" + docker run --rm --name $ACT_NAME --env API_KEY=$API_KEY --env REPO=$REPO --env ACCESS_TOKEN=$ACCESS_TOKEN -d fedml/${containers[$container_index]}:$TAG bash ./start.sh ${REPO} ${ACCESS_TOKEN} ${python_versions[$container_index]} + +done +echo "Script completed." + diff --git a/devops/dockerfile/github-action-runner/start.sh b/.github/workflows/registry-runners/start.sh similarity index 76% rename from devops/dockerfile/github-action-runner/start.sh rename to .github/workflows/registry-runners/start.sh index 917d1cfe16..b65b0f1272 100644 --- a/devops/dockerfile/github-action-runner/start.sh +++ b/.github/workflows/registry-runners/start.sh @@ -2,13 +2,15 @@ ORGANIZATION=$1 ACCESS_TOKEN=$2 +PYTHON_VERSION=$3 echo $ORGANIZATION echo $ACCESS_TOKEN +echo $PYTHON_VERSION cd /home/fedml/actions-runner -RUNNER_ALLOW_RUNASROOT="1" ./config.sh --url https://github.com/${ORGANIZATION} --token ${ACCESS_TOKEN} +RUNNER_ALLOW_RUNASROOT="1" ./config.sh --url https://github.com/${ORGANIZATION} --token ${ACCESS_TOKEN} --labels self-hosted,Linux,X64,$PYTHON_VERSION cleanup() { echo "Removing runner..." diff --git a/.github/workflows/registry-runners/windows.ps1 b/.github/workflows/registry-runners/windows.ps1 new file mode 100644 index 0000000000..40f0f00b8f --- /dev/null +++ b/.github/workflows/registry-runners/windows.ps1 @@ -0,0 +1,32 @@ + +$REPO = "Qigemingziba/FedML" +$ACCESS_TOKEN = "AGMK3PY3QDYUXXXEB5LWI4DGOQIFW" +$WORKPLACE=$PWD + +Set-Location actions-runner-python38 +& conda activate python38 +./config.cmd --url https://github.com/$REPO --name windows-python38 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.8 +Start-Process run.cmd start -WindowStyle Hidden + +Set-Location $WORKPLACE + +Set-Location actions-runner-python39 +& conda activate python39 +./config.cmd --url https://github.com/$REPO --name windows-python39 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.9 +Start-Process run.cmd start -WindowStyle Hidden + +Set-Location $WORKPLACE + +Set-Location actions-runner-python310 +& conda activate python310 +./config.cmd --url https://github.com/$REPO --name windows-python310 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.10 +Start-Process run.cmd start -WindowStyle Hidden + +Set-Location $WORKPLACE + +Set-Location actions-runner-python311 +& conda activate python311 +./config.cmd --url https://github.com/$REPO --name windows-python311 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.11 +Start-Process run.cmd start -WindowStyle Hidden + +Set-Location $WORKPLACE \ No newline at end of file diff --git a/devops/dockerfile/github-action-runner/README.md b/devops/dockerfile/github-action-runner/README.md deleted file mode 100644 index d02e29665b..0000000000 --- a/devops/dockerfile/github-action-runner/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Run self-host runner in your machine - -## Usage - -./runner-start.sh [YourGitRepo] [YourRunnerPrefix] [YourRunnerNum] [YourGitHubRunnerToken] [LocalDevSourceDir] [LocalReleaseSourceDir] [LocalDataDir] - -For the argument YourGitHubRunnerToken, you may navigate based the following path. - -Settings -> Actions -> Runners -> New self-hosted runner. - -In the Configure section, you should find the similar line: -./config.sh --url https://github.com/FedML-AI/FedML --token AXRYPL6G2VHVGDFDQQS5XA3ELYI6M - -set YourGitHubRunnerToken to value of --token - - -## Example - -Use the following commands to run 30 runners in the FedML-AI/FedML repo and run 6 runners in the FedML-AI/Front-End-Auto-Test repo: - -./runner-start.sh FedML-AI/FedML fedml-runner 30 AXRYPLZLZN6XVJB3BAIXSP3EMFC7U /home/fedml/FedML4GitHubAction-Dev /home/fedml/FedML4GitHubAction /home/fedml/fedml_data -./runner-start.sh FedML-AI/Front-End-Auto-Test webtest-runner 6 AXRYPL57ZD35ZGDWZKRKFHLEMGLTK /home/fedml/FedML4GitHubAction-Dev /home/fedml/FedML4GitHubAction /home/fedml/fedml_data - -./runner-start.sh FedML-AI/FedML fedml-runner 30 AXRYPL6CCBH24ZVRSUEAYTTEMKD56 /home/chaoyanghe/sourcecode/FedML4GitHubAction-Dev /home/chaoyanghe/sourcecode/FedML4GitHubAction /home/chaoyanghe/fedml_data -./runner-start.sh FedML-AI/Front-End-Auto-Test webtest-runner 6 AXRYPL57ZD35ZGDWZKRKFHLEMGLTK /home/chaoyanghe/sourcecode/FedML4GitHubAction-Dev /home/chaoyanghe/sourcecode/FedML4GitHubAction /home/chaoyanghe/fedml_data diff --git a/devops/dockerfile/github-action-runner/build.sh b/devops/dockerfile/github-action-runner/build.sh deleted file mode 100755 index 5f6dae9615..0000000000 --- a/devops/dockerfile/github-action-runner/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -docker build -t fedml/github-action-runner:latest -f ./Dockerfile . -docker login -docker push fedml/github-action-runner:latest \ No newline at end of file diff --git a/devops/dockerfile/github-action-runner/runner-start.sh b/devops/dockerfile/github-action-runner/runner-start.sh deleted file mode 100644 index 18a0c4f958..0000000000 --- a/devops/dockerfile/github-action-runner/runner-start.sh +++ /dev/null @@ -1,23 +0,0 @@ -REPO=$1 -TAG=$2 -NUM=$3 -ACCESS_TOKEN=$4 -LOCAL_DEV_SOURCE_DIR=$5 -LOCAL_RELEASE_SOURCE_DIR=$6 -LOCAL_DATA_DIR=$7 - -if [ $# != 7 ]; then - echo "Please provide five arguments." - echo "./runner-start.sh [YourGitRepo] [YourRunnerPrefix] [YourRunnerNum] [YourGitHubRunnerToken] [LocalDevSourceDir] [LocalReleaseSourceDir] [LocalDataDir]" - exit -1 -fi - -sudo docker stop `sudo docker ps |grep ${TAG}- |awk -F' ' '{print $1}'` -sudo docker pull fedml/github-action-runner:latest - -for((i=1;i<=$NUM;i++)); -do -ACT_NAME=$TAG-$i -sudo docker rm $ACT_NAME -sudo docker run --name $ACT_NAME --env REPO=$REPO --env ACCESS_TOKEN=$ACCESS_TOKEN -v $LOCAL_DEV_SOURCE_DIR:/home/actions-runner/fedml-dev -v $LOCAL_RELEASE_SOURCE_DIR:/home/actions-runner/fedml-master -v $LOCAL_DATA_DIR:/home/fedml/fedml_data -v $LOCAL_DATA_DIR:/home/actions-runner/fedml_data -d fedml/github-action-runner:latest -done \ No newline at end of file diff --git a/devops/scripts/install-fedml.sh b/devops/scripts/install-fedml.sh new file mode 100644 index 0000000000..cafcfa3ac7 --- /dev/null +++ b/devops/scripts/install-fedml.sh @@ -0,0 +1,2 @@ +cd python +pip install -e ./ \ No newline at end of file diff --git a/devops/scripts/sync-fedml-pip.sh b/devops/scripts/sync-fedml-pip.sh index 0d909fff76..6b24ac52e7 100755 --- a/devops/scripts/sync-fedml-pip.sh +++ b/devops/scripts/sync-fedml-pip.sh @@ -24,7 +24,7 @@ else fi fi -mkdir -p /home/fedml/fedml_data -cp -Rf /home/fedml/fedml_data_host/* /home/fedml/fedml_data +mkdir -p ./fedml/fedml_data +cp -Rf ./fedml/fedml_data_host/* ./fedml/fedml_data exit 0 diff --git a/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md b/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md index c693d8d863..a1fa30b6f2 100644 --- a/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md +++ b/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md @@ -26,7 +26,7 @@ For info on `trpc_master_config_path` refer to `python/examples/cross_silo/cuda_ Example is provided at: -`python/examples/cross_silo/cuda_rpc_fedavg_mnist_lr_example/one_line` +`python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/one_line` ### Training Script At the client side, the client ID (a.k.a rank) starts from 1. diff --git a/python/examples/launch/examples/launch/hello_world/launch_config/fedml_config.yaml b/python/examples/launch/examples/launch/hello_world/launch_config/fedml_config.yaml new file mode 100644 index 0000000000..21e1f2e33e --- /dev/null +++ b/python/examples/launch/examples/launch/hello_world/launch_config/fedml_config.yaml @@ -0,0 +1,14 @@ +containerize: false +data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv +environment_args: + bootstrap: fedml_bootstrap_generated.sh +model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' +training_params: + learning_rate: 0.004 diff --git a/python/examples/launch/hello_world/hello_world.py b/python/examples/launch/hello_world/hello_world.py index 71ffaf7c16..2f68f99055 100644 --- a/python/examples/launch/hello_world/hello_world.py +++ b/python/examples/launch/hello_world/hello_world.py @@ -1,6 +1,5 @@ import os import time - import fedml if __name__ == "__main__": diff --git a/python/examples/launch/serve_job_mnist.yaml b/python/examples/launch/serve_job_mnist.yaml index 98c1570a4f..bd8b52ca6c 100755 --- a/python/examples/launch/serve_job_mnist.yaml +++ b/python/examples/launch/serve_job_mnist.yaml @@ -35,4 +35,4 @@ computing: maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card #allow_cross_cloud_resources: true # true, false #device_type: CPU # options: GPU, CPU, hybrid - resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type \ No newline at end of file + resource_type: RTX-4090 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type \ No newline at end of file diff --git a/python/examples/train/mnist_train/examples/train/mnist_train/launch_config/fedml_config.yaml b/python/examples/train/mnist_train/examples/train/mnist_train/launch_config/fedml_config.yaml new file mode 100644 index 0000000000..188c19dde6 --- /dev/null +++ b/python/examples/train/mnist_train/examples/train/mnist_train/launch_config/fedml_config.yaml @@ -0,0 +1,3 @@ +containerize: false +environment_args: + bootstrap: fedml_bootstrap_generated.sh diff --git a/python/examples/train/mnist_train/train.py b/python/examples/train/mnist_train/train.py new file mode 100644 index 0000000000..611a15c2b6 --- /dev/null +++ b/python/examples/train/mnist_train/train.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +import torchvision.transforms as transforms +from torch.utils.data import DataLoader +import fedml +# Set random seed for reproducibility +torch.manual_seed(42) + +# Define hyperparameters +batch_size = 64 +learning_rate = 0.001 +num_epochs = 3 + +# Prepare dataset and data loaders +transform = transforms.Compose([ + transforms.ToTensor(), # Convert image to tensor, normalize to [0, 1] + transforms.Normalize((0.5,), (0.5,)) # Normalize with mean and std deviation of 0.5 +]) + +train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True) +train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + +test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True) +test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + +# Define a simple convolutional neural network model +class SimpleCNN(nn.Module): + def __init__(self): + super(SimpleCNN, self).__init__() + self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2) + self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2) + self.fc1 = nn.Linear(32 * 7 * 7, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = torch.relu(self.conv1(x)) + x = torch.max_pool2d(x, kernel_size=2, stride=2) + x = torch.relu(self.conv2(x)) + x = torch.max_pool2d(x, kernel_size=2, stride=2) + x = x.view(-1, 32 * 7 * 7) + x = torch.relu(self.fc1(x)) + x = self.fc2(x) + return x + +model = SimpleCNN() + +# Define loss function and optimizer +criterion = nn.CrossEntropyLoss() +optimizer = optim.Adam(model.parameters(), lr=learning_rate) + +# Train the model +for epoch in range(num_epochs): + + # Evaluate the model on the test set during training + model.eval() + with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + acc = 100 * correct / total + fedml.mlops.log_metric({"epoch":epoch, "acc": acc}) + + model.train() + for images, labels in train_loader: + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + +# Final evaluation on the test set +model.eval() +with torch.no_grad(): + correct = 0 + total = 0 + for images, labels in test_loader: + outputs = model(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + acc = 100 * correct / total + print('Final Test Accuracy: {:.2f} %'.format(acc)) + fedml.mlops.log_metric({"epoch":num_epochs, "acc": acc}) + +fedml.mlops.log_model(f"model-file@test", "./simple_cnn.pth") +# # Save the model parameters +# torch.save(model.state_dict(), 'simple_cnn.pth') +# print('Model saved to simple_cnn.pth') diff --git a/python/examples/train/mnist_train/train.yaml b/python/examples/train/mnist_train/train.yaml new file mode 100644 index 0000000000..f9a5cc5ab5 --- /dev/null +++ b/python/examples/train/mnist_train/train.yaml @@ -0,0 +1,50 @@ +# Local directory where your source code resides. +# It should be the relative path to this job yaml file or the absolute path. +# If your job doesn't contain any source code, it can be empty. +workspace: . + +# Running entry commands which will be executed as the job entry point. +# If an error occurs, you should exit with a non-zero code, e.g. exit 1. +# Otherwise, you should exit with a zero code, e.g. exit 0. +# Support multiple lines, which can not be empty. +job: | + echo "current job id: $FEDML_CURRENT_RUN_ID" + echo "current edge id: $FEDML_CURRENT_EDGE_ID" + echo "Hello, Here is the launch platform." + echo "Current directory is as follows." + pwd + python3 train.py + echo "training job finished." + +# If you want to use the job created by the MLOps platform, +# just uncomment the following three, then set job_id and config_id to your desired job id and related config. +#job_args: +# job_id: 2070 +# config_id: 111 + +# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name +#job_name: cv_job + +job_type: train # options: train, deploy, federate + +# train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training +# federate subtype: cross_silo, simulation, web, smart_phone +# deploy subtype: none +job_subtype: generate_training + +# containerize +containerize: false + +# Bootstrap shell commands which will be executed before running entry commands. +# Support multiple lines, which can be empty. +bootstrap: | + # pip install -r requirements.txt + echo "Bootstrap finished." + +computing: + minimum_num_gpus: 1 # minimum # of GPUs to provision + maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card + #allow_cross_cloud_resources: true # true, false + #device_type: CPU # options: GPU, CPU, hybrid + resource_type: RTX-4090 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type + diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index c96d65adc5..c2fc2e3a0f 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -1,6 +1,7 @@ import logging import platform +import multiprocess import multiprocess as multiprocessing import os import random @@ -37,7 +38,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.9.0" +__version__ = "0.8.51b1" # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release @@ -460,6 +461,26 @@ def _init_multiprocessing(): multiprocessing.set_start_method("fork", force=True) +def get_multiprocessing_context(): + if platform.system() == "Windows": + return multiprocessing.get_context("spawn") + else: + return multiprocessing.get_context("fork") + + +def get_process(target=None, args=None): + if platform.system() == "Windows": + return multiprocessing.Process(target=target, args=args) + else: + #return multiprocessing.Process(target=target, args=args) + #multiprocessing.set_start_method("spawn", force=True) + #return multiprocess.context.SpawnContext.Process(target=target, args=args) + #multiprocessing.Manager().current_process().authkey = str.encode("abc") + new_process = multiprocessing.get_context("fork").Process(target=target, args=args) + #new_process.authkey = str.encode("abc") + return new_process + + def set_env_version(version): set_env_kv("FEDML_ENV_VERSION", version) load_env() diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py index b03c72b675..ff2b0c7307 100755 --- a/python/fedml/api/__init__.py +++ b/python/fedml/api/__init__.py @@ -278,6 +278,9 @@ def model_deploy(name, endpoint_name, endpoint_id, local, master_ids, worker_ids def model_run(endpoint_id, json_string): model_module.run(endpoint_id, json_string) +def get_endpoint(endpoint_id): + return model_module.get_endpoint(endpoint_id) + def endpoint_delete(endpoint_id): model_module.delete_endpoint(endpoint_id) diff --git a/python/fedml/api/modules/model.py b/python/fedml/api/modules/model.py index a02e674f47..93892fc5d1 100644 --- a/python/fedml/api/modules/model.py +++ b/python/fedml/api/modules/model.py @@ -320,6 +320,19 @@ def run(endpoint_id: str, json_string: str) -> bool: click.echo("Failed to run model.") return False +def get_endpoint(endpoint_id: str): + api_key = get_api_key() + if api_key == "": + click.echo(''' + Please use one of the ways below to login first: + (1) CLI: `fedml login $api_key` + (2) API: fedml.api.fedml_login(api_key=$api_key) + ''') + return False + + endpoint_detail_result = FedMLModelCards.get_instance().query_endpoint_detail_api(user_api_key=api_key, + endpoint_id=endpoint_id) + return endpoint_detail_result def delete_endpoint(endpoint_id: str) -> bool: api_key = get_api_key() diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index b8237d93ba..50ca315a10 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -167,7 +167,7 @@ def autoscaler_reconcile_after_interval(self): # Get cached token for authorization of autoscale request cached_token = fedml_model_cache.get_end_point_token(e_id, e_name, model_name) if cached_token is None: - logging.error(f"Failed to get the cached token for endpoint {e_id}.") + # logging.error(f"Failed to get the cached token for endpoint {e_id}.") continue req_header = { @@ -229,7 +229,7 @@ def monitor_replicas_number(): cached_token = FedMLModelCache.get_instance().get_end_point_token_with_eid(endpoint_id) if cached_token is None: - logging.error(f"Failed to get the cached token for endpoint {endpoint_id}.") + # logging.error(f"Failed to get the cached token for endpoint {endpoint_id}.") return req_header = { @@ -339,6 +339,10 @@ def monitor_replicas_perf(edge_id, mqtt_mgr=None): def monitor_slave_run_process_status(self): try: count = 0 + try: + client_data_interface.FedMLClientDataInterface.get_instance().create_job_table() + except Exception as e: + pass job_list = client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db() for job in job_list.job_list: count += 1 @@ -448,6 +452,10 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None try: ComputeCacheManager.get_instance().set_redis_params() count = 0 + try: + server_data_interface.FedMLServerDataInterface.get_instance().create_job_table() + except Exception as e: + pass job_list = server_data_interface.FedMLServerDataInterface.get_instance().get_jobs_from_db() for job in job_list.job_list: count += 1 diff --git a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py index 05cc342e36..6dd575f307 100644 --- a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py @@ -135,13 +135,15 @@ def save_run_process(run_id, process_id, data_dir, info_dir, pass @staticmethod - def kill_process(process_id): + def kill_process(process_id, exclude_current_pid=False): try: process = psutil.Process(process_id) if process is None: return child_processes = process.children(recursive=True) for sub_process in child_processes: + if exclude_current_pid and sub_process.pid == os.getpid(): + continue if platform.system() == 'Windows': os.system("taskkill /PID {} /T /F".format(sub_process.pid)) else: diff --git a/python/fedml/computing/scheduler/comm_utils/sys_utils.py b/python/fedml/computing/scheduler/comm_utils/sys_utils.py index aaa37bc4db..065482c23b 100644 --- a/python/fedml/computing/scheduler/comm_utils/sys_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/sys_utils.py @@ -114,6 +114,8 @@ def get_sys_runner_info(): except: pass + enable_simulation_gpu, simulation_gpu_count = get_simulation_gpu_env() + if enable_simulation_gpu: gpu_count = simulation_gpu_count gpu_total_mem = "80G" @@ -128,9 +130,26 @@ def get_sys_runner_info(): gpu_count, gpu_vendor, cpu_count, gpu_device_name +def get_simulation_gpu_env(): + _enable_simulation_gpu = enable_simulation_gpu + _simulation_gpu_count = simulation_gpu_count + + env_enable_simulation_gpu = os.getenv("FEDML_ENABLE_SIMULATION_GPU", None) + if env_enable_simulation_gpu is not None: + _enable_simulation_gpu = True if env_enable_simulation_gpu == "1" or env_enable_simulation_gpu == 1 else False + + env_simulation_gpu_count = os.getenv("FEDML_SIMULATION_GPU_COUNT", None) + if env_simulation_gpu_count is not None: + _simulation_gpu_count = int(env_simulation_gpu_count) + + return _enable_simulation_gpu, _simulation_gpu_count + + # GPU list: [GPU(ID, uuid, load, memoryTotal, memoryUsed, memoryFree, driver, # gpu_name, serial, display_mode, display_active, temperature)] def get_gpu_list(): + enable_simulation_gpu, simulation_gpu_count = get_simulation_gpu_env() + if enable_simulation_gpu: ret_gpu_list = [ {'ID': 0, 'uuid': 'GPU-dab987f0-be09-294a-96d6-f9afeef49877', 'load': 1.0, @@ -184,6 +203,8 @@ def get_gpu_list(): def get_available_gpu_id_list(limit=1) -> List[int]: + enable_simulation_gpu, simulation_gpu_count = get_simulation_gpu_env() + if enable_simulation_gpu: available_gpu_ids = [0, 1, 2, 3, 4, 5, 6, 7] if simulation_gpu_count > 8: diff --git a/python/fedml/computing/scheduler/master/base_master_agent.py b/python/fedml/computing/scheduler/master/base_master_agent.py index 3aff523c24..30cf5da1c9 100755 --- a/python/fedml/computing/scheduler/master/base_master_agent.py +++ b/python/fedml/computing/scheduler/master/base_master_agent.py @@ -23,7 +23,9 @@ def __init__(self): def login( self, user_id, api_key=None, device_id=None, - os_name=None, role=None, runner_cmd=None + os_name=None, role=None, runner_cmd=None, + communication_manager=None, sender_message_queue=None, + status_center_queue=None, sender_message_event=None ): # Login account login_result = FedMLAccountManager.get_instance().login( @@ -48,20 +50,31 @@ def login( # Initialize the protocol manager # noinspection PyBoardException try: - self._initialize_protocol_manager() + self._initialize_protocol_manager( + communication_manager=communication_manager, + sender_message_queue=sender_message_queue, + status_center_queue=status_center_queue, + sender_message_event=sender_message_event) except Exception as e: FedMLAccountManager.write_login_failed_file(is_client=False) self.protocol_mgr.stop() raise e # Start the protocol manager to process the messages from MLOps and slave agents. - self.protocol_mgr.start() + if communication_manager is None: + self.protocol_mgr.start() + + return login_result @staticmethod def logout(): GeneralConstants.cleanup_run_process(None, is_master=True) sys_utils.cleanup_all_fedml_server_api_processes() + def stop(self, kill_process=False): + if self.protocol_mgr is not None: + self.protocol_mgr.stop(kill_process=kill_process) + def _create_protocol_manager(self, role, login_result): if self.protocol_mgr is not None: return @@ -69,7 +82,11 @@ def _create_protocol_manager(self, role, login_result): login_result, agent_config=login_result.agent_config) self.protocol_mgr.run_as_edge_server_and_agent = True \ if role == FedMLAccountManager.ROLE_EDGE_SERVER else False - self.protocol_mgr.run_as_cloud_agent = True if role == FedMLAccountManager.ROLE_CLOUD_AGENT else False + self.protocol_mgr.run_as_cloud_agent = True \ + if role == FedMLAccountManager.ROLE_CLOUD_AGENT or role == FedMLAccountManager.ROLE_GPU_MASTER_SERVER \ + else False + self.use_local_process_as_cloud_server = True \ + if role == FedMLAccountManager.ROLE_GPU_MASTER_SERVER else self.use_local_process_as_cloud_server self.protocol_mgr.run_as_cloud_server = True if role == FedMLAccountManager.ROLE_CLOUD_SERVER else False self.protocol_mgr.args = login_result self.protocol_mgr.edge_id = login_result.edge_id @@ -79,12 +96,20 @@ def _create_protocol_manager(self, role, login_result): self.protocol_mgr.enable_simulation_cloud_agent = self.enable_simulation_cloud_agent self.protocol_mgr.use_local_process_as_cloud_server = self.use_local_process_as_cloud_server - def _initialize_protocol_manager(self): + def _initialize_protocol_manager( + self, communication_manager=None, sender_message_queue=None, + status_center_queue=None, sender_message_event=None + ): # Init local database self._init_database() # Initialize the master protocol - self.protocol_mgr.initialize() + self.protocol_mgr.set_parent_agent(self) + self.protocol_mgr.initialize( + communication_manager=communication_manager, + sender_message_queue=sender_message_queue, + status_center_queue=status_center_queue, + sender_message_event=sender_message_event) # Report the IDLE status to MLOps self.mlops_metrics.report_server_training_status( @@ -109,6 +134,9 @@ def _init_logs(self, agent_args, edge_id): in_args.server_agent_id = edge_id MLOpsRuntimeLog.get_instance(in_args).init_logs() + def get_protocol_manager(self): + return self.protocol_mgr + @abstractmethod def _get_log_file_dir(self): pass @@ -124,3 +152,17 @@ def _init_database(self): @abstractmethod def _generate_protocol_manager_instance(self, args, agent_config=None): return None + + def start_master_server_instance(self, payload): + self.protocol_mgr.start_master_server_instance(payload) + + def generate_agent_instance(self): + return FedMLBaseMasterAgent() + + def process_job_complete_status(self, run_id, topic, payload): + if self.protocol_mgr is None: + return + if topic in self.protocol_mgr.get_subscribed_topics(): + message_handler = self.protocol_mgr.get_listener_handler(topic) + if message_handler is not None: + message_handler(topic, payload) diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner.py b/python/fedml/computing/scheduler/master/base_master_job_runner.py index 9ebab258bb..fdfff143aa 100755 --- a/python/fedml/computing/scheduler/master/base_master_job_runner.py +++ b/python/fedml/computing/scheduler/master/base_master_job_runner.py @@ -1,4 +1,3 @@ - import json import logging import multiprocessing @@ -7,6 +6,9 @@ import os import time import traceback + +import setproctitle + from ..scheduler_entry.constants import Constants from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog from ..master.server_constants import ServerConstants @@ -19,7 +21,6 @@ from fedml.utils.debugging import debug from ..scheduler_core.status_center import JobStatus from ..scheduler_core.compute_cache_manager import ComputeCacheManager -from multiprocessing import Process, Queue from ..scheduler_core.general_constants import GeneralConstants from ..scheduler_core.scheduler_base_job_runner import FedMLSchedulerBaseJobRunner, RunnerError, RunnerCompletedError from abc import ABC, abstractmethod @@ -43,13 +44,13 @@ def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id is_master_runner=True ) - self.run_edge_id_status_queue = Queue() - self.run_metrics_queue = Queue() - self.run_events_queue = Queue() - self.run_artifacts_queue = Queue() - self.run_logs_queue = Queue() - self.run_edge_device_info_queue = Queue() - self.run_edge_device_info_global_queue = Queue() + self.run_edge_id_status_queue = multiprocessing.Manager().Queue() + self.run_metrics_queue = multiprocessing.Manager().Queue() + self.run_events_queue = multiprocessing.Manager().Queue() + self.run_artifacts_queue = multiprocessing.Manager().Queue() + self.run_logs_queue = multiprocessing.Manager().Queue() + self.run_edge_device_info_queue = multiprocessing.Manager().Queue() + self.run_edge_device_info_global_queue = multiprocessing.Manager().Queue() self.run_extend_queue_list = None self.async_check_timeout = 0 self.enable_async_cluster = False @@ -68,9 +69,12 @@ def run( edge_device_info_queue=None, run_metrics_queue=None, run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, edge_device_info_global_queue=None, run_extend_queue_list=None, sender_message_center_queue=None, listener_message_queue=None, - status_center_queue=None + status_center_queue=None, process_name=None ): - print(f"Master job runner process id {os.getpid()}, run id {self.run_id}") + if process_name is not None: + setproctitle.setproctitle(process_name) + + print(f"Master job runner process id {os.getpid()}, name {process_name}, run id {self.run_id}") if platform.system() != "Windows": os.setsid() @@ -169,7 +173,8 @@ def run_impl( run_id, self.request_json, edge_id=self.edge_id, is_server_job=True, sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue + status_center_queue=status_center_queue, + process_name=GeneralConstants.get_launch_master_user_process_name(run_id, self.edge_id) ) # Check if the run status is normal @@ -231,9 +236,12 @@ def run_server_job( edge_device_info_queue=None, run_metrics_queue=None, run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, edge_device_info_global_queue=None, run_extend_queue_list=None, sender_message_center_queue=None, listener_message_queue=None, - status_center_queue=None + status_center_queue=None, process_name=None ): - print(f"Server runner process id {os.getpid()}, run id {self.run_id}") + if process_name is not None: + setproctitle.setproctitle(process_name) + + print(f"Server runner process id {os.getpid()}, name {process_name}. run id {self.run_id}") if platform.system() != "Windows": os.setsid() @@ -405,9 +413,9 @@ def _generate_job_runner_instance(self, args, run_id=None, request_json=None, ag return None def start_runner_process( - self, run_id, request_json, edge_id=None, is_server_job=False, - sender_message_queue=None, listener_message_queue=None, - status_center_queue=None, + self, run_id, request_json, edge_id=None, is_server_job=False, + sender_message_queue=None, listener_message_queue=None, + status_center_queue=None, process_name=None ): server_runner = self._generate_job_runner_instance( self.args, run_id=run_id, request_json=request_json, @@ -425,14 +433,26 @@ def start_runner_process( server_runner.edge_id_status_queue = self.run_edge_id_status_queue server_runner.edge_device_info_queue = self.run_edge_device_info_queue self.run_extend_queue_list = self._generate_extend_queue_list() - self.run_process = Process( - target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( - self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, - self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, - self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, - self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue + if platform.system() == "Windows": + self.run_process = multiprocessing.Process( + target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( + self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, + self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, + self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, + self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue, + process_name, + ) + ) + else: + self.run_process = fedml.get_process( + target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( + self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, + self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, + self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, + self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue, + process_name, + ) ) - ) self.run_process.start() ServerConstants.save_run_process(run_id, self.run_process.pid) return self.run_process @@ -444,7 +464,7 @@ def put_run_edge_device_info_to_queue(self, run_id, edge_id, device_info): if int(edge_id) in edge_ids or str(edge_id) in edge_ids: run_id_str = str(run_id) if self.run_edge_device_info_queue is None: - self.run_edge_device_info_queue = Queue() + self.run_edge_device_info_queue = multiprocessing.Manager().Queue() self.run_edge_device_info_queue.put(device_info) def should_continue_run_job(self, run_id): @@ -572,7 +592,7 @@ def callback_run_logs(self, topic, payload): run_id = str(topic).split('/')[-1] run_id_str = str(run_id) if self.run_logs_queue is None: - self.run_logs_queue = Queue() + self.run_logs_queue = multiprocessing.Manager().Queue() self.run_logs_queue.put(payload) def callback_run_metrics(self, topic, payload): @@ -580,7 +600,7 @@ def callback_run_metrics(self, topic, payload): run_id = str(topic).split('/')[-1] run_id_str = str(run_id) if self.run_metrics_queue is None: - self.run_metrics_queue = Queue() + self.run_metrics_queue = multiprocessing.Manager().Queue() self.run_metrics_queue.put(payload) # def send_training_request_to_edges(self, active_edge_info_dict): @@ -710,6 +730,3 @@ def should_process_async_cluster(self): def get_client_id_list(self, server_edge_id_list): return server_edge_id_list - - - diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py b/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py index 6831c9d034..39f7438696 100755 --- a/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py +++ b/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py @@ -1,27 +1,39 @@ import base64 import json import logging +import multiprocessing +import platform import time from abc import ABC from multiprocessing import Process + +import fedml from .cloud_server_manager import FedMLCloudServerManager +from ..comm_utils.run_process_utils import RunProcessUtils from ..scheduler_core.scheduler_base_job_runner_manager import FedMLSchedulerBaseJobRunnerManager +from ..scheduler_core.account_manager import FedMLAccountManager class FedMLBaseMasterJobRunnerManager(FedMLSchedulerBaseJobRunnerManager, ABC): def __init__(self): FedMLSchedulerBaseJobRunnerManager.__init__(self) + if not hasattr(self, "master_agent_instance_map"): + self.master_agent_instance_map = dict() # Override def start_job_runner( self, run_id, request_json, args=None, edge_id=None, is_server_job=False, sender_message_queue=None, listener_message_queue=None, status_center_queue=None, - should_start_cloud_server=False, use_local_process_as_cloud_server=False, - cuda_visible_gpu_ids_str=None + communication_manager=None, master_agent_instance=None, should_start_cloud_server=False, + use_local_process_as_cloud_server=False, cuda_visible_gpu_ids_str=None, process_name=None ): if should_start_cloud_server: - self._start_cloud_server(args, run_id, request_json, edge_id=edge_id, - use_local_process_as_cloud_server=use_local_process_as_cloud_server) + self._start_cloud_server( + args, run_id, request_json, edge_id=edge_id, + use_local_process_as_cloud_server=use_local_process_as_cloud_server, + sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, + status_center_queue=status_center_queue, communication_manager=communication_manager, + master_agent_instance=master_agent_instance, process_name=process_name) return run_id_str = str(run_id) @@ -33,34 +45,58 @@ def start_job_runner( run_id, request_json, edge_id=edge_id, is_server_job=is_server_job, sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue + status_center_queue=status_center_queue, + process_name=process_name ) def stop_job_runner( self, run_id, args=None, server_id=None, request_json=None, - run_as_cloud_agent=False, run_as_cloud_server=False + run_as_cloud_agent=False, run_as_cloud_server=False, + use_local_process_as_cloud_server=False ): super().stop_job_runner(run_id) if run_as_cloud_agent or run_as_cloud_server: - stopping_process = Process( - target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config)) - stopping_process.start() + if not use_local_process_as_cloud_server: + stopping_process = Process( + target=FedMLCloudServerManager.stop_cloud_server, + args=(run_id, server_id, args.agent_config)) + stopping_process.start() + + run_id_str = str(run_id) + if self.master_agent_instance_map.get(run_id_str, None) is not None: + self.master_agent_instance_map.get(run_id_str).stop(kill_process=True) + self.master_agent_instance_map.pop(run_id_str) + + if use_local_process_as_cloud_server: + time.sleep(1) + RunProcessUtils.kill_process(self.cloud_run_process_map[run_id_str].pid) def complete_job_runner( self, run_id, args=None, server_id=None, request_json=None, - run_as_cloud_agent=False, run_as_cloud_server=False + run_as_cloud_agent=False, run_as_cloud_server=False, + use_local_process_as_cloud_server=False ): super().complete_job_runner(run_id) if run_as_cloud_agent or run_as_cloud_server: - stopping_process = Process( - target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config)) - stopping_process.start() + if not use_local_process_as_cloud_server: + stopping_process = Process( + target=FedMLCloudServerManager.stop_cloud_server, + args=(run_id, server_id, args.agent_config)) + stopping_process.start() + + run_id_str = str(run_id) + if self.master_agent_instance_map.get(run_id_str, None) is not None: + self.master_agent_instance_map.get(run_id_str).stop(kill_process=True) + self.master_agent_instance_map.pop(run_id_str) def _start_cloud_server( self, args, run_id, request_json, edge_id=None, - use_local_process_as_cloud_server=False + use_local_process_as_cloud_server=False, + sender_message_queue=None, listener_message_queue=None, + status_center_queue=None, communication_manager=None, + master_agent_instance=None, process_name=None ): run_id_str = str(run_id) cloud_server_mgr = FedMLCloudServerManager( @@ -71,19 +107,49 @@ def _start_cloud_server( self.cloud_run_process_map[run_id_str] = Process(target=cloud_server_mgr.start_cloud_server_process_entry) self.cloud_run_process_map[run_id_str].start() else: + cloud_device_id = request_json.get("cloudServerDeviceId", "0") + server_id = request_json.get("server_id", 0) message_bytes = json.dumps(request_json).encode("ascii") base64_bytes = base64.b64encode(message_bytes) - runner_cmd_encoded = base64_bytes.decode("ascii") - cloud_device_id = request_json.get("cloudServerDeviceId", "0") + payload = base64_bytes.decode("ascii") + self.master_agent_instance_map[str(run_id)] = master_agent_instance - logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) + logging.info("start the master server: {}".format(payload)) + + if platform.system() == "Windows": + self.run_process = multiprocessing.Process( + target=cloud_server_mgr.start_local_master_server, + args=(args.account_id, args.api_key, args.os_name, args.version, + cloud_device_id, run_id, payload, + communication_manager, sender_message_queue, + status_center_queue, master_agent_instance, process_name)) + else: + self.cloud_run_process_map[run_id_str] = fedml.get_process( + target=cloud_server_mgr.start_local_master_server, + args=(args.account_id, args.api_key, args.os_name, args.version, + cloud_device_id, run_id, payload, + communication_manager, sender_message_queue, + status_center_queue, master_agent_instance, process_name)) - self.cloud_run_process_map[run_id_str] = Process( - target=cloud_server_mgr.start_local_cloud_server, - args=(args.account_id, args.version, cloud_device_id, runner_cmd_encoded)) self.cloud_run_process_map[run_id_str].start() time.sleep(1) + def start_local_master_server( + self, user, api_key, os_name, version, cloud_device_id, run_id, payload, + communication_manager=None, sender_message_queue=None, status_center_queue=None, + master_agent_instance=None + ): + if master_agent_instance is None: + return + master_agent_instance.login( + user, api_key=api_key, device_id=cloud_device_id, os_name=os_name, + role=FedMLAccountManager.ROLE_CLOUD_SERVER, + communication_manager=None, + sender_message_queue=None, + status_center_queue=None) + self.master_agent_instance_map[str(run_id)] = master_agent_instance + master_agent_instance.start_master_server_instance(payload) + def callback_run_logs(self, run_id, topic, payload): run_id_str = str(run_id) if self.job_runners.get(run_id_str, None) is not None: @@ -93,3 +159,12 @@ def callback_run_metrics(self, run_id, topic, payload): run_id_str = str(run_id) if self.job_runners.get(run_id_str, None) is not None: self.job_runners[run_id_str].callback_run_metrics(topic, payload) + + def callback_proxy_unknown_messages(self, run_id, topic, payload): + run_id_str = str(run_id) + master_agent = self.master_agent_instance_map.get(run_id_str, None) + if master_agent is None: + return + master_agent.process_job_complete_status(run_id, topic, payload) + + diff --git a/python/fedml/computing/scheduler/master/base_master_protocol_manager.py b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py index 1c4cbba4f4..05529f8c8e 100755 --- a/python/fedml/computing/scheduler/master/base_master_protocol_manager.py +++ b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py @@ -2,6 +2,8 @@ import base64 import json import logging +import time + import fedml from ..comm_utils.constants import SchedulerConstants from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog @@ -141,6 +143,7 @@ def on_agent_communication_connected(self, mqtt_client_object): def callback_start_train(self, topic=None, payload=None): # Fetch config from MLOps # noinspection PyBroadException + try: MLOpsConfigs.fetch_all_configs() except Exception: @@ -197,7 +200,7 @@ def callback_start_train(self, topic=None, payload=None): self.run_edge_ids[run_id_str] = edge_id_list # report server running status to master agent - if not self.run_as_cloud_server: + if not self.run_as_cloud_server and not self.run_as_cloud_agent: self.mlops_metrics.report_server_id_status( run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, server_id=self.edge_id, server_agent_id=self.edge_id, running_json=payload) @@ -212,7 +215,9 @@ def callback_start_train(self, topic=None, payload=None): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue() + status_center_queue=self.get_status_queue(), + communication_manager=self.get_listener_communication_manager(), + process_name=GeneralConstants.get_launch_master_job_process_name(run_id, self.edge_id) ) process = self._get_job_runner_manager().get_runner_process(run_id) @@ -223,12 +228,17 @@ def callback_start_train(self, topic=None, payload=None): elif self.run_as_cloud_agent: self.init_job_task(request_json) + server_id = request_json.get("server_id", self.edge_id) self._get_job_runner_manager().start_job_runner( run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue(), should_start_cloud_server=True, - use_local_process_as_cloud_server=self.use_local_process_as_cloud_server + status_center_queue=self.get_status_queue(), + communication_manager=self.get_listener_communication_manager(), + master_agent_instance=self.generate_agent_instance(), + should_start_cloud_server=True, + use_local_process_as_cloud_server=self.use_local_process_as_cloud_server, + process_name=GeneralConstants.get_launch_master_job_process_name(run_id, server_id) ) process = self._get_job_runner_manager().get_runner_process(run_id, is_cloud_server=True) @@ -237,6 +247,7 @@ def callback_start_train(self, topic=None, payload=None): elif self.run_as_cloud_server: self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) self.start_request_json = json.dumps(request_json) + server_id = request_json.get("server_id", self.edge_id) run_id = request_json["runId"] run_id_str = str(run_id) @@ -248,10 +259,12 @@ def callback_start_train(self, topic=None, payload=None): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue() + status_center_queue=self.get_status_queue(), + communication_manager=self.get_listener_communication_manager(), + process_name=GeneralConstants.get_launch_master_job_process_name(run_id, server_id) ) - self.send_status_msg_to_edges(edge_id_list, run_id, self.edge_id) + self.send_status_msg_to_edges(edge_id_list, run_id, server_id) def callback_stop_train(self, topic, payload, use_payload=None): # Print the payload @@ -279,6 +292,16 @@ def callback_stop_train(self, topic, payload, use_payload=None): server_agent_id = self.edge_id topic_stop_train_to_cloud_server = f"mlops/flserver_agent_{server_id}/stop_train" self.message_center.send_message(topic_stop_train_to_cloud_server, payload) + + time.sleep(2) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, server_id) + self._get_job_runner_manager().stop_job_runner( + run_id, args=self.args, server_id=server_id, request_json=None, + run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server, + use_local_process_as_cloud_server=self.use_local_process_as_cloud_server) + self.generate_status_report(run_id, server_id, server_agent_id=server_agent_id). \ + report_server_id_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED, + edge_id=server_id, server_id=server_id) return # Reset all edge status and server status @@ -304,7 +327,11 @@ def callback_complete_job(self, topic, payload): self._process_job_complete_status(run_id, server_id, request_json) def _process_job_complete_status(self, run_id, server_id, complete_payload): - pass + # Complete the job runner + self._get_job_runner_manager().complete_job_runner( + run_id, args=self.args, server_id=server_id, request_json=complete_payload, + run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server, + use_local_process_as_cloud_server=self.use_local_process_as_cloud_server) def callback_run_logs(self, topic, payload): run_id = str(topic).split('/')[-1] @@ -390,6 +417,12 @@ def callback_request_job_status(self, topic, payload): def callback_request_device_status_in_job(self, topic, payload): self.response_device_status_in_job(topic, payload) + def callback_proxy_unknown_messages(self, run_id, topic, payload): + self._get_job_runner_manager().callback_proxy_unknown_messages(run_id, topic, payload) + + def process_extra_queues(self, extra_queues): + self.rebuild_status_center(extra_queues[0]) + def generate_protocol_manager(self): message_status_runner = self._generate_protocol_manager_instance( self.args, agent_config=self.agent_config @@ -476,6 +509,8 @@ def init_job_task(self, request_json): self.setup_listener_for_run_logs(run_id) def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): + if self.run_as_cloud_agent: + return edge_status_topic = "fl_client/flclient_agent_" + str(server_id) + "/status" payload = {"run_id": run_id, "init_all_edge_id_list": edge_ids, "init_server_id": server_id} self.callback_edge_status(edge_status_topic, json.dumps(payload)) @@ -486,6 +521,9 @@ def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): self.subscribe_msg(edge_status_topic) def remove_listeners_for_edge_status(self, edge_ids=None): + if self.run_as_cloud_agent: + return + if edge_ids is None: edge_ids = self.request_json["edgeids"] @@ -542,7 +580,7 @@ def send_status_check_msg(self, run_id, edge_id, server_id, context=None): def send_status_msg_to_edges(self, edge_id_list, run_id, server_id, context=None): # Send status message to all edges for edge_id in edge_id_list: - self.send_status_check_msg(run_id, edge_id, self.edge_id, context=context) + self.send_status_check_msg(run_id, edge_id, server_id, context=context) def report_exception_status(self, run_id): self.mlops_metrics.report_job_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION) @@ -554,3 +592,9 @@ def get_start_train_topic_with_edge_id(edge_id): @abstractmethod def _generate_protocol_manager_instance(self, args, agent_config=None): return None + + def start_master_server_instance(self, payload): + super().on_agent_communication_connected(None) + + self.receive_message_json(self.topic_start_train, payload) + diff --git a/python/fedml/computing/scheduler/master/cloud_server_manager.py b/python/fedml/computing/scheduler/master/cloud_server_manager.py index 040a0f38a3..3669cb32bc 100755 --- a/python/fedml/computing/scheduler/master/cloud_server_manager.py +++ b/python/fedml/computing/scheduler/master/cloud_server_manager.py @@ -2,10 +2,14 @@ import json import logging import os +import platform import traceback +import setproctitle + import fedml from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program +from fedml.computing.scheduler.scheduler_core.account_manager import FedMLAccountManager class FedMLCloudServerManager: @@ -31,14 +35,37 @@ def __init__(self, args, run_id=None, edge_id=None, request_json=None, agent_con self.cloud_server_name = None @staticmethod - def start_local_cloud_server(user, version, cloud_device_id, runner_cmd_encoded): + def start_local_cloud_server(user, api_key, os_name, version, cloud_device_id, runner_cmd_encoded): + if platform.system() != "Windows": + os.setsid() + print(f"start cloud server, device id {cloud_device_id}, runner cmd {runner_cmd_encoded}") pip_source_dir = os.path.dirname(__file__) login_cmd = os.path.join(pip_source_dir, "server_login.py") run_cmd = f"{get_python_program()} -W ignore {login_cmd} -t login -r cloud_server -u {str(user)} " \ - f"-v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" + f"-k {api_key} -v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" os.system(run_cmd) + def start_local_master_server( + self, user, api_key, os_name, version, cloud_device_id, run_id, payload, + communication_manager=None, sender_message_queue=None, status_center_queue=None, + master_agent_instance=None, process_name=None + ): + if process_name is not None: + setproctitle.setproctitle(process_name) + + logging.info(f"Local master server pid: {os.getpid()}") + if platform.system() != "Windows": + os.setsid() + + master_agent_instance.login( + user, api_key=api_key, device_id=cloud_device_id, os_name=os_name, + role=FedMLAccountManager.ROLE_CLOUD_SERVER, runner_cmd=payload, + communication_manager=None, sender_message_queue=None, + status_center_queue=None) + + master_agent_instance.stop() + def start_cloud_server_process_entry(self): try: self.start_cloud_server_process() diff --git a/python/fedml/computing/scheduler/master/master_protocol_manager.py b/python/fedml/computing/scheduler/master/master_protocol_manager.py index ca9621e41d..1adda439c6 100755 --- a/python/fedml/computing/scheduler/master/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/master/master_protocol_manager.py @@ -7,8 +7,9 @@ class FedMLLaunchMasterProtocolManager(FedMLBaseMasterProtocolManager, ABC): def __init__(self, args, agent_config=None): FedMLBaseMasterProtocolManager.__init__(self, args, agent_config=agent_config) + self.message_center_name = "launch_master_agent" - # Override + # Override def generate_topics(self): super().generate_topics() @@ -35,9 +36,6 @@ def _init_extra_items(self): def print_connected_info(self): super().print_connected_info() - # Override - def _process_job_complete_status(self, run_id, server_id, complete_payload): - # Complete the job runner - self._get_job_runner_manager().complete_job_runner( - run_id, args=self.args, server_id=server_id, request_json=complete_payload, - run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server) + def generate_agent_instance(self): + from .master_agent import FedMLLaunchMasterAgent + return FedMLLaunchMasterAgent() diff --git a/python/fedml/computing/scheduler/master/server_login.py b/python/fedml/computing/scheduler/master/server_login.py index 8dd0696bc8..be7b73103f 100755 --- a/python/fedml/computing/scheduler/master/server_login.py +++ b/python/fedml/computing/scheduler/master/server_login.py @@ -41,4 +41,5 @@ def logout(): master_agent.login(args.api_key, api_key=args.api_key, device_id=args.device_id, os_name=args.os_name, role=args.role, runner_cmd=args.runner_cmd) else: + master_agent.stop() master_agent.logout() diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index ab6bc4c895..32f5ebdeab 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -1,12 +1,12 @@ import copy import json import logging +import multiprocessing import os import time import queue import traceback from abc import ABC -from multiprocessing import Queue import fedml from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs @@ -50,7 +50,7 @@ def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id self.replica_controller = None self.deployed_replica_payload = None self.slave_deployment_results_map = dict() - self.deployment_result_queue = Queue() + self.deployment_result_queue = multiprocessing.Manager().Queue() self.is_fresh_endpoint = True # Override diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py index 9e0d51b588..efa56f4db5 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py @@ -264,7 +264,8 @@ def callback_start_deployment(self, topic, payload): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue() + status_center_queue=self.get_status_queue(), + process_name=GeneralConstants.get_deploy_master_job_process_name(run_id, self.edge_id) ) process = self._get_job_runner_manager().get_runner_process(run_id) if process is not None: diff --git a/python/fedml/computing/scheduler/model_scheduler/model_device_client.py b/python/fedml/computing/scheduler/model_scheduler/model_device_client.py deleted file mode 100755 index 05f43afc5f..0000000000 --- a/python/fedml/computing/scheduler/model_scheduler/model_device_client.py +++ /dev/null @@ -1,98 +0,0 @@ - -import copy -import logging -import multiprocessing -import time -import traceback -from multiprocessing import Process -from ..scheduler_core.account_manager import FedMLAccountManager -from .worker_agent import FedMLDeployWorkerAgent - - -class FedMLModelDeviceClientRunner: - def __init__(self, args, current_device_id, os_name, is_from_docker, service_config, infer_host="127.0.0.1"): - self.agent_process = None - self.agent_runner = None - self.agent_process_event = None - self.args = copy.deepcopy(args) - self.service_config = service_config - self.unique_device_id = None - self.current_device_id = current_device_id - self.os_name = os_name - self.is_from_docker = is_from_docker - self.edge_id = None - self.infer_host = infer_host - self.redis_addr = "local" - self.redis_port = "6379" - self.redis_password = "fedml_default" - - def get_edge_id(self): - return self.edge_id - - def start(self): - self.agent_runner = FedMLModelDeviceClientRunner(self.args, self.current_device_id, self.os_name, - self.is_from_docker, self.service_config) - self.agent_runner.infer_host = self.infer_host - self.agent_runner.redis_addr = self.redis_addr - self.agent_runner.redis_port = self.redis_port - self.agent_runner.redis_password = self.redis_password - if self.agent_process_event is None: - self.agent_process_event = multiprocessing.Event() - self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event, self.args,)) - self.edge_id = self.bind_device() - self.agent_process.start() - - def run_entry(self, process_event, in_args): - # print(f"Model worker process id {os.getpid()}") - - self.agent_process_event = process_event - - worker_agent = FedMLDeployWorkerAgent() - - while not self.agent_process_event.is_set(): - try: - try: - worker_agent.logout() - except Exception as e: - pass - - worker_agent.login( - in_args.account_id, api_key=in_args.api_key, device_id=in_args.device_id, - os_name=in_args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM - ) - except Exception as e: - logging.info("Restart model device client: {}".format(traceback.format_exc())) - pass - finally: - try: - worker_agent.logout() - except Exception as e: - pass - time.sleep(15) - - try: - self.stop() - except Exception as e: - pass - - def check_runner_stop_event(self): - if self.agent_process_event is not None and self.agent_process_event.is_set(): - logging.info("Received stopping event.") - raise Exception("Runner stopped") - - def stop(self): - FedMLDeployWorkerAgent.logout() - - if self.agent_process_event is not None: - self.agent_process_event.set() - - def bind_device(self): - # Login account - login_result = FedMLAccountManager.get_instance().login( - self.args.account_id, api_key=self.args.api_key, device_id=self.args.device_id, - os_name=self.args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM - ) - if login_result is not None: - return login_result.edge_id - else: - return None diff --git a/python/fedml/computing/scheduler/model_scheduler/model_device_server.py b/python/fedml/computing/scheduler/model_scheduler/model_device_server.py deleted file mode 100755 index b2ecd144b1..0000000000 --- a/python/fedml/computing/scheduler/model_scheduler/model_device_server.py +++ /dev/null @@ -1,97 +0,0 @@ - -import copy -import logging -import multiprocessing -import time -import traceback -from multiprocessing import Process -from ..scheduler_core.account_manager import FedMLAccountManager -from .master_agent import FedMLDeployMasterAgent - - -class FedMLModelDeviceServerRunner: - def __init__(self, args, current_device_id, os_name, is_from_docker, service_config, infer_host="127.0.0.1"): - self.agent_process = None - self.agent_runner = None - self.agent_process_event = None - self.args = copy.deepcopy(args) - self.service_config = service_config - self.unique_device_id = None - self.current_device_id = current_device_id - self.os_name = os_name - self.is_from_docker = is_from_docker - self.edge_id = None - self.infer_host = infer_host - self.redis_addr = "local" - self.redis_port = "6379" - self.redis_password = "fedml_default" - - def get_edge_id(self): - return self.edge_id - - def start(self): - self.agent_runner = FedMLModelDeviceServerRunner(self.args, self.current_device_id, self.os_name, - self.is_from_docker, self.service_config) - self.agent_runner.infer_host = self.infer_host - self.agent_runner.redis_addr = self.redis_addr - self.agent_runner.redis_port = self.redis_port - self.agent_runner.redis_password = self.redis_password - if self.agent_process_event is None: - self.agent_process_event = multiprocessing.Event() - self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event, self.args)) - self.edge_id = self.bind_device() - self.agent_process.start() - - def run_entry(self, process_event, in_args): - # print(f"Model master process id {os.getpid()}") - - self.agent_process_event = process_event - master_agent = FedMLDeployMasterAgent() - - while not self.agent_process_event.is_set(): - try: - try: - master_agent.logout() - except Exception as e: - pass - - master_agent.login( - in_args.account_id, api_key=in_args.api_key, device_id=in_args.device_id, - os_name=in_args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM - ) - except Exception as e: - logging.info("Restart model device server: {}".format(traceback.format_exc())) - pass - finally: - try: - master_agent.logout() - except Exception as e: - pass - time.sleep(15) - - try: - self.stop() - except Exception as e: - pass - - def check_runner_stop_event(self): - if self.agent_process_event is not None and self.agent_process_event.is_set(): - logging.info("Received stopping event.") - raise Exception("Runner stopped") - - def stop(self): - FedMLDeployMasterAgent.logout() - - if self.agent_process_event is not None: - self.agent_process_event.set() - - def bind_device(self): - # Login account - login_result = FedMLAccountManager.get_instance().login( - self.args.account_id, api_key=self.args.api_key, device_id=self.args.device_id, - os_name=self.args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM - ) - if login_result is not None: - return login_result.edge_id - else: - return None diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py index b1d0bebc47..9204291c48 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py @@ -12,6 +12,7 @@ from .device_model_msg_object import FedMLModelMsgObject from .device_client_constants import ClientConstants from .device_client_data_interface import FedMLClientDataInterface +from ..scheduler_core.general_constants import GeneralConstants from ..slave.base_slave_protocol_manager import FedMLBaseSlaveProtocolManager from .worker_job_runner_manager import FedMLDeployJobRunnerManager from .device_mqtt_inference_protocol import FedMLMqttInference @@ -163,7 +164,8 @@ def callback_start_deployment(self, topic, payload): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue() + status_center_queue=self.get_status_queue(), + process_name=GeneralConstants.get_deploy_slave_job_process_name(run_id, self.edge_id) ) process = self._get_job_runner_manager().get_runner_process(run_id) if process is not None: diff --git a/python/fedml/computing/scheduler/scheduler_core/account_manager.py b/python/fedml/computing/scheduler/scheduler_core/account_manager.py index 3b80511d12..8d73a42679 100755 --- a/python/fedml/computing/scheduler/scheduler_core/account_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/account_manager.py @@ -25,6 +25,7 @@ class FedMLAccountManager(Singleton): ROLE_CLOUD_SERVER = "cloud_server" ROLE_EDGE_DEVICE = "client" ROLE_GPU_PROVIDER = "gpu_supplier" + ROLE_GPU_MASTER_SERVER = "gpu_master_server" ROLE_DEPLOY_MASTER_ON_PREM = "md.on_premise_device.master" ROLE_DEPLOY_WORKER_ON_PREM = "md.on_premise_device" @@ -33,6 +34,7 @@ class FedMLAccountManager(Singleton): DEVICE_ID_SUFFIX_CLOUD_SERVER = ".Public.Server" DEVICE_ID_SUFFIX_EDGE_DEVICE = ".Edge.Device" DEVICE_ID_SUFFIX_GPU_PROVIDER = ".Edge.GPU.Supplier" + DEVICE_ID_SUFFIX_GPU_MASTER_SERVER = ".Edge.GPU.MasterServer" DEVICE_ID_SUFFIX_DEPLOY = "MDA" DEVICE_ID_SUFFIX_DEPLOY_MASTER_ON_PREM = ".OnPremise.Master.Device" DEVICE_ID_SUFFIX_DEPLOY_WORKER_ON_PREM = ".OnPremise.Device" @@ -41,8 +43,7 @@ class FedMLAccountManager(Singleton): DEVICE_ID_DOCKER_HUB_TAG = ".DockerHub" def __init__(self): - if not hasattr(self, "agent_args"): - self.agent_args = None + pass @staticmethod def get_instance(): @@ -50,7 +51,7 @@ def get_instance(): def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, runner_cmd=None): # Build the agent args - self.build_agent_args( + agent_args = self.build_agent_args( user_id, api_key=api_key, device_id=device_id, os_name=os_name, role=role, runner_cmd=runner_cmd ) @@ -93,8 +94,8 @@ def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, ru # noinspection PyBroadException try: edge_id, user_name, extra_url, general_edge_id = FedMLAccountManager.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], self.agent_args.account_id, - self.agent_args.unique_device_id, self.agent_args.os_name, + service_config["ml_ops_config"]["EDGE_BINDING_URL"], agent_args.account_id, + agent_args.unique_device_id, agent_args.os_name, api_key=api_key, role=role ) if edge_id > 0: @@ -118,13 +119,13 @@ def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, ru return None # Fill the bound result to agent args. - self.fill_argent_args( - log_server_url=log_server_url, server_id=edge_id, + agent_args = self.fill_argent_args( + agent_args, log_server_url=log_server_url, server_id=edge_id, edge_id=edge_id, general_edge_id=general_edge_id, user_name=user_name, extra_url=extra_url, agent_config=service_config) - return self.agent_args + return agent_args def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, role=None, runner_cmd=None): # Generate the suffix for device based on the role @@ -144,6 +145,9 @@ def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_EDGE_DEVICE elif role == FedMLAccountManager.ROLE_GPU_PROVIDER: device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_GPU_PROVIDER + elif role == FedMLAccountManager.ROLE_GPU_MASTER_SERVER: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_GPU_MASTER_SERVER + is_master = True elif role == FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM: device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_DEPLOY_MASTER_ON_PREM is_master = True @@ -154,32 +158,31 @@ def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, # Build the agent args version = fedml.get_env_version() - if self.agent_args is None: - self.agent_args = AgentArgs() - self.agent_args.role = role - self.agent_args.account_id = user_id - self.agent_args.api_key = api_key - self.agent_args.current_running_dir = GeneralConstants.get_deploy_fedml_home_dir(is_master=is_master) \ + agent_args = AgentArgs() + agent_args.role = role + agent_args.account_id = user_id + agent_args.api_key = api_key + agent_args.current_running_dir = GeneralConstants.get_deploy_fedml_home_dir(is_master=is_master) \ if is_deploy else GeneralConstants.get_launch_fedml_home_dir(is_master=is_master) sys_name = platform.system() if sys_name == "Darwin": sys_name = "MacOS" - self.agent_args.os_name = sys_name if os_name is None or os_name == "" else os_name - self.agent_args.version = version - self.agent_args.log_file_dir = GeneralConstants.get_deploy_log_file_dir(is_master=is_master) \ + agent_args.os_name = sys_name if os_name is None or os_name == "" else os_name + agent_args.version = version + agent_args.log_file_dir = GeneralConstants.get_deploy_log_file_dir(is_master=is_master) \ if is_deploy else GeneralConstants.get_launch_log_file_dir(is_master=is_master) is_from_docker = False if device_id is not None and device_id != "0": - self.agent_args.current_device_id = device_id + agent_args.current_device_id = device_id else: data_dir = GeneralConstants.get_deploy_data_dir(is_master=is_master) \ if is_deploy else GeneralConstants.get_launch_data_dir(is_master=is_master) is_gpu_provider = True if role == FedMLAccountManager.ROLE_GPU_PROVIDER else False - self.agent_args.current_device_id = FedMLAccountManager.get_device_id( + agent_args.current_device_id = FedMLAccountManager.get_device_id( data_dir=data_dir, use_machine_id=is_gpu_provider) - self.agent_args.device_id = self.agent_args.current_device_id - self.agent_args.config_version = version - self.agent_args.cloud_region = "" + agent_args.device_id = agent_args.current_device_id + agent_args.config_version = version + agent_args.cloud_region = "" # Check if it is running in the fedml docker hub is_from_fedml_docker_hub = False @@ -191,26 +194,29 @@ def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, # Build unique device id docker_tag = FedMLAccountManager.DEVICE_ID_DOCKER_TAG if is_from_docker else "" docker_tag = FedMLAccountManager.DEVICE_ID_DOCKER_HUB_TAG if is_from_fedml_docker_hub else docker_tag - unique_device_id = f"{self.agent_args.current_device_id}@{self.agent_args.os_name}" \ + unique_device_id = f"{agent_args.current_device_id}@{agent_args.os_name}" \ f"{docker_tag}{device_id_suffix}" if role == FedMLAccountManager.ROLE_CLOUD_SERVER: - unique_device_id = self.agent_args.current_device_id + unique_device_id = agent_args.current_device_id # Set the unique device id - self.agent_args.is_from_docker = is_from_docker or is_from_fedml_docker_hub - self.agent_args.unique_device_id = unique_device_id - self.agent_args.runner_cmd = runner_cmd + agent_args.is_from_docker = is_from_docker or is_from_fedml_docker_hub + agent_args.unique_device_id = unique_device_id + agent_args.runner_cmd = runner_cmd + + return agent_args def fill_argent_args( - self, log_server_url=None, server_id=None, edge_id=None, + self, agent_args, log_server_url=None, server_id=None, edge_id=None, user_name=None, extra_url=None, general_edge_id=None, agent_config=None): - self.agent_args.log_server_url = log_server_url - self.agent_args.server_id = server_id - self.agent_args.edge_id = edge_id - self.agent_args.user_name = user_name - self.agent_args.extra_url = extra_url - self.agent_args.general_edge_id = general_edge_id - self.agent_args.agent_config = agent_config + agent_args.log_server_url = log_server_url + agent_args.server_id = server_id + agent_args.edge_id = edge_id + agent_args.user_name = user_name + agent_args.extra_url = extra_url + agent_args.general_edge_id = general_edge_id + agent_args.agent_config = agent_config + return agent_args @staticmethod def write_login_failed_file(is_client=True): diff --git a/python/fedml/computing/scheduler/scheduler_core/general_constants.py b/python/fedml/computing/scheduler/scheduler_core/general_constants.py index 8c60b17bdf..0ab6f79577 100755 --- a/python/fedml/computing/scheduler/scheduler_core/general_constants.py +++ b/python/fedml/computing/scheduler/scheduler_core/general_constants.py @@ -65,6 +65,20 @@ class GeneralConstants: FEDML_OTA_CMD_RESTART = "restart" FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT = "MODEL_END_POINT" + FEDML_PROCESS_NAME_PREFIX = "fedml-process-" + FEDML_LAUNCH_MASTER_JOB_RUNNER_TAG = "launch-master-job-runner" + FEDML_LAUNCH_SLAVE_JOB_RUNNER_TAG = "launch-slave-job-runner" + FEDML_LAUNCH_MASTER_USER_JOB_TAG = "launch-master-user-job" + FEDML_DEPLOY_MASTER_JOB_RUNNER_TAG = "deploy-master-job-runner" + FEDML_DEPLOY_SLAVE_JOB_RUNNER_TAG = "deploy-slave-job-runner" + FEDML_DEPLOY_MASTER_USER_JOB_TAG = "deploy-master-user-job" + FEDML_MESSAGE_CENTER_LISTENER_TAG = "message-center-listener" + FEDML_MESSAGE_CENTER_SENDER_TAG = "message-center-sender" + FEDML_STATUS_CENTER_TAG = "status-center" + FEDML_LOG_PROCESS_TAG = "log" + FEDML_MONITOR_PROCESS_TAG = "monitor" + + FEDML_TOPIC_STATUS_CENTER_STOP = "anywhere/status_center/stop" @staticmethod def get_package_unzip_dir(package_download_dir): @@ -216,3 +230,65 @@ def get_topic_complete_job(server_id): def get_payload_complete_job(run_id, server_id): payload_complete_job = {"runId": run_id, "serverId": server_id} return payload_complete_job + + @staticmethod + def get_process_name(process_tag, run_id=None, edge_id=None): + return f'{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{process_tag}'\ + f'{"-run-" + str(run_id) if run_id is not None and int(run_id) != 0 else ""}'\ + f'{"-edge-" + str(edge_id) if edge_id is not None else ""}' + + @staticmethod + def get_process_name_with_prefix(process_prefix, run_id=None, edge_id=None): + return f"{process_prefix}-run-{run_id}-edge-{edge_id}" + + @staticmethod + def get_launch_master_job_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_LAUNCH_MASTER_JOB_RUNNER_TAG, run_id, edge_id) + + @staticmethod + def get_launch_slave_job_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_LAUNCH_SLAVE_JOB_RUNNER_TAG, run_id, edge_id) + + @staticmethod + def get_launch_master_user_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_LAUNCH_MASTER_USER_JOB_TAG, run_id, edge_id) + + @staticmethod + def get_deploy_master_job_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_DEPLOY_MASTER_JOB_RUNNER_TAG, run_id, edge_id) + + @staticmethod + def get_deploy_slave_job_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_DEPLOY_SLAVE_JOB_RUNNER_TAG, run_id, edge_id) + + @staticmethod + def get_deploy_master_user_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_DEPLOY_MASTER_USER_JOB_TAG, run_id, edge_id) + + @staticmethod + def get_log_process_name(run_id, edge_id): + return GeneralConstants.get_process_name( + GeneralConstants.FEDML_LOG_PROCESS_TAG, run_id, edge_id) + + @staticmethod + def get_message_center_listener_process_name(message_center_name): + return f"{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{GeneralConstants.FEDML_MESSAGE_CENTER_LISTENER_TAG}-{message_center_name}" + + @staticmethod + def get_message_center_sender_process_name(message_center_name): + return f"{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{GeneralConstants.FEDML_MESSAGE_CENTER_SENDER_TAG}-{message_center_name}" + + @staticmethod + def get_status_center_process_name(status_center_tag): + return f"{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{GeneralConstants.FEDML_STATUS_CENTER_TAG}-{status_center_tag}" + + @staticmethod + def get_monitor_process_name(monitor_tag, run_id, edge_id): + return GeneralConstants.get_process_name( + f"{GeneralConstants.FEDML_MONITOR_PROCESS_TAG}-{monitor_tag}", run_id, edge_id) diff --git a/python/fedml/computing/scheduler/scheduler_core/message_center.py b/python/fedml/computing/scheduler/scheduler_core/message_center.py index dbe11700a0..5f414d1873 100755 --- a/python/fedml/computing/scheduler/scheduler_core/message_center.py +++ b/python/fedml/computing/scheduler/scheduler_core/message_center.py @@ -1,16 +1,20 @@ import json import logging import os +import platform import threading import time import traceback import uuid import multiprocessing -from multiprocessing import Process, Queue import queue from os.path import expanduser +import setproctitle + +import fedml from fedml.core.distributed.communication.mqtt.mqtt_manager import MqttManager +from .general_constants import GeneralConstants from ..slave.client_constants import ClientConstants from ....core.mlops.mlops_metrics import MLOpsMetrics from operator import methodcaller @@ -20,6 +24,7 @@ class FedMLMessageCenter(object): FUNC_SETUP_MESSAGE_CENTER = "setup_message_center" FUNC_REBUILD_MESSAGE_CENTER = "rebuild_message_center" + FUNC_PROCESS_EXTRA_QUEUES = "process_extra_queues" ENABLE_SAVE_MESSAGE_TO_FILE = True PUBLISH_MESSAGE_RETRY_TIMEOUT = 60 * 1000.0 PUBLISH_MESSAGE_RETRY_COUNT = 3 @@ -27,11 +32,12 @@ class FedMLMessageCenter(object): MESSAGE_SENT_SUCCESS_RECORDS_FILE = "message-sent-success-records.log" MESSAGE_RECEIVED_RECORDS_FILE = "message-received-records.log" - def __init__(self, agent_config=None, sender_message_queue=None, listener_message_queue=None): + def __init__(self, agent_config=None, sender_message_queue=None, + listener_message_queue=None, sender_message_event=None): self.sender_agent_config = agent_config self.listener_agent_config = agent_config self.sender_message_queue = sender_message_queue - self.message_event = None + self.message_event = sender_message_event self.message_center_process = None self.sender_mqtt_mgr = None self.sender_mlops_metrics = None @@ -130,21 +136,33 @@ def release_sender_mqtt_mgr(self): def get_sender_message_queue(self): return self.sender_message_queue + def get_sender_message_event(self): + return self.message_event + def start_sender(self, message_center_name=None): - self.sender_message_queue = Queue() + self.sender_message_queue = multiprocessing.Manager().Queue() self.message_event = multiprocessing.Event() self.message_event.clear() + process_name = GeneralConstants.get_message_center_sender_process_name(message_center_name) message_center = FedMLMessageCenter(agent_config=self.sender_agent_config, sender_message_queue=self.sender_message_queue) - self.message_center_process = Process( - target=message_center.run_sender, args=( - self.message_event, self.sender_message_queue, - message_center_name + if platform.system() == "Windows": + self.message_center_process = multiprocessing.Process( + target=message_center.run_sender, args=( + self.message_event, self.sender_message_queue, + message_center_name, process_name + ) + ) + else: + self.message_center_process = fedml.get_process( + target=message_center.run_sender, args=( + self.message_event, self.sender_message_queue, + message_center_name, process_name + ) ) - ) self.message_center_process.start() - def stop(self): + def stop_message_center(self): if self.message_event is not None: self.message_event.set() @@ -156,6 +174,10 @@ def check_message_stop_event(self): logging.info("Received message center stopping event.") raise MessageCenterStoppedException("Message center stopped (for sender)") + if self.listener_message_event is not None and self.listener_message_event.is_set(): + logging.info("Received message center stopping event.") + raise MessageCenterStoppedException("Message center stopped (for listener)") + def send_message(self, topic, payload, run_id=None): message_entity = FedMLMessageEntity(topic=topic, payload=payload, run_id=run_id) self.sender_message_queue.put(message_entity.get_message_body()) @@ -193,7 +215,13 @@ def retry_sending_undelivered_message(self): # Save the message self.save_message_record(message_entity.run_id, message_entity.device_id, sent_message_record) - def run_sender(self, message_event, message_queue, message_center_name): + def run_sender(self, message_event, message_queue, message_center_name, process_name=None): + if process_name is not None: + setproctitle.setproctitle(process_name) + + if platform.system() != "Windows": + os.setsid() + self.message_event = message_event self.sender_message_queue = message_queue self.message_center_name = message_center_name @@ -248,10 +276,16 @@ def run_sender(self, message_event, message_queue, message_center_name): self.release_sender_mqtt_mgr() + def get_protocol_communication_manager(self): + return None + def setup_listener_mqtt_mgr(self): if self.listener_mqtt_mgr is not None: return + # self.listener_mqtt_mgr = self.get_protocol_communication_manager() + # return + self.listener_mqtt_mgr = MqttManager( self.listener_agent_config["mqtt_config"]["BROKER_HOST"], self.listener_agent_config["mqtt_config"]["BROKER_PORT"], @@ -264,7 +298,11 @@ def setup_listener_mqtt_mgr(self): self.listener_mqtt_mgr.connect() self.listener_mqtt_mgr.loop_start() + def get_listener_communication_manager(self): + return self.listener_mqtt_mgr + def release_listener_mqtt_mgr(self): + #return try: if self.listener_mqtt_mgr is not None: self.listener_mqtt_mgr.loop_stop() @@ -287,6 +325,9 @@ def remove_message_listener(self, topic): self.listener_topics.remove(topic) self.listener_handler_funcs.pop(topic) + def get_listener_handler(self, topic): + return self.listener_handler_funcs.get(topic) + def get_message_runner(self): return None @@ -294,29 +335,42 @@ def get_listener_message_queue(self): return self.listener_message_queue def setup_listener_message_queue(self): - self.listener_message_queue = Queue() + self.listener_message_queue = multiprocessing.Manager().Queue() - def start_listener(self, sender_message_queue=None, listener_message_queue=None, agent_config=None, message_center_name=None): + def start_listener( + self, sender_message_queue=None, listener_message_queue=None, + sender_message_event=None, agent_config=None, message_center_name=None, extra_queues=None + ): if self.listener_message_center_process is not None: return if listener_message_queue is None: if self.listener_message_queue is None: - self.listener_message_queue = Queue() + self.listener_message_queue = multiprocessing.Manager().Queue() else: self.listener_message_queue = listener_message_queue self.listener_message_event = multiprocessing.Event() self.listener_message_event.clear() self.listener_agent_config = agent_config - message_runner = self.get_message_runner() + message_runner = self message_runner.listener_agent_config = agent_config - self.listener_message_center_process = Process( - target=message_runner.run_listener_dispatcher, args=( - self.listener_message_event, self.listener_message_queue, - self.listener_handler_funcs, sender_message_queue, - message_center_name + process_name = GeneralConstants.get_message_center_listener_process_name(message_center_name) + if platform.system() == "Windows": + self.listener_message_center_process = multiprocessing.Process( + target=message_runner.run_listener_dispatcher, args=( + self.listener_message_event, self.listener_message_queue, + self.listener_handler_funcs, sender_message_queue, + sender_message_event, message_center_name, extra_queues, process_name + ) + ) + else: + self.listener_message_center_process = fedml.get_process( + target=message_runner.run_listener_dispatcher, args=( + self.listener_message_event, self.listener_message_queue, + self.listener_handler_funcs, sender_message_queue, + sender_message_event, message_center_name, extra_queues, process_name + ) ) - ) self.listener_message_center_process.start() def check_listener_message_stop_event(self): @@ -349,13 +403,22 @@ def unsubscribe_msg(self, topic): self.listener_mqtt_mgr.unsubscribe_msg(topic) def run_listener_dispatcher( - self, message_event, message_queue, listener_funcs, sender_message_queue, - message_center_name + self, listener_message_event, listener_message_queue, + listener_funcs, sender_message_queue, sender_message_event, + message_center_name, extra_queues, process_name=None ): - self.listener_message_event = message_event - self.listener_message_queue = message_queue + if process_name is not None: + setproctitle.setproctitle(process_name) + + if platform.system() != "Windows": + os.setsid() + + self.listener_message_event = listener_message_event + self.listener_message_queue = listener_message_queue self.listener_handler_funcs = listener_funcs self.message_center_name = message_center_name + self.sender_message_queue = sender_message_queue + self.message_event = sender_message_event self.setup_listener_mqtt_mgr() @@ -364,6 +427,9 @@ def run_listener_dispatcher( else: methodcaller(FedMLMessageCenter.FUNC_REBUILD_MESSAGE_CENTER, sender_message_queue)(self) + if extra_queues is not None: + methodcaller(FedMLMessageCenter.FUNC_PROCESS_EXTRA_QUEUES, extra_queues)(self) + while True: message_entity = None try: @@ -378,7 +444,7 @@ def run_listener_dispatcher( # Get the message from the queue try: - message_body = message_queue.get(block=False, timeout=0.1) + message_body = listener_message_queue.get(block=False, timeout=0.1) except queue.Empty as e: # If queue is empty, then break loop message_body = None if message_body is None: @@ -402,6 +468,11 @@ def run_listener_dispatcher( message_handler_func_name = self.listener_handler_funcs.get(message_entity.topic, None) if message_handler_func_name is not None: methodcaller(message_handler_func_name, message_entity.topic, message_entity.payload)(self) + else: + if hasattr(self, "callback_proxy_unknown_messages") and \ + self.callback_proxy_unknown_messages is not None: + self.callback_proxy_unknown_messages( + message_entity.run_id, message_entity.topic, message_entity.payload) except Exception as e: if message_entity is not None: logging.info( diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py index 6e0010f556..7175032375 100755 --- a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py @@ -9,6 +9,8 @@ import traceback import zipfile import queue + +import fedml from ..comm_utils.constants import SchedulerConstants from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs from ..scheduler_entry.constants import Constants @@ -73,6 +75,7 @@ def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id self.user_name = None self.general_edge_id = None self.message_center = None + self.status_center = None self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { "${FEDSYS.RUN_ID}": "", "${FEDSYS.PRIVATE_LOCAL_DATA}": "", @@ -208,9 +211,15 @@ def retrieve_and_unzip_package(self, package_name, package_url): # Open a process to download the package so that we can avoid the request is blocked and check the timeout. from multiprocessing import Process completed_event = multiprocessing.Event() - info_queue = multiprocessing.Queue() - download_process = Process(target=self.download_package_proc, - args=(package_url, local_package_file, completed_event, info_queue)) + info_queue = multiprocessing.Manager().Queue() + if platform.system() == "Windows": + download_process = multiprocessing.Process( + target=self.download_package_proc, + args=(package_url, local_package_file, completed_event, info_queue)) + else: + download_process = fedml.get_process( + target=self.download_package_proc, + args=(package_url, local_package_file, completed_event, info_queue)) download_process.start() allowed_block_download_time = 60 download_finished = False @@ -606,7 +615,8 @@ def job_error_processor(self, error_list): def start_runner_process( self, run_id, edge_id, request_json, cuda_visible_gpu_ids_str=None, - sender_message_queue=None, status_center_queue=None + sender_message_queue=None, listener_message_queue=None, + status_center_queue=None, process_name=None ): return None @@ -640,8 +650,8 @@ def rebuild_message_status_center(self, sender_message_queue, listener_message_q self.mlops_metrics.set_messenger(self.message_center) self.mlops_metrics.run_id = self.run_id - status_center = FedMLStatusCenter.rebuild_status_center_from_queue(status_queue) + self.status_center = FedMLStatusCenter.rebuild_status_center_from_queue(status_queue) if self.status_reporter is None: self.status_reporter = MLOpsMetrics() - self.status_reporter.set_messenger(status_center) + self.status_reporter.set_messenger(self.status_center) self.status_reporter.run_id = self.run_id diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py index dcc4045699..ad32f78631 100755 --- a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py @@ -20,7 +20,7 @@ def start_job_runner( self, run_id, request_json, args=None, edge_id=None, is_server_job=False, sender_message_queue=None, listener_message_queue=None, status_center_queue=None, should_start_cloud_server=False, use_local_process_as_cloud_server=False, - cuda_visible_gpu_ids_str=None + cuda_visible_gpu_ids_str=None, process_name=None ): run_id_str = str(run_id) self.job_runners[run_id_str] = self._generate_job_runner_instance( @@ -29,9 +29,11 @@ def start_job_runner( ) self.job_runners[run_id_str].start_runner_process( run_id, request_json, edge_id=edge_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue + status_center_queue=status_center_queue, + process_name=process_name ) def stop_job_runner(self, run_id): diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py index 19bb7e9882..9970b1d3f6 100755 --- a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py @@ -2,11 +2,13 @@ import json import logging import multiprocessing +import os import sys import time import traceback import uuid import fedml +from ..comm_utils.run_process_utils import RunProcessUtils from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager from ....core.mlops.mlops_metrics import MLOpsMetrics @@ -45,6 +47,7 @@ def __init__(self, args, agent_config=None, is_master=False): self.mlops_metrics = None self.status_reporter = None self.user_name = args.user_name + self.parent_agent = None fedml._init_multiprocessing() @@ -58,38 +61,54 @@ def add_protocol_handler(self): # self.add_message_listener(self.topic_start_train, self.callback_start_train) pass - def initialize(self): + def initialize( + self, communication_manager=None, sender_message_queue=None, + status_center_queue=None, sender_message_event=None + ): # Generate the message topics self.generate_topics() # Setup MQTT connection - self.communication_mgr = MqttManager( - self.agent_config["mqtt_config"]["BROKER_HOST"], - self.agent_config["mqtt_config"]["BROKER_PORT"], - self.agent_config["mqtt_config"]["MQTT_USER"], - self.agent_config["mqtt_config"]["MQTT_PWD"], - self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], - f"FedML_Agent_Daemon_@{self.user_name}@_@{self.current_device_id}@_@{str(uuid.uuid4())}@", - self.topic_last_will, - json.dumps({"ID": self.edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) - ) + if communication_manager is None: + self.communication_mgr = MqttManager( + self.agent_config["mqtt_config"]["BROKER_HOST"], + self.agent_config["mqtt_config"]["BROKER_PORT"], + self.agent_config["mqtt_config"]["MQTT_USER"], + self.agent_config["mqtt_config"]["MQTT_PWD"], + self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], + f"FedML_Agent_Daemon_@{self.user_name}@_@{self.current_device_id}@_@{str(uuid.uuid4())}@", + self.topic_last_will, + json.dumps({"ID": self.edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) + ) + else: + self.communication_mgr = communication_manager # Add the message listeners for all topics self.add_protocol_handler() # Start the message center to process edge related messages. - self.setup_message_center() + if sender_message_queue is None: + self.setup_message_center() + sender_message_event = self.message_center.get_sender_message_event() + else: + self.rebuild_message_center(sender_message_queue) # Setup the message listener queue self.setup_listener_message_queue() # Start the status center to process edge related status. - self.start_status_listener_center() + if status_center_queue is None: + self.start_status_listener_center(sender_message_event=sender_message_event) + else: + self.set_status_queue(status_center_queue) + self.rebuild_status_center(status_center_queue) # Start the message center for listener self.start_listener(sender_message_queue=self.message_center.get_sender_message_queue(), + sender_message_event=sender_message_event, agent_config=self.agent_config, - message_center_name=self.message_center_name) + message_center_name=self.message_center_name, + extra_queues=[self.get_status_queue()]) # Init extra items, e.g. database, recovery, etc. self._init_extra_items() @@ -97,11 +116,11 @@ def initialize(self): # Setup MQTT connected listener self.communication_mgr.add_connected_listener(self.on_agent_communication_connected) self.communication_mgr.add_disconnected_listener(self.on_agent_communication_disconnected) - self.communication_mgr.connect() def start(self): # Start MQTT message loop try: + self.communication_mgr.connect() self.communication_mgr.loop_forever() except Exception as e: if str(e) == "Restarting after upgraded...": @@ -110,6 +129,8 @@ def start(self): logging.info("Server tracing: {}".format(traceback.format_exc())) finally: + logging.info(f"Protocol manager is about to exit, pid: {os.getpid()}") + FedMLAccountManager.write_login_failed_file(is_client=not self.is_master_agent) self.stop() @@ -120,7 +141,7 @@ def start(self): clean_process_group=False) sys.exit(1) - def stop(self): + def stop(self, kill_process=False): if self.communication_mgr is not None: # noinspection PyBroadException try: @@ -132,7 +153,10 @@ def stop(self): self.communication_mgr.loop_stop() self.communication_mgr.disconnect() - self.release_message_center() + if kill_process: + self.post_status_center_stopping_message() + self.release_message_center() + RunProcessUtils.kill_process(os.getppid(), exclude_current_pid=True) @abstractmethod def _init_extra_items(self): @@ -196,20 +220,37 @@ def rebuild_message_center(self, message_center_queue): def release_message_center(self): try: + self.stop_message_center() + if self.message_center is not None: - self.message_center.stop() + self.message_center.stop_message_center() self.message_center = None except Exception as e: logging.error( - f"Failed to release slave communication manager with Exception {e}. " + f"Failed to release the message center with Exception {e}. " + f"Traceback: {traceback.format_exc()}") + pass + + def release_status_center(self): + try: + self.stop_status_center() + + if self.status_center is not None: + self.status_center.stop_status_center() + self.status_center = None + + except Exception as e: + logging.error( + f"Failed to release the status center with Exception {e}. " f"Traceback: {traceback.format_exc()}") pass - def start_status_listener_center(self): + def start_status_listener_center(self, sender_message_event=None): self.start_status_center( sender_message_center_queue=self.message_center.get_sender_message_queue(), listener_message_center_queue=self.get_listener_message_queue(), + sender_message_event=sender_message_event, is_slave_agent=not self.is_master_agent ) @@ -231,6 +272,9 @@ def rebuild_status_center(self, status_center_queue): self.status_reporter.edge_id = self.edge_id self.status_reporter.server_agent_id = self.server_agent_id + def process_extra_queues(self, extra_queues): + pass + def generate_status_report(self, run_id, edge_id, server_agent_id=None): status_reporter = MLOpsMetrics() status_reporter.set_messenger(self, send_message_func=self.send_status_message) @@ -266,6 +310,29 @@ def get_status_runner(self): return None + def get_protocol_communication_manager(self): + return self.communication_mgr + + def get_protocol_sender_message_queue(self): + return self.message_center.get_sender_message_queue() + + def get_protocol_sender_message_event(self): + return self.message_center.get_sender_message_event() + + def get_protocol_status_center_queue(self): + return self.get_status_queue() + + def get_subscribed_topics(self): + return self.subscribed_topics + def send_agent_active_msg(self, edge_id): active_msg = {"ID": edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_IDLE} self.message_center.send_message_json(self.topic_active, json.dumps(active_msg)) + + def post_status_center_stopping_message(self, run_id=None): + topic_status_center_stopping = GeneralConstants.FEDML_TOPIC_STATUS_CENTER_STOP + payload = {"run_id": run_id} + self.status_reporter.send_message(topic_status_center_stopping, json.dumps(payload)) + + def set_parent_agent(self, parent_agent): + self.parent_agent = parent_agent diff --git a/python/fedml/computing/scheduler/scheduler_core/status_center.py b/python/fedml/computing/scheduler/scheduler_core/status_center.py index 97c2115e76..b1462d7ea9 100755 --- a/python/fedml/computing/scheduler/scheduler_core/status_center.py +++ b/python/fedml/computing/scheduler/scheduler_core/status_center.py @@ -1,10 +1,16 @@ import logging +import os +import platform import time from enum import Enum, unique import multiprocessing -from multiprocessing import Process, Queue import queue + +import setproctitle + +import fedml +from .general_constants import GeneralConstants from .message_common import FedMLMessageEntity, FedMLStatusEntity from .message_center import FedMLMessageCenter import traceback @@ -81,6 +87,7 @@ class FedMLStatusCenter(object): TOPIC_SLAVE_JOB_LAUNCH_SUFFIX = "/start_train" TOPIC_SLAVE_JOB_STOP_PREFIX = "flserver_agent/" TOPIC_SLAVE_JOB_STOP_SUFFIX = "/stop_train" + TOPIC_STATUS_CENTER_STOP_PREFIX = GeneralConstants.FEDML_TOPIC_STATUS_CENTER_STOP ALLOWED_MAX_JOB_STATUS_CACHE_NUM = 1000 def __init__(self, message_queue=None): @@ -105,25 +112,42 @@ def get_status_runner(self): return None def start_status_center(self, sender_message_center_queue=None, - listener_message_center_queue=None, is_slave_agent=False): - self.status_queue = Queue() + listener_message_center_queue=None, + sender_message_event=None, + is_slave_agent=False): + self.status_queue = multiprocessing.Manager().Queue() self.status_event = multiprocessing.Event() self.status_event.clear() self.status_sender_message_center_queue = sender_message_center_queue self.status_listener_message_center_queue = listener_message_center_queue - self.status_runner = self.get_status_runner() + self.status_runner = self + process_name = GeneralConstants.get_status_center_process_name( + f'{"deploy" if self.is_deployment_status_center else "launch"}_' + f'{"slave" if is_slave_agent else "master"}_agent') target_func = self.status_runner.run_status_dispatcher if not is_slave_agent else \ self.status_runner.run_status_dispatcher_in_slave - self.status_center_process = Process( - target=target_func, args=( - self.status_event, self.status_queue, self.status_sender_message_center_queue, - self.status_listener_message_center_queue + if platform.system() == "Windows": + self.status_center_process = multiprocessing.Process( + target=target_func, args=( + self.status_event, self.status_queue, self.status_sender_message_center_queue, + self.status_listener_message_center_queue, sender_message_event, process_name + ) + ) + else: + self.status_center_process = fedml.get_process( + target=target_func, args=( + self.status_event, self.status_queue, self.status_sender_message_center_queue, + self.status_listener_message_center_queue, sender_message_event, process_name + ) ) - ) self.status_center_process.start() - def check_message_stop_event(self): + def stop_status_center(self): + if self.status_event is not None: + self.status_event.set() + + def check_status_stop_event(self): if self.status_event is not None and self.status_event.is_set(): logging.info("Received status center stopping event.") raise StatusCenterStoppedException("Status center stopped (for sender)") @@ -142,6 +166,9 @@ def send_status_message(self, topic, payload): def get_status_queue(self): return self.status_queue + def set_status_queue(self, status_queue): + self.status_queue = status_queue + def status_center_process_master_status(self, topic, payload): pass @@ -156,7 +183,14 @@ def rebuild_status_center(self, status_queue): def run_status_dispatcher(self, status_event, status_queue, sender_message_center_queue, - listener_message_center_queue): + listener_message_center_queue, + sender_message_event, process_name=None): + if process_name is not None: + setproctitle.setproctitle(process_name) + + if platform.system() != "Windows": + os.setsid() + # Save the parameters self.status_event = status_event self.status_queue = status_queue @@ -169,10 +203,11 @@ def run_status_dispatcher(self, status_event, status_queue, self.rebuild_message_center(sender_message_center_queue) message_center = FedMLMessageCenter( sender_message_queue=sender_message_center_queue, - listener_message_queue=listener_message_center_queue + listener_message_queue=listener_message_center_queue, + sender_message_event=sender_message_event ) - if sender_message_center_queue is not None: + if status_queue is not None: self.rebuild_status_center(status_queue) # Init status manager instances @@ -183,7 +218,7 @@ def run_status_dispatcher(self, status_event, status_queue, # Check if we should stop status dispatcher try: - self.check_message_stop_event() + self.check_status_stop_event() except StatusCenterStoppedException as e: break @@ -203,6 +238,12 @@ def run_status_dispatcher(self, status_event, status_queue, message_entity = FedMLMessageEntity(message_body=message_body) status_entity = FedMLStatusEntity(status_msg_body=message_body) + if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_STATUS_CENTER_STOP_PREFIX): + # Process the stop message for message center and status center + message_center.stop_message_center() + self.stop_status_center() + continue + # Generate status manager instance run_id_str = str(status_entity.run_id) run_id_int = int(status_entity.run_id) @@ -252,7 +293,14 @@ def run_status_dispatcher(self, status_event, status_queue, def run_status_dispatcher_in_slave(self, status_event, status_queue, sender_message_center_queue, - listener_message_center_queue): + listener_message_center_queue, + sender_message_event, process_name=None): + if process_name is not None: + setproctitle.setproctitle(process_name) + + if platform.system() != "Windows": + os.setsid() + # Save the parameters self.status_event = status_event self.status_queue = status_queue @@ -265,10 +313,11 @@ def run_status_dispatcher_in_slave(self, status_event, status_queue, self.rebuild_message_center(sender_message_center_queue) message_center = FedMLMessageCenter( sender_message_queue=sender_message_center_queue, - listener_message_queue=listener_message_center_queue + listener_message_queue=listener_message_center_queue, + sender_message_event=sender_message_event ) - if sender_message_center_queue is not None: + if status_queue is not None: self.rebuild_status_center(status_queue) # Init status manager instances @@ -280,7 +329,7 @@ def run_status_dispatcher_in_slave(self, status_event, status_queue, # Check if we should stop status dispatcher try: - self.check_message_stop_event() + self.check_status_stop_event() except StatusCenterStoppedException as e: break diff --git a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py index e045458db5..ec98cc7906 100755 --- a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py +++ b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py @@ -136,12 +136,14 @@ def process_job_completed_status(self, master_id, status): # self.stop_cloud_server() # self.remove_listener_for_run_metrics(self.run_id) # self.remove_listener_for_run_logs(self.run_id) + self.message_center.receive_message( GeneralConstants.get_topic_complete_job(master_id), json.dumps(GeneralConstants.get_payload_complete_job(self.run_id, master_id))) - if self.status_center.is_deployment_status_center and status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: - self.report_deployment_status(self.run_id, GeneralConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) + if self.status_center.is_deployment_status_center: + if status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: + self.report_deployment_status(self.run_id, GeneralConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) def process_job_exception_status(self, master_id, status): # Report exception job status @@ -187,16 +189,17 @@ def process_job_status_consensus(self, run_id, master_id, status): status = self.get_entire_job_status() # Set the device status based on the job status - for edge_id_item, edge_status_item in self.edge_status_dict.items(): - if edge_id_item == "server": - continue - - # Calc the device status based on the job status - consensus_device_status = FedMLStatusManager.get_device_consensus_status_in_job( - status, edge_status_item) - if consensus_device_status is not None: - self.message_reporter.report_client_training_status( - edge_id_item, consensus_device_status, run_id=run_id, update_db=False) + if self.edge_status_dict is not None: + for edge_id_item, edge_status_item in self.edge_status_dict.items(): + if edge_id_item == "server": + continue + + # Calc the device status based on the job status + consensus_device_status = FedMLStatusManager.get_device_consensus_status_in_job( + status, edge_status_item) + if consensus_device_status is not None: + self.message_reporter.report_client_training_status( + edge_id_item, consensus_device_status, run_id=run_id, update_db=False) # Save the job status to local storage FedMLServerDataInterface.get_instance().save_job_status(run_id, master_id, status, status) diff --git a/python/fedml/computing/scheduler/slave/base_slave_agent.py b/python/fedml/computing/scheduler/slave/base_slave_agent.py index 01c0a39195..9876ac9912 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_agent.py +++ b/python/fedml/computing/scheduler/slave/base_slave_agent.py @@ -24,7 +24,9 @@ def __init__(self): def login( self, userid, api_key=None, device_id=None, - os_name=None, need_to_check_gpu=False, role=None + os_name=None, need_to_check_gpu=False, role=None, + communication_manager=None, sender_message_queue=None, + status_center_queue=None, sender_message_event=None ): # Preprocess the login args if need_to_check_gpu: @@ -33,7 +35,7 @@ def login( print("We can't find any gpu device on your machine. \n" "With the gpu_supplier(-g) option, you need to check if your machine " "has nvidia GPUs and installs CUDA related drivers.") - return + return None # Login account login_result = FedMLAccountManager.get_instance().login( @@ -57,17 +59,22 @@ def login( # Initialize the protocol manager # noinspection PyBoardException try: - self._initialize_protocol_manager() + self._initialize_protocol_manager( + communication_manager=communication_manager, + sender_message_queue=sender_message_queue, + status_center_queue=status_center_queue, + sender_message_event=sender_message_event) except Exception as e: FedMLAccountManager.write_login_failed_file(is_client=True) self.protocol_mgr.stop() raise e + return login_result + + def start(self): # Start the protocol manager to process the messages from MLOps and slave agents. self.protocol_mgr.start() - return login_result - @staticmethod def logout(): GeneralConstants.cleanup_run_process(None) @@ -84,12 +91,20 @@ def _create_protocol_manager(self, login_result): self.protocol_mgr.user_name = login_result.user_name self.protocol_mgr.agent_config = login_result.agent_config - def _initialize_protocol_manager(self): + def _initialize_protocol_manager( + self, communication_manager=None, sender_message_queue=None, + status_center_queue=None, sender_message_event=None + ): # Init local database self._init_database() # Initialize the master protocol - self.protocol_mgr.initialize() + self.protocol_mgr.set_parent_agent(self) + self.protocol_mgr.initialize( + communication_manager=communication_manager, + sender_message_queue=sender_message_queue, + status_center_queue=status_center_queue, + sender_message_event=sender_message_event) # Start the client API process self._start_slave_api() @@ -122,6 +137,9 @@ def _start_slave_api(self): should_capture_stderr=False ) + def get_protocol_manager(self): + return self.protocol_mgr + @abstractmethod def _get_log_file_dir(self): pass @@ -137,3 +155,8 @@ def _init_database(self): @abstractmethod def _generate_protocol_manager_instance(self, args, agent_config=None): return None + + def save_deploy_ids(self, deploy_master_edge_id=None, deploy_slave_edge_id=None): + self.protocol_mgr.save_deploy_ids( + deploy_master_edge_id=deploy_master_edge_id, deploy_slave_edge_id=deploy_slave_edge_id) + diff --git a/python/fedml/computing/scheduler/slave/base_slave_job_runner.py b/python/fedml/computing/scheduler/slave/base_slave_job_runner.py index 5e530dbba7..0486b131a6 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_job_runner.py +++ b/python/fedml/computing/scheduler/slave/base_slave_job_runner.py @@ -7,6 +7,9 @@ import traceback from abc import ABC, abstractmethod +import setproctitle + +import fedml from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon from .client_data_interface import FedMLClientDataInterface @@ -47,8 +50,12 @@ def __repr__(self): ) def run(self, process_event, completed_event, run_extend_queue_list, - sender_message_center, listener_message_queue, status_center_queue): - print(f"Client runner process id {os.getpid()}, run id {self.run_id}") + sender_message_center, listener_message_queue, status_center_queue, + process_name=None): + if process_name is not None: + setproctitle.setproctitle(process_name) + + print(f"Client runner process id {os.getpid()}, name {process_name}, run id {self.run_id}") if platform.system() != "Windows": os.setsid() @@ -244,7 +251,7 @@ def reset_devices_status(self, edge_id, status): def start_runner_process( self, run_id, request_json, edge_id=None, sender_message_queue=None, listener_message_queue=None, - status_center_queue=None, cuda_visible_gpu_ids_str=None + status_center_queue=None, cuda_visible_gpu_ids_str=None, process_name=None ): client_runner = self._generate_job_runner_instance( self.args, run_id=run_id, request_json=request_json, @@ -259,9 +266,17 @@ def start_runner_process( client_runner.server_id = request_json.get("server_id", "0") self.run_extend_queue_list = self._generate_extend_queue_list() logging.info("start the runner process.") - self.run_process = Process(target=client_runner.run, args=( - self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, - sender_message_queue, listener_message_queue, status_center_queue - )) + + if platform.system() == "Windows": + self.run_process = multiprocessing.Process( + target=client_runner.run, args=( + self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, + sender_message_queue, listener_message_queue, status_center_queue, process_name + )) + else: + self.run_process = fedml.get_process(target=client_runner.run, args=( + self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, + sender_message_queue, listener_message_queue, status_center_queue, process_name + )) self.run_process.start() return self.run_process diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py index 447bd05cd9..534ee2f7d0 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py +++ b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py @@ -62,8 +62,6 @@ def __init__(self, args, agent_config=None): self.server_id = args.server_id self.model_device_server_id = None self.model_device_client_edge_id_list = None - self.model_device_server = None - self.model_device_client_list = None @abstractmethod def generate_topics(self): @@ -147,15 +145,9 @@ def add_subscribe_topic(self, topic): self.subscribed_topics.append(topic) def stop(self): - if self.model_device_server is not None: - self.model_device_server.stop() - self.model_device_server = None - - if self.model_device_client_list is not None: - for model_client in self.model_device_client_list: - model_client.stop() - self.model_device_client_list.clear() - self.model_device_client_list = None + if self.model_device_client_edge_id_list is not None: + self.model_device_client_edge_id_list.clear() + self.model_device_client_edge_id_list = None super().stop() @@ -265,6 +257,8 @@ def callback_start_train(self, topic, payload): # Report the run status with finished status and return self.generate_status_report(run_id, edge_id, server_agent_id=server_agent_id).report_client_id_status( edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, run_id=run_id) + + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) return logging.info( f"Run started, available gpu ids: {JobRunnerUtils.get_instance().get_available_gpu_id_list(edge_id)}") @@ -282,6 +276,7 @@ def callback_start_train(self, topic, payload): listener_message_queue=self.get_listener_message_queue(), status_center_queue=self.get_status_queue(), cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, + process_name=GeneralConstants.get_launch_slave_job_process_name(run_id, edge_id) ) run_process = self._get_job_runner_manager().get_runner_process(run_id) if run_process is not None: diff --git a/python/fedml/computing/scheduler/slave/client_data_interface.py b/python/fedml/computing/scheduler/slave/client_data_interface.py index 0e9e84381a..74bf7a64a3 100755 --- a/python/fedml/computing/scheduler/slave/client_data_interface.py +++ b/python/fedml/computing/scheduler/slave/client_data_interface.py @@ -343,6 +343,15 @@ def handle_database_compatibility(self): self.close_job_db() + def check_if_table_exist(self, current_db_cursor): + results = current_db_cursor.execute("select * from sqlite_master where type='table' and name='jobs';") + if results is None: + return False + result_len = 0 + for row in results: + result_len += 1 + return False if result_len == 0 else True + def get_agent_status(self, edge_id=0): self.open_job_db() enabled = 1 diff --git a/python/fedml/computing/scheduler/slave/client_login.py b/python/fedml/computing/scheduler/slave/client_login.py index 95c772a225..7a1c759410 100755 --- a/python/fedml/computing/scheduler/slave/client_login.py +++ b/python/fedml/computing/scheduler/slave/client_login.py @@ -1,11 +1,11 @@ import argparse import os import fedml -from fedml.computing.scheduler.slave.slave_agent import FedMLLaunchSlaveAgent +from fedml.computing.scheduler.slave.united_agents import FedMLUnitedAgent def logout(): - FedMLLaunchSlaveAgent.logout() + FedMLUnitedAgent.get_instance().logout() if __name__ == "__main__": @@ -18,6 +18,7 @@ def logout(): parser.add_argument("--version", "-v", type=str, default="release") parser.add_argument("--local_server", "-ls", type=str, default="127.0.0.1") parser.add_argument("--role", "-r", type=str, default="client") + parser.add_argument("--runner_cmd", "-rc", type=str, default="{}") parser.add_argument("--device_id", "-id", type=str, default="0") parser.add_argument("--os_name", "-os", type=str, default="") parser.add_argument("--api_key", "-k", type=str, default="") @@ -36,9 +37,10 @@ def logout(): fedml.set_local_on_premise_platform_port(args.local_on_premise_platform_port) fedml.set_env_version(args.version) - slave_agent = FedMLLaunchSlaveAgent() + united_agents = FedMLUnitedAgent.get_instance() if args.type == 'login': - slave_agent.login(args.api_key, api_key=args.api_key, device_id=args.device_id, - os_name=args.os_name, role=args.role) + united_agents.login( + args.api_key, api_key=args.api_key, device_id=args.device_id, + os_name=args.os_name, role=args.role, runner_cmd=args.runner_cmd) else: - FedMLLaunchSlaveAgent.logout() + united_agents.logout() diff --git a/python/fedml/computing/scheduler/slave/slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/slave_protocol_manager.py index a1067a0d96..449cd7c29c 100755 --- a/python/fedml/computing/scheduler/slave/slave_protocol_manager.py +++ b/python/fedml/computing/scheduler/slave/slave_protocol_manager.py @@ -1,18 +1,15 @@ -import copy -import json + import os -import fedml from ..comm_utils.job_cleanup import JobCleanup from .base_slave_protocol_manager import FedMLBaseSlaveProtocolManager from .launch_job_runner_manager import FedMLLaunchJobRunnerManager -from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner -from ..model_scheduler.model_device_client import FedMLModelDeviceClientRunner class FedMLLaunchSlaveProtocolManager(FedMLBaseSlaveProtocolManager): def __init__(self, args, agent_config=None): FedMLBaseSlaveProtocolManager.__init__(self, args, agent_config=agent_config) + self.message_center_name = "launch_slave_agent" # Override def generate_topics(self): @@ -34,7 +31,8 @@ def _get_job_runner_manager(self): def _process_connection_ready(self): from fedml.core.mlops import sync_deploy_id sync_deploy_id( - self.edge_id, self.model_device_server.edge_id, self.model_device_client_edge_id_list) + self.edge_id, self.model_device_server_id, self.model_device_client_edge_id_list, + message_center=self.message_center) # Override def _process_connection_lost(self): @@ -47,59 +45,19 @@ def _init_extra_items(self): # Sync the data when startup JobCleanup.get_instance().sync_data_on_startup(self.args.edge_id) - # Get the environment variables - infer_host = os.getenv("FEDML_INFER_HOST", None) - infer_redis_addr = os.getenv("FEDML_INFER_REDIS_ADDR", None) - infer_redis_port = os.getenv("FEDML_INFER_REDIS_PORT", None) - infer_redis_password = os.getenv("FEDML_INFER_REDIS_PASSWORD", None) - model_client_num = os.getenv("FEDML_MODEL_WORKER_NUM", None) - - # Start deploy master agent and slave agent - in_args = copy.deepcopy(self.args) - if self.model_device_client_edge_id_list is None: - self.model_device_client_edge_id_list = list() - if self.model_device_client_list is None: - model_client_num = 1 if model_client_num is None else int(model_client_num) - self.model_device_client_list = list() - for client_index in range(model_client_num): - model_device_client = FedMLModelDeviceClientRunner( - in_args, f"{in_args.current_device_id}_{client_index + 1}", in_args.os_name, - in_args.is_from_docker, self.agent_config) - if infer_host is not None: - model_device_client.infer_host = infer_host - if infer_redis_addr is not None: - model_device_client.redis_addr = infer_redis_addr - if infer_redis_port is not None: - model_device_client.redis_port = infer_redis_port - if infer_redis_password is not None: - model_device_client.redis_password = infer_redis_password - model_device_client.start() - self.model_device_client_list.append(model_device_client) - self.model_device_client_edge_id_list.append(model_device_client.get_edge_id()) - - self.args = copy.deepcopy(in_args) - if self.model_device_server is None: - self.model_device_server = FedMLModelDeviceServerRunner(in_args, in_args.current_device_id, - in_args.os_name, in_args.is_from_docker, - self.agent_config) - if infer_host is not None: - self.model_device_server.infer_host = infer_host - if infer_redis_addr is not None: - self.model_device_server.redis_addr = infer_redis_addr - if infer_redis_port is not None: - self.model_device_server.redis_port = infer_redis_port - if infer_redis_password is not None: - self.model_device_server.redis_password = infer_redis_password - - self.model_device_server.start() - self.model_device_server_id = self.model_device_server.get_edge_id() + # Start the monitor process + self.mlops_metrics.stop_device_realtime_perf() + self.mlops_metrics.report_device_realtime_perf(self.args, self.args.agent_config["mqtt_config"]) + + def save_deploy_ids(self, deploy_master_edge_id=None, deploy_slave_edge_id=None): + if deploy_master_edge_id is not None: + self.model_device_server_id = deploy_master_edge_id + + if deploy_slave_edge_id is not None: + if self.model_device_client_edge_id_list is None: + self.model_device_client_edge_id_list = list() + self.model_device_client_edge_id_list.append(deploy_slave_edge_id) # Save the deployed master and worker id list to the environment variable. os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server_id) os.environ["FEDML_DEPLOY_WORKER_IDS"] = str(self.model_device_client_edge_id_list) - - # Start the monitor process - self.args = copy.deepcopy(in_args) - self.mlops_metrics.stop_device_realtime_perf() - self.mlops_metrics.report_device_realtime_perf(self.args, self.args.agent_config["mqtt_config"]) - pass \ No newline at end of file diff --git a/python/fedml/computing/scheduler/slave/united_agents.py b/python/fedml/computing/scheduler/slave/united_agents.py new file mode 100755 index 0000000000..3c8549c06a --- /dev/null +++ b/python/fedml/computing/scheduler/slave/united_agents.py @@ -0,0 +1,75 @@ +from fedml.computing.scheduler.model_scheduler.master_agent import FedMLDeployMasterAgent +from fedml.computing.scheduler.model_scheduler.worker_agent import FedMLDeployWorkerAgent +from fedml.computing.scheduler.scheduler_core.account_manager import FedMLAccountManager +from fedml.computing.scheduler.slave.slave_agent import FedMLLaunchSlaveAgent +from fedml.computing.scheduler.master.master_agent import FedMLLaunchMasterAgent +from fedml.core.common.singleton import Singleton + + +class FedMLUnitedAgent(Singleton): + + @staticmethod + def get_instance(): + return FedMLUnitedAgent() + + def logout(self): + FedMLLaunchSlaveAgent.logout() + + def login(self, userid, api_key=None, device_id=None, + os_name=None, need_to_check_gpu=False, role=None, runner_cmd=None): + # Create the launch master/slave and deploy master/slave agents. + launch_slave_agent = FedMLLaunchSlaveAgent() + launch_master_agent = FedMLLaunchMasterAgent() + deploy_slave_agent = FedMLDeployWorkerAgent() + deploy_master_agent = FedMLDeployMasterAgent() + + # Login with the launch slave role + login_result = launch_slave_agent.login( + api_key, api_key=api_key, device_id=device_id, + os_name=os_name, role=role + ) + + # Get the communication manager, sender message queue + shared_communication_mgr = launch_slave_agent.get_protocol_manager().get_protocol_communication_manager() + shared_slave_sender_message_queue = launch_slave_agent.get_protocol_manager().get_protocol_sender_message_queue() + shared_slave_sender_message_event = launch_slave_agent.get_protocol_manager().get_protocol_sender_message_event() + + # Login with the launch master role based on + # the shared communication manager, sender message center + launch_master_agent.login( + api_key, api_key=api_key, device_id=login_result.device_id, + os_name=os_name, runner_cmd=runner_cmd, + role=FedMLAccountManager.ROLE_GPU_MASTER_SERVER, + communication_manager=shared_communication_mgr, + sender_message_queue=None + ) + + # Get the status center queue + shared_slave_status_center_queue = launch_slave_agent.get_protocol_manager().get_protocol_status_center_queue() + shared_master_status_center_queue = launch_master_agent.get_protocol_manager().get_protocol_status_center_queue() + shared_master_sender_message_queue = launch_master_agent.get_protocol_manager().get_protocol_sender_message_queue() + shared_master_sender_message_event = launch_master_agent.get_protocol_manager().get_protocol_sender_message_event() + + # Login with the deployment master role based on + # the shared communication manager, sender message center, status center + deploy_master_login_result = deploy_master_agent.login( + userid, api_key=api_key, device_id=login_result.device_id, + os_name=os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM, + communication_manager=shared_communication_mgr + ) + + # Login with the deployment slave role based on + # the shared communication manager, sender message center, status center + deploy_slave_login_result = deploy_slave_agent.login( + userid, api_key=api_key, device_id=login_result.device_id, + os_name=os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM, + communication_manager=shared_communication_mgr + ) + + # Set the deployment ids to launch agent so that we can report the related device info to MLOps. + launch_slave_agent.save_deploy_ids( + deploy_master_edge_id=deploy_master_login_result.edge_id, + deploy_slave_edge_id=deploy_slave_login_result.edge_id) + + # Start the slave agent to connect to servers and loop forever. + launch_slave_agent.start() diff --git a/python/fedml/core/mlops/__init__.py b/python/fedml/core/mlops/__init__.py index 148427fe1f..121c8e26bb 100644 --- a/python/fedml/core/mlops/__init__.py +++ b/python/fedml/core/mlops/__init__.py @@ -1453,12 +1453,14 @@ def release_resources(run_id, device_id): MLOpsConstants.MSG_TOPIC_LAUNCH_RELEASE_GPU_IDS, json.dumps(payload)) -def sync_deploy_id(device_id, master_deploy_id, worker_deploy_id_list): - fedml_args = get_fedml_args() - - setup_log_mqtt_mgr() - +def sync_deploy_id(device_id, master_deploy_id, worker_deploy_id_list, message_center=None): payload = {"device_id": device_id, "master_deploy_id": master_deploy_id, "worker_deploy_ids": worker_deploy_id_list} - MLOpsStore.mlops_log_mqtt_mgr.send_message_json( - MLOpsConstants.MSG_TOPIC_LAUNCH_SYNC_DEPLOY_IDS, json.dumps(payload)) + if message_center is None: + fedml_args = get_fedml_args() + setup_log_mqtt_mgr() + MLOpsStore.mlops_log_mqtt_mgr.send_message_json( + MLOpsConstants.MSG_TOPIC_LAUNCH_SYNC_DEPLOY_IDS, json.dumps(payload)) + else: + message_center.send_message( MLOpsConstants.MSG_TOPIC_LAUNCH_SYNC_DEPLOY_IDS, json.dumps(payload)) + diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 4bb41df73f..61da372d97 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -1,6 +1,7 @@ import json import logging import os +import platform import time import traceback import uuid @@ -8,12 +9,15 @@ import multiprocessing import psutil +import setproctitle +import fedml from fedml.computing.scheduler.comm_utils import sys_utils from .device_info_report_protocol import FedMLDeviceInfoReportProtocol from .mlops_utils import MLOpsUtils from .system_stats import SysStats from ...computing.scheduler.comm_utils.job_monitor import JobMonitor +from ...computing.scheduler.scheduler_core.general_constants import GeneralConstants from ...core.distributed.communication.mqtt.mqtt_manager import MqttManager @@ -28,6 +32,17 @@ ROLE_ENDPOINT_REPLICA_NUM = 8 ROLE_ENDPOINT_REPLICA_PERF = 9 +ROLE_DEVICE_JOB_TOTAL_MONITOR_STR = "device_job_total" +ROLE_DEVICE_INFO_REPORTER_STR = "device_info" +ROLE_ENDPOINT_MASTER_STR = "endpoint_master" +ROLE_ENDPOINT_SLAVE_STR = "endpoint_slave" +ROLE_RUN_MASTER_STR = "run_master" +ROLE_RUN_SLAVE_STR = "run_slave" +ROLE_ENDPOINT_LOGS_STR = "endpoint_logs" +ROLE_AUTO_SCALER_STR = "autoscaler" +ROLE_ENDPOINT_REPLICA_NUM_STR = "endpoint_replica_num" +ROLE_ENDPOINT_REPLICA_PERF_STR = "endpoint_replica_perf" + class MLOpsDevicePerfStats(object): def __init__(self): @@ -76,58 +91,161 @@ def setup_realtime_stats_process(self, sys_args): self.device_realtime_stats_event.clear() perf_stats.device_realtime_stats_event = self.device_realtime_stats_event - self.device_realtime_stats_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client)) + if platform.system() == "Windows": + self.device_realtime_stats_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client, + GeneralConstants.get_monitor_process_name( + ROLE_DEVICE_INFO_REPORTER_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.device_realtime_stats_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client, + GeneralConstants.get_monitor_process_name( + ROLE_DEVICE_INFO_REPORTER_STR, perf_stats.run_id, perf_stats.edge_id))) self.device_realtime_stats_process.start() if self.enable_job_total_monitor: - self.job_total_monitor_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client)) + if platform.system() == "Windows": + self.job_total_monitor_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client, + GeneralConstants.get_monitor_process_name( + ROLE_DEVICE_JOB_TOTAL_MONITOR_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.job_total_monitor_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client, + GeneralConstants.get_monitor_process_name( + ROLE_DEVICE_JOB_TOTAL_MONITOR_STR, perf_stats.run_id, perf_stats.edge_id))) self.job_total_monitor_process.start() else: if self.is_client: - self.monitor_endpoint_master_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER)) + # Register endpoint master process + if platform.system() == "Windows": + self.monitor_endpoint_master_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_endpoint_master_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_endpoint_master_process.start() - self.monitor_run_slave_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE)) + # Register endpoint slave process + if platform.system() == "Windows": + self.monitor_endpoint_slave_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_SLAVE, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_endpoint_slave_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_SLAVE, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_endpoint_slave_process.start() + + # Register run slave process + if platform.system() == "Windows": + self.monitor_run_slave_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE, True, + GeneralConstants.get_monitor_process_name( + ROLE_RUN_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_run_slave_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE, True, + GeneralConstants.get_monitor_process_name( + ROLE_RUN_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_run_slave_process.start() - self.monitor_endpoint_logs_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS)) + # Register endpoint logs process + if platform.system() == "Windows": + self.monitor_endpoint_logs_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_LOGS_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_endpoint_logs_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_LOGS_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_endpoint_logs_process.start() # Register auto-scaler process - self.monitor_auto_scaler_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER)) + if platform.system() == "Windows": + self.monitor_auto_scaler_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER, True, + GeneralConstants.get_monitor_process_name( + ROLE_AUTO_SCALER_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_auto_scaler_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER, True, + GeneralConstants.get_monitor_process_name( + ROLE_AUTO_SCALER_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_auto_scaler_process.start() # Register replica number report channel - self.monitor_replica_num_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM)) + if platform.system() == "Windows": + self.monitor_replica_num_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_REPLICA_NUM_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_replica_num_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_REPLICA_NUM_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_replica_num_process.start() # Register replica performance report channel - self.monitor_replica_perf_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF)) + if platform.system() == "Windows": + self.monitor_replica_perf_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_REPLICA_PERF_STR, perf_stats.run_id, perf_stats.edge_id))) + + else: + self.monitor_replica_perf_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF, True, + GeneralConstants.get_monitor_process_name( + ROLE_ENDPOINT_REPLICA_PERF_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_replica_perf_process.start() else: - self.monitor_run_master_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_MASTER)) + if platform.system() == "Windows": + self.monitor_run_master_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_MASTER, False, + GeneralConstants.get_monitor_process_name( + ROLE_RUN_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) + else: + self.monitor_run_master_process = fedml.get_process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_MASTER, False, + GeneralConstants.get_monitor_process_name( + ROLE_RUN_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) self.monitor_run_master_process.start() - def report_device_realtime_stats_entry(self, sys_event, role, is_client=False): - # print(f"Report device realtime stats, process id {os.getpid()}") + def report_device_realtime_stats_entry(self, sys_event, role, is_client=False, process_name=None): + if process_name is not None: + setproctitle.setproctitle(process_name) + + # print(f"Report device realtime stats, process id {os.getpid()}, name {process_name}") self.device_realtime_stats_event = sys_event mqtt_mgr = MqttManager( diff --git a/python/fedml/core/mlops/mlops_job_perfs.py b/python/fedml/core/mlops/mlops_job_perfs.py index fe3d921558..429e32ff1d 100644 --- a/python/fedml/core/mlops/mlops_job_perfs.py +++ b/python/fedml/core/mlops/mlops_job_perfs.py @@ -1,19 +1,25 @@ import json import logging import os +import platform import time import traceback import uuid import multiprocess as multiprocessing import psutil +import setproctitle +import fedml from .mlops_utils import MLOpsUtils from .system_stats import SysStats +from ...computing.scheduler.scheduler_core.general_constants import GeneralConstants from ...core.distributed.communication.mqtt.mqtt_manager import MqttManager class MLOpsJobPerfStats(object): + JOB_PERF_PROCESS_TAG = "job_perf" + def __init__(self): self.job_stats_process = None self.job_stats_event = None @@ -138,16 +144,26 @@ def setup_job_stats_process(self, sys_args): self.job_stats_event.clear() perf_stats.job_stats_event = self.job_stats_event perf_stats.job_process_id_map = self.job_process_id_map - - self.job_stats_process = multiprocessing.Process(target=perf_stats.report_job_stats_entry, - args=(self.job_stats_event,)) + if platform.system() == "Windows": + self.job_stats_process = multiprocessing.Process( + target=perf_stats.report_job_stats_entry, + args=(self.job_stats_event, GeneralConstants.get_monitor_process_name( + MLOpsJobPerfStats.JOB_PERF_PROCESS_TAG, perf_stats.run_id, perf_stats.edge_id))) + else: + self.job_stats_process = fedml.get_process( + target=perf_stats.report_job_stats_entry, + args=(self.job_stats_event, GeneralConstants.get_monitor_process_name( + MLOpsJobPerfStats.JOB_PERF_PROCESS_TAG, perf_stats.run_id, perf_stats.edge_id))) self.job_stats_process.start() def report_job_stats(self, sys_args): self.setup_job_stats_process(sys_args) - def report_job_stats_entry(self, sys_event): - # print(f"Report job realtime stats, process id {os.getpid()}") + def report_job_stats_entry(self, sys_event, process_name): + if process_name is not None: + setproctitle.setproctitle(process_name) + + # print(f"Report job realtime stats, process id {os.getpid()}, name {process_name}") self.job_stats_event = sys_event mqtt_mgr = MqttManager( diff --git a/python/fedml/core/mlops/mlops_runtime_log_daemon.py b/python/fedml/core/mlops/mlops_runtime_log_daemon.py index ff06dc91b3..bf136a36c9 100644 --- a/python/fedml/core/mlops/mlops_runtime_log_daemon.py +++ b/python/fedml/core/mlops/mlops_runtime_log_daemon.py @@ -1,16 +1,19 @@ import argparse import logging import os +import platform import shutil import threading import time import multiprocess as multiprocessing import requests +import setproctitle import yaml import fedml from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils +from fedml.computing.scheduler.scheduler_core.general_constants import GeneralConstants from fedml.core.mlops.mlops_utils import MLOpsLoggingUtils from ...core.mlops.mlops_configs import MLOpsConfigs @@ -255,8 +258,11 @@ def should_ignore_log_line(log_line): return False - def log_process(self, process_event): - logging.info(f"Log uploading process id {os.getpid()}, run id {self.run_id}, edge id {self.device_id}") + def log_process(self, process_event, process_name=None): + if process_name is not None: + setproctitle.setproctitle(process_name) + + logging.info(f"Log uploading process id {os.getpid()}, run id {self.run_id}, name {process_name}, edge id {self.device_id}") self.log_process_event = process_event only_push_artifact = False @@ -418,6 +424,8 @@ def set_log_source(self, source): self.log_source = source def start_log_processor(self, log_run_id, log_device_id, log_source=None, log_file_prefix=None): + if log_run_id == "-1" or int(log_run_id) <= 0: + return log_processor = MLOpsRuntimeLogProcessor(self.args.using_mlops, log_run_id, log_device_id, self.log_file_dir, self.log_server_url, @@ -431,8 +439,13 @@ def start_log_processor(self, log_run_id, log_device_id, log_source=None, log_fi self.log_process_event_map[event_map_id] = multiprocessing.Event() self.log_process_event_map[event_map_id].clear() log_processor.log_process_event = self.log_process_event_map[event_map_id] - log_child_process = multiprocessing.Process(target=log_processor.log_process, - args=(self.log_process_event_map[event_map_id],)) + process_name = GeneralConstants.get_log_process_name(log_run_id, log_device_id) + if platform.system() == "Windows": + log_child_process = multiprocessing.Process( + target=log_processor.log_process, args=(self.log_process_event_map[event_map_id], process_name)) + else: + log_child_process = fedml.get_process( + target=log_processor.log_process, args=(self.log_process_event_map[event_map_id], process_name)) # process = threading.Thread(target=log_processor.log_process) # process.start() if log_child_process is not None: diff --git a/python/setup.py b/python/setup.py index 4757c10a17..262fc060c4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -20,7 +20,7 @@ def finalize_options(self): requirements = [ 'GPUtil', - 'PyYAML', + 'PyYAML==5.3.1', 'aiohttp>=3.8.1', 'attrdict', 'attrs', @@ -69,7 +69,8 @@ def finalize_options(self): 'python-dotenv', 'protobuf>=3.20.2,<4.0dev', 'typer<0.10.0,>=0.3.0', - 'fastapi-cli==0.0.1' + 'fastapi-cli==0.0.1', + 'setproctitle' ] requirements_extra_mpi = [ @@ -126,7 +127,7 @@ def finalize_options(self): setup( name="fedml", - version="0.9.0", + version="0.8.51b1", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for " diff --git a/python/tests/cross-silo/run_cross_silo.sh b/python/tests/cross-silo/run_cross_silo.sh index 2ccdbff15b..0beaaffc52 100644 --- a/python/tests/cross-silo/run_cross_silo.sh +++ b/python/tests/cross-silo/run_cross_silo.sh @@ -1,10 +1,10 @@ #!/bin/bash set -e WORKSPACE=$(pwd) -PROJECT_HOME=$WORKSPACE/../../ -cd $PROJECT_HOME +# PROJECT_HOME=$WORKSPACE/../../ +# cd $PROJECT_HOME -cd examples/cross_silo/mqtt_s3_fedavg_mnist_lr_example/custom_data_and_model +cd examples/federate/cross_silo/mqtt_s3_fedavg_mnist_lr_example/custom_data_and_model # run client(s) RUN_ID="$(python -c "import uuid; print(uuid.uuid4().hex)")" diff --git a/python/tests/smoke_test/cli/build.sh b/python/tests/smoke_test/cli/build.sh index 98fdb05244..de956692f1 100644 --- a/python/tests/smoke_test/cli/build.sh +++ b/python/tests/smoke_test/cli/build.sh @@ -16,7 +16,7 @@ # --help Show this message and exit. # build client package -cd ../../../examples/cross_silo/mqtt_s3_fedavg_mnist_lr_example/one_line +cd ../../../examples/federate/cross_silo/mqtt_s3_fedavg_mnist_lr_example/one_line echo "$PWD" SOURCE=client @@ -30,4 +30,4 @@ SOURCE=server ENTRY=torch_server.py CONFIG=config DEST=./mlops -fedml build -t server -sf $SOURCE -ep $ENTRY -cf $CONFIG -df $DEST \ No newline at end of file +fedml build -t server -sf $SOURCE -ep $ENTRY -cf $CONFIG -df $DEST diff --git a/python/tests/test_deploy/test_deploy.py b/python/tests/test_deploy/test_deploy.py new file mode 100644 index 0000000000..d7243c68de --- /dev/null +++ b/python/tests/test_deploy/test_deploy.py @@ -0,0 +1,39 @@ +import os.path +import time +import fedml +# Login +API_KEY = os.getenv("API_KEY") +fedml.set_env_version("test") +fedml.set_local_on_premise_platform_port(18080) +error_code, error_msg = fedml.api.fedml_login(api_key=API_KEY) +if error_code != 0: + raise Exception("API Key is invalid!") + +# Yaml file +cur_dir = os.path.dirname(__file__) +fedml_dir = os.path.dirname(cur_dir) +python_dir = os.path.dirname(fedml_dir) +yaml_file = os.path.join(python_dir, "examples", "launch", "serve_job_mnist.yaml") + +# Launch job +launch_result_dict = {} +launch_result_status = {} + +launch_result = fedml.api.launch_job(yaml_file) +print("Endpoint id is", launch_result.inner_id) + +cnt = 0 +while 1: + try: + r = fedml.api.get_endpoint(endpoint_id=launch_result.inner_id) + except Exception as e: + raise Exception(f"FAILED to get endpoint:{launch_result.inner_id}. {e}") + if r.status == "DEPLOYED": + print("Deployment has been successfully!") + break + elif r.status == "FAILED": + raise Exception("FAILED to deploy.") + time.sleep(1) + cnt += 1 + if cnt %3 ==0: + print('Deployment status is', r.status) \ No newline at end of file diff --git a/python/tests/test_federate/test_federate.sh b/python/tests/test_federate/test_federate.sh new file mode 100644 index 0000000000..ebfcb60330 --- /dev/null +++ b/python/tests/test_federate/test_federate.sh @@ -0,0 +1,26 @@ + +WORKSPACE=`pwd` +echo $WORKSPACE +cd $WORKSPACE/examples/federate/quick_start/parrot +python torch_fedavg_mnist_lr_one_line_example.py --cf fedml_config.yaml +python torch_fedavg_mnist_lr_custum_data_and_model_example.py --cf fedml_config.yaml + +cd $WORKSPACE/examples/federate/simulation/sp_decentralized_mnist_lr_example +python torch_fedavg_mnist_lr_step_by_step_example.py --cf fedml_config.yaml + +cd $WORKSPACE/examples/federate/simulation/sp_fednova_mnist_lr_example +python torch_fednova_mnist_lr_step_by_step_example.py --cf fedml_config.yaml + +cd $WORKSPACE/examples/federate/simulation/sp_fedopt_mnist_lr_example +python torch_fedopt_mnist_lr_step_by_step_example.py --cf fedml_config.yaml + +cd $WORKSPACE/examples/federate/simulation/sp_hierarchicalfl_mnist_lr_example +python torch_hierarchicalfl_mnist_lr_step_by_step_example.py --cf fedml_config.yaml + + +cd $WORKSPACE/examples/federate/simulation/sp_turboaggregate_mnist_lr_example +python torch_turboaggregate_mnist_lr_step_by_step_example.py --cf fedml_config.yaml + + +cd $WORKSPACE/examples/federate/simulation/sp_vertical_mnist_lr_example +python torch_vertical_mnist_lr_step_by_step_example.py --cf fedml_config.yaml diff --git a/python/tests/test_launch/test_launch.py b/python/tests/test_launch/test_launch.py new file mode 100644 index 0000000000..a6b6ffb9cf --- /dev/null +++ b/python/tests/test_launch/test_launch.py @@ -0,0 +1,50 @@ +import os.path +import time +import fedml +from fedml.api.constants import RunStatus + +API_KEY = os.getenv("API_KEY") +# Login +fedml.set_env_version("test") +fedml.set_local_on_premise_platform_port(18080) +error_code, error_msg = fedml.api.fedml_login(api_key=API_KEY) +if error_code != 0: + raise Exception("API Key is invalid!") + +# Yaml file +cur_dir = os.path.dirname(__file__) +fedml_dir = os.path.dirname(cur_dir) +python_dir = os.path.dirname(fedml_dir) +yaml_file = os.path.join(python_dir, "examples", "launch", "hello_job.yaml") + +# Launch job + +launch_result = fedml.api.launch_job(yaml_file) + +# launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster") +if launch_result.result_code != 0: + raise Exception(f"Failed to launch job. Reason: {launch_result.result_message}") + +# check job status +while 1: + time.sleep(1) + # if + # if launch_result_status[run_id] == RunStatus.FINISHED: + # continue + log_result = fedml.api.run_logs(launch_result.run_id, 1, 5) + if log_result is None or log_result.run_status is None: + raise Exception(f"Failed to get job status.") + + print(f"run_id: {launch_result.run_id} run_status: {log_result.run_status}") + + if log_result.run_status in [RunStatus.ERROR, RunStatus.FAILED]: + log_result = fedml.api.run_logs(launch_result.run_id, 1, 100) + if log_result is None or log_result.run_status is None: + raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} and failed to get run logs.") + + raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} run logs: {log_result.log_line_list}") + if log_result.run_status == RunStatus.FINISHED: + print(f"Job finished successfully.") + break + + diff --git a/python/tests/test_train/test_train.py b/python/tests/test_train/test_train.py new file mode 100644 index 0000000000..039d3b81d2 --- /dev/null +++ b/python/tests/test_train/test_train.py @@ -0,0 +1,49 @@ +import os.path +import time +import fedml +from fedml.api.constants import RunStatus + +API_KEY = os.getenv("API_KEY") +# Login +fedml.set_env_version("test") +fedml.set_local_on_premise_platform_port(18080) +error_code, error_msg = fedml.api.fedml_login(api_key=API_KEY) +if error_code != 0: + raise Exception("API Key is invalid!") + +# Yaml file +cur_dir = os.path.dirname(__file__) +fedml_dir = os.path.dirname(cur_dir) +python_dir = os.path.dirname(fedml_dir) +yaml_file = os.path.join(python_dir, "examples", "train", "mnist_train", "train.yaml") + +# Launch job + +launch_result = fedml.api.launch_job(yaml_file) + +# launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster") +if launch_result.result_code != 0: + raise Exception(f"Failed to launch job. Reason: {launch_result.result_message}") + +# check job status +while 1: + time.sleep(1) + # if + # if launch_result_status[run_id] == RunStatus.FINISHED: + # continue + log_result = fedml.api.run_logs(launch_result.run_id, 1, 5) + if log_result is None or log_result.run_status is None: + raise Exception(f"Failed to get job status.") + + print(f"run_id: {launch_result.run_id} run_status: {log_result.run_status}") + + if log_result.run_status in [RunStatus.ERROR, RunStatus.FAILED]: + log_result = fedml.api.run_logs(launch_result.run_id, 1, 100) + if log_result is None or log_result.run_status is None: + raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} and failed to get run logs.") + + raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} run logs: {log_result.log_line_list}") + if log_result.run_status == RunStatus.FINISHED: + print(f"Job finished successfully.") + break +