From f030996ca90bbcfac1cef6c71da179ed9a9332a6 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 3 May 2024 05:01:56 +0000 Subject: [PATCH 01/52] base workflow --- .../score_external_tuning_submission.yml | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/score_external_tuning_submission.yml diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml new file mode 100644 index 000000000..c3e843a3d --- /dev/null +++ b/.github/workflows/score_external_tuning_submission.yml @@ -0,0 +1,40 @@ +name: Containerized Regression Tests + +on: + pull_request: + branches: + - 'scoring' +env: + SUBMISSION_PATH: /home/kasimbeg/algorithmic-efficiency/some_path + TUNING_SEARCH_SPACE_FLAG: /home/kasimbeg/algorithmic-efficiency/some_path + EXPERIMENT_NAME: Team_blub/some_submission_path + FRAMEWORK: jax + DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev + HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/some_path +jobs: + run_workloads: + strategy: + matrix: + study: [0, 1, 2, 3, 4] + trial: [0, 1, 2, 3, 4] + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - name: Run containerized workload + run: | + docker pull $DOCKER_IMAGE + python run_workloads.py --framework $FRAMEWORK \ + --local False \ + --experiment_name $EXPERIMENT_NAME \ + --docker_image_url us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev \ + --run_percentage 100 \ + --submission_path $SUBMISSION_PATH \ + --tuning_search_space $TUNING_SEARCH_SPACE \ + --held_out_workloads_config_path $HELDOUT_WORKLOADS_PATH \ + --study_start_index {{ matrix.study }} \ + --study_end_index {{ matrix.study }} \ + --hparam_start_index {{ matrix.trial }} \ + --hparam_end_index $(( {{ matrix.trial }} + 1 )) \ + --workload_metadata_path workload_metadata_external_tuning.json \ + --seed 0 \ + --dry_run True \ No newline at end of file From 5eabed838aa9fd1d511aecba853ec623955882c2 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 3 May 2024 05:13:37 +0000 Subject: [PATCH 02/52] fix name --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index c3e843a3d..a847985b4 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -1,4 +1,4 @@ -name: Containerized Regression Tests +name: Run submission on: pull_request: From e694c2edd9ed52a78048e2657c8aa2a78a474712 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 3 May 2024 05:35:00 +0000 Subject: [PATCH 03/52] fix --- .github/workflows/score_external_tuning_submission.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index a847985b4..d1c0b7cce 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -23,6 +23,7 @@ jobs: - name: Run containerized workload run: | docker pull $DOCKER_IMAGE + export TRIAL_INDEX={{ matrix.trial }} python run_workloads.py --framework $FRAMEWORK \ --local False \ --experiment_name $EXPERIMENT_NAME \ @@ -34,7 +35,7 @@ jobs: --study_start_index {{ matrix.study }} \ --study_end_index {{ matrix.study }} \ --hparam_start_index {{ matrix.trial }} \ - --hparam_end_index $(( {{ matrix.trial }} + 1 )) \ + --hparam_end_index $(( TRIAL_INDEX + 1 )) \ --workload_metadata_path workload_metadata_external_tuning.json \ --seed 0 \ --dry_run True \ No newline at end of file From 3cd70c2450f57641022ae9c51d2bdc187638390b Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 3 May 2024 05:40:52 +0000 Subject: [PATCH 04/52] fix --- .../workflows/score_external_tuning_submission.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index d1c0b7cce..0d108068e 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -23,7 +23,8 @@ jobs: - name: Run containerized workload run: | docker pull $DOCKER_IMAGE - export TRIAL_INDEX={{ matrix.trial }} + export TRIAL_INDEX=${{ matrix.trial }} + echo $TRIAL_INDEX python run_workloads.py --framework $FRAMEWORK \ --local False \ --experiment_name $EXPERIMENT_NAME \ @@ -32,10 +33,10 @@ jobs: --submission_path $SUBMISSION_PATH \ --tuning_search_space $TUNING_SEARCH_SPACE \ --held_out_workloads_config_path $HELDOUT_WORKLOADS_PATH \ - --study_start_index {{ matrix.study }} \ - --study_end_index {{ matrix.study }} \ - --hparam_start_index {{ matrix.trial }} \ - --hparam_end_index $(( TRIAL_INDEX + 1 )) \ + --study_start_index ${{ matrix.study }} \ + --study_end_index ${{ matrix.study }} \ + --hparam_start_index ${{ matrix.trial }} \ + --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ --workload_metadata_path workload_metadata_external_tuning.json \ --seed 0 \ --dry_run True \ No newline at end of file From 0ae2861680f5a50ea52b55a0e195515c01415848 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 3 May 2024 05:45:21 +0000 Subject: [PATCH 05/52] fix --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 0d108068e..20f350e10 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -25,7 +25,7 @@ jobs: docker pull $DOCKER_IMAGE export TRIAL_INDEX=${{ matrix.trial }} echo $TRIAL_INDEX - python run_workloads.py --framework $FRAMEWORK \ + python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py --framework $FRAMEWORK \ --local False \ --experiment_name $EXPERIMENT_NAME \ --docker_image_url us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev \ From a965758b00df3e073fe91b38728763e48b016c2a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sat, 4 May 2024 16:38:42 +0000 Subject: [PATCH 06/52] modify docker to inlcude submissions --- docker/Dockerfile | 3 +++ docker/build_docker_images.sh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 9b72aea86..ab0f5717a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -67,4 +67,7 @@ RUN cd /algorithmic-efficiency && git pull COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh +# Move submissions into algorithmic-efficiency-repo +COPY /home/kasimbeg/algorithmic-efficiency/submissions/submissions_algorithms_v0_5 /algorithmic-efficiency/submissions/ + ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"] diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh index 9e0e68ca9..39bc1a6f0 100644 --- a/docker/build_docker_images.sh +++ b/docker/build_docker_images.sh @@ -20,7 +20,7 @@ fi for FRAMEWORK in "jax" "pytorch" "both" do - IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}" + IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}_scoring" DOCKER_BUILD_COMMAND="docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH" DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" DOCKER_PUSH_COMMAND="docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" From 5610928f7c429c4bc97741f2768292814597b654 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sat, 4 May 2024 17:22:15 +0000 Subject: [PATCH 07/52] copy submissions to docker container --- .../score_external_tuning_submission.yml | 15 ++++++++------- docker/Dockerfile | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 20f350e10..ae5a77faa 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -5,12 +5,12 @@ on: branches: - 'scoring' env: - SUBMISSION_PATH: /home/kasimbeg/algorithmic-efficiency/some_path - TUNING_SEARCH_SPACE_FLAG: /home/kasimbeg/algorithmic-efficiency/some_path - EXPERIMENT_NAME: Team_blub/some_submission_path + SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py + TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json + EXPERIMENT_NAME: Team_11/external_tuning/nadamp FRAMEWORK: jax - DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev - HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/some_path + DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring + HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json jobs: run_workloads: strategy: @@ -25,10 +25,11 @@ jobs: docker pull $DOCKER_IMAGE export TRIAL_INDEX=${{ matrix.trial }} echo $TRIAL_INDEX - python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py --framework $FRAMEWORK \ + python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ + --framework $FRAMEWORK \ --local False \ --experiment_name $EXPERIMENT_NAME \ - --docker_image_url us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev \ + --docker_image_url $DOCKER_IMAGE \ --run_percentage 100 \ --submission_path $SUBMISSION_PATH \ --tuning_search_space $TUNING_SEARCH_SPACE \ diff --git a/docker/Dockerfile b/docker/Dockerfile index ab0f5717a..0ced3457f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -68,6 +68,6 @@ COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh # Move submissions into algorithmic-efficiency-repo -COPY /home/kasimbeg/algorithmic-efficiency/submissions/submissions_algorithms_v0_5 /algorithmic-efficiency/submissions/ +COPY submissions_algorithms_v0_5 /algorithmic-efficiency/submissions/submissions_algorithms_v0_5 ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"] From a8abbc41cae171a016872b6421d02df5978cee20 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sat, 4 May 2024 17:32:08 +0000 Subject: [PATCH 08/52] fix --- .github/workflows/score_external_tuning_submission.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index ae5a77faa..3ed6c7e96 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -7,7 +7,7 @@ on: env: SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: Team_11/external_tuning/nadamp + EXPERIMENT_NAME: algoperf_scoring_05/Team_11/external_tuning/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json @@ -39,5 +39,4 @@ jobs: --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ --workload_metadata_path workload_metadata_external_tuning.json \ - --seed 0 \ - --dry_run True \ No newline at end of file + --seed 0 \ No newline at end of file From 73630b2a0d5120336ef4ea9d3c162b692af9cdfc Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sat, 4 May 2024 17:33:11 +0000 Subject: [PATCH 09/52] add project name --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 3ed6c7e96..728ad7398 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -7,7 +7,7 @@ on: env: SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: algoperf_scoring_05/Team_11/external_tuning/nadamp + EXPERIMENT_NAME: ch/Team_11/external_tuning/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json From 175f78e7a39f47d92859bc842a71a6f6457b5ab1 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sat, 4 May 2024 17:33:53 +0000 Subject: [PATCH 10/52] add project name --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index ae5a77faa..f30cd8c16 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -7,7 +7,7 @@ on: env: SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: Team_11/external_tuning/nadamp + EXPERIMENT_NAME: algoperf_scoring_05/Team_11/external_tuning/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json From fb599103c3844c01516c0b119a5ed236f075ab53 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 May 2024 19:41:19 +0000 Subject: [PATCH 11/52] add dryrun --- .github/workflows/score_external_tuning_submission.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 728ad7398..c38a5938d 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -39,4 +39,5 @@ jobs: --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ --workload_metadata_path workload_metadata_external_tuning.json \ - --seed 0 \ No newline at end of file + --seed 0 \ + --dry_run True \ No newline at end of file From 6757da297c31dd3961e9cb8ef4a155021ffec68b Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 May 2024 19:43:24 +0000 Subject: [PATCH 12/52] fix workload metadata path --- .github/workflows/score_external_tuning_submission.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index f30cd8c16..dd10db699 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -7,10 +7,11 @@ on: env: SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: algoperf_scoring_05/Team_11/external_tuning/nadamp + EXPERIMENT_NAME: ch/Team_11/external_tuning/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json + WORKLOAD_METADATA_PATH: /algorithmic-efficiency/workload_metadata_external_tuning.json jobs: run_workloads: strategy: @@ -38,6 +39,6 @@ jobs: --study_end_index ${{ matrix.study }} \ --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ - --workload_metadata_path workload_metadata_external_tuning.json \ + --workload_metadata_path $WORKLOAD_METADATA \ --seed 0 \ --dry_run True \ No newline at end of file From 02966d2a631255fa90fd06bfc95417d94f355b66 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 May 2024 20:05:05 +0000 Subject: [PATCH 13/52] remove dryrun --- .github/workflows/score_external_tuning_submission.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index dd10db699..d6207ad49 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -40,5 +40,4 @@ jobs: --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ --workload_metadata_path $WORKLOAD_METADATA \ - --seed 0 \ - --dry_run True \ No newline at end of file + --seed 0 \ No newline at end of file From 6e8742984708edb60135cd6e5d059eb58c5ae2e0 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 02:08:43 +0000 Subject: [PATCH 14/52] fix variable names and experiment name --- .github/workflows/score_external_tuning_submission.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index d6207ad49..92a9c6c5d 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -6,8 +6,8 @@ on: - 'scoring' env: SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py - TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: ch/Team_11/external_tuning/nadamp + TUNING_SEARCH_SPACE: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json + EXPERIMENT_NAME: algoperf_scoring_v05/external_tuning/Team_11/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json @@ -39,5 +39,5 @@ jobs: --study_end_index ${{ matrix.study }} \ --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ - --workload_metadata_path $WORKLOAD_METADATA \ + --workload_metadata_path $WORKLOAD_METADATA_PATH \ --seed 0 \ No newline at end of file From 4deca9a36e2dcdca13b7f21a8332554377bb59a5 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 02:10:04 +0000 Subject: [PATCH 15/52] fix workload metadata path --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 92a9c6c5d..689f28465 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -11,7 +11,7 @@ env: FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json - WORKLOAD_METADATA_PATH: /algorithmic-efficiency/workload_metadata_external_tuning.json + WORKLOAD_METADATA_PATH: /algorithmic-efficiency/scoring/workload_metadata_external_tuning.json jobs: run_workloads: strategy: From befc0c6517610b235c7bd04a87f9e0a00ea9e051 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 02:13:59 +0000 Subject: [PATCH 16/52] fix paths --- .github/workflows/score_external_tuning_submission.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 689f28465..cb8720c9e 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -10,8 +10,8 @@ env: EXPERIMENT_NAME: algoperf_scoring_v05/external_tuning/Team_11/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring - HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json - WORKLOAD_METADATA_PATH: /algorithmic-efficiency/scoring/workload_metadata_external_tuning.json + HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/held_out_workloads.json + WORKLOAD_METADATA_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/workload_metadata_external_tuning.json jobs: run_workloads: strategy: From 4fd4d307febaafb9e4bd477653d4c3d206f71fa5 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 02:37:52 +0000 Subject: [PATCH 17/52] fix heldout workload path --- .github/workflows/score_external_tuning_submission.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index cb8720c9e..9082814e7 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -10,8 +10,9 @@ env: EXPERIMENT_NAME: algoperf_scoring_v05/external_tuning/Team_11/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring - HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/held_out_workloads.json + HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/held_out_workloads_algoperf_v05.json WORKLOAD_METADATA_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/workload_metadata_external_tuning.json + SUBMISSION_ID: 17 jobs: run_workloads: strategy: @@ -40,4 +41,5 @@ jobs: --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ --workload_metadata_path $WORKLOAD_METADATA_PATH \ + --submission_id $SUBMISSION_ID --seed 0 \ No newline at end of file From 9933e771cfdcea064199395c6c41b7647bb86027 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 07:02:05 +0000 Subject: [PATCH 18/52] fix seed flag --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 9082814e7..992e835c8 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -41,5 +41,5 @@ jobs: --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ --workload_metadata_path $WORKLOAD_METADATA_PATH \ - --submission_id $SUBMISSION_ID + --submission_id $SUBMISSION_ID \ --seed 0 \ No newline at end of file From 0139919c94bd0c67dbce01e47a20e8fbdc65930d Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 07:29:38 +0000 Subject: [PATCH 19/52] remove local --- .../score_external_tuning_submission.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index dd10db699..9642cd20e 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -6,12 +6,13 @@ on: - 'scoring' env: SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py - TUNING_SEARCH_SPACE_FLAG: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: ch/Team_11/external_tuning/nadamp + TUNING_SEARCH_SPACE: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json + EXPERIMENT_NAME: algoperf_scoring_v05/external_tuning/Team_11/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring - HELDOUT_WORKLOADS_PATH: /algorithmic-efficiency/scoring/held_out_workloads.json - WORKLOAD_METADATA_PATH: /algorithmic-efficiency/workload_metadata_external_tuning.json + HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/held_out_workloads_algoperf_v05.json + WORKLOAD_METADATA_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/workload_metadata_external_tuning.json + SUBMISSION_ID: 17 jobs: run_workloads: strategy: @@ -28,7 +29,6 @@ jobs: echo $TRIAL_INDEX python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ --framework $FRAMEWORK \ - --local False \ --experiment_name $EXPERIMENT_NAME \ --docker_image_url $DOCKER_IMAGE \ --run_percentage 100 \ @@ -39,6 +39,6 @@ jobs: --study_end_index ${{ matrix.study }} \ --hparam_start_index ${{ matrix.trial }} \ --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ - --workload_metadata_path $WORKLOAD_METADATA \ - --seed 0 \ - --dry_run True \ No newline at end of file + --workload_metadata_path $WORKLOAD_METADATA_PATH \ + --submission_id $SUBMISSION_ID \ + --seed 0 \ No newline at end of file From 5abac0acf844ef468f13a19c7169453a78f83786 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 May 2024 08:00:13 +0000 Subject: [PATCH 20/52] fix submission paths --- .github/workflows/score_external_tuning_submission.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 9642cd20e..ee8e2adcf 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -5,8 +5,8 @@ on: branches: - 'scoring' env: - SUBMISSION_PATH: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py - TUNING_SEARCH_SPACE: /algorithmic-efficiency/submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json + SUBMISSION_PATH: submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py + TUNING_SEARCH_SPACE: submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json EXPERIMENT_NAME: algoperf_scoring_v05/external_tuning/Team_11/nadamp FRAMEWORK: jax DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring From b525c44478ac153447f445fe631902444f56b9f1 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 03:55:42 +0000 Subject: [PATCH 21/52] fix paths --- scoring/run_workloads.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index f99b81106..d44901a41 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -175,10 +175,10 @@ def main(_): run_fraction) mount_repo_flag = '' if FLAGS.local: - mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' - command = ('docker run -t -d -v $HOME/data/:/data/ ' - '-v $HOME/experiment_runs/:/experiment_runs ' - '-v $HOME/experiment_runs/logs:/logs ' + mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' + command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ ' + '-v /home/kasimbeg/experiment_runs/:/experiment_runs ' + '-v /home/kasimbeg/experiment_runs/logs:/logs ' f'{mount_repo_flag}' '--gpus all --ipc=host ' f'{docker_image_url} ' From 6c580623e0fe250e268c26509e0c7391cdb7870f Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 04:20:54 +0000 Subject: [PATCH 22/52] remove heldout workloads from run workloads --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index ee8e2adcf..fd655746b 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -34,7 +34,7 @@ jobs: --run_percentage 100 \ --submission_path $SUBMISSION_PATH \ --tuning_search_space $TUNING_SEARCH_SPACE \ - --held_out_workloads_config_path $HELDOUT_WORKLOADS_PATH \ + --held_out_workloads_config_path None \ --study_start_index ${{ matrix.study }} \ --study_end_index ${{ matrix.study }} \ --hparam_start_index ${{ matrix.trial }} \ From bb3b30be18a75aa8865e957b525c66725d13d218 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 20:07:01 +0000 Subject: [PATCH 23/52] increase timeout --- .github/workflows/score_external_tuning_submission.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index fd655746b..603948514 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -20,6 +20,7 @@ jobs: study: [0, 1, 2, 3, 4] trial: [0, 1, 2, 3, 4] runs-on: self-hosted + timeout-minutes: 10080 steps: - uses: actions/checkout@v2 - name: Run containerized workload From 3297023c008c0191036d3ef282d45aff5ae7a123 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 20:31:55 +0000 Subject: [PATCH 24/52] change command to python3.8 --- .github/workflows/score_external_tuning_submission.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index ee8e2adcf..b6ea54b7d 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -27,7 +27,7 @@ jobs: docker pull $DOCKER_IMAGE export TRIAL_INDEX=${{ matrix.trial }} echo $TRIAL_INDEX - python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ + python3.8 /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ --framework $FRAMEWORK \ --experiment_name $EXPERIMENT_NAME \ --docker_image_url $DOCKER_IMAGE \ From 9174267af64e00baa5f704b1d9f9aaa642bac707 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 20:52:09 +0000 Subject: [PATCH 25/52] add env --- .github/workflows/score_external_tuning_submission.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 40f5d064b..abd4cc4a2 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -28,7 +28,8 @@ jobs: docker pull $DOCKER_IMAGE export TRIAL_INDEX=${{ matrix.trial }} echo $TRIAL_INDEX - python3.8 /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ + source /home/kasimbeg/env/bin/activate + python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ --framework $FRAMEWORK \ --experiment_name $EXPERIMENT_NAME \ --docker_image_url $DOCKER_IMAGE \ From 4bc51d685d81f1a1f8e9535d2c1e7972b60dae39 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 20:53:26 +0000 Subject: [PATCH 26/52] add env --- .github/workflows/score_external_tuning_submission.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index b6ea54b7d..8b0d74f63 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -27,7 +27,8 @@ jobs: docker pull $DOCKER_IMAGE export TRIAL_INDEX=${{ matrix.trial }} echo $TRIAL_INDEX - python3.8 /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ + source /home/kasimbeg/env/bin + python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ --framework $FRAMEWORK \ --experiment_name $EXPERIMENT_NAME \ --docker_image_url $DOCKER_IMAGE \ From cc2f764ca04dcf8998e577b31aeec17675a11d30 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 20:55:29 +0000 Subject: [PATCH 27/52] remove heldout workload path --- .github/workflows/score_external_tuning_submission.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index abd4cc4a2..7e221333e 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -36,7 +36,6 @@ jobs: --run_percentage 100 \ --submission_path $SUBMISSION_PATH \ --tuning_search_space $TUNING_SEARCH_SPACE \ - --held_out_workloads_config_path None \ --study_start_index ${{ matrix.study }} \ --study_end_index ${{ matrix.study }} \ --hparam_start_index ${{ matrix.trial }} \ From c69092a0d96ef2fb6d7f44757fe78687eae1631d Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 14 May 2024 20:58:31 +0000 Subject: [PATCH 28/52] remove heldoutworkloads flag --- .github/workflows/score_external_tuning_submission.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml index 8b0d74f63..7b166ef3e 100644 --- a/.github/workflows/score_external_tuning_submission.yml +++ b/.github/workflows/score_external_tuning_submission.yml @@ -35,7 +35,6 @@ jobs: --run_percentage 100 \ --submission_path $SUBMISSION_PATH \ --tuning_search_space $TUNING_SEARCH_SPACE \ - --held_out_workloads_config_path $HELDOUT_WORKLOADS_PATH \ --study_start_index ${{ matrix.study }} \ --study_end_index ${{ matrix.study }} \ --hparam_start_index ${{ matrix.trial }} \ From 64669a49340561316c0da9fb7640f5b01658b517 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 23 May 2024 00:14:28 +0000 Subject: [PATCH 29/52] add check to kill container --- scoring/run_workloads.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index cc5ac4f08..f3ed11918 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -17,6 +17,8 @@ from absl import app from absl import flags from absl import logging +import datetime +import subprocess from algorithmic_efficiency import random_utils as prng from algorithmic_efficiency.workloads.workloads import get_base_workload_name @@ -105,13 +107,31 @@ def container_running(): else: return True +def kill_containers(): + docker_client = docker.from_env() + containers = docker_client.containers.list() + for container in containers: + container.kill() + +def gpu_is_active(): + output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader']) + return any(int(x) > 0 for x in output.decode().splitlines()) + def wait_until_container_not_running(sleep_interval=5 * 60): + # check gpu util + # if the gpu has not been utilized for 30 minutes kill the + gpu_last_active = datetime.datetime.now().timestamp() + while container_running(): + # check if gpus have been inactive > 45 min and if so terminate container + if gpu_is_active(): + gpu_last_active = datetime.datetime.now().timestamp() + if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60: + kill_containers("Killing container: GPUs have been inactive > 45 minutes...") time.sleep(sleep_interval) return - def main(_): framework = FLAGS.framework run_fraction = FLAGS.run_percentage / 100. From ce3d5025c08fad4783c700ba9a96bdbceeb13315 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 23 May 2024 01:25:34 +0000 Subject: [PATCH 30/52] fix --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index f3ed11918..065614553 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -114,7 +114,7 @@ def kill_containers(): container.kill() def gpu_is_active(): - output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader']) + output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) return any(int(x) > 0 for x in output.decode().splitlines()) From 9b79774b6b753669e83cb3f8221d90658468eaa0 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 24 May 2024 00:14:44 +0000 Subject: [PATCH 31/52] add functionality to install additional requirements --- docker/scripts/startup.sh | 14 ++++++++++++++ scoring/run_workloads.py | 12 ++++++++++++ 2 files changed, 26 insertions(+) diff --git a/docker/scripts/startup.sh b/docker/scripts/startup.sh index 5c5a6aa49..527e8306a 100644 --- a/docker/scripts/startup.sh +++ b/docker/scripts/startup.sh @@ -132,6 +132,10 @@ while [ "$1" != "" ]; do shift TEST=$1 ;; + --additional_requirements_path) + shift + ADDITIONAL_REQUIREMENTS_PATH=$1 + ;; *) usage exit 1 @@ -140,6 +144,16 @@ while [ "$1" != "" ]; do shift done + +# Optionally install addtional dependencies +if [[ -n ${ADDITIONAL_REQUIREMENTS_PATH+x} ]]; then + echo "Installing addtional requirements..." + COMMAND="cd algorithmic-efficiency && pip install -r ${ADDITIONAL_REQUIREMENTS_PATH}" + echo $COMMAND + eval $COMMAND +fi + + if [[ ${TEST} == "true" ]]; then cd algorithmic-efficiency COMMAND="python3 tests/test_traindiffs.py" diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 065614553..cf11cf7e1 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -89,6 +89,11 @@ None, 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) +flags.DEFINE_string( + 'additional_requirements_path', + None, + 'Path to requirements.txt if any.' +) FLAGS = flags.FLAGS @@ -152,7 +157,13 @@ def main(_): study_end_index = FLAGS.study_end_index else: study_end_index = num_studies - 1 + + additional_requirements_path_flag = '' + if FLAGS.additional_requirements_path: + additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path}' + submission_id = FLAGS.submission_id + rng_seed = FLAGS.seed if not rng_seed: @@ -213,6 +224,7 @@ def main(_): f'-m {max_steps} ' f'--num_tuning_trials {num_tuning_trials} ' f'--rng_seed {run_seed} ' + f'{additional_requirements_path_flag}' '-c false ' '-o true ' '-i true ') From 30852b57e1a254028e232b4b4ce5293691b3b20d Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 24 May 2024 01:47:46 +0000 Subject: [PATCH 32/52] fix --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index cf11cf7e1..f8769f7a6 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -160,7 +160,7 @@ def main(_): additional_requirements_path_flag = '' if FLAGS.additional_requirements_path: - additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path}' + additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} ' submission_id = FLAGS.submission_id From 0932d24e64077833b131041b5597c99080e7435f Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 13 Jun 2024 00:52:26 +0000 Subject: [PATCH 33/52] add flag for max steps --- utils/run_workloads.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/utils/run_workloads.py b/utils/run_workloads.py index 39f6a7b6f..92eb4d9b0 100644 --- a/utils/run_workloads.py +++ b/utils/run_workloads.py @@ -64,6 +64,13 @@ 'If your algorithm has a smaller per step time than our baselines ' 'you may want to increase the number of steps per workload.') +flags.DEFINE_integer( + 'max_steps' + None, + 'Maximum number of steps to run. If the run_percentage results into a larger' + 'number of steps, the maximum number of steps will be run.' +) + FLAGS = flags.FLAGS @@ -143,7 +150,10 @@ def main(_): # Get workload dataset, max step, algorithm path and tuning search space dataset = workload_config[workload]['dataset'] - max_steps = int(workload_config[workload]['max_steps'] * run_fraction) + if FLAGS.max_steps is None: + max_steps = int(workload_config[workload]['max_steps'] * run_fraction) + else: + max_steps = FLAGS.max_steps submission_path = workload_config[workload]['submission_path'] tuning_search_space = workload_config[workload]['tuning_search_space'] From 760c53ec0e8ef8c1442d26d760ba6f2289eed03a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 13 Jun 2024 01:23:51 +0000 Subject: [PATCH 34/52] add max steps flag --- scoring/run_workloads.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index f8769f7a6..7ae02c98d 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -94,6 +94,12 @@ None, 'Path to requirements.txt if any.' ) +FLAGS.DEFINE_integer( + 'max_steps', + None, + 'Maximum number of steps to run. If run_fraction results in greater number of steps ' + 'than the max_steps, the run will be cut to max_steps.' +) FLAGS = flags.FLAGS @@ -205,8 +211,11 @@ def main(_): "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches print('=' * 100) dataset = workload_metadata[base_workload_name]['dataset'] - max_steps = int(workload_metadata[base_workload_name]['max_steps'] * - run_fraction) + if FLAGS.max_steps is None: + max_steps = int(workload_metadata[base_workload_name]['max_steps'] * + run_fraction) + else: + max_steps = FLAGS.max_steps mount_repo_flag = '' if FLAGS.local: mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' From 6983721fd6a4ae7800d3dcb0470fb781587993fa Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 13 Jun 2024 01:25:11 +0000 Subject: [PATCH 35/52] fix --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 7ae02c98d..0b814c829 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -94,7 +94,7 @@ None, 'Path to requirements.txt if any.' ) -FLAGS.DEFINE_integer( +flags.DEFINE_integer( 'max_steps', None, 'Maximum number of steps to run. If run_fraction results in greater number of steps ' From d6b57f97a5d8f5745dba8143a0296552b1060e64 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 01:29:31 +0000 Subject: [PATCH 36/52] add workloads flag --- scoring/run_workloads.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 0b814c829..02c247311 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -85,8 +85,9 @@ 'If your algorithm has a smaller per step time than our baselines ' 'you may want to increase the number of steps per workload.') flags.DEFINE_string( - 'workload', + 'workloads', None, + 'String representing a comma separated list of workload names.' 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) flags.DEFINE_string( @@ -181,17 +182,20 @@ def main(_): with open(FLAGS.workload_metadata_path) as f: workload_metadata = json.load(f) + # Get list of all possible workloads workloads = [w for w in workload_metadata.keys()] - - # Read held-out workloads + # Read heldout workloads if FLAGS.held_out_workloads_config_path: held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads - # Filter for single workload - if FLAGS.workload and (FLAGS.workload in workloads): - workloads = [FLAGS.workload] + # Filter workloads if explicit workloads specified + if FLAGS.workloads is not None: + workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) + if len(workloads_filtered) != len(FLAGS.workloads.split(',')): + unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads_filtered) + raise ValueError(f'Invalid workload name {unmatched_workloads}') rng_subkeys = prng.split(rng_key, num_studies) From 58289d89ed2906dbf905dcc90def9ba1773343ae Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:13:49 +0000 Subject: [PATCH 37/52] fix --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 02c247311..1b284ebb7 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -194,7 +194,7 @@ def main(_): if FLAGS.workloads is not None: workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) if len(workloads_filtered) != len(FLAGS.workloads.split(',')): - unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads_filtered) + unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) raise ValueError(f'Invalid workload name {unmatched_workloads}') rng_subkeys = prng.split(rng_key, num_studies) From ca8c52ff8177306a05c2e6e786f64a5d5284a942 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:14:46 +0000 Subject: [PATCH 38/52] fix --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 1b284ebb7..ff9ee50ec 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -193,7 +193,7 @@ def main(_): # Filter workloads if explicit workloads specified if FLAGS.workloads is not None: workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) - if len(workloads_filtered) != len(FLAGS.workloads.split(',')): + if len(workloads) != len(FLAGS.workloads.split(',')): unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) raise ValueError(f'Invalid workload name {unmatched_workloads}') From 01a7b682e4218c6120bce6f392c4a052c1d557a3 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:17:19 +0000 Subject: [PATCH 39/52] debugging --- scoring/run_workloads.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index ff9ee50ec..1200691be 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -193,6 +193,8 @@ def main(_): # Filter workloads if explicit workloads specified if FLAGS.workloads is not None: workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) + print(workloads) + print(FLAGS.workloads.split(',')) if len(workloads) != len(FLAGS.workloads.split(',')): unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) raise ValueError(f'Invalid workload name {unmatched_workloads}') From 84253df665d50c227e38177ad171d62a5eede3d3 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:18:57 +0000 Subject: [PATCH 40/52] debugging --- scoring/run_workloads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 1200691be..8727e7abf 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -192,6 +192,7 @@ def main(_): # Filter workloads if explicit workloads specified if FLAGS.workloads is not None: + print(workloads) workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) print(workloads) print(FLAGS.workloads.split(',')) From f0bc1ee11ea4578aeca30d542d0c71b9fa910425 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:21:49 +0000 Subject: [PATCH 41/52] debugging --- scoring/run_workloads.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 8727e7abf..4a8078c14 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -184,8 +184,10 @@ def main(_): # Get list of all possible workloads workloads = [w for w in workload_metadata.keys()] + # Read heldout workloads if FLAGS.held_out_workloads_config_path: + print('appending heldout workloads') held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads From 897822173e81c3dbbe460afd61d87402b5ea08ff Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:25:52 +0000 Subject: [PATCH 42/52] debugging --- scoring/run_workloads.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 4a8078c14..a2dbfce7d 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -190,6 +190,8 @@ def main(_): print('appending heldout workloads') held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) + print(FLAGS.held_out_workloads_config_path) + print(held_out_workloads) workloads = workloads + held_out_workloads # Filter workloads if explicit workloads specified From f30ce4f0883ac4be9b4f843063440a89027b1988 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 4 Jul 2024 02:33:01 +0000 Subject: [PATCH 43/52] remove debugging --- scoring/run_workloads.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index a2dbfce7d..da702388f 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -187,19 +187,13 @@ def main(_): # Read heldout workloads if FLAGS.held_out_workloads_config_path: - print('appending heldout workloads') held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) - print(FLAGS.held_out_workloads_config_path) - print(held_out_workloads) workloads = workloads + held_out_workloads # Filter workloads if explicit workloads specified if FLAGS.workloads is not None: - print(workloads) workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) - print(workloads) - print(FLAGS.workloads.split(',')) if len(workloads) != len(FLAGS.workloads.split(',')): unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) raise ValueError(f'Invalid workload name {unmatched_workloads}') From 11ea68f166ef68105bc427a71d026870da249b47 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 19 Sep 2024 16:42:27 +0000 Subject: [PATCH 44/52] add safety flag to enforce explicitly enabling step budgets --- scoring/run_workloads.py | 82 +++++++++++----------------------------- 1 file changed, 23 insertions(+), 59 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index da702388f..0d708990f 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -17,8 +17,6 @@ from absl import app from absl import flags from absl import logging -import datetime -import subprocess from algorithmic_efficiency import random_utils as prng from algorithmic_efficiency.workloads.workloads import get_base_workload_name @@ -30,7 +28,7 @@ 'URL to docker image') flags.DEFINE_integer('run_percentage', 100, - 'Percentage of max num steps to run for.') + 'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect.') flags.DEFINE_string('experiment_name', 'my_experiment', 'Name of top sub directory in experiment dir.') @@ -85,21 +83,14 @@ 'If your algorithm has a smaller per step time than our baselines ' 'you may want to increase the number of steps per workload.') flags.DEFINE_string( - 'workloads', + 'workload', None, - 'String representing a comma separated list of workload names.' 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) flags.DEFINE_string( - 'additional_requirements_path', - None, - 'Path to requirements.txt if any.' -) -flags.DEFINE_integer( - 'max_steps', - None, - 'Maximum number of steps to run. If run_fraction results in greater number of steps ' - 'than the max_steps, the run will be cut to max_steps.' + 'enable_step_percentage', + False, + 'By default ignore step_fraction such that scoring is bounded by time budget.' ) FLAGS = flags.FLAGS @@ -119,34 +110,15 @@ def container_running(): else: return True -def kill_containers(): - docker_client = docker.from_env() - containers = docker_client.containers.list() - for container in containers: - container.kill() - -def gpu_is_active(): - output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) - return any(int(x) > 0 for x in output.decode().splitlines()) - def wait_until_container_not_running(sleep_interval=5 * 60): - # check gpu util - # if the gpu has not been utilized for 30 minutes kill the - gpu_last_active = datetime.datetime.now().timestamp() - while container_running(): - # check if gpus have been inactive > 45 min and if so terminate container - if gpu_is_active(): - gpu_last_active = datetime.datetime.now().timestamp() - if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60: - kill_containers("Killing container: GPUs have been inactive > 45 minutes...") time.sleep(sleep_interval) return + def main(_): framework = FLAGS.framework - run_fraction = FLAGS.run_percentage / 100. experiment_name = FLAGS.experiment_name docker_image_url = FLAGS.docker_image_url submission_path = FLAGS.submission_path @@ -164,13 +136,7 @@ def main(_): study_end_index = FLAGS.study_end_index else: study_end_index = num_studies - 1 - - additional_requirements_path_flag = '' - if FLAGS.additional_requirements_path: - additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} ' - submission_id = FLAGS.submission_id - rng_seed = FLAGS.seed if not rng_seed: @@ -182,21 +148,17 @@ def main(_): with open(FLAGS.workload_metadata_path) as f: workload_metadata = json.load(f) - # Get list of all possible workloads workloads = [w for w in workload_metadata.keys()] - # Read heldout workloads + # Read held-out workloads if FLAGS.held_out_workloads_config_path: held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads - # Filter workloads if explicit workloads specified - if FLAGS.workloads is not None: - workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) - if len(workloads) != len(FLAGS.workloads.split(',')): - unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) - raise ValueError(f'Invalid workload name {unmatched_workloads}') + # Filter for single workload + if FLAGS.workload and (FLAGS.workload in workloads): + workloads = [FLAGS.workload] rng_subkeys = prng.split(rng_key, num_studies) @@ -216,17 +178,20 @@ def main(_): "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches print('=' * 100) dataset = workload_metadata[base_workload_name]['dataset'] - if FLAGS.max_steps is None: - max_steps = int(workload_metadata[base_workload_name]['max_steps'] * - run_fraction) - else: - max_steps = FLAGS.max_steps + + max_steps_flag = '' + if FLAGS.enable_step_percentage: + run_fraction = FLAGS.run_percentage / 100. + max_steps = int(workload_metadata[base_workload_name]['max_steps'] * + run_fraction) + max_steps_flag = f'-m {max_steps}' + mount_repo_flag = '' if FLAGS.local: - mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' - command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ ' - '-v /home/kasimbeg/experiment_runs/:/experiment_runs ' - '-v /home/kasimbeg/experiment_runs/logs:/logs ' + mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' + command = ('docker run -t -d -v $HOME/data/:/data/ ' + '-v $HOME/experiment_runs/:/experiment_runs ' + '-v $HOME/experiment_runs/logs:/logs ' f'{mount_repo_flag}' '--gpus all --ipc=host ' f'{docker_image_url} ' @@ -235,10 +200,9 @@ def main(_): f'-s {submission_path} ' f'-w {workload} ' f'-e {study_dir} ' - f'-m {max_steps} ' + f'{max_steps_flag} ' f'--num_tuning_trials {num_tuning_trials} ' f'--rng_seed {run_seed} ' - f'{additional_requirements_path_flag}' '-c false ' '-o true ' '-i true ') From 078b5faf937574790ece25b8ca96412fc65bc852 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 19 Sep 2024 16:55:26 +0000 Subject: [PATCH 45/52] fix to enable_step_percentage flag --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 0d708990f..ca199fcac 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -87,7 +87,7 @@ None, 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) -flags.DEFINE_string( +flags.DEFINE_bool( 'enable_step_percentage', False, 'By default ignore step_fraction such that scoring is bounded by time budget.' From 4956a31372c82e4baf524b3c16a49b8f9eed209a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 19 Sep 2024 17:31:01 +0000 Subject: [PATCH 46/52] add flag for step budget --- scoring/run_workloads.py | 88 ++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 21 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index ca199fcac..435d83305 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -1,4 +1,4 @@ -""" +]""" Example Usage: python run_workloads.py --framework jax \ --experiment_name my_first_experiment \ @@ -17,6 +17,8 @@ from absl import app from absl import flags from absl import logging +import datetime +import subprocess from algorithmic_efficiency import random_utils as prng from algorithmic_efficiency.workloads.workloads import get_base_workload_name @@ -28,7 +30,8 @@ 'URL to docker image') flags.DEFINE_integer('run_percentage', 100, - 'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect.') + 'Percentage of max num steps to run for.' + 'Must set the flag enable_step_budget to True for this to take effect.') flags.DEFINE_string('experiment_name', 'my_experiment', 'Name of top sub directory in experiment dir.') @@ -83,14 +86,26 @@ 'If your algorithm has a smaller per step time than our baselines ' 'you may want to increase the number of steps per workload.') flags.DEFINE_string( - 'workload', + 'workloads', None, + 'String representing a comma separated list of workload names.' 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) +flags.DEFINE_string( + 'additional_requirements_path', + None, + 'Path to requirements.txt if any.' +) +flags.DEFINE_integer( + 'max_steps', + None, + 'Maximum number of steps to run. Must set flag enable_step_budget.' + 'This flag takes precedence over the run_percentage flag.' +) flags.DEFINE_bool( - 'enable_step_percentage', - False, - 'By default ignore step_fraction such that scoring is bounded by time budget.' + 'enable_step_budget', + False, + 'Flag that has to be explicitly set to override time budgets to step budget percentage.' ) FLAGS = flags.FLAGS @@ -110,13 +125,31 @@ def container_running(): else: return True +def kill_containers(): + docker_client = docker.from_env() + containers = docker_client.containers.list() + for container in containers: + container.kill() + +def gpu_is_active(): + output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) + return any(int(x) > 0 for x in output.decode().splitlines()) + def wait_until_container_not_running(sleep_interval=5 * 60): + # check gpu util + # if the gpu has not been utilized for 30 minutes kill the + gpu_last_active = datetime.datetime.now().timestamp() + while container_running(): + # check if gpus have been inactive > 45 min and if so terminate container + if gpu_is_active(): + gpu_last_active = datetime.datetime.now().timestamp() + if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60: + kill_containers("Killing container: GPUs have been inactive > 45 minutes...") time.sleep(sleep_interval) return - def main(_): framework = FLAGS.framework experiment_name = FLAGS.experiment_name @@ -136,7 +169,13 @@ def main(_): study_end_index = FLAGS.study_end_index else: study_end_index = num_studies - 1 + + additional_requirements_path_flag = '' + if FLAGS.additional_requirements_path: + additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} ' + submission_id = FLAGS.submission_id + rng_seed = FLAGS.seed if not rng_seed: @@ -148,17 +187,21 @@ def main(_): with open(FLAGS.workload_metadata_path) as f: workload_metadata = json.load(f) + # Get list of all possible workloads workloads = [w for w in workload_metadata.keys()] - # Read held-out workloads + # Read heldout workloads if FLAGS.held_out_workloads_config_path: held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads - # Filter for single workload - if FLAGS.workload and (FLAGS.workload in workloads): - workloads = [FLAGS.workload] + # Filter workloads if explicit workloads specified + if FLAGS.workloads is not None: + workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) + if len(workloads) != len(FLAGS.workloads.split(',')): + unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) + raise ValueError(f'Invalid workload name {unmatched_workloads}') rng_subkeys = prng.split(rng_key, num_studies) @@ -178,20 +221,22 @@ def main(_): "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches print('=' * 100) dataset = workload_metadata[base_workload_name]['dataset'] - max_steps_flag = '' - if FLAGS.enable_step_percentage: - run_fraction = FLAGS.run_percentage / 100. + if FLAGS.enable_step_budget: + run_fraction = FLAGS.run_percentage / 100. + if FLAGS.max_steps is None: max_steps = int(workload_metadata[base_workload_name]['max_steps'] * - run_fraction) - max_steps_flag = f'-m {max_steps}' - + run_fraction) + else: + max_steps = FLAGS.max_steps + max_steps_flag = f'-m {max_steps}' + mount_repo_flag = '' if FLAGS.local: - mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' - command = ('docker run -t -d -v $HOME/data/:/data/ ' - '-v $HOME/experiment_runs/:/experiment_runs ' - '-v $HOME/experiment_runs/logs:/logs ' + mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' + command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ ' + '-v /home/kasimbeg/experiment_runs/:/experiment_runs ' + '-v /home/kasimbeg/experiment_runs/logs:/logs ' f'{mount_repo_flag}' '--gpus all --ipc=host ' f'{docker_image_url} ' @@ -203,6 +248,7 @@ def main(_): f'{max_steps_flag} ' f'--num_tuning_trials {num_tuning_trials} ' f'--rng_seed {run_seed} ' + f'{additional_requirements_path_flag}' '-c false ' '-o true ' '-i true ') From a0e45024539c9f238bd641acdc9d6624c604670a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 19 Sep 2024 17:52:37 +0000 Subject: [PATCH 47/52] fix syntax error --- scoring/run_workloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 435d83305..3749ca214 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -1,4 +1,4 @@ -]""" +""" Example Usage: python run_workloads.py --framework jax \ --experiment_name my_first_experiment \ From 3c267236e180ad9be1d6c5f5fecff7ebbe88e99a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 16 Oct 2024 00:40:08 +0000 Subject: [PATCH 48/52] remove unwanted changes --- .../score_external_tuning_submission.yml | 45 ------------------- docker/Dockerfile | 3 -- docker/build_docker_images.sh | 2 +- 3 files changed, 1 insertion(+), 49 deletions(-) delete mode 100644 .github/workflows/score_external_tuning_submission.yml diff --git a/.github/workflows/score_external_tuning_submission.yml b/.github/workflows/score_external_tuning_submission.yml deleted file mode 100644 index 7e221333e..000000000 --- a/.github/workflows/score_external_tuning_submission.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Run submission - -on: - pull_request: - branches: - - 'scoring' -env: - SUBMISSION_PATH: submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/submission.py - TUNING_SEARCH_SPACE: submissions/submissions_algorithms_v0_5/AlgoPerf_Team_11/external_tuning/nadamp/tuning_search_space.json - EXPERIMENT_NAME: algoperf_scoring_v05/external_tuning/Team_11/nadamp - FRAMEWORK: jax - DOCKER_IMAGE: us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main_scoring - HELDOUT_WORKLOADS_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/held_out_workloads_algoperf_v05.json - WORKLOAD_METADATA_PATH: /home/kasimbeg/algorithmic-efficiency/scoring/workload_metadata_external_tuning.json - SUBMISSION_ID: 17 -jobs: - run_workloads: - strategy: - matrix: - study: [0, 1, 2, 3, 4] - trial: [0, 1, 2, 3, 4] - runs-on: self-hosted - timeout-minutes: 10080 - steps: - - uses: actions/checkout@v2 - - name: Run containerized workload - run: | - docker pull $DOCKER_IMAGE - export TRIAL_INDEX=${{ matrix.trial }} - echo $TRIAL_INDEX - source /home/kasimbeg/env/bin/activate - python /home/kasimbeg/algorithmic-efficiency/scoring/run_workloads.py \ - --framework $FRAMEWORK \ - --experiment_name $EXPERIMENT_NAME \ - --docker_image_url $DOCKER_IMAGE \ - --run_percentage 100 \ - --submission_path $SUBMISSION_PATH \ - --tuning_search_space $TUNING_SEARCH_SPACE \ - --study_start_index ${{ matrix.study }} \ - --study_end_index ${{ matrix.study }} \ - --hparam_start_index ${{ matrix.trial }} \ - --hparam_end_index $(( ${{ matrix.trial }} + 1 )) \ - --workload_metadata_path $WORKLOAD_METADATA_PATH \ - --submission_id $SUBMISSION_ID \ - --seed 0 \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 0ced3457f..9b72aea86 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -67,7 +67,4 @@ RUN cd /algorithmic-efficiency && git pull COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh -# Move submissions into algorithmic-efficiency-repo -COPY submissions_algorithms_v0_5 /algorithmic-efficiency/submissions/submissions_algorithms_v0_5 - ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"] diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh index 39bc1a6f0..9e0e68ca9 100644 --- a/docker/build_docker_images.sh +++ b/docker/build_docker_images.sh @@ -20,7 +20,7 @@ fi for FRAMEWORK in "jax" "pytorch" "both" do - IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}_scoring" + IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}" DOCKER_BUILD_COMMAND="docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH" DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" DOCKER_PUSH_COMMAND="docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" From a43836cedb85ed4120ae3101fb61e62f68a21a1e Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 16 Oct 2024 00:44:26 +0000 Subject: [PATCH 49/52] reformat --- scoring/run_workloads.py | 65 ++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 3749ca214..e474b6910 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -9,16 +9,16 @@ --tuning_search_space """ +import datetime import json import os import struct +import subprocess import time from absl import app from absl import flags from absl import logging -import datetime -import subprocess from algorithmic_efficiency import random_utils as prng from algorithmic_efficiency.workloads.workloads import get_base_workload_name @@ -28,10 +28,11 @@ 'docker_image_url', 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev', 'URL to docker image') -flags.DEFINE_integer('run_percentage', - 100, - 'Percentage of max num steps to run for.' - 'Must set the flag enable_step_budget to True for this to take effect.') +flags.DEFINE_integer( + 'run_percentage', + 100, + 'Percentage of max num steps to run for.' + 'Must set the flag enable_step_budget to True for this to take effect.') flags.DEFINE_string('experiment_name', 'my_experiment', 'Name of top sub directory in experiment dir.') @@ -91,21 +92,18 @@ 'String representing a comma separated list of workload names.' 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) -flags.DEFINE_string( - 'additional_requirements_path', - None, - 'Path to requirements.txt if any.' -) +flags.DEFINE_string('additional_requirements_path', + None, + 'Path to requirements.txt if any.') flags.DEFINE_integer( - 'max_steps', - None, - 'Maximum number of steps to run. Must set flag enable_step_budget.' - 'This flag takes precedence over the run_percentage flag.' -) + 'max_steps', + None, + 'Maximum number of steps to run. Must set flag enable_step_budget.' + 'This flag takes precedence over the run_percentage flag.') flags.DEFINE_bool( - 'enable_step_budget', - False, - 'Flag that has to be explicitly set to override time budgets to step budget percentage.' + 'enable_step_budget', + False, + 'Flag that has to be explicitly set to override time budgets to step budget percentage.' ) FLAGS = flags.FLAGS @@ -125,20 +123,26 @@ def container_running(): else: return True + def kill_containers(): docker_client = docker.from_env() containers = docker_client.containers.list() for container in containers: container.kill() + def gpu_is_active(): - output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) - return any(int(x) > 0 for x in output.decode().splitlines()) - + output = subprocess.check_output([ + 'nvidia-smi', + '--query-gpu=utilization.gpu', + '--format=csv,noheader,nounits' + ]) + return any(int(x) > 0 for x in output.decode().splitlines()) + def wait_until_container_not_running(sleep_interval=5 * 60): - # check gpu util - # if the gpu has not been utilized for 30 minutes kill the + # check gpu util + # if the gpu has not been utilized for 30 minutes kill the gpu_last_active = datetime.datetime.now().timestamp() while container_running(): @@ -146,10 +150,12 @@ def wait_until_container_not_running(sleep_interval=5 * 60): if gpu_is_active(): gpu_last_active = datetime.datetime.now().timestamp() if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60: - kill_containers("Killing container: GPUs have been inactive > 45 minutes...") + kill_containers( + "Killing container: GPUs have been inactive > 45 minutes...") time.sleep(sleep_interval) return + def main(_): framework = FLAGS.framework experiment_name = FLAGS.experiment_name @@ -196,9 +202,10 @@ def main(_): FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads - # Filter workloads if explicit workloads specified + # Filter workloads if explicit workloads specified if FLAGS.workloads is not None: - workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) + workloads = list( + filter(lambda x: x in FLAGS.workloads.split(','), workloads)) if len(workloads) != len(FLAGS.workloads.split(',')): unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) raise ValueError(f'Invalid workload name {unmatched_workloads}') @@ -230,7 +237,7 @@ def main(_): else: max_steps = FLAGS.max_steps max_steps_flag = f'-m {max_steps}' - + mount_repo_flag = '' if FLAGS.local: mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' @@ -291,4 +298,4 @@ def main(_): if __name__ == '__main__': flags.mark_flag_as_required('workload_metadata_path') - app.run(main) \ No newline at end of file + app.run(main) From ce4fc77460c6a2d7641b5f5a3b0c6ef7600cb7e5 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 16 Oct 2024 00:49:07 +0000 Subject: [PATCH 50/52] remove duplicate run_workloads script --- utils/run_workloads.py | 210 ---------------------- utils/target_setting_workload_config.json | 195 -------------------- 2 files changed, 405 deletions(-) delete mode 100644 utils/run_workloads.py delete mode 100644 utils/target_setting_workload_config.json diff --git a/utils/run_workloads.py b/utils/run_workloads.py deleted file mode 100644 index 92eb4d9b0..000000000 --- a/utils/run_workloads.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -Example Usage: -python run_workloads.py \ ---workload_config_path workload_config.json \ ---framework jax \ ---experiment_name my_first_experiment \ ---docker_image_url us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev \ ---run_percentage 10 \ ---workload_config_path workload_config.json \ ---dry_run -""" - -import json -import os -import struct -import time - -from absl import app -from absl import flags -from absl import logging - -import docker - -flags.DEFINE_string( - 'docker_image_url', - 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev', - 'URL to docker image') -flags.DEFINE_integer('run_percentage', - 100, - 'Percentage of max num steps to run for.') -flags.DEFINE_string('experiment_name', - 'my_experiment', - 'Name of top sub directory in experiment dir.') -flags.DEFINE_boolean('rsync_data', - True, - 'Whether or not to transfer the data from GCP w rsync.') -flags.DEFINE_boolean('local', False, 'Mount local algorithmic-efficiency repo.') -flags.DEFINE_string('framework', 'jax', 'Can be either PyTorch or JAX.') -flags.DEFINE_boolean( - 'dry_run', - False, - 'Whether or not to actually run the docker containers. ' - 'If False, simply print the docker run commands. ') -flags.DEFINE_integer('num_studies', 1, 'Number of studies to run') -flags.DEFINE_integer('study_start_index', None, 'Start index for studies.') -flags.DEFINE_integer('study_end_index', None, 'End index for studies.') -flags.DEFINE_integer('num_tuning_trials', 1, 'Number of tuning trials.') -flags.DEFINE_integer('hparam_start_index', - None, - 'Start index for tuning trials.') -flags.DEFINE_integer('hparam_end_index', None, 'End index for tuning trials.') -flags.DEFINE_integer('seed', None, 'Random seed for evaluating a submission.') -flags.DEFINE_integer('submission_id', - 0, - 'Submission ID to generate study and hparam seeds.') -flags.DEFINE_string( - 'workload_config_path', - 'workload_confing.json', - 'Path to config containing dataset and maximum number of steps per workload.' - 'The default values of these are set to the full budgets as determined ' - 'via the target-setting procedure. ' - 'Note that training will be interrupted at either the set maximum number ' - 'of steps or the fixed workload maximum run time, whichever comes first. ' - 'If your algorithm has a smaller per step time than our baselines ' - 'you may want to increase the number of steps per workload.') - -flags.DEFINE_integer( - 'max_steps' - None, - 'Maximum number of steps to run. If the run_percentage results into a larger' - 'number of steps, the maximum number of steps will be run.' -) - -FLAGS = flags.FLAGS - - -def read_workloads(filename): - with open(filename, "r") as f: - held_out_workloads = json.load(f) - return held_out_workloads - - -def container_running(): - docker_client = docker.from_env() - containers = docker_client.containers.list() - if len(containers) == 0: - return False - else: - return True - - -def wait_until_container_not_running(sleep_interval=5 * 60): - while container_running(): - time.sleep(sleep_interval) - return - - -def main(_): - # What Docker image to run the container with - docker_image_url = FLAGS.docker_image_url - - # Framework - framework = FLAGS.framework - - # - run_fraction = FLAGS.run_percentage / 100. - experiment_name = FLAGS.experiment_name - - # Get study and trial interval arguments - num_studies = FLAGS.num_studies - study_start_index = FLAGS.study_start_index if FLAGS.study_start_index else 0 - study_end_index = FLAGS.study_end_index if FLAGS.study_end_index else num_studies - 1 - - # Get trial arguments - num_tuning_trials = FLAGS.num_tuning_trials - hparam_start_index_flag = '' - hparam_end_index_flag = '' - if FLAGS.hparam_start_index: - hparam_start_index_flag = f'--hparam_start_index {FLAGS.hparam_start_index} ' - if FLAGS.hparam_end_index: - hparam_end_index_flag = f'--hparam_end_index {FLAGS.hparam_end_index} ' - - # Generate rng keys from submission_id and seed - submission_id = FLAGS.submission_id - rng_seed = FLAGS.seed - - if not rng_seed: - rng_seed = struct.unpack('I', os.urandom(4))[0] - - logging.info('Using RNG seed %d', rng_seed) - - # Read workload specifications to run - with open(FLAGS.workload_config_path) as f: - workload_config = json.load(f) - workloads = [w for w in workload_config.keys()] - - for study_index in range(study_start_index, study_end_index + 1): - print('-' * 100) - print('*' * 40, f'Starting study {study_index + 1}/{num_studies}', '*' * 40) - print('-' * 100) - study_dir = os.path.join(experiment_name, f'study_{study_index}') - - for workload in workloads: - # For each runnable workload check if there are any containers running - wait_until_container_not_running() - - # Clear caches - os.system("sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") - print('=' * 100) - - # Get workload dataset, max step, algorithm path and tuning search space - dataset = workload_config[workload]['dataset'] - if FLAGS.max_steps is None: - max_steps = int(workload_config[workload]['max_steps'] * run_fraction) - else: - max_steps = FLAGS.max_steps - submission_path = workload_config[workload]['submission_path'] - tuning_search_space = workload_config[workload]['tuning_search_space'] - - # Optionally, define flag to mount local algorithmic-efficiency repo - mount_repo_flag = '' - if FLAGS.local: - mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' - - command = ('docker run -t -d -v $HOME/data/:/data/ ' - '-v $HOME/experiment_runs/:/experiment_runs ' - '-v $HOME/experiment_runs/logs:/logs ' - f'{mount_repo_flag}' - '--gpus all --ipc=host ' - f'{docker_image_url} ' - f'-d {dataset} ' - f'-f {framework} ' - f'-s {submission_path} ' - f'-w {workload} ' - f'-t {tuning_search_space} ' - f'-e {study_dir} ' - f'-m {max_steps} ' - f'--num_tuning_trials {num_tuning_trials} ' - f'{hparam_start_index_flag} ' - f'{hparam_end_index_flag} ' - f'--rng_seed {rng_seed} ' - '-c false ' - '-o true ' - '-i true ') - if not FLAGS.dry_run: - print('Running docker container command') - print('Container ID: ') - return_code = os.system(command) - else: - return_code = 0 - if return_code == 0: - print( - f'SUCCESS: container for {framework} {workload} launched successfully' - ) - print(f'Command: {command}') - print(f'Results will be logged to {experiment_name}') - else: - print( - f'Failed: container for {framework} {workload} failed with exit code {return_code}.' - ) - print(f'Command: {command}') - wait_until_container_not_running() - os.system( - "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches - - print('=' * 100) - - -if __name__ == '__main__': - app.run(main) diff --git a/utils/target_setting_workload_config.json b/utils/target_setting_workload_config.json deleted file mode 100644 index a8c050422..000000000 --- a/utils/target_setting_workload_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "imagenet_resnet": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json" - }, - "imagenet_resnet_gelu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_momentum.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_gelu/tuning_search_space.json" - }, - "imagenet_resnet_large_bn_init": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_momentum.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_large_bn_init/tuning_search_space.json" - }, - "imagenet_resnet_silu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_silu/tuning_search_space.json" - }, - "imagenet_vit": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json" - }, - "imagenet_vit_glu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_glu/tuning_search_space.json" - }, - "imagenet_vit_map": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_map/tuning_search_space.json" - }, - "imagenet_vit_post_ln": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_post_ln/tuning_search_space.json" - }, - "fastmri": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nesterov.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri/tuning_search_space.json" - }, - "fastmri_layernorm": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_layernorm/tuning_search_space.json" - }, - "fastmri_model_size": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_model_size/tuning_search_space.json" - }, - "fastmri_tanh": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_tanh/tuning_search_space.json" - }, - "ogbg": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nesterov.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json" - }, - "ogbg_gelu": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_gelu/tuning_search_space.json" - }, - "ogbg_model_size": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_model_size/tuning_search_space.json" - }, - "ogbg_silu": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_silu/tuning_search_space.json" - }, - "wmt": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json" - }, - "wmt_attention_temp": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_attention_temp/tuning_search_space.json" - }, - "wmt_glu_tanh": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_glu_tanh/tuning_search_space.json" - }, - "wmt_post_ln": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_post_ln/tuning_search_space.json" - }, - "librispeech_deepspeech": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json" - }, - "librispeech_deepspeech_no_resnet": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_no_resnet/tuning_search_space.json" - }, - "librispeech_deepspeech_norm_and_spec_aug": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_norm_and_spec_aug/tuning_search_space.json" - }, - "librispeech_deepspeech_tanh": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_tanh/tuning_search_space.json" - }, - "criteo1tb": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb/tuning_search_space.json" - }, - "criteo1tb_embed_init": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_embed_init/tuning_search_space.json" - }, - "criteo1tb_layernorm": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_layernorm/tuning_search_space.json" - }, - "criteo1tb_resnet": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_resnet/tuning_search_space.json" - }, - "librispeech_conformer": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json" - }, - "librispeech_conformer_attention_temperature": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_attention_temperature/tuning_search_space.json" - }, - "librispeech_conformer_gelu": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_gelu/tuning_search_space.json" - }, - "librispeech_conformer_layernorm": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_layernorm/tuning_search_space.json" - } - -} \ No newline at end of file From c27eee49236212dca2b0032bfde157eacf1d4523 Mon Sep 17 00:00:00 2001 From: priyakasimbeg Date: Mon, 21 Oct 2024 13:17:24 -0700 Subject: [PATCH 51/52] Update setup.cfg Pin scipy version, without this some jax imports may break --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 321020ad9..a5997e745 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,6 +49,7 @@ install_requires = clu==0.0.7 matplotlib>=3.7.2 tabulate==0.9.0 + scipy==1.11.4 python_requires = >=3.8 From cda6e46935ddbb6ddc475d40c231b9db8592f602 Mon Sep 17 00:00:00 2001 From: priyakasimbeg Date: Mon, 21 Oct 2024 23:49:27 +0000 Subject: [PATCH 52/52] move scipy dep --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index a5997e745..eb570dafb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,6 @@ install_requires = clu==0.0.7 matplotlib>=3.7.2 tabulate==0.9.0 - scipy==1.11.4 python_requires = >=3.8 @@ -122,6 +121,8 @@ jax_core_deps = chex==0.1.7 ml_dtypes==0.2.0 protobuf==4.25.3 + scipy==1.11.4 + # JAX CPU jax_cpu =