From e02f038b563cbdff0bd2cde465768a047cdf76c7 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 27 Jan 2025 11:23:04 -0500 Subject: [PATCH 1/6] Use dmon for monitor_proc --- userbenchmark/release-test/monitor_proc.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/userbenchmark/release-test/monitor_proc.sh b/userbenchmark/release-test/monitor_proc.sh index 3594f8caf4..d6e1c04f05 100644 --- a/userbenchmark/release-test/monitor_proc.sh +++ b/userbenchmark/release-test/monitor_proc.sh @@ -17,11 +17,11 @@ get_gpu_max_memory_usage_cuda() { local max=$2 local curr # Some processes might not use the GPU - if ! nvidia-smi pmon -s m -c 1 -o T | grep "${my_pid}" >/dev/null 2>/dev/null; then + if ! nvidia-smi dmon -s m -c 1 -o T -i 0 | grep "${my_pid}" >/dev/null 2>/dev/null; then echo "${max}" return fi - curr=$(nvidia-smi pmon -s m -c 1 -o T | grep "${my_pid}" | awk '{print $5}' | sort | tail -1 | grep -o "[0-9.]*") + curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | grep "${my_pid}" | awk '{print $4}' | sort | tail -1 | grep -o "[0-9.]*") max "${curr}" "${max}" } From 38b379cffde1e7872198936086bc07f29f190e2a Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 27 Jan 2025 16:42:33 -0500 Subject: [PATCH 2/6] Fix monitor_proc --- userbenchmark/release-test/monitor_proc.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/userbenchmark/release-test/monitor_proc.sh b/userbenchmark/release-test/monitor_proc.sh index d6e1c04f05..3b95250737 100644 --- a/userbenchmark/release-test/monitor_proc.sh +++ b/userbenchmark/release-test/monitor_proc.sh @@ -16,12 +16,13 @@ get_gpu_max_memory_usage_cuda() { local my_pid=$1 local max=$2 local curr - # Some processes might not use the GPU - if ! nvidia-smi dmon -s m -c 1 -o T -i 0 | grep "${my_pid}" >/dev/null 2>/dev/null; then + # pick the process that uses the most GPU memory + curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | tail -n +3 | awk '{print $3}' | sort -n | tail -1 | grep -o "[0-9.]*") + # Some processes might not use the GPU, then memory usage should be 0 + if [ "${curr}" -eq 0 ] ; then echo "${max}" return fi - curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | grep "${my_pid}" | awk '{print $4}' | sort | tail -1 | grep -o "[0-9.]*") max "${curr}" "${max}" } From 5f60e008036f78ebd47e80b486ff621105be9775 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 27 Jan 2025 18:28:04 -0500 Subject: [PATCH 3/6] Try another field --- userbenchmark/release-test/monitor_proc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/userbenchmark/release-test/monitor_proc.sh b/userbenchmark/release-test/monitor_proc.sh index 3b95250737..e6f8edb74a 100644 --- a/userbenchmark/release-test/monitor_proc.sh +++ b/userbenchmark/release-test/monitor_proc.sh @@ -17,7 +17,7 @@ get_gpu_max_memory_usage_cuda() { local max=$2 local curr # pick the process that uses the most GPU memory - curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | tail -n +3 | awk '{print $3}' | sort -n | tail -1 | grep -o "[0-9.]*") + curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | tail -n +3 | awk '{print $2}' | sort -n | tail -1 | grep -o "[0-9.]*") # Some processes might not use the GPU, then memory usage should be 0 if [ "${curr}" -eq 0 ] ; then echo "${max}" From 79265eaba992bec092bae2fe766f954788e71f3e Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 28 Jan 2025 10:34:16 -0500 Subject: [PATCH 4/6] Print dmon output --- userbenchmark/release-test/monitor_proc.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/userbenchmark/release-test/monitor_proc.sh b/userbenchmark/release-test/monitor_proc.sh index e6f8edb74a..bacb9a3fd8 100644 --- a/userbenchmark/release-test/monitor_proc.sh +++ b/userbenchmark/release-test/monitor_proc.sh @@ -17,7 +17,8 @@ get_gpu_max_memory_usage_cuda() { local max=$2 local curr # pick the process that uses the most GPU memory - curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | tail -n +3 | awk '{print $2}' | sort -n | tail -1 | grep -o "[0-9.]*") + nvidia-smi dmon -s m -c 1 -o T -i 0 | tee mem-log.txt + curr=$(cat mem-log.txt | tail -n +3 | awk '{print $3}' | sort -n | tail -1 | grep -o "[0-9.]*") # Some processes might not use the GPU, then memory usage should be 0 if [ "${curr}" -eq 0 ] ; then echo "${max}" From c981cce2b10f5fb7fc19c6277ec1170079014875 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 28 Jan 2025 10:34:48 -0500 Subject: [PATCH 5/6] Fix a100 --- .github/workflows/userbenchmark-a100.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/userbenchmark-a100.yml b/.github/workflows/userbenchmark-a100.yml index 9b7ac76e91..bc6c56beef 100644 --- a/.github/workflows/userbenchmark-a100.yml +++ b/.github/workflows/userbenchmark-a100.yml @@ -27,9 +27,9 @@ jobs: - name: Install Conda run: | bash ./.ci/torchbench/install-conda.sh - - name: Install TorchBench - run: | - bash ./.ci/torchbench/install.sh +# - name: Install TorchBench +# run: | +# bash ./.ci/torchbench/install.sh - name: Run user benchmark run: | set -x From b196360d90abf88713084080d6bedb0892da46e1 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 28 Jan 2025 10:42:01 -0500 Subject: [PATCH 6/6] Revert --- userbenchmark/release-test/monitor_proc.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/userbenchmark/release-test/monitor_proc.sh b/userbenchmark/release-test/monitor_proc.sh index bacb9a3fd8..fd4ca94d65 100644 --- a/userbenchmark/release-test/monitor_proc.sh +++ b/userbenchmark/release-test/monitor_proc.sh @@ -16,9 +16,7 @@ get_gpu_max_memory_usage_cuda() { local my_pid=$1 local max=$2 local curr - # pick the process that uses the most GPU memory - nvidia-smi dmon -s m -c 1 -o T -i 0 | tee mem-log.txt - curr=$(cat mem-log.txt | tail -n +3 | awk '{print $3}' | sort -n | tail -1 | grep -o "[0-9.]*") + curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | tail -n +3 | awk '{print $3}' | sort -n | tail -1 | grep -o "[0-9.]*") # Some processes might not use the GPU, then memory usage should be 0 if [ "${curr}" -eq 0 ] ; then echo "${max}"