Skip to content

Commit

Permalink
Benchmarks: model benchmarks - change torch.distributed.launch to tor…
Browse files Browse the repository at this point in the history
…chrun (#556)

This PR has following changes
- torch.distributed.launch changed to torchrun. torch.distributed.launch
is deprecated in latest Pytorch and is recommended to move to torchrun -
https://pytorch.org/docs/stable/elastic/run.html
- Changes to AMD GPU detection logic. The AMD GPU detection logic throws
warning when containers have only renderD in /dev/dri, this change would
resolve those warnings

---------

Co-authored-by: Yuting Jiang <[email protected]>
  • Loading branch information
pnunna93 and yukirora authored Aug 8, 2023
1 parent e1df877 commit 67f2aa7
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 7 deletions.
2 changes: 1 addition & 1 deletion superbench/common/devices/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_vendor(self):
logger.warning('Cannot find NVIDIA GPU device.')
return 'nvidia'
if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
if not list(Path('/dev/dri').glob('card*')):
if not list(Path('/dev/dri').glob('renderD*')):
logger.warning('Cannot find AMD GPU device.')
return 'amd'
if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')):
Expand Down
4 changes: 2 additions & 2 deletions superbench/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
torch_dist_params = '' if mode.node_num == 1 else \
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
mode_command = (
f'python3 -m torch.distributed.launch'
f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f'torchrun'
f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
)
Expand Down
8 changes: 4 additions & 4 deletions tests/runner/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ def test_get_mode_command(self):
'node_num': 'all',
},
'expected_command': (
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=1 '
'torchrun '
'--no_python --nproc_per_node=1 '
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
Expand All @@ -123,8 +123,8 @@ def test_get_mode_command(self):
'node_num': 1,
},
'expected_command': (
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=8 '
'torchrun '
'--no_python --nproc_per_node=8 '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
Expand Down

0 comments on commit 67f2aa7

Please sign in to comment.