From 0ac21a50da109c13ac9051047b6f730d1f61871f Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 25 Sep 2024 08:08:08 -0700 Subject: [PATCH 1/2] Docs - Fix metrics name in user tutorial (#651) **Description** In result-summary.md line 73-74, the example of kernel-launch used metrics 'kernel-launch/event_overhead' and 'kernel-launch/wall_overhead'. But in micro-benchmarks.md line 20-21, the metric names are described as 'kernel-launch/event_time' and ' kernel-launch/wall_time'. Solution: Use identical metric names described in micro-benchmarks.md. https://github.com/microsoft/superbenchmark/issues/649 Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: hongtaozhang Co-authored-by: Yifan Xiong --- docs/user-tutorial/data-diagnosis.md | 4 ++-- docs/user-tutorial/result-summary.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md index c2f0e3369..46eaeda18 100644 --- a/docs/user-tutorial/data-diagnosis.md +++ b/docs/user-tutorial/data-diagnosis.md @@ -83,8 +83,8 @@ superbench: criteria: lambda x:x>0.05 categories: KernelLaunch metrics: - - kernel-launch/event_overhead:\d+ - - kernel-launch/wall_overhead:\d+ + - kernel-launch/event_time:\d+ + - kernel-launch/wall_time:\d+ rule1: # Rule 1: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as defective function: variance diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md index dffee2514..a85ec7863 100644 --- a/docs/user-tutorial/result-summary.md +++ b/docs/user-tutorial/result-summary.md @@ -70,8 +70,8 @@ superbench: aggregate: True categories: KernelLaunch metrics: - - kernel-launch/event_overhead - - kernel-launch/wall_overhead + - kernel-launch/event_time + - kernel-launch/wall_time nccl: statistics: mean categories: NCCL From e39489c404c16ab2689f4822e7c751d44665a088 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 26 Sep 2024 22:40:00 +0800 Subject: [PATCH 2/2] Benchmarks - Add configurations for NDv5 and AMD MI300 (#652) **Description** Add configurations for NDv5 and AMD MI300. **Major Revision** - Add cublaslt , FP8 transformers training, dist-inference cpp in NDv5 config example - Add hipblaslt , FP8 transformers training, dist-inference cpp in AMD MI300 config example --- .github/workflows/build-image.yml | 2 + dockerfile/rocm6.0.x.dockerfile | 5 + setup.py | 2 +- superbench/config/amd_mi300.yaml | 232 +++++++++++++++++++++++ superbench/config/azure_ndv5.yaml | 305 ++++++++++++++++++++++++++++++ 5 files changed, 545 insertions(+), 1 deletion(-) create mode 100644 superbench/config/amd_mi300.yaml create mode 100644 superbench/config/azure_ndv5.yaml diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 322d34002..05e4dd447 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -68,6 +68,8 @@ jobs: else echo "No Docker images found with the specified references." fi + sudo docker ps -q | grep build | xargs -r sudo docker stop + echo y | sudo docker system prune -a --volumes df -h - name: Prepare metadata id: metadata diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index bd33e289a..ce5736e29 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -173,6 +173,11 @@ RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASL RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch +# Install AMD SMI Python Library +RUN apt install amd-smi-lib -y && \ + cd /opt/rocm/share/amd_smi && \ + python3 -m pip install . + ADD . . ENV USE_HIP_DATATYPE=1 ENV USE_HIPBLAS_COMPUTETYPE=1 diff --git a/setup.py b/setup.py index c85455593..686bef0b9 100644 --- a/setup.py +++ b/setup.py @@ -219,7 +219,7 @@ def run(self): 'onnxruntime-gpu; python_version>="3.10"', ], 'nvidia': ['py3nvml>=0.2.6'], - 'amd': ['pyrsmi>=1.0.1'], + 'amd': ['amdsmi'], } ), include_package_data=True, diff --git a/superbench/config/amd_mi300.yaml b/superbench/config/amd_mi300.yaml new file mode 100644 index 000000000..b7aefba63 --- /dev/null +++ b/superbench/config/amd_mi300.yaml @@ -0,0 +1,232 @@ +# SuperBench Config +version: v0.11 +superbench: + enable: null + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 8 + node_num: 1 + frameworks: + - pytorch + common_model_config: &common_model_config + model_ddp_parameter: &model_ddp_param + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + m: 7680 + n: 8192 + k: 8192 + hipblaslt-gemm: + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} + parallel: yes + parameters: + in_types: ["fp32", "fp16", "bf16", 'fp8'] + tolerant_fail: yes + num_warmup: 100 + num_steps: 1000 + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + rccl-bw: + enable: true + modes: + - name: mpi + proc_num: 8 + node_num: 1 + mca: + pml: ob1 + btl: ^openib + btl_tcp_if_exclude: lo,docker0 + coll_hcoll_enable: 0 + parameters: + maxbytes: 16G + ngpus: 1 + operation: allreduce + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4)) + parallel: no + ib-loopback: + enable: true + modes: + - name: local + proc_num: 16 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) + parallel: no + parameters: + msg_size: 8388608 + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: [] + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + btl: tcp,self + pml: ob1 + btl_tcp_if_include: ens17f0 + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: ens17f0 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + dist-inference: + modes: + - name: mpi + proc_num: 8 + node_num: 1 + mca: + pml: ob1 + btl: ^openib + btl_tcp_if_exclude: lo,docker0 + coll_hcoll_enable: 0 + frameworks: + - pytorch + parameters: + num_layers: 50 + num_warmup: 20 + num_steps: 100 + use_cuda_graph: true + precision: float16 + hidden_size: 128 + input_size: 128 + batch_size: 1024 + model-benchmarks:gpt: + enable: true + <<: *default_pytorch_mode + models: + - gpt2-small + - gpt2-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + enable: true + <<: *default_pytorch_mode + models: + - bert-base + - bert-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + enable: true + <<: *default_pytorch_mode + models: + - lstm + parameters: + <<: *model_ddp_param + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + model-benchmarks:resnet: + enable: true + <<: *default_pytorch_mode + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *model_ddp_param + batch_size: 384 + model-benchmarks:densenet: + enable: true + <<: *default_pytorch_mode + models: + - densenet169 + - densenet201 + parameters: + <<: *model_ddp_param + model-benchmarks:vgg: + enable: true + <<: *default_pytorch_mode + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *model_ddp_param diff --git a/superbench/config/azure_ndv5.yaml b/superbench/config/azure_ndv5.yaml new file mode 100644 index 000000000..d4e030b4c --- /dev/null +++ b/superbench/config/azure_ndv5.yaml @@ -0,0 +1,305 @@ +# SuperBench Config +version: v0.11 +superbench: + enable: + monitor: + enable: true + sample_duration: 1 + sample_interval: 10 + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 8 + node_num: 1 + frameworks: + - pytorch + common_model_config: &common_model_config + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"] + cublaslt-gemm: + <<: *default_local_mode + parameters: + in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8'] + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + gpu-burn: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + time: 900 + doubles: true + tensor_core: true + nccl-bw:default: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + ngpus: 8 + nccl-bw:gdr-only: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + env: + NCCL_IB_PCI_RELAXED_ORDERING: '1' + NCCL_NET_GDR_LEVEL: '5' + NCCL_P2P_DISABLE: '1' + NCCL_SHM_DISABLE: '1' + NCCL_MIN_NCHANNELS: '16' + NCCL_IB_DISABLE: '0' + parameters: + ngpus: 8 + nccl-lat:default: + enable: true + modes: + - name: mpi + proc_num: 8 + node_num: 1 + parameters: + maxbytes: 16M + warmup_iters: 20 + iters: 1000 + graph_iters: 1 + ib-loopback: + timeout: *default_timeout + modes: + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1 + parallel: yes + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1 + parallel: yes + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: + - /dev/nvme0n1 + - /dev/nvme1n1 + - /dev/nvme2n1 + - /dev/nvme3n1 + - /dev/nvme4n1 + - /dev/nvme5n1 + - /dev/nvme6n1 + - /dev/nvme7n1 + seq_read_runtime: 60 + seq_write_runtime: 60 + seq_readwrite_runtime: 60 + rand_read_runtime: 60 + rand_write_runtime: 60 + rand_readwrite_runtime: 60 + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + sharding-matmul: + <<: *default_pytorch_mode + computation-communication-overlap: + <<: *default_pytorch_mode + dist-inference: + enable: true + timeout: 600 + modes: + - name: mpi + proc_num: 8 + node_num: 1 + env: + NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml' + frameworks: + - pytorch + parameters: + num_layers: 50 + num_warmup: 20 + num_steps: 100 + use_cuda_graph: true + precision: float16 + hidden_size: 128 + input_size: 128 + batch_size: 1024 + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 8 + parameters: + msg_size: 8388608 + ib_dev: mlx5_$LOCAL_RANK + gpu_dev: $LOCAL_RANK + numa_dev: $((LOCAL_RANK/2)) + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + gpcnet-network-load-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + ort-inference: + <<: *default_local_mode + tensorrt-inference: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: int8 + model-benchmarks:gpt: + <<: *default_pytorch_mode + models: + - gpt2-small + - gpt2-large + parameters: + <<: *common_model_config + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + <<: *default_pytorch_mode + models: + - bert-base + - bert-large + parameters: + <<: *common_model_config + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + <<: *default_pytorch_mode + models: + - lstm + parameters: + <<: *common_model_config + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet: + <<: *default_pytorch_mode + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *common_model_config + batch_size: 384 + num_steps: 512 + model-benchmarks:densenet: + <<: *default_pytorch_mode + models: + - densenet169 + - densenet201 + parameters: + <<: *common_model_config + pin_memory: no + model-benchmarks:vgg: + <<: *default_pytorch_mode + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *common_model_config + pin_memory: no