From 204cc7f96bcee89ea8a72692c5e433d0fa0be17d Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 31 Oct 2024 14:46:08 -0700 Subject: [PATCH 1/8] add aarch64 docker build --- dockerfile/cuda12.4-arm.dockerfile | 139 +++++++++++ setup.py | 4 +- .../cpu_stream_performance.py | 4 +- .../cuda_decode_performance/CMakeLists.txt | 217 +++++++++--------- third_party/Makefile | 12 +- third_party/stream-tests/Makefile | 29 ++- 6 files changed, 290 insertions(+), 115 deletions(-) create mode 100644 dockerfile/cuda12.4-arm.dockerfile diff --git a/dockerfile/cuda12.4-arm.dockerfile b/dockerfile/cuda12.4-arm.dockerfile new file mode 100644 index 000000000..03c8834f8 --- /dev/null +++ b/dockerfile/cuda12.4-arm.dockerfile @@ -0,0 +1,139 @@ +FROM nvcr.io/nvidia/pytorch:24.05-py3 + +# OS: +# - Ubuntu: 22.04 +# - OpenMPI: 4.1.5rc2 +# - Docker Client: 20.10.8 +# NVIDIA: +# - CUDA: 12.4.1 +# - CUDA Driver: 550.54.15 +# - cuBLAS: 12.4.5.8 +# - cuDNN: 9.1.0.70 +# - NCCL: 2.21.5 +# Mellanox: +# - OFED: 24.04-0.6.6.0 +# - HPC-X: v2.19 +# Intel: +# - mlc: v3.11a + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + ffmpeg \ + git \ + iproute2 \ + jq \ + libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libswresample-dev \ + libtinfo5 \ + libtool \ + lshw \ + python3-mpi4py \ + net-tools \ + nlohmann-json3-dev \ + openssh-client \ + openssh-server \ + pciutils \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +ARG NUM_MAKE_JOBS= + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/aarch64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + +# Install OFED +ENV OFED_VERSION=24.04-0.6.6.0 +RUN cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ + rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* + +# Install HPC-X +ENV HPCX_VERSION=v2.19 +RUN cd /opt && \ + rm -rf hpcx && \ + wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64.tbz -O hpcx.tbz && \ + tar xf hpcx.tbz && \ + mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64 hpcx && \ + rm hpcx.tbz + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/822971/mlc_v3.11a.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +# Install NCCL 2.21.5 +RUN cd /tmp && \ + git clone -b v2.21.5-1 https://github.com/NVIDIA/nccl.git && \ + cd nccl && \ + make -j src.build && \ + make install && \ + rm -rf /tmp/nccl + +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench_dev \ + SB_MICRO_PATH=/opt/superbench_dev \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \ + echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" >> /etc/bash.bashrc + +# Add config files +ADD dockerfile/etc /opt/microsoft/ + +WORKDIR ${SB_HOME} + +ADD third_party third_party +RUN make -C third_party cuda_with_msccl + +ADD . . +RUN python3 -m pip install --upgrade setuptools==65.7 && \ + python3 -m pip install --no-cache-dir .[nvworker] && \ + make cppbuild && \ + make postinstall && \ + rm -rf .git + diff --git a/setup.py b/setup.py index 686bef0b9..13bf9d044 100644 --- a/setup.py +++ b/setup.py @@ -215,8 +215,8 @@ def run(self): ], 'ort': [ 'onnx>=1.10.2', - 'onnxruntime-gpu==1.10.0; python_version<"3.10"', - 'onnxruntime-gpu; python_version>="3.10"', + 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine != "aarch64"', + 'onnxruntime-gpu; python_version>="3.10" and platform_machine != "aarch64"', ], 'nvidia': ['py3nvml>=0.2.6'], 'amd': ['amdsmi'], diff --git a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py index 6045e8868..57b4eb7db 100644 --- a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py +++ b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py @@ -23,7 +23,7 @@ def __init__(self, name, parameters=''): super().__init__(name, parameters) self._bin_name = 'streamZen3.exe' - self.__cpu_arch = ['other', 'zen3', 'zen4'] + self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2'] def add_parser_arguments(self): """Add the specified arguments.""" @@ -80,6 +80,8 @@ def _preprocess(self): exe = 'streamZen3.exe' elif self._args.cpu_arch == 'zen4': exe = 'streamZen4.exe' + elif self._args.cpu_arch == 'neo2': + exe = 'streamNeo2.exe' else: exe = 'streamx86.exe' diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt index 83cb15067..907f616fa 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt @@ -4,114 +4,121 @@ cmake_minimum_required(VERSION 3.18) project(cuda_decode_performance) -find_package(CUDA QUIET) -if(CUDA_FOUND) - set(CMAKE_CXX_STANDARD 17) - set(CMAKE_CXX_STANDARD_REQUIRED ON) - - set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) - set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) - set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) - set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) - set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) - - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - find_package(PkgConfig REQUIRED) - pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) - pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) - pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) - pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) - - set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) - find_library(AVCODEC_LIBRARY NAMES avcodec - HINTS - ${PC_AVCODEC_LIBDIR} - ${PC_AVCODEC_LIBRARY_DIRS} + + # Check architecture + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.") + else() + find_package(CUDA QUIET) + if(CUDA_FOUND) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) + set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) + set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) + set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) + set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_package(PkgConfig REQUIRED) + pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) + pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) + pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) + pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) + + set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) + find_library(AVCODEC_LIBRARY NAMES avcodec + HINTS + ${PC_AVCODEC_LIBDIR} + ${PC_AVCODEC_LIBRARY_DIRS} + ) + find_library(AVFORMAT_LIBRARY NAMES avformat + HINTS + ${PC_AVFORMAT_LIBDIR} + ${PC_AVFORMAT_LIBRARY_DIRS} + ) + find_library(AVUTIL_LIBRARY NAMES avutil + HINTS + ${PC_AVUTIL_LIBDIR} + ${PC_AVUTIL_LIBRARY_DIRS} + ) + find_library(SWRESAMPLE_LIBRARY NAMES swresample + HINTS + ${PC_SWRESAMPLE_LIBDIR} + ${PC_SWRESAMPLE_LIBRARY_DIRS} + ) + set(AVCODEC_LIB ${AVCODEC_LIBRARY}) + set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) + set(AVUTIL_LIB ${AVUTIL_LIBRARY}) + set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) + endif() + + set(APP_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp ) - find_library(AVFORMAT_LIBRARY NAMES avformat - HINTS - ${PC_AVFORMAT_LIBDIR} - ${PC_AVFORMAT_LIBRARY_DIRS} + + set(NV_DEC_SOURCES + ${NV_DEC_DIR}/NvDecoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp ) - find_library(AVUTIL_LIBRARY NAMES avutil - HINTS - ${PC_AVUTIL_LIBDIR} - ${PC_AVUTIL_LIBRARY_DIRS} + + set(NV_DEC_HDRS + ${NV_DEC_DIR}/NvDecoder.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h + ${NVCODEC_UTILS_DIR}/NvCodecUtils.h + ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h ) - find_library(SWRESAMPLE_LIBRARY NAMES swresample + + source_group( "headers" FILES ${NV_DEC_HDRS} ) + source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) + set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") + find_package(CUDA) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") + if ( CMAKE_COMPILER_IS_GNUCC ) + if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) + list(APPEND CUDA_NVCC_FLAGS -std=c++11) + endif() + endif() + + # Check if the file exists + if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) + execute_process( + COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so + RESULT_VARIABLE result + ) + if(result) + message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") + endif() + endif () + + find_library(CUVID_LIB nvcuvid HINTS - ${PC_SWRESAMPLE_LIBDIR} - ${PC_SWRESAMPLE_LIBRARY_DIRS} + "/usr/local/lib/" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" ) - set(AVCODEC_LIB ${AVCODEC_LIBRARY}) - set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) - set(AVUTIL_LIB ${AVUTIL_LIBRARY}) - set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) - endif() - - set(APP_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp - ) - - set(NV_DEC_SOURCES - ${NV_DEC_DIR}/NvDecoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp - ) - - set(NV_DEC_HDRS - ${NV_DEC_DIR}/NvDecoder.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h - ${NVCODEC_UTILS_DIR}/NvCodecUtils.h - ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h - ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h - ) - - source_group( "headers" FILES ${NV_DEC_HDRS} ) - source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) - set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") - find_package(CUDA) - set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") - if ( CMAKE_COMPILER_IS_GNUCC ) - if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) - list(APPEND CUDA_NVCC_FLAGS -std=c++11) + + cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) + + set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} + ${NVCODEC_PUBLIC_INTERFACE_DIR} + ${NVCODEC_UTILS_DIR} + ${NV_CODEC_DIR} + ${NV_APPDEC_COMMON_DIR} + ${NV_FFMPEG_HDRS} + ${THIRD_PARTY_SAMPLE_DIR} + ) + + target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} + ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) + + install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) endif() - endif() - - # Check if the file exists - if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) - execute_process( - COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so - RESULT_VARIABLE result - ) - if(result) - message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") - endif() - endif () - - find_library(CUVID_LIB nvcuvid - HINTS - "/usr/local/lib/" - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" - ) - - cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) - - set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} - ${NVCODEC_PUBLIC_INTERFACE_DIR} - ${NVCODEC_UTILS_DIR} - ${NV_CODEC_DIR} - ${NV_APPDEC_COMMON_DIR} - ${NV_FFMPEG_HDRS} - ${THIRD_PARTY_SAMPLE_DIR} - ) - - target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} - ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) - - install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) -endif() + + endif() diff --git a/third_party/Makefile b/third_party/Makefile index 7abac4fb4..383a206dd 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -18,7 +18,14 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm -# Build all targets. +# Build targets. +ifeq ($(shell uname -m), aarch64) +all: cuda +cuda_with_msccl: cuda cuda_msccl +cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_gpuburn megatron_lm megatron_deepspeed +cpu: common cpu_perftest +common: cpu_stream fio +else all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed @@ -26,6 +33,7 @@ rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_ cpu: common cpu_perftest common: cpu_hpl cpu_stream fio directx_amd: directx_amf_encoding_latency +endif # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed. sb_micro_path: @@ -59,7 +67,7 @@ else endif if [ -d cuda-samples ]; then rm -rf cuda-samples; fi git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git - cd ./$(TEST_PATH) && make clean && make TARGET_ARCH=x86_64 SMS=$(ARCHS) + cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS) cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/ # Build nccl-tests from commit 8274cb4 of default branch. diff --git a/third_party/stream-tests/Makefile b/third_party/stream-tests/Makefile index a5ed5ff35..8a86c0c59 100644 --- a/third_party/stream-tests/Makefile +++ b/third_party/stream-tests/Makefile @@ -1,16 +1,28 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang -CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 -GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 -ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 -ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 +GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 +ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 +ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 +NEO2FLAGS= -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 + GEN_OUTPUT= streamx86.exe ZEN3_OUTPUT= streamZen3.exe ZEN4_OUTPUT= streamZen4.exe +NEO2_OUTPUT= streamNeo2.exe + +ARCH := $(shell uname -m) +ifeq ($(ARCH), aarch64) +CFLAGS = -Ofast -fopenmp -DNTIMES=200 +CC=gcc +all: NEO2 +else +CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang +CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 all: ZEN3 ZEN4 X86 +endif + ZEN3: stream.c $(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT) @@ -18,6 +30,13 @@ ZEN4: $(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT) X86: $(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT) +NEO2: + $(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT) +ifeq ($(ARCH), aarch64) +clean: + rm $(NEO2_OUTPUT) +else clean: rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT) +endif From 967213363bfd708dad10b24e9e161ad0d61eb2ed Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 31 Oct 2024 15:03:42 -0700 Subject: [PATCH 2/8] add docker build template. commented out --- .github/workflows/build-image.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 05e4dd447..13eead127 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -30,6 +30,12 @@ jobs: tags: superbench/main:cuda12.4 runner: [self-hosted, rocm-build] build_args: "NUM_MAKE_JOBS=8" + # # TODO: Enable ARM build, check for hosted aarch64 runner availability + # - name: cuda12.4-arm + # dockerfile: cuda12.4 + # tags: superbench/main:cuda12.4 + # runner: [self-hosted, aarch64-build] + # build_args: "NUM_MAKE_JOBS=8" - name: cuda12.2 dockerfile: cuda12.2 tags: superbench/main:cuda12.2 From 3e7136f8f94832aae9b719e6cbbba1dd9f9864e6 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Fri, 1 Nov 2024 12:42:50 -0700 Subject: [PATCH 3/8] fix PR comments cleanup thirdparty Makefile and stream tests makefile. fix mising gpcnet. fix lint in cuda decode perf Makefile. --- setup.py | 4 +- .../cuda_decode_performance/CMakeLists.txt | 235 +++++++++--------- third_party/Makefile | 15 +- third_party/stream-tests/Makefile | 25 +- 4 files changed, 137 insertions(+), 142 deletions(-) diff --git a/setup.py b/setup.py index 13bf9d044..93d53639c 100644 --- a/setup.py +++ b/setup.py @@ -215,8 +215,8 @@ def run(self): ], 'ort': [ 'onnx>=1.10.2', - 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine != "aarch64"', - 'onnxruntime-gpu; python_version>="3.10" and platform_machine != "aarch64"', + 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine == "x86_64"', + 'onnxruntime-gpu; python_version>="3.10" and platform_machine == "x86_64"', ], 'nvidia': ['py3nvml>=0.2.6'], 'amd': ['amdsmi'], diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt index 907f616fa..1022aed3d 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt @@ -4,121 +4,120 @@ cmake_minimum_required(VERSION 3.18) project(cuda_decode_performance) - - # Check architecture - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") - message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.") - else() - find_package(CUDA QUIET) - if(CUDA_FOUND) - set(CMAKE_CXX_STANDARD 17) - set(CMAKE_CXX_STANDARD_REQUIRED ON) - - set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) - set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) - set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) - set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) - set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) - - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - find_package(PkgConfig REQUIRED) - pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) - pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) - pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) - pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) - - set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) - find_library(AVCODEC_LIBRARY NAMES avcodec - HINTS - ${PC_AVCODEC_LIBDIR} - ${PC_AVCODEC_LIBRARY_DIRS} - ) - find_library(AVFORMAT_LIBRARY NAMES avformat - HINTS - ${PC_AVFORMAT_LIBDIR} - ${PC_AVFORMAT_LIBRARY_DIRS} - ) - find_library(AVUTIL_LIBRARY NAMES avutil - HINTS - ${PC_AVUTIL_LIBDIR} - ${PC_AVUTIL_LIBRARY_DIRS} - ) - find_library(SWRESAMPLE_LIBRARY NAMES swresample - HINTS - ${PC_SWRESAMPLE_LIBDIR} - ${PC_SWRESAMPLE_LIBRARY_DIRS} - ) - set(AVCODEC_LIB ${AVCODEC_LIBRARY}) - set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) - set(AVUTIL_LIB ${AVUTIL_LIBRARY}) - set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) - endif() - - set(APP_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp - ) - - set(NV_DEC_SOURCES - ${NV_DEC_DIR}/NvDecoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp - ) - - set(NV_DEC_HDRS - ${NV_DEC_DIR}/NvDecoder.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h - ${NVCODEC_UTILS_DIR}/NvCodecUtils.h - ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h - ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h - ) - - source_group( "headers" FILES ${NV_DEC_HDRS} ) - source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) - set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") - find_package(CUDA) - set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") - if ( CMAKE_COMPILER_IS_GNUCC ) - if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) - list(APPEND CUDA_NVCC_FLAGS -std=c++11) - endif() - endif() - - # Check if the file exists - if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) - execute_process( - COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so - RESULT_VARIABLE result - ) - if(result) - message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") - endif() - endif () - - find_library(CUVID_LIB nvcuvid - HINTS - "/usr/local/lib/" - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" - ) - - cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) - - set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} - ${NVCODEC_PUBLIC_INTERFACE_DIR} - ${NVCODEC_UTILS_DIR} - ${NV_CODEC_DIR} - ${NV_APPDEC_COMMON_DIR} - ${NV_FFMPEG_HDRS} - ${THIRD_PARTY_SAMPLE_DIR} - ) - - target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} - ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) - - install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) - endif() - - endif() +# Check architecture +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.") +else() + find_package(CUDA QUIET) + if(CUDA_FOUND) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) + set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) + set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) + set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) + set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_package(PkgConfig REQUIRED) + pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) + pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) + pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) + pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) + + set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) + find_library(AVCODEC_LIBRARY NAMES avcodec + HINTS + ${PC_AVCODEC_LIBDIR} + ${PC_AVCODEC_LIBRARY_DIRS} + ) + find_library(AVFORMAT_LIBRARY NAMES avformat + HINTS + ${PC_AVFORMAT_LIBDIR} + ${PC_AVFORMAT_LIBRARY_DIRS} + ) + find_library(AVUTIL_LIBRARY NAMES avutil + HINTS + ${PC_AVUTIL_LIBDIR} + ${PC_AVUTIL_LIBRARY_DIRS} + ) + find_library(SWRESAMPLE_LIBRARY NAMES swresample + HINTS + ${PC_SWRESAMPLE_LIBDIR} + ${PC_SWRESAMPLE_LIBRARY_DIRS} + ) + set(AVCODEC_LIB ${AVCODEC_LIBRARY}) + set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) + set(AVUTIL_LIB ${AVUTIL_LIBRARY}) + set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) + endif() + + set(APP_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp + ) + + set(NV_DEC_SOURCES + ${NV_DEC_DIR}/NvDecoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp + ) + + set(NV_DEC_HDRS + ${NV_DEC_DIR}/NvDecoder.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h + ${NVCODEC_UTILS_DIR}/NvCodecUtils.h + ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h + ) + + source_group( "headers" FILES ${NV_DEC_HDRS} ) + source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) + set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") + find_package(CUDA) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") + if ( CMAKE_COMPILER_IS_GNUCC ) + if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) + list(APPEND CUDA_NVCC_FLAGS -std=c++11) + endif() + endif() + + # Check if the file exists + if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) + execute_process( + COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so + RESULT_VARIABLE result + ) + if(result) + message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") + endif() + endif () + + find_library(CUVID_LIB nvcuvid + HINTS + "/usr/local/lib/" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" + ) + + cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) + + set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} + ${NVCODEC_PUBLIC_INTERFACE_DIR} + ${NVCODEC_UTILS_DIR} + ${NV_CODEC_DIR} + ${NV_APPDEC_COMMON_DIR} + ${NV_FFMPEG_HDRS} + ${THIRD_PARTY_SAMPLE_DIR} + ) + + target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} + ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) + + install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) + endif() + +endif() diff --git a/third_party/Makefile b/third_party/Makefile index 383a206dd..63ca48f36 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -19,19 +19,16 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm # Build targets. -ifeq ($(shell uname -m), aarch64) -all: cuda -cuda_with_msccl: cuda cuda_msccl -cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_gpuburn megatron_lm megatron_deepspeed -cpu: common cpu_perftest -common: cpu_stream fio -else all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm -cpu: common cpu_perftest -common: cpu_hpl cpu_stream fio +cpu: common cpu_perftest cpu_stream +common: fio + +# non aarch64 specific targets +ifneq ($(shell uname -m), aarch64) +common: fio cpu_hpl directx_amd: directx_amf_encoding_latency endif diff --git a/third_party/stream-tests/Makefile b/third_party/stream-tests/Makefile index 8a86c0c59..a652defd9 100644 --- a/third_party/stream-tests/Makefile +++ b/third_party/stream-tests/Makefile @@ -1,29 +1,28 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 -ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 -ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 -NEO2FLAGS= -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 +GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000 +ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 +ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 +NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 -GEN_OUTPUT= streamx86.exe -ZEN3_OUTPUT= streamZen3.exe -ZEN4_OUTPUT= streamZen4.exe -NEO2_OUTPUT= streamNeo2.exe +GEN_OUTPUT := streamx86.exe +ZEN3_OUTPUT := streamZen3.exe +ZEN4_OUTPUT := streamZen4.exe +NEO2_OUTPUT := streamNeo2.exe ARCH := $(shell uname -m) ifeq ($(ARCH), aarch64) -CFLAGS = -Ofast -fopenmp -DNTIMES=200 -CC=gcc +CFLAGS := -Ofast -fopenmp -DNTIMES=200 +CC := gcc all: NEO2 else -CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang -CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 +CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang +CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 all: ZEN3 ZEN4 X86 endif - ZEN3: stream.c $(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT) ZEN4: From 41a003fcd73e48bf0b578faac0c2dc8fa5858dd7 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Fri, 1 Nov 2024 16:00:00 -0700 Subject: [PATCH 4/8] unify docker file for arm64 and amd64 --- dockerfile/cuda12.4-arm.dockerfile | 139 ----------------------------- dockerfile/cuda12.4.dockerfile | 47 +++++----- 2 files changed, 27 insertions(+), 159 deletions(-) delete mode 100644 dockerfile/cuda12.4-arm.dockerfile diff --git a/dockerfile/cuda12.4-arm.dockerfile b/dockerfile/cuda12.4-arm.dockerfile deleted file mode 100644 index 03c8834f8..000000000 --- a/dockerfile/cuda12.4-arm.dockerfile +++ /dev/null @@ -1,139 +0,0 @@ -FROM nvcr.io/nvidia/pytorch:24.05-py3 - -# OS: -# - Ubuntu: 22.04 -# - OpenMPI: 4.1.5rc2 -# - Docker Client: 20.10.8 -# NVIDIA: -# - CUDA: 12.4.1 -# - CUDA Driver: 550.54.15 -# - cuBLAS: 12.4.5.8 -# - cuDNN: 9.1.0.70 -# - NCCL: 2.21.5 -# Mellanox: -# - OFED: 24.04-0.6.6.0 -# - HPC-X: v2.19 -# Intel: -# - mlc: v3.11a - -LABEL maintainer="SuperBench" - -ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - autoconf \ - automake \ - bc \ - build-essential \ - curl \ - dmidecode \ - ffmpeg \ - git \ - iproute2 \ - jq \ - libaio-dev \ - libavcodec-dev \ - libavformat-dev \ - libavutil-dev \ - libboost-program-options-dev \ - libcap2 \ - libcurl4-openssl-dev \ - libnuma-dev \ - libpci-dev \ - libswresample-dev \ - libtinfo5 \ - libtool \ - lshw \ - python3-mpi4py \ - net-tools \ - nlohmann-json3-dev \ - openssh-client \ - openssh-server \ - pciutils \ - sudo \ - util-linux \ - vim \ - wget \ - && \ - apt-get autoremove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* - -ARG NUM_MAKE_JOBS= - -# Install Docker -ENV DOCKER_VERSION=20.10.8 -RUN cd /tmp && \ - wget -q https://download.docker.com/linux/static/stable/aarch64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ - tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ - rm docker.tgz - -# Update system config -RUN mkdir -p /root/.ssh && \ - touch /root/.ssh/authorized_keys && \ - mkdir -p /var/run/sshd && \ - sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ - sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ - sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ - echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ - echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf - -# Install OFED -ENV OFED_VERSION=24.04-0.6.6.0 -RUN cd /tmp && \ - wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ - rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* - -# Install HPC-X -ENV HPCX_VERSION=v2.19 -RUN cd /opt && \ - rm -rf hpcx && \ - wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64.tbz -O hpcx.tbz && \ - tar xf hpcx.tbz && \ - mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64 hpcx && \ - rm hpcx.tbz - -# Install Intel MLC -RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/822971/mlc_v3.11a.tgz -O mlc.tgz && \ - tar xzf mlc.tgz Linux/mlc && \ - cp ./Linux/mlc /usr/local/bin/ && \ - rm -rf ./Linux mlc.tgz - -# Install NCCL 2.21.5 -RUN cd /tmp && \ - git clone -b v2.21.5-1 https://github.com/NVIDIA/nccl.git && \ - cd nccl && \ - make -j src.build && \ - make install && \ - rm -rf /tmp/nccl - -ENV PATH="${PATH}" \ - LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \ - SB_HOME=/opt/superbench_dev \ - SB_MICRO_PATH=/opt/superbench_dev \ - ANSIBLE_DEPRECATION_WARNINGS=FALSE \ - ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections - -RUN echo PATH="$PATH" > /etc/environment && \ - echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ - echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \ - echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" >> /etc/bash.bashrc - -# Add config files -ADD dockerfile/etc /opt/microsoft/ - -WORKDIR ${SB_HOME} - -ADD third_party third_party -RUN make -C third_party cuda_with_msccl - -ADD . . -RUN python3 -m pip install --upgrade setuptools==65.7 && \ - python3 -m pip install --no-cache-dir .[nvworker] && \ - make cppbuild && \ - make postinstall && \ - rm -rf .git - diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile index 3ec82f39d..259ece773 100644 --- a/dockerfile/cuda12.4.dockerfile +++ b/dockerfile/cuda12.4.dockerfile @@ -19,6 +19,10 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 LABEL maintainer="SuperBench" ENV DEBIAN_FRONTEND=noninteractive + +ARG TARGETPLATFORM +ARG TARGETARCH + RUN apt-get update && \ apt-get install -y --no-install-recommends \ autoconf \ @@ -63,8 +67,8 @@ ARG NUM_MAKE_JOBS= # Install Docker ENV DOCKER_VERSION=20.10.8 -RUN cd /tmp && \ - wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ +RUN TARGETARCH_HW=$(uname -m) && \ + wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ rm docker.tgz @@ -80,40 +84,43 @@ RUN mkdir -p /root/.ssh && \ # Install OFED ENV OFED_VERSION=23.07-0.5.1.2 -RUN cd /tmp && \ - wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ +RUN TARGETARCH_HW=$(uname -m) && \ + cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* # Install HPC-X ENV HPCX_VERSION=v2.18 -RUN cd /opt && \ +RUN TARGETARCH_HW=$(uname -m) && \ + cd /opt && \ rm -rf hpcx && \ - wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz -O hpcx.tbz && \ + wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \ tar xf hpcx.tbz && \ - mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 hpcx && \ + mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW} hpcx && \ rm hpcx.tbz -# Install Intel MLC -RUN cd /tmp && \ +# Installs specific to amd64 platform +RUN if [ "$TARGETARCH" = "amd64" ]; then \ + # Install Intel MLC + cd /tmp && \ wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ - rm -rf ./Linux mlc.tgz - -# Install AOCC compiler -RUN cd /tmp && \ + rm -rf ./Linux mlc.tgz && \ + # Install AOCC compiler wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ - rm -rf aocc-compiler-4.0.0_1_amd64.deb - -# Install AMD BLIS -RUN cd /tmp && \ + rm -rf aocc-compiler-4.0.0_1_amd64.deb && \ + # Install AMD BLIS wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ mv amd-blis /opt/AMD && \ - rm -rf aocl-blis-linux-aocc-4.0.tar.gz + rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \ + else \ + echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \ + fi # Install NCCL 2.23.4 RUN cd /tmp && \ From 42c12eccfe2c1869dda5e48040e25457e5aeeea5 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 4 Nov 2024 10:10:29 -0800 Subject: [PATCH 5/8] add arm64 for multiplatform builds --- .github/workflows/build-image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 13eead127..944fa2ec8 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -142,7 +142,7 @@ jobs: id: docker_build uses: docker/build-push-action@v2 with: - platforms: linux/amd64 + platforms: linux/amd64, linux/arm64 context: . file: ${{ steps.metadata.outputs.dockerfile }} push: ${{ github.event_name != 'pull_request' }} From 0da1d89285f673665cfedcce3995432cb7a0edc5 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 4 Nov 2024 10:33:34 -0800 Subject: [PATCH 6/8] specify platforms per docker build --- .github/workflows/build-image.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 3e389410b..8e877de67 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -28,21 +28,25 @@ jobs: - name: cuda12.4 dockerfile: cuda12.4 tags: superbench/main:cuda12.4 + platforms: linux/amd64, linux/arm64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" - name: cuda12.2 dockerfile: cuda12.2 tags: superbench/main:cuda12.2 + platforms: linux/amd64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest + platforms: linux/amd64 runner: ubuntu-latest build_args: "NUM_MAKE_JOBS=8" - name: rocm6.2 dockerfile: rocm6.2.x tags: superbench/main:rocm6.2 + platforms: linux/amd64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" steps: @@ -125,7 +129,7 @@ jobs: id: docker_build uses: docker/build-push-action@v2 with: - platforms: linux/amd64, linux/arm64 + platforms: ${{ matrix.platforms }} context: . file: ${{ steps.metadata.outputs.dockerfile }} push: ${{ github.event_name != 'pull_request' }} From 6df745bd23eadfeed634f835e2c7c8504955730a Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 5 Nov 2024 18:14:07 -0800 Subject: [PATCH 7/8] disable arm64 build. fix lint in dockerfile --- .github/workflows/build-image.yml | 2 +- dockerfile/cuda12.4.dockerfile | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 8e877de67..fdfe9114c 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -28,7 +28,7 @@ jobs: - name: cuda12.4 dockerfile: cuda12.4 tags: superbench/main:cuda12.4 - platforms: linux/amd64, linux/arm64 + platforms: linux/amd64 # TODO: linux/arm64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" - name: cuda12.2 diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile index 259ece773..e9feb2a3e 100644 --- a/dockerfile/cuda12.4.dockerfile +++ b/dockerfile/cuda12.4.dockerfile @@ -20,9 +20,6 @@ LABEL maintainer="SuperBench" ENV DEBIAN_FRONTEND=noninteractive -ARG TARGETPLATFORM -ARG TARGETARCH - RUN apt-get update && \ apt-get install -y --no-install-recommends \ autoconf \ @@ -64,6 +61,8 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* /tmp/* ARG NUM_MAKE_JOBS= +ARG TARGETPLATFORM +ARG TARGETARCH # Install Docker ENV DOCKER_VERSION=20.10.8 @@ -117,9 +116,9 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then \ wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ mv amd-blis /opt/AMD && \ - rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \ + rm -rf aocl-blis-linux-aocc-4.0.tar.gz \ else \ - echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \ + echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH" \ fi # Install NCCL 2.23.4 From 2252a9633a9c1bdfc4ba97c0419844bc33e2a163 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 6 Nov 2024 09:39:10 -0800 Subject: [PATCH 8/8] add requried delimiters for shell command parsing --- dockerfile/cuda12.4.dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile index e9feb2a3e..560f0908a 100644 --- a/dockerfile/cuda12.4.dockerfile +++ b/dockerfile/cuda12.4.dockerfile @@ -116,9 +116,9 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then \ wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ mv amd-blis /opt/AMD && \ - rm -rf aocl-blis-linux-aocc-4.0.tar.gz \ + rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \ else \ - echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH" \ + echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \ fi # Install NCCL 2.23.4