From 204cc7f96bcee89ea8a72692c5e433d0fa0be17d Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 31 Oct 2024 14:46:08 -0700
Subject: [PATCH 1/8] add aarch64 docker build

---
 dockerfile/cuda12.4-arm.dockerfile            | 139 +++++++++++
 setup.py                                      |   4 +-
 .../cpu_stream_performance.py                 |   4 +-
 .../cuda_decode_performance/CMakeLists.txt    | 217 +++++++++---------
 third_party/Makefile                          |  12 +-
 third_party/stream-tests/Makefile             |  29 ++-
 6 files changed, 290 insertions(+), 115 deletions(-)
 create mode 100644 dockerfile/cuda12.4-arm.dockerfile

diff --git a/dockerfile/cuda12.4-arm.dockerfile b/dockerfile/cuda12.4-arm.dockerfile
new file mode 100644
index 000000000..03c8834f8
--- /dev/null
+++ b/dockerfile/cuda12.4-arm.dockerfile
@@ -0,0 +1,139 @@
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+# OS:
+#   - Ubuntu: 22.04
+#   - OpenMPI: 4.1.5rc2
+#   - Docker Client: 20.10.8
+# NVIDIA:
+#   - CUDA: 12.4.1
+#   - CUDA Driver: 550.54.15
+#   - cuBLAS: 12.4.5.8
+#   - cuDNN: 9.1.0.70
+#   - NCCL: 2.21.5
+# Mellanox:
+#   - OFED: 24.04-0.6.6.0
+#   - HPC-X: v2.19
+# Intel:
+#   - mlc: v3.11a
+
+LABEL maintainer="SuperBench"
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    bc \
+    build-essential \
+    curl \
+    dmidecode \
+    ffmpeg \
+    git \
+    iproute2 \
+    jq \
+    libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
+    libboost-program-options-dev \
+    libcap2 \
+    libcurl4-openssl-dev \
+    libnuma-dev \
+    libpci-dev \
+    libswresample-dev \
+    libtinfo5 \
+    libtool \
+    lshw \
+    python3-mpi4py \
+    net-tools \
+    nlohmann-json3-dev \
+    openssh-client \
+    openssh-server \
+    pciutils \
+    sudo \
+    util-linux \
+    vim \
+    wget \
+    && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+ARG NUM_MAKE_JOBS=
+
+# Install Docker
+ENV DOCKER_VERSION=20.10.8
+RUN cd /tmp && \
+    wget -q https://download.docker.com/linux/static/stable/aarch64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
+    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
+    rm docker.tgz
+
+# Update system config
+RUN mkdir -p /root/.ssh && \
+    touch /root/.ssh/authorized_keys && \
+    mkdir -p /var/run/sshd && \
+    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
+    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
+    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
+
+# Install OFED
+ENV OFED_VERSION=24.04-0.6.6.0
+RUN cd /tmp && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
+    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
+
+# Install HPC-X
+ENV HPCX_VERSION=v2.19
+RUN cd /opt && \
+    rm -rf hpcx && \
+    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64 hpcx && \
+    rm hpcx.tbz
+
+# Install Intel MLC
+RUN cd /tmp && \
+    wget -q https://downloadmirror.intel.com/822971/mlc_v3.11a.tgz -O mlc.tgz && \
+    tar xzf mlc.tgz Linux/mlc && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    rm -rf ./Linux mlc.tgz
+
+# Install NCCL 2.21.5
+RUN cd /tmp && \
+    git clone -b v2.21.5-1 https://github.com/NVIDIA/nccl.git && \
+    cd nccl && \
+    make -j src.build && \
+    make install && \
+    rm -rf /tmp/nccl
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench_dev \
+    SB_MICRO_PATH=/opt/superbench_dev \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
+    echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" >> /etc/bash.bashrc
+
+# Add config files
+ADD dockerfile/etc /opt/microsoft/
+
+WORKDIR ${SB_HOME}
+
+ADD third_party third_party
+RUN make -C third_party cuda_with_msccl
+
+ADD . .
+RUN python3 -m pip install --upgrade setuptools==65.7 && \
+    python3 -m pip install --no-cache-dir .[nvworker] && \
+    make cppbuild && \
+    make postinstall && \
+    rm -rf .git
+
diff --git a/setup.py b/setup.py
index 686bef0b9..13bf9d044 100644
--- a/setup.py
+++ b/setup.py
@@ -215,8 +215,8 @@ def run(self):
             ],
             'ort': [
                 'onnx>=1.10.2',
-                'onnxruntime-gpu==1.10.0; python_version<"3.10"',
-                'onnxruntime-gpu; python_version>="3.10"',
+                'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine != "aarch64"',
+                'onnxruntime-gpu; python_version>="3.10" and platform_machine != "aarch64"',
             ],
             'nvidia': ['py3nvml>=0.2.6'],
             'amd': ['amdsmi'],
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
index 6045e8868..57b4eb7db 100644
--- a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
@@ -23,7 +23,7 @@ def __init__(self, name, parameters=''):
         super().__init__(name, parameters)
 
         self._bin_name = 'streamZen3.exe'
-        self.__cpu_arch = ['other', 'zen3', 'zen4']
+        self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2']
 
     def add_parser_arguments(self):
         """Add the specified arguments."""
@@ -80,6 +80,8 @@ def _preprocess(self):
             exe = 'streamZen3.exe'
         elif self._args.cpu_arch == 'zen4':
             exe = 'streamZen4.exe'
+        elif self._args.cpu_arch == 'neo2':
+            exe = 'streamNeo2.exe'
         else:
             exe = 'streamx86.exe'
 
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
index 83cb15067..907f616fa 100644
--- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
@@ -4,114 +4,121 @@
 cmake_minimum_required(VERSION 3.18)
 project(cuda_decode_performance)
 
-find_package(CUDA QUIET)
-if(CUDA_FOUND)
-  set(CMAKE_CXX_STANDARD 17)
-  set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-  set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
-  set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
-  set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
-  set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
-  set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
-
-  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    find_package(PkgConfig REQUIRED)
-    pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
-    pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
-    pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
-    pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
-
-    set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
-    find_library(AVCODEC_LIBRARY NAMES avcodec
-    HINTS
-    ${PC_AVCODEC_LIBDIR}
-    ${PC_AVCODEC_LIBRARY_DIRS}
+
+ # Check architecture
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+    message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.")
+ else()
+    find_package(CUDA QUIET)
+    if(CUDA_FOUND)
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+    set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
+    set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
+    set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
+    set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
+    set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
+
+    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        find_package(PkgConfig REQUIRED)
+        pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
+        pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
+        pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
+        pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
+
+        set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
+        find_library(AVCODEC_LIBRARY NAMES avcodec
+        HINTS
+        ${PC_AVCODEC_LIBDIR}
+        ${PC_AVCODEC_LIBRARY_DIRS}
+        )
+        find_library(AVFORMAT_LIBRARY NAMES avformat
+        HINTS
+        ${PC_AVFORMAT_LIBDIR}
+        ${PC_AVFORMAT_LIBRARY_DIRS}
+        )
+        find_library(AVUTIL_LIBRARY NAMES avutil
+        HINTS
+        ${PC_AVUTIL_LIBDIR}
+        ${PC_AVUTIL_LIBRARY_DIRS}
+        )
+        find_library(SWRESAMPLE_LIBRARY NAMES swresample
+        HINTS
+        ${PC_SWRESAMPLE_LIBDIR}
+        ${PC_SWRESAMPLE_LIBRARY_DIRS}
+        )
+        set(AVCODEC_LIB ${AVCODEC_LIBRARY})
+        set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
+        set(AVUTIL_LIB ${AVUTIL_LIBRARY})
+        set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
+    endif()
+
+    set(APP_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
     )
-    find_library(AVFORMAT_LIBRARY NAMES avformat
-    HINTS
-    ${PC_AVFORMAT_LIBDIR}
-    ${PC_AVFORMAT_LIBRARY_DIRS}
+
+    set(NV_DEC_SOURCES
+    ${NV_DEC_DIR}/NvDecoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
     )
-    find_library(AVUTIL_LIBRARY NAMES avutil
-    HINTS
-    ${PC_AVUTIL_LIBDIR}
-    ${PC_AVUTIL_LIBRARY_DIRS}
+
+    set(NV_DEC_HDRS
+    ${NV_DEC_DIR}/NvDecoder.h
+    ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
+    ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
+    ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
+    ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
     )
-    find_library(SWRESAMPLE_LIBRARY NAMES swresample
+
+    source_group( "headers" FILES ${NV_DEC_HDRS} )
+    source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
+    set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
+    find_package(CUDA)
+    set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
+    if ( CMAKE_COMPILER_IS_GNUCC )
+        if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
+        list(APPEND CUDA_NVCC_FLAGS -std=c++11)
+        endif()
+    endif()
+
+    # Check if the file exists
+    if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
+        execute_process(
+            COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
+            RESULT_VARIABLE result
+        )
+        if(result)
+            message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
+        endif()
+    endif ()
+
+    find_library(CUVID_LIB nvcuvid
     HINTS
-    ${PC_SWRESAMPLE_LIBDIR}
-    ${PC_SWRESAMPLE_LIBRARY_DIRS}
+    "/usr/local/lib/"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
     )
-    set(AVCODEC_LIB ${AVCODEC_LIBRARY})
-    set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
-    set(AVUTIL_LIB ${AVUTIL_LIBRARY})
-    set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
-  endif()
-
-  set(APP_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
-  )
-
-  set(NV_DEC_SOURCES
-  ${NV_DEC_DIR}/NvDecoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
-  )
-
-  set(NV_DEC_HDRS
-  ${NV_DEC_DIR}/NvDecoder.h
-  ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
-  ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
-  ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
-  ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
-  )
-
-  source_group( "headers" FILES ${NV_DEC_HDRS} )
-  source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
-  set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
-  find_package(CUDA)
-  set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
-  if ( CMAKE_COMPILER_IS_GNUCC )
-    if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
-      list(APPEND CUDA_NVCC_FLAGS -std=c++11)
+
+    cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
+
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+    target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
+    ${NVCODEC_PUBLIC_INTERFACE_DIR}
+    ${NVCODEC_UTILS_DIR}
+    ${NV_CODEC_DIR}
+    ${NV_APPDEC_COMMON_DIR}
+    ${NV_FFMPEG_HDRS}
+    ${THIRD_PARTY_SAMPLE_DIR}
+    )
+
+    target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
+    ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
+
+    install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
     endif()
-  endif()
-
-  # Check if the file exists
-  if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
-      execute_process(
-        COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
-        RESULT_VARIABLE result
-      )  
-      if(result)
-        message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
-      endif()
-  endif ()
-
-  find_library(CUVID_LIB nvcuvid
-  HINTS
-  "/usr/local/lib/"
-  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
-  )
-
-  cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
-
-  set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-  target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
-  ${NVCODEC_PUBLIC_INTERFACE_DIR}
-  ${NVCODEC_UTILS_DIR}
-  ${NV_CODEC_DIR}
-  ${NV_APPDEC_COMMON_DIR}
-  ${NV_FFMPEG_HDRS}
-  ${THIRD_PARTY_SAMPLE_DIR}
-  )
-
-  target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
-  ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
-
-  install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
-endif()
+
+ endif()
diff --git a/third_party/Makefile b/third_party/Makefile
index 7abac4fb4..383a206dd 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -18,7 +18,14 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 
 .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
 
-# Build all targets.
+# Build targets.
+ifeq ($(shell uname -m), aarch64)
+all: cuda
+cuda_with_msccl: cuda cuda_msccl
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_gpuburn megatron_lm megatron_deepspeed
+cpu: common cpu_perftest
+common: cpu_stream fio
+else
 all: cuda rocm
 cuda_with_msccl: cuda cuda_msccl
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
@@ -26,6 +33,7 @@ rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_
 cpu: common cpu_perftest
 common: cpu_hpl cpu_stream fio
 directx_amd: directx_amf_encoding_latency
+endif
 
 # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
 sb_micro_path:
@@ -59,7 +67,7 @@ else
 endif
 	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
 	git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
-	cd ./$(TEST_PATH) && make clean && make TARGET_ARCH=x86_64 SMS=$(ARCHS)
+	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
 	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
 
 # Build nccl-tests from commit 8274cb4 of default branch.
diff --git a/third_party/stream-tests/Makefile b/third_party/stream-tests/Makefile
index a5ed5ff35..8a86c0c59 100644
--- a/third_party/stream-tests/Makefile
+++ b/third_party/stream-tests/Makefile
@@ -1,16 +1,28 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang
-CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
-GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 
-ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 
-ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 
+GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000
+ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
+ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
+NEO2FLAGS= -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2
+
 GEN_OUTPUT= streamx86.exe
 ZEN3_OUTPUT= streamZen3.exe
 ZEN4_OUTPUT= streamZen4.exe
+NEO2_OUTPUT= streamNeo2.exe
+
+ARCH := $(shell uname -m)
 
+ifeq ($(ARCH), aarch64)
+CFLAGS = -Ofast -fopenmp -DNTIMES=200
+CC=gcc
+all: NEO2
+else
+CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang
+CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
 all: ZEN3 ZEN4 X86
+endif
+
 
 ZEN3: stream.c
 	$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
@@ -18,6 +30,13 @@ ZEN4:
 	$(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT)
 X86:
 	$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
+NEO2:
+	$(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT)
 
+ifeq ($(ARCH), aarch64)
+clean:
+	rm $(NEO2_OUTPUT)
+else
 clean:
 	rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT)
+endif

From 967213363bfd708dad10b24e9e161ad0d61eb2ed Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 31 Oct 2024 15:03:42 -0700
Subject: [PATCH 2/8] add docker build template. commented out

---
 .github/workflows/build-image.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 05e4dd447..13eead127 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -30,6 +30,12 @@ jobs:
           tags: superbench/main:cuda12.4
           runner: [self-hosted, rocm-build]
           build_args: "NUM_MAKE_JOBS=8"
+        # # TODO: Enable ARM build, check for hosted aarch64 runner availability
+        # - name: cuda12.4-arm
+        #   dockerfile: cuda12.4
+        #   tags: superbench/main:cuda12.4
+        #   runner: [self-hosted, aarch64-build]
+        #   build_args: "NUM_MAKE_JOBS=8"
         - name: cuda12.2
           dockerfile: cuda12.2
           tags: superbench/main:cuda12.2

From 3e7136f8f94832aae9b719e6cbbba1dd9f9864e6 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Fri, 1 Nov 2024 12:42:50 -0700
Subject: [PATCH 3/8] fix PR comments

cleanup thirdparty Makefile and stream tests makefile. fix mising gpcnet. fix lint in cuda decode perf Makefile.
---
 setup.py                                      |   4 +-
 .../cuda_decode_performance/CMakeLists.txt    | 235 +++++++++---------
 third_party/Makefile                          |  15 +-
 third_party/stream-tests/Makefile             |  25 +-
 4 files changed, 137 insertions(+), 142 deletions(-)

diff --git a/setup.py b/setup.py
index 13bf9d044..93d53639c 100644
--- a/setup.py
+++ b/setup.py
@@ -215,8 +215,8 @@ def run(self):
             ],
             'ort': [
                 'onnx>=1.10.2',
-                'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine != "aarch64"',
-                'onnxruntime-gpu; python_version>="3.10" and platform_machine != "aarch64"',
+                'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine == "x86_64"',
+                'onnxruntime-gpu; python_version>="3.10" and platform_machine == "x86_64"',
             ],
             'nvidia': ['py3nvml>=0.2.6'],
             'amd': ['amdsmi'],
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
index 907f616fa..1022aed3d 100644
--- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
@@ -4,121 +4,120 @@
 cmake_minimum_required(VERSION 3.18)
 project(cuda_decode_performance)
 
-
- # Check architecture
- if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
-    message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.")
- else()
-    find_package(CUDA QUIET)
-    if(CUDA_FOUND)
-    set(CMAKE_CXX_STANDARD 17)
-    set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-    set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
-    set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
-    set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
-    set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
-    set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
-
-    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-        find_package(PkgConfig REQUIRED)
-        pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
-        pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
-        pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
-        pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
-
-        set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
-        find_library(AVCODEC_LIBRARY NAMES avcodec
-        HINTS
-        ${PC_AVCODEC_LIBDIR}
-        ${PC_AVCODEC_LIBRARY_DIRS}
-        )
-        find_library(AVFORMAT_LIBRARY NAMES avformat
-        HINTS
-        ${PC_AVFORMAT_LIBDIR}
-        ${PC_AVFORMAT_LIBRARY_DIRS}
-        )
-        find_library(AVUTIL_LIBRARY NAMES avutil
-        HINTS
-        ${PC_AVUTIL_LIBDIR}
-        ${PC_AVUTIL_LIBRARY_DIRS}
-        )
-        find_library(SWRESAMPLE_LIBRARY NAMES swresample
-        HINTS
-        ${PC_SWRESAMPLE_LIBDIR}
-        ${PC_SWRESAMPLE_LIBRARY_DIRS}
-        )
-        set(AVCODEC_LIB ${AVCODEC_LIBRARY})
-        set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
-        set(AVUTIL_LIB ${AVUTIL_LIBRARY})
-        set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
-    endif()
-
-    set(APP_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
-    )
-
-    set(NV_DEC_SOURCES
-    ${NV_DEC_DIR}/NvDecoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
-    )
-
-    set(NV_DEC_HDRS
-    ${NV_DEC_DIR}/NvDecoder.h
-    ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
-    ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
-    ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
-    ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
-    )
-
-    source_group( "headers" FILES ${NV_DEC_HDRS} )
-    source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
-    set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
-    find_package(CUDA)
-    set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
-    if ( CMAKE_COMPILER_IS_GNUCC )
-        if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
-        list(APPEND CUDA_NVCC_FLAGS -std=c++11)
-        endif()
-    endif()
-
-    # Check if the file exists
-    if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
-        execute_process(
-            COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
-            RESULT_VARIABLE result
-        )
-        if(result)
-            message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
-        endif()
-    endif ()
-
-    find_library(CUVID_LIB nvcuvid
-    HINTS
-    "/usr/local/lib/"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
-    )
-
-    cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
-
-    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-    target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
-    ${NVCODEC_PUBLIC_INTERFACE_DIR}
-    ${NVCODEC_UTILS_DIR}
-    ${NV_CODEC_DIR}
-    ${NV_APPDEC_COMMON_DIR}
-    ${NV_FFMPEG_HDRS}
-    ${THIRD_PARTY_SAMPLE_DIR}
-    )
-
-    target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
-    ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
-
-    install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
-    endif()
-
- endif()
+# Check architecture
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+   message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.")
+else()
+   find_package(CUDA QUIET)
+   if(CUDA_FOUND)
+   set(CMAKE_CXX_STANDARD 17)
+   set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+   set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
+   set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
+   set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
+   set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
+   set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
+
+   if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+       find_package(PkgConfig REQUIRED)
+       pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
+       pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
+       pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
+       pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
+
+       set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
+       find_library(AVCODEC_LIBRARY NAMES avcodec
+       HINTS
+       ${PC_AVCODEC_LIBDIR}
+       ${PC_AVCODEC_LIBRARY_DIRS}
+       )
+       find_library(AVFORMAT_LIBRARY NAMES avformat
+       HINTS
+       ${PC_AVFORMAT_LIBDIR}
+       ${PC_AVFORMAT_LIBRARY_DIRS}
+       )
+       find_library(AVUTIL_LIBRARY NAMES avutil
+       HINTS
+       ${PC_AVUTIL_LIBDIR}
+       ${PC_AVUTIL_LIBRARY_DIRS}
+       )
+       find_library(SWRESAMPLE_LIBRARY NAMES swresample
+       HINTS
+       ${PC_SWRESAMPLE_LIBDIR}
+       ${PC_SWRESAMPLE_LIBRARY_DIRS}
+       )
+       set(AVCODEC_LIB ${AVCODEC_LIBRARY})
+       set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
+       set(AVUTIL_LIB ${AVUTIL_LIBRARY})
+       set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
+   endif()
+
+   set(APP_SOURCES
+   ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
+   )
+
+   set(NV_DEC_SOURCES
+   ${NV_DEC_DIR}/NvDecoder.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
+   )
+
+   set(NV_DEC_HDRS
+   ${NV_DEC_DIR}/NvDecoder.h
+   ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
+   ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
+   ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
+   ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
+   ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
+   ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
+   )
+
+   source_group( "headers" FILES ${NV_DEC_HDRS} )
+   source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
+   set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
+   find_package(CUDA)
+   set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
+   if ( CMAKE_COMPILER_IS_GNUCC )
+       if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
+       list(APPEND CUDA_NVCC_FLAGS -std=c++11)
+       endif()
+   endif()
+
+   # Check if the file exists
+   if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
+       execute_process(
+           COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
+           RESULT_VARIABLE result
+       )
+       if(result)
+           message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
+       endif()
+   endif ()
+
+   find_library(CUVID_LIB nvcuvid
+   HINTS
+   "/usr/local/lib/"
+   "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
+   )
+
+   cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
+
+   set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+   target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
+   ${NVCODEC_PUBLIC_INTERFACE_DIR}
+   ${NVCODEC_UTILS_DIR}
+   ${NV_CODEC_DIR}
+   ${NV_APPDEC_COMMON_DIR}
+   ${NV_FFMPEG_HDRS}
+   ${THIRD_PARTY_SAMPLE_DIR}
+   )
+
+   target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
+   ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
+
+   install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
+   endif()
+
+endif()
diff --git a/third_party/Makefile b/third_party/Makefile
index 383a206dd..63ca48f36 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -19,19 +19,16 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
 
 # Build targets.
-ifeq ($(shell uname -m), aarch64)
-all: cuda
-cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_gpuburn megatron_lm megatron_deepspeed
-cpu: common cpu_perftest
-common: cpu_stream fio
-else
 all: cuda rocm
 cuda_with_msccl: cuda cuda_msccl
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
-cpu: common cpu_perftest
-common: cpu_hpl cpu_stream fio
+cpu: common cpu_perftest cpu_stream
+common: fio
+
+# non aarch64 specific targets
+ifneq ($(shell uname -m), aarch64)
+common: fio cpu_hpl
 directx_amd: directx_amf_encoding_latency
 endif
 
diff --git a/third_party/stream-tests/Makefile b/third_party/stream-tests/Makefile
index 8a86c0c59..a652defd9 100644
--- a/third_party/stream-tests/Makefile
+++ b/third_party/stream-tests/Makefile
@@ -1,29 +1,28 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000
-ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
-ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
-NEO2FLAGS= -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2
+GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000
+ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
+ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
+NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2
 
-GEN_OUTPUT= streamx86.exe
-ZEN3_OUTPUT= streamZen3.exe
-ZEN4_OUTPUT= streamZen4.exe
-NEO2_OUTPUT= streamNeo2.exe
+GEN_OUTPUT := streamx86.exe
+ZEN3_OUTPUT := streamZen3.exe
+ZEN4_OUTPUT := streamZen4.exe
+NEO2_OUTPUT := streamNeo2.exe
 
 ARCH := $(shell uname -m)
 
 ifeq ($(ARCH), aarch64)
-CFLAGS = -Ofast -fopenmp -DNTIMES=200
-CC=gcc
+CFLAGS := -Ofast -fopenmp -DNTIMES=200
+CC := gcc
 all: NEO2
 else
-CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang
-CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
+CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang
+CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
 all: ZEN3 ZEN4 X86
 endif
 
-
 ZEN3: stream.c
 	$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
 ZEN4:

From 41a003fcd73e48bf0b578faac0c2dc8fa5858dd7 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Fri, 1 Nov 2024 16:00:00 -0700
Subject: [PATCH 4/8] unify docker file for arm64 and amd64

---
 dockerfile/cuda12.4-arm.dockerfile | 139 -----------------------------
 dockerfile/cuda12.4.dockerfile     |  47 +++++-----
 2 files changed, 27 insertions(+), 159 deletions(-)
 delete mode 100644 dockerfile/cuda12.4-arm.dockerfile

diff --git a/dockerfile/cuda12.4-arm.dockerfile b/dockerfile/cuda12.4-arm.dockerfile
deleted file mode 100644
index 03c8834f8..000000000
--- a/dockerfile/cuda12.4-arm.dockerfile
+++ /dev/null
@@ -1,139 +0,0 @@
-FROM nvcr.io/nvidia/pytorch:24.05-py3
-
-# OS:
-#   - Ubuntu: 22.04
-#   - OpenMPI: 4.1.5rc2
-#   - Docker Client: 20.10.8
-# NVIDIA:
-#   - CUDA: 12.4.1
-#   - CUDA Driver: 550.54.15
-#   - cuBLAS: 12.4.5.8
-#   - cuDNN: 9.1.0.70
-#   - NCCL: 2.21.5
-# Mellanox:
-#   - OFED: 24.04-0.6.6.0
-#   - HPC-X: v2.19
-# Intel:
-#   - mlc: v3.11a
-
-LABEL maintainer="SuperBench"
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    autoconf \
-    automake \
-    bc \
-    build-essential \
-    curl \
-    dmidecode \
-    ffmpeg \
-    git \
-    iproute2 \
-    jq \
-    libaio-dev \
-    libavcodec-dev \
-    libavformat-dev \
-    libavutil-dev \
-    libboost-program-options-dev \
-    libcap2 \
-    libcurl4-openssl-dev \
-    libnuma-dev \
-    libpci-dev \
-    libswresample-dev \
-    libtinfo5 \
-    libtool \
-    lshw \
-    python3-mpi4py \
-    net-tools \
-    nlohmann-json3-dev \
-    openssh-client \
-    openssh-server \
-    pciutils \
-    sudo \
-    util-linux \
-    vim \
-    wget \
-    && \
-    apt-get autoremove && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* /tmp/*
-
-ARG NUM_MAKE_JOBS=
-
-# Install Docker
-ENV DOCKER_VERSION=20.10.8
-RUN cd /tmp && \
-    wget -q https://download.docker.com/linux/static/stable/aarch64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
-    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
-    rm docker.tgz
-
-# Update system config
-RUN mkdir -p /root/.ssh && \
-    touch /root/.ssh/authorized_keys && \
-    mkdir -p /var/run/sshd && \
-    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
-    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
-    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
-    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
-    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
-
-# Install OFED
-ENV OFED_VERSION=24.04-0.6.6.0
-RUN cd /tmp && \
-    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \
-    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64.tgz && \
-    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-aarch64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
-    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
-
-# Install HPC-X
-ENV HPCX_VERSION=v2.19
-RUN cd /opt && \
-    rm -rf hpcx && \
-    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64.tbz -O hpcx.tbz && \
-    tar xf hpcx.tbz && \
-    mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-aarch64 hpcx && \
-    rm hpcx.tbz
-
-# Install Intel MLC
-RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/822971/mlc_v3.11a.tgz -O mlc.tgz && \
-    tar xzf mlc.tgz Linux/mlc && \
-    cp ./Linux/mlc /usr/local/bin/ && \
-    rm -rf ./Linux mlc.tgz
-
-# Install NCCL 2.21.5
-RUN cd /tmp && \
-    git clone -b v2.21.5-1 https://github.com/NVIDIA/nccl.git && \
-    cd nccl && \
-    make -j src.build && \
-    make install && \
-    rm -rf /tmp/nccl
-
-ENV PATH="${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
-    SB_HOME=/opt/superbench_dev \
-    SB_MICRO_PATH=/opt/superbench_dev \
-    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
-    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
-
-RUN echo PATH="$PATH" > /etc/environment && \
-    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
-    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
-    echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" >> /etc/bash.bashrc
-
-# Add config files
-ADD dockerfile/etc /opt/microsoft/
-
-WORKDIR ${SB_HOME}
-
-ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
-
-ADD . .
-RUN python3 -m pip install --upgrade setuptools==65.7 && \
-    python3 -m pip install --no-cache-dir .[nvworker] && \
-    make cppbuild && \
-    make postinstall && \
-    rm -rf .git
-
diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index 3ec82f39d..259ece773 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -19,6 +19,10 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3
 LABEL maintainer="SuperBench"
 
 ENV DEBIAN_FRONTEND=noninteractive
+
+ARG TARGETPLATFORM
+ARG TARGETARCH
+
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     autoconf \
@@ -63,8 +67,8 @@ ARG NUM_MAKE_JOBS=
 
 # Install Docker
 ENV DOCKER_VERSION=20.10.8
-RUN cd /tmp && \
-    wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
+RUN TARGETARCH_HW=$(uname -m) && \
+    wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
     tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
     rm docker.tgz
 
@@ -80,40 +84,43 @@ RUN mkdir -p /root/.ssh && \
 
 # Install OFED
 ENV OFED_VERSION=23.07-0.5.1.2
-RUN cd /tmp && \
-    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
-    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
-    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /tmp && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
 
 # Install HPC-X
 ENV HPCX_VERSION=v2.18
-RUN cd /opt && \
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /opt && \
     rm -rf hpcx && \
-    wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz -O hpcx.tbz && \
+    wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
     tar xf hpcx.tbz && \
-    mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 hpcx && \
+    mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW} hpcx && \
     rm hpcx.tbz
 
-# Install Intel MLC
-RUN cd /tmp && \
+# Installs specific to amd64 platform
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+    # Install Intel MLC
+    cd /tmp && \
     wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
-    rm -rf ./Linux mlc.tgz
-
-# Install AOCC compiler
-RUN cd /tmp && \
+    rm -rf ./Linux mlc.tgz && \
+    # Install AOCC compiler
     wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
     apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
-    rm -rf aocc-compiler-4.0.0_1_amd64.deb
-
-# Install AMD BLIS
-RUN cd /tmp && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    # Install AMD BLIS
     wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
     tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
     mv amd-blis /opt/AMD && \
-    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
+    else \
+    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
+    fi
 
 # Install NCCL 2.23.4
 RUN cd /tmp && \

From 42c12eccfe2c1869dda5e48040e25457e5aeeea5 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 4 Nov 2024 10:10:29 -0800
Subject: [PATCH 5/8] add arm64 for multiplatform builds

---
 .github/workflows/build-image.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 13eead127..944fa2ec8 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -142,7 +142,7 @@ jobs:
         id: docker_build
         uses: docker/build-push-action@v2
         with:
-          platforms: linux/amd64
+          platforms: linux/amd64, linux/arm64
           context: .
           file: ${{ steps.metadata.outputs.dockerfile }}
           push: ${{ github.event_name != 'pull_request' }}

From 0da1d89285f673665cfedcce3995432cb7a0edc5 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 4 Nov 2024 10:33:34 -0800
Subject: [PATCH 6/8] specify platforms per docker build

---
 .github/workflows/build-image.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 3e389410b..8e877de67 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -28,21 +28,25 @@ jobs:
         - name: cuda12.4
           dockerfile: cuda12.4
           tags: superbench/main:cuda12.4
+          platforms: linux/amd64, linux/arm64
           runner: [self-hosted]
           build_args: "NUM_MAKE_JOBS=16"
         - name: cuda12.2
           dockerfile: cuda12.2
           tags: superbench/main:cuda12.2
+          platforms: linux/amd64
           runner: [self-hosted]
           build_args: "NUM_MAKE_JOBS=16"
         - name: cuda11.1.1
           dockerfile: cuda11.1.1
           tags: superbench/main:cuda11.1.1,superbench/superbench:latest
+          platforms: linux/amd64
           runner: ubuntu-latest
           build_args: "NUM_MAKE_JOBS=8"
         - name: rocm6.2
           dockerfile: rocm6.2.x
           tags: superbench/main:rocm6.2
+          platforms: linux/amd64
           runner: [self-hosted]
           build_args: "NUM_MAKE_JOBS=16"
     steps:
@@ -125,7 +129,7 @@ jobs:
         id: docker_build
         uses: docker/build-push-action@v2
         with:
-          platforms: linux/amd64, linux/arm64
+          platforms: ${{ matrix.platforms }}
           context: .
           file: ${{ steps.metadata.outputs.dockerfile }}
           push: ${{ github.event_name != 'pull_request' }}

From 6df745bd23eadfeed634f835e2c7c8504955730a Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 5 Nov 2024 18:14:07 -0800
Subject: [PATCH 7/8] disable arm64 build. fix lint in dockerfile

---
 .github/workflows/build-image.yml | 2 +-
 dockerfile/cuda12.4.dockerfile    | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 8e877de67..fdfe9114c 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -28,7 +28,7 @@ jobs:
         - name: cuda12.4
           dockerfile: cuda12.4
           tags: superbench/main:cuda12.4
-          platforms: linux/amd64, linux/arm64
+          platforms: linux/amd64 # TODO: linux/arm64
           runner: [self-hosted]
           build_args: "NUM_MAKE_JOBS=16"
         - name: cuda12.2
diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index 259ece773..e9feb2a3e 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -20,9 +20,6 @@ LABEL maintainer="SuperBench"
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-ARG TARGETPLATFORM
-ARG TARGETARCH
-
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     autoconf \
@@ -64,6 +61,8 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 ARG NUM_MAKE_JOBS=
+ARG TARGETPLATFORM
+ARG TARGETARCH
 
 # Install Docker
 ENV DOCKER_VERSION=20.10.8
@@ -117,9 +116,9 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then \
     wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
     tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
     mv amd-blis /opt/AMD && \
-    rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz \
     else \
-    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
+    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH" \
     fi
 
 # Install NCCL 2.23.4

From 2252a9633a9c1bdfc4ba97c0419844bc33e2a163 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 6 Nov 2024 09:39:10 -0800
Subject: [PATCH 8/8] add requried delimiters for shell command parsing

---
 dockerfile/cuda12.4.dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index e9feb2a3e..560f0908a 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -116,9 +116,9 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then \
     wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
     tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
     mv amd-blis /opt/AMD && \
-    rm -rf aocl-blis-linux-aocc-4.0.tar.gz \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
     else \
-    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH" \
+    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
     fi
 
 # Install NCCL 2.23.4