Skip to content

Commit

Permalink
Merge pull request #11 from BKP/ipc_07-30-24
Browse files Browse the repository at this point in the history
MI300X RO Support
  • Loading branch information
edgargabriel authored Aug 1, 2024
2 parents a3c338c + bb6e48d commit efa075f
Show file tree
Hide file tree
Showing 13 changed files with 305 additions and 124 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ option(USE_IPC "Enable IPC support (using HIP)" OFF)
option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
option(USE_COHERENT_HEAP "Enable support for coherent systems" OFF)
option(USE_CACHED_HEAP "Enable support for cached systems" OFF)
option(USE_MANAGED_HEAP "Enable managed memory" OFF)
option(USE_HOST_HEAP "Enable host memory using malloc/free" OFF)
option(USE_HIP_HOST_HEAP "Enable host memory using hip api" OFF)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ ROC_SHMEM base requirements:
* May work with other versions, but not tested
* AMD GFX9 GPUs (e.g.: MI25, Vega 56, Vega 64, MI50, MI60, MI100, Radeon VII)
* AMD MI200 GPUs: To enable the support on MI200, please configure the library
with USE_CACHED_HEAP
with USE_COHERENT_HEAP
* ROCm-aware MPI as described in
[Building the Dependencies](#building-the-dependencies)
* InfiniBand adaptor compatable with ROCm RDMA technology
Expand Down
3 changes: 1 addition & 2 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
#cmakedefine USE_SHARED_CTX
#cmakedefine USE_WF_COAL
#cmakedefine USE_COHERENT_HEAP
#cmakedefine USE_CACHED_HEAP
#cmakedefine USE_MANAGED_HEAP
#cmakedefine USE_HOST_HEAP
#cmakedefine USE_HIP_HOST_HEAP
#cmakedefine USE_FUNC_CALL
#cmakedefine USE_SINGLE_NODE
#cmakedefine USE_HOST_SIDE_HDP_FLUSH
#cmakedefine USE_HOST_SIDE_HDP_FLUSH
1 change: 0 additions & 1 deletion scripts/build_configs/rc_single
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ cmake \
-DUSE_DC=OFF \
-DUSE_IPC=OFF \
-DUSE_COHERENT_HEAP=OFF \
-DUSE_CACHED_HEAP=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
$src_path
Expand Down
2 changes: 0 additions & 2 deletions scripts/build_configs/rc_single_single_node
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@ cmake \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=OFF \
-DUSE_CACHED_HEAP=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=ON\
-DROCM_PATH="/opt/rocm-5.4.2/"\
$src_path
cmake --build . --parallel 8
cmake --install .
1 change: 0 additions & 1 deletion scripts/build_configs/rc_single_single_node_debug
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ cmake \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=OFF \
-DUSE_CACHED_HEAP=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
Expand Down
29 changes: 29 additions & 0 deletions scripts/build_configs/ro_ipc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.

if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi

src_path=$(dirname "$(realpath $0)")/../../

cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
$src_path
cmake --build . --parallel 8
cmake --install .
271 changes: 271 additions & 0 deletions src/assembly.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/

#ifndef LIBRARY_SRC_ASSEMBLY_HPP_
#define LIBRARY_SRC_ASSEMBLY_HPP_

#include <hip/hip_runtime.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>

namespace rocshmem {

#define DO_PRAGMA(x) _Pragma(#x)
#define NOWARN(warnoption, ...) \
DO_PRAGMA(GCC diagnostic push) \
DO_PRAGMA(GCC diagnostic ignored #warnoption) \
__VA_ARGS__ \
DO_PRAGMA(GCC diagnostic pop)

#define SFENCE() asm volatile("sfence" ::: "memory")

__device__ __forceinline__ int uncached_load_ubyte(uint8_t* src) {
int ret;
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile(
"global_load_ubyte %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile(
"global_load_ubyte %0 %1 off sc0 sc1 \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
return ret;
}

__device__ __forceinline__ void refresh_volatile_sbyte(volatile int *assigned_value,
volatile char *read_value) {
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile(
"global_load_sbyte %0 %1 off glc slc\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile(
"global_load_sbyte %0 %1 off sc0 sc1\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
}

__device__ __forceinline__ void refresh_volatile_dwordx2(volatile uint64_t *assigned_value,
volatile uint64_t *read_value) {
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile(
"global_load_dwordx2 %0 %1 off glc slc\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile(
"global_load_dwordx2 %0 %1 off sc0 sc1\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
}

/* Ignore the warning about deprecated volatile.
* The only usage of volatile is to force the compiler to generate
* the assembly instruction. If volatile is omitted, the compiler
* will NOT generate the non-temporal load or the waitcnt.
*/
// clang-format off
NOWARN(-Wdeprecated-volatile,
template <typename T> __device__ __forceinline__ T uncached_load(T* src) {
T ret;
switch (sizeof(T)) {
case 4:
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile(
"global_load_dword %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile(
"global_load_dword %0 %1 off sc0 sc1 \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
break;
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
case 8:
asm volatile(
"global_load_dwordx2 %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile(
"global_load_dwordx2 %0 %1 off sc0 sc1 \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
break;
default:
break;
}
return ret;
}
)
// clang-format on

__device__ __forceinline__ void __roc_inv() {
#if defined USE_COHERENT_HEAP
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
// asm volatile("buffer_wbinvl1;");
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// asm volatile("buffer_inv sc0 sc1;");
#endif
#endif
}

__device__ __forceinline__ void __roc_flush() {
#if defined USE_COHERENT_HEAP
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
// asm volatile("s_dcache_wb;");
// asm volatile("buffer_wbl2;");
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// asm volatile("s_dcache_wb;");
// asm volatile("buffer_wbl2;");
#endif
#endif
}

__device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
int size) {
switch (size) {
case 2: {
int16_t val16{*(reinterpret_cast<int16_t*>(val))};
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile("flat_store_short %0 %1 glc slc" : : "v"(dst), "v"(val16));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile("flat_store_short %0 %1 sc0 sc1" : : "v"(dst), "v"(val16));
#endif
break;
}
case 4: {
int32_t val32{*(reinterpret_cast<int32_t*>(val))};
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile("flat_store_dword %0 %1 glc slc" : : "v"(dst), "v"(val32));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile("flat_store_dword %0 %1 sc0 sc1" : : "v"(dst), "v"(val32));
#endif
break;
}
case 8: {
int64_t val64{*(reinterpret_cast<int64_t*>(val))};
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile("flat_store_dwordx2 %0 %1 glc slc" : : "v"(dst), "v"(val64));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile("flat_store_dwordx2 %0 %1 sc0 sc1" : : "v"(dst), "v"(val64));
#endif
break;
}
default:
break;
}
}

__device__ __forceinline__ uint64_t __read_clock() {
uint64_t clock{};
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
asm volatile(
"s_memrealtime %0\n"
"s_waitcnt lgkmcnt(0)\n"
: "=s"(clock));
#endif
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile(
"s_memrealtime %0\n"
"s_waitcnt lgkmcnt(0)\n"
: "=s"(clock));
#endif
return clock;
}

} // namespace rocshmem

#endif // LIBRARY_SRC_ASSEMBLY_HPP_
2 changes: 1 addition & 1 deletion src/gpu_ib/connection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ void* Connection::buf_alloc([[maybe_unused]] struct ibv_pd* pd,
if (use_gpu_mem) {
void* dev_ptr;
if (coherent_cq == 1) {
#if defined USE_COHERENT_HEAP || defined USE_CACHED_HEAP
#if defined USE_COHERENT_HEAP
CHECK_HIP(hipMalloc(reinterpret_cast<void**>(&dev_ptr), size));
#else
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
Expand Down
2 changes: 1 addition & 1 deletion src/memory/heap_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace rocshmem {

#if defined USE_MANAGED_HEAP
using HEAP_T = HeapMemory<HIPAllocatorManaged>;
#elif defined USE_COHERENT_HEAP || defined USE_CACHED_HEAP
#elif defined USE_COHERENT_HEAP
using HEAP_T = HeapMemory<HIPAllocator>;
#elif defined USE_HOST_HEAP
using HEAP_T = HeapMemory<HostAllocator>;
Expand Down
Loading

0 comments on commit efa075f

Please sign in to comment.