From 234d1a055d21a28ec14608d3accb18b963f1953c Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Thu, 1 Aug 2024 04:05:28 +0000 Subject: [PATCH] Disable Libfabric shared memory when possible For a bunch of reasons (detailed in the code), there's no upside to leaving shared memory enabled, and we really need it disabled for correctness when flush() is needed, so that the flush operation flows through the NIC. Therefore try to disable shared memory whenever possible. Signed-off-by: Brian Barrett --- m4/check_pkg_libfabric.m4 | 3 ++- src/nccl_ofi_ofiutils.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/m4/check_pkg_libfabric.m4 b/m4/check_pkg_libfabric.m4 index 42f7a4883..fc0047d77 100644 --- a/m4/check_pkg_libfabric.m4 +++ b/m4/check_pkg_libfabric.m4 @@ -55,7 +55,8 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [ FI_OPT_EFA_EMULATED_WRITE, FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES, - FI_OPT_MAX_MSG_SIZE], + FI_OPT_MAX_MSG_SIZE, + FI_OPT_SHARED_MEMORY_PERMITTED], [], [], [AC_INCLUDES_DEFAULT [#include #ifdef HAVE_RDMA_FI_EXT_H diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index d8ad5c460..159257f4a 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -302,6 +302,42 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str goto error; } + /* + * Disable shared memory. There's really only three cases + * we're going to be using network operations inside a shared + * memory domain: + * + * 1. disabling NCCL P2P (NVLink / PCIe) operations to test + * networking without lots of nodes. + * 2. flush operations + * 3. cleanup copies for the rdma protocol's eager messages + * + * In none of these do you want to use Libfabric's shared + * memory as opposed to a real network device. (2) is + * actually a correctness issue to use shared memory. So we + * disable shared memory transport when available. + */ +#if HAVE_DECL_FI_OPT_SHARED_MEMORY_PERMITTED + { + bool optval = false; + ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, + FI_OPT_SHARED_MEMORY_PERMITTED, &optval, + sizeof(optval)); + if (ret == -FI_EOPNOTSUPP) { + /* One way we get here is running against + * older libfabric builds that don't have + * FI_OPT_SHARED_MEMORY_PERMITTED. This isn't + * awesome, but there isn't really a better + * choice. + */ + NCCL_OFI_TRACE(NCCL_INIT, "Disabling shared memory not supported"); + } else if (ret != 0) { + NCCL_OFI_WARN("Disabling shared memory failed: %s", + fi_strerror(-ret)); + goto error; + } + } +#endif /* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if * using the Libfabric 1.18 API with HMEM support.