diff --git a/m4/check_pkg_libfabric.m4 b/m4/check_pkg_libfabric.m4 index 42f7a4883..fc0047d77 100644 --- a/m4/check_pkg_libfabric.m4 +++ b/m4/check_pkg_libfabric.m4 @@ -55,7 +55,8 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [ FI_OPT_EFA_EMULATED_WRITE, FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES, - FI_OPT_MAX_MSG_SIZE], + FI_OPT_MAX_MSG_SIZE, + FI_OPT_SHARED_MEMORY_PERMITTED], [], [], [AC_INCLUDES_DEFAULT [#include #ifdef HAVE_RDMA_FI_EXT_H diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index d8ad5c460..159257f4a 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -302,6 +302,42 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str goto error; } + /* + * Disable shared memory. There's really only three cases + * we're going to be using network operations inside a shared + * memory domain: + * + * 1. disabling NCCL P2P (NVLink / PCIe) operations to test + * networking without lots of nodes. + * 2. flush operations + * 3. cleanup copies for the rdma protocol's eager messages + * + * In none of these do you want to use Libfabric's shared + * memory as opposed to a real network device. (2) is + * actually a correctness issue to use shared memory. So we + * disable shared memory transport when available. + */ +#if HAVE_DECL_FI_OPT_SHARED_MEMORY_PERMITTED + { + bool optval = false; + ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, + FI_OPT_SHARED_MEMORY_PERMITTED, &optval, + sizeof(optval)); + if (ret == -FI_EOPNOTSUPP) { + /* One way we get here is running against + * older libfabric builds that don't have + * FI_OPT_SHARED_MEMORY_PERMITTED. This isn't + * awesome, but there isn't really a better + * choice. + */ + NCCL_OFI_TRACE(NCCL_INIT, "Disabling shared memory not supported"); + } else if (ret != 0) { + NCCL_OFI_WARN("Disabling shared memory failed: %s", + fi_strerror(-ret)); + goto error; + } + } +#endif /* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if * using the Libfabric 1.18 API with HMEM support.