Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hmem: Define ofi_hmem_put_dmabuf_fd #10716

Merged
merged 7 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions include/ofi_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ struct ofi_hmem_ops {
const void *src, size_t size);
int (*get_dmabuf_fd)(const void *addr, uint64_t size, int *fd,
uint64_t *offset);
int (*put_dmabuf_fd)(int fd);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just name it as close_dmabuf_fd ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose put_dmabuf_fd to semantically align with get_dmabuf_fd.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any hmem type where a get_dmabuf_fd does not implicitly increase the open count on the file? if not, I'd recommend going the other way and renaming get to open so that it's clear that it's not simply resolving an already-open file descriptor, but is actually opening a resource which must be eventually closed.

Copy link
Contributor

@j-xiong j-xiong Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the ZE hmem type always return the same fd.

};

extern struct ofi_hmem_ops hmem_ops[];
Expand Down Expand Up @@ -167,6 +168,7 @@ int rocr_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
size_t size);
int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd,
uint64_t *offset);
int rocr_hmem_put_dmabuf_fd(int fd);

int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
Expand All @@ -193,6 +195,7 @@ bool cuda_is_gdrcopy_enabled(void);
bool cuda_is_dmabuf_supported(void);
int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd,
uint64_t *offset);
int cuda_put_dmabuf_fd(int fd);

void cuda_gdrcopy_to_dev(uint64_t handle, void *dev,
const void *host, size_t size);
Expand Down Expand Up @@ -357,6 +360,11 @@ static inline int ofi_hmem_no_get_dmabuf_fd(const void *addr, uint64_t size,
return -FI_ENOSYS;
}

static inline int ofi_hmem_no_put_dmabuf_fd(int fd)
{
return -FI_ENOSYS;
}

static inline bool ofi_hmem_p2p_disabled(void)
{
return ofi_hmem_disable_p2p;
Expand Down Expand Up @@ -450,5 +458,6 @@ int ofi_hmem_dev_reg_copy_from_hmem(enum fi_hmem_iface iface, uint64_t handle,
void *dest, const void *src, size_t size);
int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface, const void *addr, uint64_t size,
int *fd, uint64_t *offset);
int ofi_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd);

#endif /* _OFI_HMEM_H_ */
2 changes: 2 additions & 0 deletions prov/cxi/include/cxip.h
Original file line number Diff line number Diff line change
Expand Up @@ -821,8 +821,10 @@ struct cxip_md {
struct cxi_md *md;
struct ofi_mr_info info;
uint64_t handle;
int dmabuf_fd;
bool handle_valid;
bool cached;
bool dmabuf_fd_valid;
};

#define CXIP_MR_DOMAIN_HT_BUCKETS 16
Expand Down
21 changes: 19 additions & 2 deletions prov/cxi/src/cxip_iomm.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base,
hints->dmabuf_offset = offset;
hints->dmabuf_valid = true;

/* Need to cache DMA buf FD to release later. */
md->dmabuf_fd = dmabuf_fd;
md->dmabuf_fd_valid = true;

return FI_SUCCESS;
}

Expand Down Expand Up @@ -106,7 +110,7 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry)
CXIP_WARN(MAP_FAIL_MSG, dom->lni->lni->id,
entry->info.iov.iov_base, entry->info.iov.iov_len,
map_flags, ret, fi_strerror(-ret));
goto err;
goto err_free_dmabuf;
}

/* If the md len is larger than the iov_len, the VA and len have
Expand Down Expand Up @@ -161,6 +165,9 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry)

err_unmap:
cxil_unmap(md->md);
err_free_dmabuf:
if (md->dmabuf_fd_valid)
ofi_hmem_put_dmabuf_fd(entry->info.iface, md->dmabuf_fd);
err:
md->dom = NULL;
return ret;
Expand All @@ -181,6 +188,9 @@ static void cxip_do_unmap(struct ofi_mr_cache *cache,
if (md->handle_valid)
ofi_hmem_dev_unregister(entry->info.iface, md->handle);

if (md->dmabuf_fd_valid)
ofi_hmem_put_dmabuf_fd(entry->info.iface, md->dmabuf_fd);

ret = cxil_unmap(md->md);
if (ret)
CXIP_WARN("cxil_unmap failed: %d\n", ret);
Expand Down Expand Up @@ -426,7 +436,7 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr,
&uncached_md->md);
if (ret) {
CXIP_WARN("cxil_map failed: %d:%s\n", ret, fi_strerror(-ret));
goto err_free_uncached_md;
goto err_free_dmabuf;
}

/* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be
Expand Down Expand Up @@ -466,8 +476,12 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr,

return FI_SUCCESS;


err_unmap:
cxil_unmap(uncached_md->md);
err_free_dmabuf:
if (uncached_md->dmabuf_fd_valid)
ofi_hmem_put_dmabuf_fd(attr->iface, uncached_md->dmabuf_fd);
err_free_uncached_md:
free(uncached_md);

Expand Down Expand Up @@ -575,6 +589,9 @@ static void cxip_unmap_nocache(struct cxip_md *md)
{
int ret;

if (md->dmabuf_fd_valid)
ofi_hmem_put_dmabuf_fd(md->info.iface, md->dmabuf_fd);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be curious to know why cxi requires that the close of the dmabuf is deferred until point of unmapping it? Why can't you just close it immediately after you do the map? I thought this was a property of the dmabuf kernel interface and not something that would be different between providers, but I don't know this code well and might be missing something.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the kernel driver would reference count properly so that closing right after mapping should work just fine. I don't see the drawback of doing either way.


if (md->handle_valid)
ofi_hmem_dev_unregister(md->info.iface, md->handle);

Expand Down
35 changes: 35 additions & 0 deletions prov/cxi/test/cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,38 @@ Test(cuda, verify_force_dev_reg_local)
cxit_destroy_cqs();
cxit_teardown_ep();
}

Test(cuda, dmabuf_stress)
{
int ret;
int i;
void *buf;
size_t size = 1024 * 1024;
struct fid_mr *mr;
cudaError_t cuda_ret;

ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "1", 1);
cr_assert_eq(ret, 0, "setenv failed: %d", -errno);

ret = setenv("FI_MR_CUDA_CACHE_MONITOR_ENABLED", "0", 1);
cr_assert_eq(ret, 0, "setenv failed: %d", -errno);

cuda_ret = cudaMalloc(&buf, size);
cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret);

cxit_setup_msg();

for (i = 0; i < 2048; i++) {
ret = fi_mr_reg(cxit_domain, buf, size, FI_READ | FI_WRITE,
0, 0, 0, &mr, NULL);
cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);

ret = fi_close(&mr->fid);
cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);
}

cxit_teardown_msg();

cuda_ret = cudaFree(buf);
cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret);
}
82 changes: 82 additions & 0 deletions prov/cxi/test/rocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -761,3 +761,85 @@ Test(hsa, verify_hmemDevReg_fine)

verify_dev_reg_handle(true, FINE);
}

Test(hsa, dmabuf_offset)
{
hsa_status_t hsa_ret;
void *bufs[2];
int ret;
int i;
struct fid_mr *mrs[2];
size_t size = 1024 * 1024;

ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 1);
cr_assert_eq(ret, 0, "setenv failed: %d", -errno);

ret = setenv("FI_MR_ROCR_CACHE_MONITOR_ENABLED", "0", 1);
cr_assert_eq(ret, 0, "setenv failed: %d", -errno);

cxit_setup_msg();

hsa_ret = hsa_memory_allocate(coarse_grain, size, &bufs[0]);
cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d",
hsa_ret);

ret = fi_mr_reg(cxit_domain, bufs[0], size, FI_READ | FI_WRITE, 0, 0, 0,
&mrs[0], NULL);
cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);

hsa_ret = hsa_memory_allocate(coarse_grain, size, &bufs[1]);
cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d",
hsa_ret);

ret = fi_mr_reg(cxit_domain, bufs[1], size, FI_READ | FI_WRITE, 0, 0, 0,
&mrs[1], NULL);
cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);

for (i = 0; i < 2; i++) {
ret = fi_close(&(mrs[i]->fid));
cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);

hsa_ret = hsa_memory_free(bufs[i]);
cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d",
hsa_ret);
}

cxit_teardown_msg();
}

Test(hsa, dmabuf_stress)
{
hsa_status_t hsa_ret;
int ret;
int i;
void *buf;
size_t size = 1024 * 1024;
struct fid_mr *mr;

ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 1);
cr_assert_eq(ret, 0, "setenv failed: %d", -errno);

ret = setenv("FI_MR_ROCR_CACHE_MONITOR_ENABLED", "0", 1);
cr_assert_eq(ret, 0, "setenv failed: %d", -errno);

hsa_ret = hsa_memory_allocate(coarse_grain, size, &buf);
cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d",
hsa_ret);

cxit_setup_msg();

for (i = 0; i < 2048; i++) {
ret = fi_mr_reg(cxit_domain, buf, size, FI_READ | FI_WRITE,
0, 0, 0, &mr, NULL);
cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);

ret = fi_close(&mr->fid);
cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);
}

cxit_teardown_msg();

hsa_ret = hsa_memory_free(buf);
cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d",
hsa_ret);
}
11 changes: 11 additions & 0 deletions src/hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_reg_copy_to_hmem = ofi_hmem_system_dev_reg_copy,
.dev_reg_copy_from_hmem = ofi_hmem_system_dev_reg_copy,
.get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd,
.put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd,
},
[FI_HMEM_CUDA] = {
.initialized = false,
Expand All @@ -167,6 +168,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem,
.get_dmabuf_fd = cuda_get_dmabuf_fd,
.put_dmabuf_fd = cuda_put_dmabuf_fd,
},
[FI_HMEM_ROCR] = {
.initialized = false,
Expand All @@ -193,6 +195,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_reg_copy_to_hmem = rocr_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = rocr_dev_reg_copy_from_hmem,
.get_dmabuf_fd = rocr_hmem_get_dmabuf_fd,
.put_dmabuf_fd = rocr_hmem_put_dmabuf_fd,
},
[FI_HMEM_ZE] = {
.initialized = false,
Expand All @@ -219,6 +222,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_reg_copy_to_hmem = ze_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = ze_dev_reg_copy_from_hmem,
.get_dmabuf_fd = ze_hmem_get_dmabuf_fd,
.put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd,
},
[FI_HMEM_NEURON] = {
.initialized = false,
Expand All @@ -244,6 +248,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem,
.get_dmabuf_fd = neuron_get_dmabuf_fd,
.put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd,
},
[FI_HMEM_SYNAPSEAI] = {
.initialized = false,
Expand All @@ -269,6 +274,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem,
.get_dmabuf_fd = synapseai_get_dmabuf_fd,
.put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd,
},
};

Expand Down Expand Up @@ -820,3 +826,8 @@ int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface iface, const void *addr,
{
return hmem_ops[iface].get_dmabuf_fd(addr, size, fd, offset);
}

int ofi_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd)
{
return hmem_ops[iface].put_dmabuf_fd(fd);
}
15 changes: 15 additions & 0 deletions src/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,16 @@ int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd,
#endif /* HAVE_CUDA_DMABUF */
}

int cuda_put_dmabuf_fd(int fd)
{
#if HAVE_CUDA_DMABUF
close(fd);
return FI_SUCCESS;
#else
return -FI_ENOSYS;
#endif /* HAVE_CUDA_DMABUF */
}

int cuda_hmem_init(void)
{
int ret;
Expand Down Expand Up @@ -1047,6 +1057,11 @@ int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd,
return -FI_ENOSYS;
}

int cuda_put_dmabuf_fd(int fd)
{
return -FI_ENOSYS;
}

int cuda_set_sync_memops(void *ptr)
{
return FI_SUCCESS;
Expand Down
Loading
Loading