Skip to content

Commit

Permalink
Merge pull request #1265 from cppchedy/chedy/device-to-device-copy
Browse files Browse the repository at this point in the history
[Bindless][Exp] Add support for device to device copies between CuArrays
  • Loading branch information
kbenzie authored Apr 24, 2024
2 parents b37fa2c + f9fb116 commit 7fcfe3a
Showing 1 changed file with 76 additions and 7 deletions.
83 changes: 76 additions & 7 deletions source/adapters/cuda/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -690,12 +690,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
}
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.srcHost = pSrc;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.srcHost = pSrc;
cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes;
if (pImageDesc->rowPitch == 0) {
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand Down Expand Up @@ -781,8 +781,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.srcY = srcOffset.y;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.dstHost = pDst;
if (pImageDesc->rowPitch == 0) {
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
Expand All @@ -792,6 +790,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.srcPitch = pImageDesc->rowPitch;
cpy_desc.srcDevice = (CUdeviceptr)pSrc;
}
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.dstHost = pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = copyExtent.height;
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
Expand Down Expand Up @@ -831,10 +831,79 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
}
} else {
/// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
/// TODO: implemet device to device copy
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE

// All the following async copy function calls should be treated as
// synchronous because of the explicit call to cuStreamSynchronize at
// the end
if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcY = 0;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstY = 0;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = 1;
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcY = srcOffset.y;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstY = dstOffset.y;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = copyExtent.height;
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcY = srcOffset.y;
cpy_desc.srcZ = srcOffset.z;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstZ = dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = copyExtent.height;
cpy_desc.Depth = copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcY = srcOffset.y;
cpy_desc.srcZ = srcOffset.z;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstZ = dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, copyExtent.height);
cpy_desc.Depth = pImageDesc->arraySize;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
}
// Synchronization is required here to handle the case of copying data
// from host to device, then device to device and finally device to host.
// Without it, there is a risk of the copies not being executed in the
// intended order.
cuStreamSynchronize(Stream);
}

if (phEvent) {
auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY,
hQueue, Stream);
Expand Down

0 comments on commit 7fcfe3a

Please sign in to comment.