Skip to content

Commit

Permalink
OpenCL Backend - Update OpenCL LUT type to uchar buffer (#1442)
Browse files Browse the repository at this point in the history
* opencl image1d update

* code cleanup

* update data type

---------

Co-authored-by: Kiriti Gowda <[email protected]>
  • Loading branch information
hansely and kiritigowda authored Nov 14, 2024
1 parent 4298a00 commit 32b515c
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 72 deletions.
4 changes: 0 additions & 4 deletions amd_openvx/openvx/ago/ago_drama_alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,15 +165,11 @@ int agoGpuAllocBuffers(AgoGraph * graph)
// get data groups (Gd)
auto getMemObjectType = [=](AgoData * data) -> cl_mem_object_type {
cl_mem_object_type obj_type = CL_MEM_OBJECT_BUFFER;
if (data->ref.type == VX_TYPE_LUT && data->u.lut.type == VX_TYPE_UINT8)
obj_type = CL_MEM_OBJECT_IMAGE1D;
return (vx_uint32)obj_type;
};
#elif ENABLE_HIP
auto getMemObjectType = [=](AgoData * data) -> vx_uint32 {
vx_uint32 obj_type = HIP_MEM_KIND_BUFFER;
if (data->ref.type == VX_TYPE_LUT && data->u.lut.type == VX_TYPE_UINT8)
obj_type = HIP_MEM_KIND_IMAGE1D;
return (vx_uint32)obj_type;
};
#endif
Expand Down
33 changes: 10 additions & 23 deletions amd_openvx/openvx/ago/ago_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1931,12 +1931,11 @@ static int agoDataSyncFromGpuToCpu(AgoGraph * graph, AgoNode * node, AgoData * d
if (node->flags & AGO_KERNEL_FLAG_DEVICE_GPU) {
if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
int64_t stime = agoGetClockCounter();
vx_size size = dataToSync->size;
if (dataToSync->ref.type == VX_TYPE_LUT) {
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { 256, 1, 1 };
cl_int err = clEnqueueWriteImage(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
cl_int err = clEnqueueWriteBuffer(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->gpu_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueWriteBuffer(lut) => %d\n", err);
return -1;
}
}
Expand All @@ -1963,11 +1962,10 @@ static int agoDataSyncFromGpuToCpu(AgoGraph * graph, AgoNode * node, AgoData * d
if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL)) {
if (dataToSync->ref.type == VX_TYPE_LUT) {
int64_t stime = agoGetClockCounter();
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { 256, 1, 1 };
cl_int err = clEnqueueReadImage(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
vx_size size = dataToSync->size;
cl_int err = clEnqueueReadBuffer(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->gpu_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueReadImage(lut) => %d\n", err);
agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueReadBuffer(lut) => %d\n", err);
return -1;
}
dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
Expand Down Expand Up @@ -2589,21 +2587,10 @@ vx_status agoDirective(vx_reference reference, vx_enum directive)
auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
if (dataToSync->ref.type == VX_TYPE_LUT) {
if (dataToSync->opencl_buffer) {
if (dataToSync->u.lut.type == VX_TYPE_UINT8) {
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { 256, 1, 1 };
cl_int err = clEnqueueWriteImage(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
return VX_FAILURE;
}
}
else if (dataToSync->u.lut.type == VX_TYPE_INT16) {
cl_int err = clEnqueueWriteBuffer(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->gpu_buffer_offset, dataToSync->size, dataToSync->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
return VX_FAILURE;
}
cl_int err = clEnqueueWriteBuffer(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->gpu_buffer_offset, dataToSync->size, dataToSync->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
return VX_FAILURE;
}
dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
status = VX_SUCCESS;
Expand Down
18 changes: 9 additions & 9 deletions amd_openvx/openvx/ago/ago_kernel_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3309,19 +3309,19 @@ int agoKernel_Lut_U8_U8(AgoNode * node, AgoKernelCommand cmd)
node->opencl_type = NODE_OPENCL_TYPE_REG2REG;
char textBuffer[2048];
snprintf(textBuffer, sizeof(textBuffer), OPENCL_FORMAT(
"void %s (U8x8 * p0, U8x8 p1, __read_only image1d_t lut)\n"
"void %s (U8x8 * p0, U8x8 p1, __global uchar * lut, uint lut_count, uint lut_offset)\n"
"{\n"
" U8x8 r;\n"
" float4 f;\n"
" f.s0 = read_imagef(lut, (int)( p1.s0 & 255)).s0 * 255.0f;\n"
" f.s1 = read_imagef(lut, (int)((p1.s0 >> 8) & 255)).s0 * 255.0f;\n"
" f.s2 = read_imagef(lut, (int)((p1.s0 >> 16) & 255)).s0 * 255.0f;\n"
" f.s3 = read_imagef(lut, (int)( p1.s0 >> 24 )).s0 * 255.0f;\n"
" f.s0 = (float)(lut[p1.s0 & 255]);\n"
" f.s1 = (float)(lut[(p1.s0 >> 8) & 255]);\n"
" f.s2 = (float)(lut[(p1.s0 >> 16) & 255]);\n"
" f.s3 = (float)(lut[p1.s0 >> 24]);\n"
" r.s0 = amd_pack(f);\n"
" f.s0 = read_imagef(lut, (int)( p1.s1 & 255)).s0 * 255.0f;\n"
" f.s1 = read_imagef(lut, (int)((p1.s1 >> 8) & 255)).s0 * 255.0f;\n"
" f.s2 = read_imagef(lut, (int)((p1.s1 >> 16) & 255)).s0 * 255.0f;\n"
" f.s3 = read_imagef(lut, (int)( p1.s1 >> 24 )).s0 * 255.0f;\n"
" f.s0 = (float)(lut[p1.s1 & 255]);\n"
" f.s1 = (float)(lut[(p1.s1 >> 8) & 255]);\n"
" f.s2 = (float)(lut[(p1.s1 >> 16) & 255]);\n"
" f.s3 = (float)(lut[p1.s1 >> 24]);\n"
" r.s1 = amd_pack(f);\n"
" *p0 = r;\n"
"}\n"
Expand Down
54 changes: 19 additions & 35 deletions amd_openvx/openvx/ago/ago_util_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,16 +485,13 @@ int agoGpuOclAllocBuffer(AgoData * data)
else if (data->ref.type == VX_TYPE_LUT) {
if (!data->opencl_buffer) {
if (data->u.lut.type == VX_TYPE_UINT8) {
// allocal OpenCL image
cl_int err = -1;
cl_image_format format = { CL_INTENSITY, CL_UNORM_INT8 };
cl_image_desc desc = { CL_MEM_OBJECT_IMAGE1D, 256, 0, 0, 1, 0, 0, 0, 0, NULL };
data->opencl_buffer = data->opencl_buffer_allocated = agoGpuOclCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
data->gpu_buffer_offset = 0;
data->opencl_buffer = data->opencl_buffer_allocated = agoGpuOclCreateBuffer(context, CL_MEM_READ_WRITE, data->size + data->gpu_buffer_offset, NULL, &err);
if (err) {
agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: agoGpuOclCreateImage(%p,CL_MEM_READ_WRITE,1D/U8,256,0,*) => %d (for LUT)\n", context->opencl_context, err);
return -1;
}
data->gpu_buffer_offset = 0;
}
else {
// normal opencl_buffer allocation
Expand Down Expand Up @@ -832,21 +829,20 @@ static int agoGpuOclSetKernelArgs(cl_kernel opencl_kernel, vx_uint32& kernelArgI
if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
return -1;
kernelArgIndex++;
if (data->u.lut.type != VX_TYPE_UINT8) {
// count and offset parameters
err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(vx_uint32), &data->u.lut.count);
if (err) {
agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,lut:count) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
return -1;
}
kernelArgIndex++;
err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(vx_uint32), &data->u.lut.offset);
if (err) {
agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,lut:offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
return -1;
}
kernelArgIndex++;

// count and offset parameters
err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(vx_uint32), &data->u.lut.count);
if (err) {
agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,lut:count) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
return -1;
}
kernelArgIndex++;
err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(vx_uint32), &data->u.lut.offset);
if (err) {
agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,lut:offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group);
return -1;
}
kernelArgIndex++;
}
else if (data->ref.type == VX_TYPE_REMAP) {
if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
Expand Down Expand Up @@ -1119,24 +1115,12 @@ static int agoGpuOclDataInputSync(AgoGraph * graph, cl_kernel opencl_kernel, vx_
if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0)
return -1;
}
kernelArgIndex += 1;
if (data->u.lut.type != VX_TYPE_UINT8) {
kernelArgIndex += 2;
}
kernelArgIndex += 3;
if (need_read_access) {
if (!(data->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
if (data->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
int64_t stime = agoGetClockCounter();
if (data->u.lut.type == VX_TYPE_UINT8) {
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { 256, 1, 1 };
err = clEnqueueWriteImage(opencl_cmdq, data->opencl_buffer, CL_TRUE, origin, region, 256, 0, data->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
return -1;
}
}
else if (data->u.lut.type == VX_TYPE_INT16) {
if (data->u.lut.type == VX_TYPE_UINT8 || data->u.lut.type == VX_TYPE_INT16) {
cl_int err = clEnqueueWriteBuffer(opencl_cmdq, data->opencl_buffer, CL_TRUE, data->gpu_buffer_offset, data->size, data->buffer, 0, NULL, NULL);
if (err) {
agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclDataInputSync: clEnqueueWriteBuffer() => %d (for LUT)\n", err);
Expand Down Expand Up @@ -1390,7 +1374,7 @@ static std::string agoGpuOclData2Decl(AgoData * data, vx_uint32 index, vx_uint32
}
else if (data->ref.type == VX_TYPE_LUT) {
if (data->u.lut.type == VX_TYPE_UINT8) {
snprintf(item, sizeof(item), "__read_only image1d_t p%d", index);
snprintf(item, sizeof(item), "__global uchar * p%d_buf, uint p%d_count, uint p%d_offset", index, index, index);
code += item;
}
else if (data->u.lut.type == VX_TYPE_INT16) {
Expand Down Expand Up @@ -2069,7 +2053,7 @@ int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode)
snprintf(item, sizeof(item), ", p%d_buf, p%d_stride", (int)data_index, (int)data_index);
code += item;
}
else if (data->ref.type == VX_TYPE_LUT && data->u.lut.type == VX_TYPE_INT16) {
else if (data->ref.type == VX_TYPE_LUT) {
snprintf(item, sizeof(item), ", p%d_buf, p%d_count, p%d_offset", (int)data_index, (int)data_index, (int)data_index);
code += item;
}
Expand Down
2 changes: 1 addition & 1 deletion amd_openvx/openvx/include/vx_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ typedef vx_status(VX_CALLBACK *amd_kernel_query_target_support_f)(vx_graph graph
* vx_convolution: float convolution[<ROWS>*<COLS>]
* vx_threshold: int value or int2 value
* vx_remap: __global short2 * buf, uint stride_in_bytes
* vx_lut: __read_only image1d_t lut
* vx_lut: __global uchar * lut, uint count, uint offset
*/
typedef vx_status(VX_CALLBACK *amd_kernel_opencl_codegen_callback_f)(
vx_node node, // [input] node
Expand Down

0 comments on commit 32b515c

Please sign in to comment.