Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from ggml-org:master #252

Merged
merged 1 commit into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 25 additions & 29 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,19 +444,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);

// Default wave size is 128, A8x uses 64.
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
backend_ctx->adreno_wave_size = 64;
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
backend_ctx->adreno_wave_size = 128;
} else {
backend_ctx->adreno_wave_size = 128;
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
"using wave size %d, "
"may not work as expected\n",
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
}
// Use wave size of 64 for all Adreno GPUs.
backend_ctx->adreno_wave_size = 64;
} else if (strstr(default_device->name, "Intel")) {
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
} else {
Expand Down Expand Up @@ -1376,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
int M = tensor->ne[1]; // ne01
int K = tensor->ne[0]; // ne00

//For matrix-vector multiplication kernel, we assume K is a multiple of 32
GGML_ASSERT(K % 32 == 0);
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
GGML_ASSERT(M % 4 == 0);

// transpose is out of place, so we need to allocate transposed buffers
// <----------------------------------------------------------------------------------> //
// use sub_buffer of max buffer size instead
Expand Down Expand Up @@ -1416,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
cl_mem qT_d_image1D;
cl_mem dT_d_image1D;

cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
cl_image_desc img_desc_1d;

memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 8 / 4;
img_desc_1d.image_width = M * K / 4 / 4;
img_desc_1d.buffer = extra->q;
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);

img_fmt_1d = { CL_RGBA, CL_FLOAT };
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 8 / 4;
img_desc_1d.image_width = M * K / 4 / 4;
img_desc_1d.buffer = qT_d;
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);

img_fmt_1d = { CL_RGBA, CL_FLOAT };
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 32 / 4 / 2;
img_desc_1d.image_width = M * K / 32 / 4;
img_desc_1d.buffer = extra->d;
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);

img_fmt_1d = { CL_RGBA, CL_FLOAT };
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 32 / 4 / 2;
img_desc_1d.image_width = M * K / 32 / 4;
img_desc_1d.buffer = dT_d;
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
Expand All @@ -1454,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
// set up and call the transpose kernels
// <----------------------------------------------------------------------------------> //
// weights
int height_q = M / 8;
int width_q = K / 8 / 4;
int height_q = M / 4;
int width_q = K / 4 / 4;
kernel = backend_ctx->kernel_transpose_16;

CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
Expand All @@ -1469,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
CL_CHECK(clWaitForEvents(1, &evt));

// scales
int height_s = M / 8;
int width_s = K / 32 / 8;
int height_s = M / 4;
int width_s = K / 32 / 4;

kernel = backend_ctx->kernel_transpose_16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
Expand Down Expand Up @@ -1864,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
void * buf_d;
#endif

#ifdef GGML_USE_OPENCL
// Make sure everything is done.
CL_CHECK(clFinish(queue));

Expand Down Expand Up @@ -1900,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
#endif // GGML_OPENCL_SOA_Q
#endif // GGML_USE_OPENCL

// Open file and dump.
char fname[512];
Expand Down Expand Up @@ -2865,6 +2857,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(status);

int height_B = N/4;
if (height_B == 0) {
height_B = 1;
}
int width_B = K/4;
int padded_height_B = (N + padding)/4;

Expand Down Expand Up @@ -3013,11 +3008,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}

if (N == 1) {
local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
size_t wavesize = backend_ctx->adreno_wave_size;
local_work_size[0] = wavesize; // localsize
local_work_size[1] = 4; // reduce factor
local_work_size[2] = 1;

global_work_size[0] = M / 2;
global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
global_work_size[1] = 4; // reduce factor
global_work_size[2] = 1;
}
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1797,6 +1797,9 @@ kernel void kernel_mul_mat_f16_f16(
//------------------------------------------------------------------------------
// mul_mat_f16_f32_1row
//------------------------------------------------------------------------------
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_f16_f32_1row(
global char * src0,
ulong offset0,
Expand Down
13 changes: 8 additions & 5 deletions ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable

#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#endif

// assume
#define QK4_0 32
Expand Down Expand Up @@ -186,8 +188,9 @@
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \


__attribute__((qcom_reqd_sub_group_size("full")))
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable

#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#endif

// assume
#define QK4_0 32
Expand Down Expand Up @@ -186,8 +188,9 @@
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \


__attribute__((qcom_reqd_sub_group_size("full")))
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
Expand Down
11 changes: 10 additions & 1 deletion ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,16 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable

__attribute__((qcom_reqd_sub_group_size("full")))
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif

#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_128
#endif

kernel void kernel_mul_mat_Ab_Bi_8x4(
global const ushort * src0_q, // quantized A
global const half * src0_d, // A scales
Expand Down
32 changes: 13 additions & 19 deletions ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
// 16-bit transpose, loading/storing an 8x8 tile of elements
// 16-bit transpose, loading/storing a 4x4 tile of elements

#pragma OPENCL EXTENSION cl_khr_fp16 : enable

kernel void kernel_transpose_16(
__read_only image1d_buffer_t input,
Expand All @@ -9,24 +11,16 @@ kernel void kernel_transpose_16(

const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_3 = i<<3;
const int j_3 = j<<3;
const int i_2 = i<<2;
const int j_2 = j<<2;

ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
half4 temp3 = read_imageh(input, (j_2+3)*cols+i);

write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}
Loading