Okay, so here’s the bilinear interpolation kernel:
each work-item creates WI_SIZE_X * WI_SIZE_Y output elements. Work group dimensions are WG_SIZE_X x WG_SIZE_Y.
input image is H x W, output is (H - 1) x (W - 1). Is this how bilinear interpolation is typically used?
The good news is that I achieve peak memory bandwidth for CL_UNORM_INT8 when I set WI_SIZE_X=1 and WI_SIZE_Y=3.
Why is that the only setting that gets peak memory bandwidth?
My guess is memory coalescing + L1 cache. Each work item computes a 3x1 column for the output which requires a 4x2 input patch. Those 4x2 input patches are right next to each other, so I think there’d be like one cache miss, then once all the data is loaded into L1, the HW would perform the interpolations at whatever throughput the gpu gets (texel fill rate). I think it computing a 3x1 output column vs a 1x3 output row is just easier to coalesce.
This is just my intuition. Please let me know if I am not correct about something.
My questions:
Each interpolation must involve fetching a 2x2 window into L1 cache correct?
kernel code
#define REP(i, n) for(int i = 0;i < (n);++i)
kernel void interpolate2(read_only image2d_t input, write_only image2d_t output) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_LINEAR;
int2 ids = (int2)(WI_SIZE_X * (int)get_global_id(0), WI_SIZE_Y * (int)get_global_id(1));
float2 coords = (float2)(get_global_id(0), get_global_id(1)) + (float2)(1.0f, 1.0f);
#pragma unroll WI_SIZE_X
REP(i, WI_SIZE_X) {
#pragma unroll WI_SIZE_Y
REP(j, WI_SIZE_Y) {
half4 pixel = read_imageh(input, sampler, coords + (float2)((float)(i), (float)(j)));
write_imageh(output, ids + (int2)(i, j), pixel);
}
}
}
host code (not complete)
#define W (112 * 8)
#define H (112)
#define WG_SIZE_X 8
#define WG_SIZE_Y 8
#define WI_SIZE_X 1
#define WI_SIZE_Y 3
typedef struct _IMG {
cl_image_format image_format;
cl_image_desc image_desc;
cl_mem mem;
} IMG;
...
IMG A = {{CL_RGBA, CL_UNORM_INT8}, {CL_MEM_OBJECT_IMAGE2D, W, H, 1}};
A.mem = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, &A.image_format, &A.image_desc, NULL, &ret);
mem_objects.push_back(A.mem);
ASSERT(ret == CL_SUCCESS);
IMG A_interp = {{CL_RGBA, CL_UNORM_INT8}, {CL_MEM_OBJECT_IMAGE2D, W - 1, H - 1, 1}};
A_interp.mem = clCreateImage(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, &A_interp.image_format, &A_interp.image_desc, NULL, &ret);
mem_objects.push_back(A_interp.mem);
ASSERT(ret == CL_SUCCESS);
cl_kernel interpolate_kernel = clCreateKernel(program, "interpolate2", &ret);
ASSERT(ret == CL_SUCCESS);
ret = clSetKernelArg(interpolate_kernel, 0, sizeof(cl_mem), (void *)&A.mem);
ret = clSetKernelArg(interpolate_kernel, 1, sizeof(cl_mem), (void *)&A_interp.mem);
globalws = {(A_interp.image_desc.image_width) / WI_SIZE_X, (A_interp.image_desc.image_height) / WI_SIZE_Y};
localws = {WG_SIZE_X, WG_SIZE_Y};
ret = clEnqueueNDRangeKernel(queue, interpolate_kernel, 2, NULL, globalws.data(), localws.data(), 0, NULL, my_event);