I am doing a discrete convolution and I am confused as to why when I change my local_work_size in clEnqueueNDRangeKernel, the effect is negligible. It is further confusing me because my openCL function is running 6 times faster than the CPU version and outputting the correct data no matter what I set local_work_size to. Shouldn’t I see a huge performance drop if local_work_size is set to 1?
Here is the code I have calling my kernel:
size_t localTestSize[1] = {512};
int amountForDivEqually = ceil((float)SIZE/(float)localTestSize[0]);
size_t globalTestSize[1] = {amountForDivEqually * localTestSize[0]};
std::cout << "CCC: " << globalTestSize[0] << "\n";
resultCL = clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLConvolution, 1, NULL, globalTestSize, localTestSize, 0, NULL, NULL);
if (resultCL != CL_SUCCESS)
{
throw(std::string("CallKernel()::Error: Enqueue kernel onto command queue. (clEnqueueNDRangeKernel)"));
}
Also here is my kernel:
const char* OpenCLSource[] =
{
"__kernel void ConvolutionGPU(__global float* outputSignalArray, __global float* inputSignalArray,__global float* responseSignalArray, __global int* length)",
"{",
" unsigned int n = get_global_id(0);",
" if(n < *length)",
" { ",
" float accumulator = 0.0f;",
" for(int j = 0; j < *length; j++)",
" {",
" accumulator += inputSignalArray[j] * responseSignalArray[(j+n) % (*length)];",
" }",
" outputSignalArray[n] = accumulator;",
" }",
"}"
};