this kernel runs OK on CPU on AMD platform.
But when I try to run on GT9400 it brings incorrect data.
kernel takes 32k array of complex elements, transforms it 32 time using another 16*32k arrays of complex float and writes 32 arrays of 32k complex elements.
then I read one of such 32k complex arrays into host memory.
on CPU it works OK, all 32k elements computed correctly.
On GPU i get array with correct first 8k elements. Next 1/4 of array just identical with first 1/4, next too.
That is, instead of computing 32k of complexes it computes only 8k and repeats it 4 times.
Something wrong with offsets? But what and why CPU handles it correctly?..
kernel:
__kernel void dechirp_range_kernel(__constant float* gpu_data ,
__constant float* gpu_chirps,
__global float* gpu_dechirped)
{
uint tid = get_global_id(0);
//R: each work item will process 2 complex data elements and write 2216 new complex elements into dechirped array
float4 data=vload4(tid,gpu_data);
float4 cur_chirp;
float4 cur_dechirp;
for(uint i=0;i<16;i++){//R: can be optimized via mad instruction probably
cur_chirp=vload4(i*(32768/2)+tid,gpu_chirps);
//negative sign
cur_dechirp.x = data.xcur_chirp.x - data.ycur_chirp.y;
cur_dechirp.y = data.ycur_chirp.x + data.xcur_chirp.y;
cur_dechirp.z = data.zcur_chirp.z - data.wcur_chirp.w;
cur_dechirp.w = data.wcur_chirp.z + data.zcur_chirp.w;
vstore4(cur_dechirp,(2i+0)(32768/2)+tid,gpu_dechirped);
//positive sign
cur_dechirp.x = data.xcur_chirp.x + data.ycur_chirp.y;
cur_dechirp.y = data.ycur_chirp.x - data.xcur_chirp.y;
cur_dechirp.z = data.zcur_chirp.z + data.wcur_chirp.w;
cur_dechirp.w = data.wcur_chirp.z - data.zcur_chirp.w;
vstore4(cur_dechirp,(2i+1)(32768/2)+tid,gpu_dechirped);
}
}
how it called:
cl_event events[2];
size_t globalThreads[1];
globalThreads[0] = 32768/2;//R: each work item works with 2 data elements
err = clEnqueueNDRangeKernel(
cq,
dechirp_range_kernel,
1,//R: 1D execution domain used, each work item works with 2 data elements
NULL,
globalThreads,
NULL,//R: no workgroups requred
0,
NULL,
&events[0]);
if(err != CL_SUCCESS)
{
fprintf(stderr,
"ERROR: Enqueueing kernel onto command queue. \
(dechirp_range_kernel)\n");
switch(err){
case CL_INVALID_PROGRAM_EXECUTABLE:fprintf(stderr,"ERROR code: CL_INVALID_PROGRAM_EXECUTABLE \n");break;
case CL_INVALID_COMMAND_QUEUE:fprintf(stderr,"ERROR code: CL_INVALID_COMMAND_QUEUE \n");break;
case CL_INVALID_KERNEL:fprintf(stderr,"ERROR code: CL_INVALID_KERNEL \n");break;
case CL_INVALID_CONTEXT:fprintf(stderr,"ERROR code: CL_INVALID_CONTEXT \n");break;
case CL_INVALID_KERNEL_ARGS:fprintf(stderr,"ERROR code: CL_INVALID_KERNEL_ARGS \n");break;
case CL_INVALID_WORK_DIMENSION:fprintf(stderr,"ERROR code: CL_INVALID_WORK_DIMENSION \n");break;
case CL_INVALID_WORK_GROUP_SIZE:fprintf(stderr,"ERROR code: CL_INVALID_WORK_GROUP_SIZE \n");break;
case CL_INVALID_WORK_ITEM_SIZE:fprintf(stderr,"ERROR code: CL_INVALID_WORK_ITEM_SIZE \n");break;
case CL_INVALID_GLOBAL_OFFSET:fprintf(stderr,"ERROR code: CL_INVALID_GLOBAL_OFFSET \n");break;
case CL_OUT_OF_RESOURCES:fprintf(stderr,"ERROR code: CL_OUT_OF_RESOURCES \n");break;
case CL_DEVICE_MAX_READ_IMAGE_ARGS:fprintf(stderr,"ERROR code: CL_DEVICE_MAX_READ_IMAGE_ARGS \n");break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:fprintf(stderr,"ERROR code: CL_MEM_OBJECT_ALLOCATION_FAILURE \n");break;
case CL_OUT_OF_HOST_MEMORY:fprintf(stderr,"ERROR code: CL_OUT_OF_HOST_MEMORY \n");break;
default:fprintf(stderr,"ERROR code: unknown, %d \n",err);break;
}
}
err |= clFinish(cq);