What could be wrong with this kernel? code provided

this kernel runs OK on CPU on AMD platform.
But when I try to run on GT9400 it brings incorrect data.

kernel takes 32k array of complex elements, transforms it 32 time using another 16*32k arrays of complex float and writes 32 arrays of 32k complex elements.
then I read one of such 32k complex arrays into host memory.
on CPU it works OK, all 32k elements computed correctly.
On GPU i get array with correct first 8k elements. Next 1/4 of array just identical with first 1/4, next too.
That is, instead of computing 32k of complexes it computes only 8k and repeats it 4 times.
Something wrong with offsets? But what and why CPU handles it correctly?..

kernel:

__kernel void dechirp_range_kernel(__constant float* gpu_data ,
__constant float* gpu_chirps,
__global float* gpu_dechirped)
{
uint tid = get_global_id(0);
//R: each work item will process 2 complex data elements and write 2216 new complex elements into dechirped array
float4 data=vload4(tid,gpu_data);
float4 cur_chirp;
float4 cur_dechirp;
for(uint i=0;i<16;i++){//R: can be optimized via mad instruction probably
cur_chirp=vload4(i*(32768/2)+tid,gpu_chirps);
//negative sign
cur_dechirp.x = data.xcur_chirp.x - data.ycur_chirp.y;
cur_dechirp.y = data.ycur_chirp.x + data.xcur_chirp.y;
cur_dechirp.z = data.zcur_chirp.z - data.wcur_chirp.w;
cur_dechirp.w = data.wcur_chirp.z + data.zcur_chirp.w;
vstore4(cur_dechirp,(2i+0)(32768/2)+tid,gpu_dechirped);
//positive sign
cur_dechirp.x = data.xcur_chirp.x + data.ycur_chirp.y;
cur_dechirp.y = data.ycur_chirp.x - data.xcur_chirp.y;
cur_dechirp.z = data.zcur_chirp.z + data.wcur_chirp.w;
cur_dechirp.w = data.wcur_chirp.z - data.zcur_chirp.w;
vstore4(cur_dechirp,(2i+1)(32768/2)+tid,gpu_dechirped);
}
}

how it called:

cl_event events[2];
size_t globalThreads[1];
globalThreads[0] = 32768/2;//R: each work item works with 2 data elements
err = clEnqueueNDRangeKernel(
		     cq,
             dechirp_range_kernel,
             1,//R: 1D execution domain used, each work item works with 2 data elements
             NULL,
             globalThreads,
             NULL,//R: no workgroups requred
             0,
             NULL,
             &events[0]);
if(err != CL_SUCCESS) 
{ 
	fprintf(stderr,
		"ERROR: Enqueueing kernel onto command queue. \
		(dechirp_range_kernel)\n");
	switch(err){
		case CL_INVALID_PROGRAM_EXECUTABLE:fprintf(stderr,"ERROR code: CL_INVALID_PROGRAM_EXECUTABLE \n");break;
		case CL_INVALID_COMMAND_QUEUE:fprintf(stderr,"ERROR code: CL_INVALID_COMMAND_QUEUE \n");break;
		case CL_INVALID_KERNEL:fprintf(stderr,"ERROR code: CL_INVALID_KERNEL \n");break;
		case CL_INVALID_CONTEXT:fprintf(stderr,"ERROR code: CL_INVALID_CONTEXT \n");break;
		case CL_INVALID_KERNEL_ARGS:fprintf(stderr,"ERROR code: CL_INVALID_KERNEL_ARGS \n");break;
		case CL_INVALID_WORK_DIMENSION:fprintf(stderr,"ERROR code: CL_INVALID_WORK_DIMENSION \n");break;
		case CL_INVALID_WORK_GROUP_SIZE:fprintf(stderr,"ERROR code: CL_INVALID_WORK_GROUP_SIZE \n");break;
		case CL_INVALID_WORK_ITEM_SIZE:fprintf(stderr,"ERROR code: CL_INVALID_WORK_ITEM_SIZE \n");break;
		case CL_INVALID_GLOBAL_OFFSET:fprintf(stderr,"ERROR code: CL_INVALID_GLOBAL_OFFSET \n");break;
		case CL_OUT_OF_RESOURCES:fprintf(stderr,"ERROR code: CL_OUT_OF_RESOURCES \n");break;
		case CL_DEVICE_MAX_READ_IMAGE_ARGS:fprintf(stderr,"ERROR code: CL_DEVICE_MAX_READ_IMAGE_ARGS \n");break;
		case CL_MEM_OBJECT_ALLOCATION_FAILURE:fprintf(stderr,"ERROR code: CL_MEM_OBJECT_ALLOCATION_FAILURE \n");break;
		case CL_OUT_OF_HOST_MEMORY:fprintf(stderr,"ERROR code: CL_OUT_OF_HOST_MEMORY \n");break;
		default:fprintf(stderr,"ERROR code: unknown, %d \n",err);break;
	}
}
err |= clFinish(cq);

No ideas? Should it be reported as bug?
CPU build works just fine with this kernel…

I rewrote same kernel differently, now 2D NDrange used, but no loop inside kernel itself… but absolute the same result. Firts 8192 elements computed correctly, next just repeat of first 8k…

What am I missing??? External Image

__kernel void dechirp_range_kernel_NV(__constant float* gpu_data ,

							 __constant float* gpu_chirps,

							 __global   float* gpu_dechirped)

{

	uint tid = get_global_id(0);

	uint chirp_id=get_global_id(1);

	float4 data=vload4(tid,gpu_data);

	float4 cur_chirp;

	float4 cur_dechirp;

		cur_chirp=vload4(chirp_id*(32768/2)+tid,gpu_chirps);

		//negative sign

		cur_dechirp.x = data.x*cur_chirp.x - data.y*cur_chirp.y;

		cur_dechirp.y = data.y*cur_chirp.x + data.x*cur_chirp.y;

		cur_dechirp.z = data.z*cur_chirp.z - data.w*cur_chirp.w;

		cur_dechirp.w = data.w*cur_chirp.z + data.z*cur_chirp.w;

		vstore4(cur_dechirp,(2*chirp_id+0)*(32768/2)+tid,gpu_dechirped);	

		//positive sign

		cur_dechirp.x = data.x*cur_chirp.x + data.y*cur_chirp.y;

		cur_dechirp.y = data.y*cur_chirp.x - data.x*cur_chirp.y;

		cur_dechirp.z = data.z*cur_chirp.z + data.w*cur_chirp.w;

		cur_dechirp.w = data.w*cur_chirp.z - data.z*cur_chirp.w;

		vstore4(cur_dechirp,(2*chirp_id+1)*(32768/2)+tid,gpu_dechirped);	

}

Do you put your data in __constant memory? Remember on NVIDIA’s GPU, __constant memory is limited to 32KB. You can get the size limit of __constant memory by querying the OpenCL implementation (IIRC ATI has the same limitation on GPU).

AFAIK currently __constant modifier for pointers implemented just as __global one, but I will try to change them to global, thanks for suggestion!

Apparently not.

The change from __constant to __global solved this problem completely, thanks a lot again!!!

But why kernel didn’t just fail with out of resources or smth like this ?

It’s easy to detect that __constant is too big if it’s declared directly in the kernel, i.e.

__constant float data[100000] = { … };

This can be easily spotted by the compiler.

However, in the use of

void func(__constant float* data, …)

It’s much harder because there is no indication of size in the source code. Also it’s not possible to detect from the memory allocation call or memory transfer call because OpenCL does not differentiate between normal memory allocation/copy and constant memory allocation/copy. So the only way to detect that is to check whether a memory object is passed as an argument with __constant specifier. But I’m not sure how it’s defined in OpenCL.