I am trying to implement the Motion estimation algorithm using open CL.
I have written a code to calculate the SAD values in open CL and tried running on NVIDIA GE 9400GT GPU.
I am trying to encode Video with resolution 320x240. I have allocated 2 buffers with memory 320x240x2 bytes each. One for current frame and other for the reference frame.
I copied the data to the allocated buffers using clEnqueueWriteBuffer.

The problem I am facing is that the code is running fine for 10MBs, after that the SAD values are not proper. Some time I am getting the CL_INVALID_COMMAND_QUEUE error from clEnqueueTask function and some other time I am getting the same error from clEnqueueReadbuffer. I am using cl_Finish after each operation.

1). My doubt is whether I can allocate the memory for the complete frame ?
2). Why the code is failing to run after running properly for 10 times?
3). Is memory the only issue or is there any other issue?
4). whether the problem is with the device?

Show the source code.


	clEnqueueWriteBuffer(ctx->gpu.command_queue, ctx->gpu_frame.sad, CL_TRUE, 0, sizeof(int)* 8, sad, 0, NULL, NULL);

	err = clSetKernelArg(ctx->gpu.kernel,0,sizeof(cl_mem), &ctx->gpu_frame.ref_pad);

	err = clSetKernelArg(ctx->gpu.kernel,1,sizeof(cl_mem), &ctx->;

	err = clSetKernelArg(ctx->gpu.kernel,2,sizeof(cl_int), &x);

	err = clSetKernelArg(ctx->gpu.kernel,3,sizeof(cl_int), &y);

	err = clSetKernelArg(ctx->gpu.kernel,4,sizeof(cl_int), &mv1);

	err = clSetKernelArg(ctx->gpu.kernel,5,sizeof(cl_int), &min_cost);

	err = clSetKernelArg(ctx->gpu.kernel,6,sizeof(cl_int), &w);

	err = clSetKernelArg(ctx->gpu.kernel,7,sizeof(cl_int), &h);

	err = clSetKernelArg(ctx->gpu.kernel,8,sizeof(cl_int), &ctx->picbuf[0].s_l);

	err = clSetKernelArg(ctx->gpu.kernel,9,sizeof(cl_int), &ctx->picbuf[0].w_l);

	err = clSetKernelArg(ctx->gpu.kernel,10,sizeof(cl_mem), &ctx->gpu_frame.sad);

	err = clSetKernelArg(ctx->gpu.kernel,11,sizeof(cl_mem), &ctx->gpu_frame.idx);

	err = clFinish(ctx->gpu.command_queue);

			while (err != 0)

		err = clFinish(ctx->gpu.command_queue);

	//err = clEnqueueNDRangeKernel(ctx->gpu.command_queue,ctx->gpu.kernel,

	//									1,NULL,&global,0,0,NULL,NULL);

	err = clEnqueueTask(ctx->gpu.command_queue,ctx->gpu.kernel,NULL, NULL, NULL);


			while (err != 0)

		err = clFinish(ctx->gpu.command_queue);

	// copy the results from out of the output buffer

	err = clEnqueueReadBuffer(ctx->gpu.command_queue, ctx->gpu_frame.sad, CL_TRUE, 0, sizeof(int) * 8, sad, 0, NULL, NULL);

	while (err != 0)

		err = clFinish(ctx->gpu.command_queue);

	clEnqueueReadBuffer(ctx->gpu.command_queue, ctx->gpu_frame.idx, CL_TRUE, 0, sizeof(unsigned char) * 8, idx, 0, NULL, NULL);

	err = clFinish(ctx->gpu.command_queue);

	while (err != 0)

		err = clFinish(ctx->gpu.command_queue);

kernel code is

__kernel void s264e_me_ipel_sad( __global unsigned char *ref1, __global unsigned char *org, int x, int y, int mv, int min_cost,

							int w, int h, int s_ref, int s_org, __global int *sad, __global unsigned char *idx)


int     i, j, k, t0,org_w;

int   mv1, mv2, mv3, mv4;

int id0 = get_global_id(0);

int lsad = 0;

int org_off;

int ref_off, off;

s_ref = 384;

org_w = 320;

off = 32 * s_ref + 32;

mv1 = mv & 0xffff;

mv2 = (mv & 0xffff0000) >> 16;

/* get SAD of out-side positions */

for(k=0; k<4; k++)


	if(sad[k] < 0)


		mv3 = mv1 + tbl_diapos_small2[k][0];

		mv4 = mv2 + tbl_diapos_small2[k][1];

		org_off  = y * org_w + x;

		ref_off  = off + (mv4 * s_ref) + mv3;

		lsad = 0;

		for(i=0; i<h; i++)


			for(j=0; j<w; j++)


				lsad += SCMN_ABS16((short)org[org_off+j] - (short)ref1[ref_off+j]);


			if(lsad > min_cost)


			org_off += org_w;

			ref_off += s_ref;


		sad[k] = lsad;



/* sorting SAD based on bubble sorting */

for(i=0; i<5; i++) 

	idx[i] = i;

for(i=0; i<4; i++)


	for(j=i+1; j<5; j++)


		if(sad[idx[i]] >= sad[idx[j]])


			t0 = idx[j];

			idx[j] = idx[i];

			idx[i] = t0;





Why do you have these “while (err != 0)”? Drop them. Instead you should ALWAYS check for errors and don’t hide them. Otherwise you will continue getting mistereous errors and wrong results. Make sure you catch the 1st error and then try to find the causes.

Thanks Maxim. It was a memory isuue and I have solved it.

I have one more doubt.

My kernel funtion is

__kernel void s264e_me_ipel_sad( __global unsigned char *ref, __global unsigned char *org, int x, int y, int mv, int min_cost)


char ref1;

ref1 = ref + 320 * y + x;

org = ref1[0];

the above statement is not working.

But if use org = ref[320 * y+x]; it is working


May I know the reason.

Can you please suggest me a method to calculate the offset address.

You wrongly declared ref1, you should do it this way:

__global unsigned char *ref1

But I personally never use explicit pointer arythmetics in OpenCL source code.