clFlush(ctx->gpu.command_queue);
clEnqueueWriteBuffer(ctx->gpu.command_queue, ctx->gpu_frame.sad, CL_TRUE, 0, sizeof(int)* 8, sad, 0, NULL, NULL);
err = clSetKernelArg(ctx->gpu.kernel,0,sizeof(cl_mem), &ctx->gpu_frame.ref_pad);
err = clSetKernelArg(ctx->gpu.kernel,1,sizeof(cl_mem), &ctx->gpu_frame.org);
err = clSetKernelArg(ctx->gpu.kernel,2,sizeof(cl_int), &x);
err = clSetKernelArg(ctx->gpu.kernel,3,sizeof(cl_int), &y);
err = clSetKernelArg(ctx->gpu.kernel,4,sizeof(cl_int), &mv1);
err = clSetKernelArg(ctx->gpu.kernel,5,sizeof(cl_int), &min_cost);
err = clSetKernelArg(ctx->gpu.kernel,6,sizeof(cl_int), &w);
err = clSetKernelArg(ctx->gpu.kernel,7,sizeof(cl_int), &h);
err = clSetKernelArg(ctx->gpu.kernel,8,sizeof(cl_int), &ctx->picbuf[0].s_l);
err = clSetKernelArg(ctx->gpu.kernel,9,sizeof(cl_int), &ctx->picbuf[0].w_l);
err = clSetKernelArg(ctx->gpu.kernel,10,sizeof(cl_mem), &ctx->gpu_frame.sad);
err = clSetKernelArg(ctx->gpu.kernel,11,sizeof(cl_mem), &ctx->gpu_frame.idx);
err = clFinish(ctx->gpu.command_queue);
while (err != 0)
err = clFinish(ctx->gpu.command_queue);
//err = clEnqueueNDRangeKernel(ctx->gpu.command_queue,ctx->gpu.kernel,
// 1,NULL,&global,0,0,NULL,NULL);
err = clEnqueueTask(ctx->gpu.command_queue,ctx->gpu.kernel,NULL, NULL, NULL);
clFinish(ctx->gpu.command_queue);
while (err != 0)
err = clFinish(ctx->gpu.command_queue);
// copy the results from out of the output buffer
err = clEnqueueReadBuffer(ctx->gpu.command_queue, ctx->gpu_frame.sad, CL_TRUE, 0, sizeof(int) * 8, sad, 0, NULL, NULL);
while (err != 0)
err = clFinish(ctx->gpu.command_queue);
clEnqueueReadBuffer(ctx->gpu.command_queue, ctx->gpu_frame.idx, CL_TRUE, 0, sizeof(unsigned char) * 8, idx, 0, NULL, NULL);
err = clFinish(ctx->gpu.command_queue);
while (err != 0)
err = clFinish(ctx->gpu.command_queue);
kernel code is
__kernel void s264e_me_ipel_sad( __global unsigned char *ref1, __global unsigned char *org, int x, int y, int mv, int min_cost,
int w, int h, int s_ref, int s_org, __global int *sad, __global unsigned char *idx)
{
int i, j, k, t0,org_w;
int mv1, mv2, mv3, mv4;
int id0 = get_global_id(0);
int lsad = 0;
int org_off;
int ref_off, off;
s_ref = 384;
org_w = 320;
off = 32 * s_ref + 32;
mv1 = mv & 0xffff;
mv2 = (mv & 0xffff0000) >> 16;
/* get SAD of out-side positions */
for(k=0; k<4; k++)
{
if(sad[k] < 0)
{
mv3 = mv1 + tbl_diapos_small2[k][0];
mv4 = mv2 + tbl_diapos_small2[k][1];
org_off = y * org_w + x;
ref_off = off + (mv4 * s_ref) + mv3;
lsad = 0;
for(i=0; i<h; i++)
{
for(j=0; j<w; j++)
{
lsad += SCMN_ABS16((short)org[org_off+j] - (short)ref1[ref_off+j]);
}
if(lsad > min_cost)
break;
org_off += org_w;
ref_off += s_ref;
}
sad[k] = lsad;
}
}
/* sorting SAD based on bubble sorting */
for(i=0; i<5; i++)
idx[i] = i;
for(i=0; i<4; i++)
{
for(j=i+1; j<5; j++)
{
if(sad[idx[i]] >= sad[idx[j]])
{
t0 = idx[j];
idx[j] = idx[i];
idx[i] = t0;
}
}
}
}