Hi guys :D

I try to use cuda to computing as fast as possible

but the result i receive that my cuda program is too slow.

Some once in the forum discuss about coalescing but until now i don’t know how to use it. <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=’:’(’ />

please give me your idea

call kernel function

dimGrid=(38,1)

dimBlock=(352,1,1)

Subtraction<<<dimGrid,dimBlock>>>(templateCol,templateRow,sourceCol,sourceGpu,templateGpu,positionGpu,targetGpu);

and this is kernel function

**global** void Subtraction(int templateX,int templateY,long sourceX,unsigned char *sourceF,unsigned char *templateF,int *positionData,unsigned char targetF)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
if(bx>7&&bx!=11&&bx!=15&&bx!=21&&bx!=31&&bx!=35&&tx<=348)
{
int x=positionData[bx2];
int y=positionData[bx*2+1];

int start=x+(y+1)

*sourceX;*

for(int i=0;i<templateY;i++)

{

targetF[i*sourceX+start+tx]=255-abs(templateF[tx+itemplateX]-sourceF[i*sourceX+start+tx]);

for(int i=0;i<templateY;i++)

{

targetF[i*sourceX+start+tx]=255-abs(templateF[tx+i

__syncthreads();

}

}

}

when i use cuda visual profiler to optimize

gld_coherent =0;

gld_incoherent =2884136;

gst_coherent=0;

gst_incoherent=2881344;