Here’s the full source code for the kernel.

**global** void

Convolution(short2* inFrame, short2* outFrame, int pitch)

{

**shared** short2 shared[(BLOCKSIZE+2)*(BLOCKSIZE+2)];*

int sharedpos = 1 + threadIdx.x + (threadIdx.y + 1)*18;*

int globalpos = 16 + blockIdx.xBLOCKSIZE + threadIdx.x;

globalpos += (1 + threadIdx.y + blockIdx.yBLOCKSIZE)*DIFFROWWIDTH;

// read in the core block

shared[sharedpos] = inFrame[globalpos];

// read in the top row

if (threadIdx.y == 0)

{

shared[1 + threadIdx.x] = inFrame[globalpos-DIFFROWWIDTH];

}

// read in the bottom row

if (threadIdx.y == BLOCKSIZE - 1)

shared[(BLOCKSIZE+1)*18 + 1 + threadIdx.x] = inFrame[globalpos + DIFFROWWIDTH];

// read in the left column

if (threadIdx.x == 0)

shared[(1+threadIdx.y)*18] = inFrame[globalpos - 1];

// read the right column

if (threadIdx.x == BLOCKSIZE - 1)

shared[(1+threadIdx.y)*18 + 17] = inFrame[globalpos+1];

// read the four corners

if (threadIdx.x == 0 && threadIdx.y == 0)

shared[0] = inFrame[globalpos - (DIFFROWWIDTH + 1)];

if (threadIdx.x == BLOCKSIZE - 1 && threadIdx.y == 0)

shared[17] = inFrame[globalpos - (DIFFROWWIDTH - 1)];

if (threadIdx.x == 0 && threadIdx.y == BLOCKSIZE - 1)

shared[(BLOCKSIZE + 1)*18] = inFrame[globalpos + (DIFFROWWIDTH - 1)];*

if (threadIdx.x == BLOCKSIZE - 1 && threadIdx.y == BLOCKSIZE - 1)

shared[(BLOCKSIZE+2)(BLOCKSIZE+2) - 1] = inFrame[globalpos + (DIFFROWWIDTH + 1)];

__syncthreads();

int result;

short2 tmp;

// calculate tmp.x

result = __mul24(13, shared[sharedpos].x);

result += __mul24(10, shared[sharedpos - 1].y);

result += __mul24(10, shared[sharedpos].y);

result += __mul24(10, shared[sharedpos - 18].x);

result += __mul24(10, shared[sharedpos + 18].x);

result += shared[sharedpos-18].y;

result += shared[sharedpos - 19].y;

result += shared[sharedpos + 17].y;

result += shared[sharedpos + 18].y;

result /= 57;

tmp.x = (short)result;

// calculate tmp.y

result = __mul24(13, shared[sharedpos].y);

result += __mul24(10, shared[sharedpos].x);

result += __mul24(10, shared[sharedpos+1].x);

result += __mul24(10, shared[sharedpos - 18].y);

result += __mul24(10, shared[sharedpos + 18].y);

result += shared[sharedpos-18].x;

result += shared[sharedpos - 17].x;

result += shared[sharedpos + 18].x;

result += shared[sharedpos + 19].x;

result /= 57;

tmp.y = (short)result;

// assign result to outFrame

globalpos = (threadIdx.y + blockDim.y*blockIdx.y)*pitch/sizeof(short2);*

globalpos += (threadIdx.x + blockIdx.xBLOCKSIZE);

outFrame[globalpos] = tmp;

}