need Help with Filter example

hey guys,

i need some help with this.

im trying to get this filter kernel running, but it is not working…




#define TILE_W 16

#define TILE_H 16

#define R 1 // Filter radius

#define D (R*2 + 1) // Filter diameter

#define S (D*D) // Filter size

#define BLOCK_W (TILE_W+(2*R))

#define BLOCK_H (TILE_H+(2*R))

global void filter(float *odata, float *idata, int width, int height)


__shared__ float smem[BLOCK_W*BLOCK_H];

int x = blockIdx.x*TILE_W + threadIdx.x - R;

int y = blockIdx.y*TILE_H + threadIdx.y - R;

//clamp to edge of image

x = max(0,x);

x = min(x,width-1);

y = max(y,0);

y = min(y,height-1);

unsigned int index = y*width + x;

unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;

//each thread copies its pixel of the block to shared memory

smem[bindex] = idata[index];


//only threads inside apron will write results

if ( (threadIdx.x >= R) && (threadIdx.x < BLOCK_W-R) && (threadIdx.y >= R) && (threadIdx.y < BLOCK_W-R) )


	float sum=0;

	for (int dy=-R; dy<=R; dy++)


		for (int dx=-R; dx<=R; dx++)


			float i = smem[bindex + (dy*blockDim.x) + dx];

			sum += i;








my questions:

whats my block dimension? Is it supposed to be (BLOCK_W,BLOCK_H)?

whats my grid dimension?

lets assume my input picture is the size 640x480, whats the size of my output picture? 638x478 cause of the apron (R 1)?

what i want to do is, take a 640x480 grayscale picture add the apron ( picture is now the size 642x482 ) filter it with this kernel

and have an output picture, which is 640x480 again.

any help or explanation is greatly appreciated.



here’s the rest of my code.

[codebox]// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <>


void runTest( int argc, char** argv);


// Program main



main( int argc, char** argv)


runTest( argc, argv);

cutilExit(argc, argv);




runTest( int argc, char** argv)


// size of the matrix

const unsigned int size_x = 72;

const unsigned int size_y = 36;

// size of memory required to store the matrix

const unsigned int mem_size = sizeof(float) * size_x * size_y;

unsigned int timer;


// use command-line specified CUDA device, otherwise use device with highest Gflops/s

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

	cutilDeviceInit(argc, argv);


	cudaSetDevice( cutGetMaxGflopsDeviceId() );

// allocate host memory

float* h_idata = (float*) malloc(mem_size);

// initalize the memory

for( unsigned int i = 0; i < (size_x * size_y); ++i)


    h_idata[i] = 9.;    // rand(); 


//printing input matrix

for (int j=0; j<((size_x)*(size_y)); j++)


	if (j%(size_x)==0) printf("\n");

	printf(" %d ", (int) h_idata[j]);



// allocate device memory

float* d_idata;

float* d_odata;

cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size));

cutilSafeCall( cudaMalloc( (void**) &d_odata, mem_size));

// copy host memory to device

cutilSafeCall( cudaMemcpy( d_idata, h_idata, mem_size,

                            cudaMemcpyHostToDevice) );

// setup execution parameters

dim3 grid(size_x / BLOCK_W, size_y / BLOCK_H, 1);

dim3 threads(BLOCK_W, BLOCK_H, 1);

// warmup so we don’t time CUDA startup

filter<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);

// synchronize here, so we make sure that we don't count any time from the asynchronize kernel launches.



filter<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);



float averageTime = cutGetTimerValue(timer);

printf(“average time: %0.3f ms\n\n”, averageTime);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// copy result from device to host

float* h_odata = (float*) malloc(mem_size);

cutilSafeCall( cudaMemcpy( h_odata, d_odata, mem_size,

                            cudaMemcpyDeviceToHost) );

//printing output matrix

for (int j=0; j<((size_x-2)*(size_y-2)); j++)


	if (j%(size_x-2)==0) printf("\n");

	printf(" %0.2f ", h_odata[j]);


// cleanup memory





cutilCheckError( cutDeleteTimer(timer));




ahh i guess i figured it out all by myself…


//setup execution parameters

dim3 grid(size_x/TILE_W, size_y/TILE_H,1);

dim3 threads(BLOCK_W, BLOCK_H,1);


where size_x and size_y need to be multiples of TILE…