Kernel crashes when called multiple times (inside a loop)

Hi,

I have just started programming using CUDA.
My host program calls kernel inside a loop. For small number of iterations everything is OK but for large number of iterations the kernel crashes.

My Kernel is :


device float gKernelValue[5][5] =

{
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 }
};

device float convol(unsigned char *buf, int width, int height, int row, int col)
{
float ret = 0.0f;

for (int check = 0; check < 3; ++check)
{
	ret = 0.0f;
	for (int i = 0; i < 5; ++i)
	{
		for (int j = 0; j < 5; ++j)
		{
			int data = 0;
			int tmprow = row + i;
			int tmpcol = col + j;
			if (tmprow >= 0 && tmpcol >= 0 && tmprow < height && tmpcol < width)
			{
				int index = width *  tmprow + tmpcol;
				if (index >= 0 && index < (width * height))
					data = (int)buf[index];
			}

			ret += gKernelValue[i][j] * (float)data;
		}
	}
	if (ret > 255.0f)
		ret = 255.0f;
}

return ret;

}

global
void My_kernel(unsigned char *buf, unsigned long long width, unsigned long long height , float *data)
{
int tmprow = blockDim.y * blockIdx.y + threadIdx.y;
int tmpcol = blockDim.x * blockIdx.x + threadIdx.x;

int index = width *  tmprow +  tmpcol;

if (tmprow < height && tmpcol < width)
{
	float org[5][5];

	for (int i = 0; i <  5; ++i)
	{
		for (int j = 0; j < 5; ++j)
		{
			org[i][j] = 0.f;

			int r = blockDim.y * blockIdx.y + threadIdx.y + i;
			int c = blockDim.x * blockIdx.x + threadIdx.x + j;

			if (r >= 0 && c >= 0 && r < height && c < width)
			{
				org[i][j] = convol(buf, width, height, r, c);
			}
		}
	}

	data[index] = org[2][2];
}

}

and My host program is

void myfunc(unsigned char *buf, unsigned long long width, unsigned long long height)
{
dim3 threads, grid;
threads = dim3(16, 16);
grid = dim3((1 + width / threads.x), (1 + height / threads.y));

thrust::device_vector<float> d_ret(width * height, 0.f);

for (int iter = 0; iter < 100; ++iter)
	My_kernel << < grid, threads >> >(buf, width, height, thrust::raw_pointer_cast(&d_ret[0]));

}

Please let me know what I am missing here.