Hi,
I have just started programming using CUDA.
My host program calls kernel inside a loop. For small number of iterations everything is OK but for large number of iterations the kernel crashes.
My Kernel is :
device float gKernelValue[5][5] =
{
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0 }
};
device float convol(unsigned char *buf, int width, int height, int row, int col)
{
float ret = 0.0f;
for (int check = 0; check < 3; ++check)
{
ret = 0.0f;
for (int i = 0; i < 5; ++i)
{
for (int j = 0; j < 5; ++j)
{
int data = 0;
int tmprow = row + i;
int tmpcol = col + j;
if (tmprow >= 0 && tmpcol >= 0 && tmprow < height && tmpcol < width)
{
int index = width * tmprow + tmpcol;
if (index >= 0 && index < (width * height))
data = (int)buf[index];
}
ret += gKernelValue[i][j] * (float)data;
}
}
if (ret > 255.0f)
ret = 255.0f;
}
return ret;
}
global
void My_kernel(unsigned char *buf, unsigned long long width, unsigned long long height , float *data)
{
int tmprow = blockDim.y * blockIdx.y + threadIdx.y;
int tmpcol = blockDim.x * blockIdx.x + threadIdx.x;
int index = width * tmprow + tmpcol;
if (tmprow < height && tmpcol < width)
{
float org[5][5];
for (int i = 0; i < 5; ++i)
{
for (int j = 0; j < 5; ++j)
{
org[i][j] = 0.f;
int r = blockDim.y * blockIdx.y + threadIdx.y + i;
int c = blockDim.x * blockIdx.x + threadIdx.x + j;
if (r >= 0 && c >= 0 && r < height && c < width)
{
org[i][j] = convol(buf, width, height, r, c);
}
}
}
data[index] = org[2][2];
}
}
and My host program is
void myfunc(unsigned char *buf, unsigned long long width, unsigned long long height)
{
dim3 threads, grid;
threads = dim3(16, 16);
grid = dim3((1 + width / threads.x), (1 + height / threads.y));
thrust::device_vector<float> d_ret(width * height, 0.f);
for (int iter = 0; iter < 100; ++iter)
My_kernel << < grid, threads >> >(buf, width, height, thrust::raw_pointer_cast(&d_ret[0]));
}
Please let me know what I am missing here.