Hi,
i have a problem with the following kernel code :
for(int x = kernelradius; x <= nbNiveaux - kernelradius - 1; x++)
{
for(int y = kernelradius; y <= nbNiveaux - kernelradius - 1; y++)
{
sommediv = 0;
unsigned int index_cmat = __umul24(i, nbNiveaux) + __umul24(__umul24(j, d), nbNiveaux) + x + __umul24(y , d); // (x,y)
float cmatxy = Cmats[index_cmat];
for(int kx = -kernelradius; kx <= kernelradius; kx++)
{
for(int ky = -kernelradius; ky <= kernelradius; ky++)
{
dx = x + kx;
dy = y + ky;
unsigned int index_cmatf = __umul24(i, nbNiveaux) + __umul24(__umul24(j, d), nbNiveaux) + dx + __umul24(dy , d); // (dx,dy)
CmatsF[index_cmatf] = h[kernelradius + kx + (kernelradius + ky) * largeurFiltre[0]];
}
}
}
}
CmatsF and h are global memory arrays. When I want to read elements of h outside the loops “for(int kx …, ky …” it works perfectly, but inside the loops, it seems to crash the kernel (or CUDA) and global memory is cleared. :unsure:
Can someone explain to me why I can’t access h in these loops ?
thanks