I have traced the problem down to one specific line of code.
Here is the full kernel. In the brackets following the if(false) statement is a a first version of the kernel which runs fine. I want to coalesce the atomicAdds now (access conflicts are rare since there are about 20times more array elements than threads). The second part runs fine as well,except if I reintroduce the atomicAdd (which is commented out right now). Directly above I also check if I have an access outside of the array dimension, but that is not the case.
If anyone got an idea please let me know.
Ceearem
__global__ void make_rho_kernel(int* flag,int read_threads_at_same_time)
{
int i,l,m,n,nx,ny,nz,mx,my,mz,a,b;
// clear 3d density array
// loop over my charges, add their contribution to nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// int nzxy=blockIdx.x*gridDim.y+blockIdx.y;
// i=pppm_grid_ids[nzyx*blockDim.x+threadIdx.x];
int nelements=nupper-nlower+1;
int* idx=(int*) sharedmem;
int* sdensity_brick_int=&idx[blockDim.x];
PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &sdensity_brick_int[nelements*blockDim.x];
if(threadIdx.x<order*(order/2-(1-order)/2+1))
srho_coeff[threadIdx.x]=rho_coeff[threadIdx.x];
__syncthreads();
i=blockIdx.x*blockDim.x+threadIdx.x;
if(false)
{
if(i < nlocal) {
PPPM_FLOAT dx,dy,dz,x0,y0,z0;
nx = part2grid[i];
ny = part2grid[i+nmax];
nz = part2grid[i+2*nmax];
dx = nx+shiftone - (_x[i]-_boxlo.x)*delxinv;
dy = ny+shiftone - (_x[i+nmax]-_boxlo.y)*delyinv;
dz = nz+shiftone - (_x[i+2*nmax]-_boxlo.z)*delzinv;
z0 = delxinv*delyinv*delzinv * _q[i];
for (n = nlower; n <= nupper; n++)
{
mz = n+nz;
y0 = z0*rho1d(n,dz,srho_coeff);
for (m = nlower; m <= nupper; m++)
{
my = m+ny;
x0 = y0*rho1d(m,dy,srho_coeff);
for (l = nlower; l <= nupper; l++)
{
mx = l+nx;
int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out;
a=int(x0*rho1d(l,dx,srho_coeff)*density_intScale);
b=(atomicAdd(&density_brick_int[mzyx],a)|a);
if(((b)&(0x7c000000))&&(not((b)&(0x80000000))))
{
flag[1]++;
if((b)&(0x60000000)) flag[0]++;
}
__syncthreads();
}
}
}
}
return;
}
i=blockIdx.x*blockDim.x+threadIdx.x;
{
PPPM_FLOAT dx,dy,dz,x0,y0,z0;
if(i < nlocal)
{
nx = part2grid[i];
ny = part2grid[i+nmax];
nz = part2grid[i+2*nmax];
dx = nx+shiftone - (_x[i]-_boxlo.x)*delxinv;
dy = ny+shiftone - (_x[i+nmax]-_boxlo.y)*delyinv;
dz = nz+shiftone - (_x[i+2*nmax]-_boxlo.z)*delzinv;
z0 = delxinv*delyinv*delzinv * _q[i];
}
else
{
nx=ny=nz=1; dx=dy=dz=0.1;
}
__syncthreads();
for (n = nlower; n <= nupper; n++)
{
mz = n+nz;
y0 = z0*rho1d(n,dz,srho_coeff);
for (m = nlower; m <= nupper; m++)
{
//my = m+ny;
x0 = y0*rho1d(m,dy,srho_coeff);
if(i<nlocal)
{
idx[threadIdx.x]=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+nx+nlower-nxlo_out;
for (l = nlower; l <= nupper; l++)
{
sdensity_brick_int[threadIdx.x*nelements+l-nlower]=int(x0*rho1d(l,dx,srho_coeff)*density_intScale);
}
}
else idx[threadIdx.x]=-1;
__syncthreads();
// if((idx[threadIdx.x]>=(nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1))||
// (idx[threadIdx.x]<0)) flag[2]++;
for(int ii=0;ii<blockDim.x;ii+=read_threads_at_same_time)
{
int kk=threadIdx.x/nelements;
if((threadIdx.x<nelements*read_threads_at_same_time)&&(kk+ii<blockDim.x)&&(idx[ii+kk]>-1))
{
a=sdensity_brick_int[ii*nelements+threadIdx.x];
if((idx[ii+kk]+threadIdx.x-kk*nelements>=(nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1))||
(idx[ii+kk]+threadIdx.x-kk*nelements<0)) flag[2]++;
//b=(atomicAdd(&density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements],a)|a);
b=a;
if(((b)&(0x7c000000))&&(not((b)&(0x80000000))))
{
flag[1]++;
if((b)&(0x60000000)) flag[0]++;
}
}
}
__syncthreads(); //*/
}
}
}
}