I have a (slightly) shared-memory optimized nbody kernel (only compiled by Nvrtc compiler):
R"(
// 20 flops per pair
extern "C"
__global__
void kernelAcceleration(const float * __restrict__ x, const float * __restrict__ y,
const float * __restrict__ z, float * vx, float * vy, float * vz,
const float * __restrict__ m)
{
int i=threadIdx.x + blockDim.x * blockIdx.x;
int k=threadIdx.x;
float fx =0;
float fy =0;
float fz =0;
float fx2 =0;
float fy2 =0;
float fz2 =0;
float x0=x[i];
float y0=y[i];
float z0=z[i];
__shared__ float sx[constant_hblockN];
__shared__ float sy[constant_hblockN];
__shared__ float sz[constant_hblockN];
__shared__ float sm[constant_hblockN];
bool doubleBuffer = false;
for(int j=constant_hblockN;j<constant_n;j+=constant_hblockN)
{
doubleBuffer=!doubleBuffer;
__syncthreads();
if(k<constant_hblockN)
{
sx[k]=x[j+k];
sy[k]=y[j+k];
sz[k]=z[j+k];
sm[k]=m[j+k];
}
__syncthreads();
#pragma unroll
for(int m=0;m<constant_hblockN;m+=2)
{
const int m1=m+1;
{
const float x1=sx[m];
const float y1=sy[m];
const float z1=sz[m];
const float dx=x1-x0;
const float dy=y1-y0;
const float dz=z1-z0;
float r=rsqrtf(dx*dx+dy*dy+dz*dz+0.01f);
r*=(r*r)*sm[m];
fx+=dx*(r);
fy+=dy*(r);
fz+=dz*(r);
}
{
const float x1=sx[m1];
const float y1=sy[m1];
const float z1=sz[m1];
const float dx=x1-x0;
const float dy=y1-y0;
const float dz=z1-z0;
float r=rsqrtf(dx*dx+dy*dy+dz*dz+0.01f);
r*=(r*r)*sm[m1];
fx2+=dx*(r);
fy2+=dy*(r);
fz2+=dz*(r);
}
}
}
vx[i]+=constant_dt*(fx+fx2)/m[i];
vy[i]+=constant_dt*(fy+fy2)/m[i];
vz[i]+=constant_dt*(fz+fz2)/m[i];
}
)"
this results in 375 GFLOPS on two Quadro-K420 devices each with 1 SM units of CC version 3.0.
When looking at Nsight output for memory bandwidth:
I see L1 is not used. Also documentation says some Kepler versions use L1 only for local reads.
Question: is the " L1/shared 150 GB/s" bandwidth here total of a common unitifed cache or are these values separately summed? Do I need to use local memory reading on top of this kernel, to get the 150GB/s up to the 200GB/s levels?
What happens when atomic operations exist? Do they have additional (but low) bandwidth or they add up to all other memory requests that use same cache hardware?