I placed a similar question on Stackoverflow (http://stackoverflow.com/questions/28374796/bank-conflict-cuda-shared-memory, so far without useful replies.
I have a problem with shared memory usage on a Quadro K6000. I managed to reproduce the problem with the minimal example attached below, which is a simple copy from global → shared → global memory using a shared-memory array which might be padded on the right side (variable ng). The use of shared memory is not very useful here, but it illustrates the problem.
If I compile the code (-arch=sm_35, cuda 6.5.14) with ng=0 and study the shared memory access pattern with NVVP, it tells me that there are “no issues”. The same test with (for example) ng=2 returns “Shared Store Transactions/Access = 2, Ideal Transactions/Acces = 1” at the lines marked with “NVVP warning”.
What exactly is NVVP indicating with “Shared Store Transactions/Access = 2, Ideal Transactions/Acces = 1”? And why does it occur with ng>0, but not with ng=0?
__global__ void copy(double * const __restrict__ out, const double * const __restrict__ in, const int ni, const int nj, const int ng)
{
extern __shared__ double as[];
const int ij=threadIdx.x + threadIdx.y*blockDim.x;
const int ijs=threadIdx.x + threadIdx.y*(blockDim.x+ng);
as[ijs] = in[ij]; // NVVP warning
__syncthreads();
out[ij] = as[ijs]; // NVVP warning
}
int main()
{
const int itot = 16;
const int jtot = 16;
const int ng = 2;
const int ncells = itot * jtot;
double *in = new double[ncells];
double *out = new double[ncells];
double *tmp = new double[ncells];
for(int n=0; n<ncells; ++n)
in[n] = 0.001 * (std::rand() % 1000) - 0.5;
double *ind, *outd;
cudaMalloc((void **)&ind, ncells*sizeof(double));
cudaMalloc((void **)&outd, ncells*sizeof(double));
cudaMemcpy(ind, in, ncells*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(outd, out, ncells*sizeof(double), cudaMemcpyHostToDevice);
dim3 gridGPU (1, 1 , 1);
dim3 blockGPU(16, 16, 1);
copy<<<gridGPU, blockGPU, (itot+ng)*jtot*sizeof(double)>>>(outd, ind, itot, jtot, ng);
cudaMemcpy(tmp, outd, ncells*sizeof(double), cudaMemcpyDeviceToHost);
return 0;
}