I thought the following simple code (just for test purposes) would generate bank conflicts on V100, but Nsight-compute says there are no bank conflicts.
In the code, I allocated a shared memory of size (sizeof(double)x32x32) for a matrix L
, and its column values are copied from d_L
in the for
loop.
Since each column has 32 double values (256 bytes) and V100 has 32 banks each of which is of size 4 byte, each column values will be stored in two rows of banks. If this is correct, I thought that L[n*j + tx] = d_L[n*j + tx]
should generate a two-way bank conflict for each j=0,..,n-1
, but Nsight-compute says there are no bank conflicts as you see in the captured image below.
Could anyone shed some light on this?
__global__
void bank_conflict(int n, double *d_L, double *d_out)
{
extern __shared__ double s[];
double *L;
int tx = threadIdx.x;
L = s;
for (int j=0; j<n; j++) {
L[n*j + tx] = d_L[n*j + tx];
}
d_out[tx] = L[tx];
}
int main(int argc, char **argv)
{
int n = 32;
double *h_out, *h_L, *d_L, *d_out;
cudaHostAlloc((void **)&h_L, sizeof(double)*(n*n), cudaHostAllocPortable);
cudaHostAlloc((void **)&h_out, sizeof(double)*(n), cudaHostAllocPortable);
cudaMalloc((void **)&d_L, sizeof(double)*(n*n));
cudaMalloc((void **)&d_out, sizeof(double)*(n));
for (int j=0; j<n; j++) {
for (int i=0; i<n; i++) {
h_L[n*j + i] = 5.0;
}
}
cudaMemcpy(d_L, h_L, sizeof(double)*(n*n), cudaMemcpyHostToDevice);
bank_conflict<<<1, n, sizeof(double)*(n*n)>>>(n, d_L, d_out);
cudaMemcpy(h_out, d_out, sizeof(double)*n, cudaMemcpyDeviceToHost);
for (int j=0; j<n; j++) {
printf("h_out[%d] = %e\n", j, h_out[j]);
}
cudaFreeHost(h_L);
cudaFreeHost(h_out);
cudaFree(d_L);
cudaFree(d_out);
return 0;
}