i couldnt understand result nsight compute report make.
this is my code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
const int BDIMX = 32;
const int BDIMY = 16;
__global__ void matrixTranspose1(int *result, int *m) {
__shared__ int smem[BDIMY][BDIMX];
size_t gidx = threadIdx.y * blockDim.x + threadIdx.x;
size_t irow = gidx / blockDim.y;
size_t icol = gidx % blockDim.y;
smem[threadIdx.y][threadIdx.x] = m[gidx];
__syncthreads();
result[gidx] = smem[icol][irow];
}
int main() {
int *mat = new int[BDIMX*BDIMY];
int *h_result = new int[BDIMX*BDIMY];
int *d_mat, *d_result;
int mat_byte = BDIMX * BDIMY * sizeof(int);
cudaMalloc(&d_mat, mat_byte);
cudaMalloc(&d_result, mat_byte);
cudaMemcpy(d_mat, mat, mat_byte, cudaMemcpyHostToDevice);
dim3 block(BDIMX, BDIMY);
matrixTranspose1<<<1, block>>>(d_result, d_mat);
cudaMemcpy(h_result, d_result, mat_byte, cudaMemcpyDeviceToHost);
cudaFree(d_mat);
cudaFree(d_result);
delete[] mat;
delete[] h_result;
return 0;
}
in my code, expected bank conflict is 256 but nsight compute report showing 240 bank conflict when shared memory load.
please help me to understand why nsight compute report show 240 conflict
https://github.com/ys9617/bin/blob/master/shared%20memory%20bank%20conflict.PNG