weird bank conflict when matrix transpose

i couldnt understand result nsight compute report make.

this is my code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

const int BDIMX = 32;
const int BDIMY = 16;

__global__ void matrixTranspose1(int *result, int *m) {
	__shared__ int smem[BDIMY][BDIMX];

	size_t gidx = threadIdx.y * blockDim.x + threadIdx.x;

	size_t irow = gidx / blockDim.y;
	size_t icol = gidx % blockDim.y;

	smem[threadIdx.y][threadIdx.x] = m[gidx];

	__syncthreads();

	result[gidx] = smem[icol][irow];
}

int main() {
	int *mat = new int[BDIMX*BDIMY];
	int *h_result = new int[BDIMX*BDIMY];

	int *d_mat, *d_result;
	int mat_byte = BDIMX * BDIMY * sizeof(int);
	
	cudaMalloc(&d_mat, mat_byte);
	cudaMalloc(&d_result, mat_byte);

	cudaMemcpy(d_mat, mat, mat_byte, cudaMemcpyHostToDevice);

	dim3 block(BDIMX, BDIMY);

	matrixTranspose1<<<1, block>>>(d_result, d_mat);

	cudaMemcpy(h_result, d_result, mat_byte, cudaMemcpyDeviceToHost);

	cudaFree(d_mat);
	cudaFree(d_result);

	delete[] mat;
	delete[] h_result;

	return 0;
}

in my code, expected bank conflict is 256 but nsight compute report showing 240 bank conflict when shared memory load.

please help me to understand why nsight compute report show 240 conflict

https://github.com/ys9617/bin/blob/master/shared%20memory%20bank%20conflict.PNG

Each instruction has 16 requests. The first 15 requests have a bank conflict. The last request does not have any more conflicts.