weird bank conflict when matrix transpose

i couldnt understand result nsight compute report make.

this is my code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

const int BDIMX = 32;
const int BDIMY = 16;

__global__ void matrixTranspose1(int *result, int *m) {
	__shared__ int smem[BDIMY][BDIMX];

	size_t gidx = threadIdx.y * blockDim.x + threadIdx.x;

	size_t irow = gidx / blockDim.y;
	size_t icol = gidx % blockDim.y;

	smem[threadIdx.y][threadIdx.x] = m[gidx];


	result[gidx] = smem[icol][irow];

int main() {
	int *mat = new int[BDIMX*BDIMY];
	int *h_result = new int[BDIMX*BDIMY];

	int *d_mat, *d_result;
	int mat_byte = BDIMX * BDIMY * sizeof(int);
	cudaMalloc(&d_mat, mat_byte);
	cudaMalloc(&d_result, mat_byte);

	cudaMemcpy(d_mat, mat, mat_byte, cudaMemcpyHostToDevice);

	dim3 block(BDIMX, BDIMY);

	matrixTranspose1<<<1, block>>>(d_result, d_mat);

	cudaMemcpy(h_result, d_result, mat_byte, cudaMemcpyDeviceToHost);


	delete[] mat;
	delete[] h_result;

	return 0;

in my code, expected bank conflict is 256 but nsight compute report showing 240 bank conflict when shared memory load.

please help me to understand why nsight compute report show 240 conflict

Each instruction has 16 requests. The first 15 requests have a bank conflict. The last request does not have any more conflicts.