Cp.async introduces bank conflict than naive ldg and sts ptx?

I have an access pattern that load data from global to shared which is:

  #pragma unroll
  for (int ax0_ax1_ax2_ax3_0_fused_2 = 0; ax0_ax1_ax2_ax3_0_fused_2 < 2; ++ax0_ax1_ax2_ax3_0_fused_2) {
    *(uint4*)(((half*)buf_dyn_shmem) + ((((((int)threadIdx.y) * 1024) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_ax3_0_fused_2 * 256)) + (((int)threadIdx.x) * 8))) = *(uint4*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_ax3_0_fused_2 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)));
  }
  #pragma unroll
  for (int ax0_ax1_ax2_ax3_0_fused_2_1 = 0; ax0_ax1_ax2_ax3_0_fused_2_1 < 4; ++ax0_ax1_ax2_ax3_0_fused_2_1) {
    *(uint4*)(((half*)buf_dyn_shmem) + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 256)) + (((int)threadIdx.x) * 8)) + 4096)) = *(uint4*)(B + ((((((((((((int)threadIdx.y) * 256) + (((int)threadIdx.z) * 128)) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 32)) + ((int)threadIdx.x)) >> 9) * 262144) + (((int)blockIdx.x) * 4096)) + (((((((int)threadIdx.y) * 8) + (((int)threadIdx.z) * 4)) + ax0_ax1_ax2_ax3_0_fused_2_1) & 15) * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)));
  }

and we can see that this pattern is conflict free because we access shared memory with consective threadIdx.x * 8 (4 banks aligned), the nsight compute profiler show similar results:

However, I wanna to change the copy pattern to cp.async in my gtx 3090 because I wanna save some registers and save bandwidth to achieve a bigger tile? but the performance is not good, and I observed some bank conflict happens in this part, the code is:

  extern __shared__ uchar buf_dyn_shmem[];
  
  for (int ax0_ax1_ax2_ax3_0_fused_2 = 0; ax0_ax1_ax2_ax3_0_fused_2 < 2; ++ax0_ax1_ax2_ax3_0_fused_2) {

  {
    unsigned int addr;
    __asm__ __volatile__(
      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
      : "=r"(addr)
      : "l"((void *)(buf_dyn_shmem + (((((int)threadIdx.y) * 2048) + (ax0_ax1_ax2_ax3_0_fused_2 * 512)) + (((int)threadIdx.x) * 16))))
    );
    __asm__ __volatile__(
        "cp.async.ca.shared.global.L2::128B [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 524288)) + ((ax0_ax1_ax2_ax3_0_fused_2 >> 1) * 262144)) + ((ax0_ax1_ax2_ax3_0_fused_2 & 1) * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)))), "n"(16)
    );
  }
  }
  for (int ax0_ax1_ax2_ax3_0_fused_2_1 = 0; ax0_ax1_ax2_ax3_0_fused_2_1 < 8; ++ax0_ax1_ax2_ax3_0_fused_2_1) {

  {
    unsigned int addr;
    __asm__ __volatile__(
      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
      : "=r"(addr)
      : "l"((void *)(buf_dyn_shmem + ((((((int)threadIdx.y) * 4096) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 512)) + (((int)threadIdx.x) * 16)) + 16384)))
    );
    __asm__ __volatile__(
        "cp.async.ca.shared.global.L2::128B [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)(B + (((((((((int)threadIdx.y) >> 1) * 262144) + (((int)blockIdx.x) * 4096)) + ((((int)threadIdx.y) & 1) * 2048)) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)))), "n"(16)
    );
  }
  }

and we see bank conflicts happens:

however I can observe that sota library like cutlass which also leveraged cp.async ptx but it has no bank conflict, did it make sense?