Cp.async introduces bank conflict than naive ldg and sts ptx?

wanglei21c · March 6, 2023, 2:02pm

I have an access pattern that load data from global to shared which is:

  #pragma unroll
  for (int ax0_ax1_ax2_ax3_0_fused_2 = 0; ax0_ax1_ax2_ax3_0_fused_2 < 2; ++ax0_ax1_ax2_ax3_0_fused_2) {
    *(uint4*)(((half*)buf_dyn_shmem) + ((((((int)threadIdx.y) * 1024) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_ax3_0_fused_2 * 256)) + (((int)threadIdx.x) * 8))) = *(uint4*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_ax3_0_fused_2 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)));
  }
  #pragma unroll
  for (int ax0_ax1_ax2_ax3_0_fused_2_1 = 0; ax0_ax1_ax2_ax3_0_fused_2_1 < 4; ++ax0_ax1_ax2_ax3_0_fused_2_1) {
    *(uint4*)(((half*)buf_dyn_shmem) + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 256)) + (((int)threadIdx.x) * 8)) + 4096)) = *(uint4*)(B + ((((((((((((int)threadIdx.y) * 256) + (((int)threadIdx.z) * 128)) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 32)) + ((int)threadIdx.x)) >> 9) * 262144) + (((int)blockIdx.x) * 4096)) + (((((((int)threadIdx.y) * 8) + (((int)threadIdx.z) * 4)) + ax0_ax1_ax2_ax3_0_fused_2_1) & 15) * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)));
  }

and we can see that this pattern is conflict free because we access shared memory with consective threadIdx.x * 8 (4 banks aligned), the nsight compute profiler show similar results:

However, I wanna to change the copy pattern to cp.async in my gtx 3090 because I wanna save some registers and save bandwidth to achieve a bigger tile? but the performance is not good, and I observed some bank conflict happens in this part, the code is:

  extern __shared__ uchar buf_dyn_shmem[];
  
  for (int ax0_ax1_ax2_ax3_0_fused_2 = 0; ax0_ax1_ax2_ax3_0_fused_2 < 2; ++ax0_ax1_ax2_ax3_0_fused_2) {

  {
    unsigned int addr;
    __asm__ __volatile__(
      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
      : "=r"(addr)
      : "l"((void *)(buf_dyn_shmem + (((((int)threadIdx.y) * 2048) + (ax0_ax1_ax2_ax3_0_fused_2 * 512)) + (((int)threadIdx.x) * 16))))
    );
    __asm__ __volatile__(
        "cp.async.ca.shared.global.L2::128B [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 524288)) + ((ax0_ax1_ax2_ax3_0_fused_2 >> 1) * 262144)) + ((ax0_ax1_ax2_ax3_0_fused_2 & 1) * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)))), "n"(16)
    );
  }
  }
  for (int ax0_ax1_ax2_ax3_0_fused_2_1 = 0; ax0_ax1_ax2_ax3_0_fused_2_1 < 8; ++ax0_ax1_ax2_ax3_0_fused_2_1) {

  {
    unsigned int addr;
    __asm__ __volatile__(
      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
      : "=r"(addr)
      : "l"((void *)(buf_dyn_shmem + ((((((int)threadIdx.y) * 4096) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 512)) + (((int)threadIdx.x) * 16)) + 16384)))
    );
    __asm__ __volatile__(
        "cp.async.ca.shared.global.L2::128B [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)(B + (((((((((int)threadIdx.y) >> 1) * 262144) + (((int)blockIdx.x) * 4096)) + ((((int)threadIdx.y) & 1) * 2048)) + (ax0_ax1_ax2_ax3_0_fused_2_1 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + ((((int)threadIdx.x) >> 4) * 8)))), "n"(16)
    );
  }
  }

and we see bank conflicts happens:

however I can observe that sota library like cutlass which also leveraged cp.async ptx but it has no bank conflict, did it make sense?

Topic		Replies	Views
Bank Conflicts CUDA Programming and Performance	2	1956	December 6, 2009
Shared memory bank conflict CUDA Programming and Performance	1	249	May 19, 2024
Very strange share memory bank conflicts CUDA Programming and Performance cuda	4	489	November 2, 2021
Very strange share memory bank conflicts CUDA-MEMCHECK cuda	1	923	October 15, 2021
Resolve 1D shared memory bank conflict with paddling CUDA Programming and Performance cuda , kernel	9	51	September 1, 2024
unexpected unroll issue "unroll" changes output for the better CUDA Programming and Performance	3	2973	May 9, 2008
Shared memory access patterns CUDA Programming and Performance	2	1094	March 4, 2010
float4 Shared memory doesn't yield bank conflict according to nvprof when it should CUDA Programming and Performance	4	1893	January 13, 2024
is there any way to make this code run faster? 1D qpsk lookup CUDA Programming and Performance	3	3504	July 27, 2009
Bank conflicts with 2D shared mem array Resolving bank conflicts CUDA Programming and Performance	1	2008	July 18, 2008

Cp.async introduces bank conflict than naive ldg and sts ptx?

Related topics