About async copy

In my code like below, I use async copy instr and find bank conflict in this clip.

// assume only one warp per threadblock
__global__ f(int* src, ...) {
  int lane_id = threadIdx.x;
  __shared__ int dst[512]
  for (int offset = 0; offset < 512; offset += 128) {
    void *ptr = (void *)(dst + offset + lane_id * 4);
    uint32_t smem_ptr;
    asm(
      "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
      : "=r"(smem_ptr)
      : "l"(ptr)); 
    asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n" ::"r"(smem_ptr),
    "l"(&src[offset + lane_id * 4]),
    "n"(16));
  }
  asm volatile("cp.async.commit_group;\n" ::);
  ...
}

I don’t understand why there are bankconflict?

Can you try with copying 4 bytes instead of 16 bytes, just for testing?

I search for some website and find that for async copy, there will not be hardware combine mechanism like load by register. So does that means to avoid bank conflict and for best effienciy, we need to load 4 bytes per thread per time?

Does that means 4 times of 4 Bytes will be faster than 1 time 16 bytes

According to the documentation (IIRC) async_copy can do 4, 8 and 16 bytes. But perhaps you can try 4 bytes first and then we see first, whether that was related to the bank conflicts.

4 bytes have no conflict. So if 16 bytes will definitly cause bank conflict, why nvidia give this choice?

does load for 4 times * 4 bytes will have apparent difference with 1 time * 16 bytes?

make sure src and dst are aligned to 16 bytes

E.g. you have defined

__shared__ int dst

insert __align__(16) or use int4 as data type.

When I profile your code on A100, I don’t see bank conflicts.

#include <thrust/device_vector.h>

__global__ 
void f(int* src) {
  int lane_id = threadIdx.x;
  __shared__ int dst[512];
  for (int offset = 0; offset < 512; offset += 128) {
    void *ptr = (void *)(dst + offset + lane_id * 4);
    uint32_t smem_ptr;
    asm(
      "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
      : "=r"(smem_ptr)
      : "l"(ptr)); 
    asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n" ::"r"(smem_ptr),
    "l"(&src[offset + lane_id * 4]),
    "n"(16));
  }
  asm volatile("cp.async.commit_group;\n" ::);

}


int main(){
	thrust::device_vector<int> input(512 * 1024 * 128, 1);

	f<<<1024*128, 32>>>(input.data().get());
	cudaDeviceSynchronize();
}
smsp__inst_executed_op_ldgsts.sum [inst] 524.288
smsp__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ldgsts.sum 2.097.152
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum 0