Problem about PTX instruction cp.async.ca.shared.global

Here’s my kernel:

global void test_kernel(int *a) {
shared int smem[32];
asm volatile(“cp.async.ca.shared.global [%0], [%1], 4;\n” :: “l”(smem + threadIdx.x), “l”(a + threadIdx.x));
asm volatile(“cp.async.wait_all;\n” ::);
// …
}

I try to use cp.async.ca.shared.global to load 32 int from global memory to shared memory, but it doesn’t work.

When I use COMPUTE-SANITIZER to check the kernel, it tells me the address pointed by the shared memory pointer is out of bounds, like this:

========= COMPUTE-SANITIZER
========= Invalid shared write of size 4 bytes
========= at 0x3c0 in test_kernel(int*,int*)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x92000000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x7f0e9478050a]
========= in /usr/lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame: [0x55d09007aecb]
========= in /root/sm86/memcpy-async-test/gemm
========= Host Frame: [0x55d0900c7980]
========= in /root/sm86/memcpy-async-test/gemm
========= Host Frame: [0x55d0900769bf]
========= in /root/sm86/memcpy-async-test/gemm
========= Host Frame: [0x55d09007684f]
========= in /root/sm86/memcpy-async-test/gemm
========= Host Frame: [0x55d090076896]
========= in /root/sm86/memcpy-async-test/gemm
========= Host Frame: [0x55d090076633]
========= in /root/sm86/memcpy-async-test/gemm
========= Host Frame:__libc_start_main [0x7f0e9645cbf7]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame: [0x55d0900763da]
========= in /root/sm86/memcpy-async-test/gemm

Meanwhile, the value printed by “printf(”%lx", smem);" is “7f0e92000000”, which matches the COMPUTE-SANITIZER error messages.

Someone has met this problem:

But noone answered. Who can help me.

1 Like

The problem revolves around the address of shared memory you are using.

If you study the PTX code generated from the example given in the programming guide you will see that the calculation of the shared address uses a single 32-bit register, not a 64-bit calculation:

# cuobjdump -ptx t2

Fatbin elf code:
================
arch = sm_80
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

Fatbin elf code:
================
arch = sm_80
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

Fatbin ptx code:
================
arch = sm_80
code version = [7,6]
producer = <unknown>
host = linux
compile_size = 64bit
compressed

.version 7.6
.target sm_80
.address_size 64
...
add.s32 %r26, %r23, %r28;  // 32-bit calculation of address/offset

        cp.async.ca.shared.global [%r26], [%rd27], 4, 4;
                                   ^          ^
                                   |         64-bit PTX register
                                   32-bit PTX register

Therefore we can conclude that the address expected for the shared argument to this PTX instruction is the address relative to the shared state-space address.

The following demonstrates this:

# cat t1.cu
#include <iostream>
__global__ void test_kernel(int *a, int *b) {
  __shared__ int smem[32];
  unsigned long long as = (unsigned long long)(smem+threadIdx.x);
  as &= 0x0FFFFFFULL;
  asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n" :: "l"(as), "l"(a+threadIdx.x));
  asm volatile("cp.async.wait_all;\n" ::);
  b[threadIdx.x] = smem[threadIdx.x];
}

int main(){

  int *a, *b;
  cudaMallocManaged(&a, 32*sizeof(int));
  cudaMallocManaged(&b, 32*sizeof(int));
  for (int i = 0; i < 32; i++) {a[i] = i; b[i] = 0;}
  test_kernel<<<1,32>>>(a, b);
  cudaDeviceSynchronize();
  for (int i = 0; i < 32; i++) std::cout << b[i] << ",";
  std::cout << std::endl;
}

# nvcc -o t1 t1.cu -arch=sm_80 -lineinfo
# cuda-memcheck ./t1
========= CUDA-MEMCHECK
========= This tool is deprecated and will be removed in a future release of the CUDA toolkit
========= Please use the compute-sanitizer tool as a drop-in replacement
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
========= ERROR SUMMARY: 0 errors
#

The PTX documentation makes reference to the necessary operand formats for this instruction:

Operand src specifies a location in the global state space and dst specifies a location in the shared state space.

and

Supported alignment requirements and addressing modes for operand src and dst are described in Addresses as Operands.

Additionally, the PTX interop documentation states:

However, addresses of the local and shared memory spaces are always 32 bits in size.

Note that using the above code, disassembling to SASS, the relevant instruction for the copy is:

    /*0080*/                   LDGSTS.E [R9], [R2.64] ;                           /* 0x0000000002097fae */

we see that even though the PTX instruction specifies a 64-bit shared address, by the time we get to SASS a 32-bit register is being used for the shared address.

Note that this variant works similarly:

  as &= 0x0FFFFFFULL;
  unsigned asl = as;
  asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n" :: "r"(asl), "l"(a+threadIdx.x));

Later:
This is a better approach:

__global__ void test_kernel(int *a, int *b) {
  __shared__ int smem[32];
  size_t asl = __cvta_generic_to_shared(smem+threadIdx.x);
  asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n" :: "l"(asl), "l"(a+threadIdx.x));
  asm volatile("cp.async.wait_all;\n" ::);
  b[threadIdx.x] = smem[threadIdx.x];
}

or using inline PTX for the conversion:

__global__ void test_kernel(int *a, int *b) {
  __shared__ int smem[32];
  asm volatile(".reg .u64 smem_ptr64; cvta.to.shared.u64 smem_ptr64, %0;\n" :: "l"(smem+threadIdx.x));
  asm volatile("cp.async.ca.shared.global [smem_ptr64], [%0], 4;\n" :: "l"(a+threadIdx.x));
  asm volatile("cp.async.wait_all;\n" ::);
  b[threadIdx.x] = smem[threadIdx.x];
}

or:

__global__ void test_kernel(int *a, int *b) {
  __shared__ int smem[32];
  asm volatile(".reg .u64 smem_ptr64; cvta.to.shared.u64 smem_ptr64, %0;\n" :: "l"(smem+threadIdx.x));
  asm volatile(".reg .u32 smem_ptr32; cvt.u32.u64 smem_ptr32, smem_ptr64;\n" ::);
  asm volatile("cp.async.ca.shared.global [smem_ptr32], [%0], 4;\n" :: "l"(a+threadIdx.x));
  asm volatile("cp.async.wait_all;\n" ::);
  b[threadIdx.x] = smem[threadIdx.x];
}
2 Likes

It works! Thanks A LOT!

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.