How to write inline asm function sts128 (store 128 bits to shared memory)?

czxu · October 27, 2023, 12:57am

Hi Forum,
I am trying to replace the part of the code that write data from registers to shared memory with inline asm function, but my code crushed without any error message after I made that modification… would you please help me on the correct format of the inline function?
My original code is :

__half2 tmp_a[4], tmp_b[4];
// load data into the two tmp registers, a_share and b_share are __half2 pointers to shared memory space
.......
# pragma unroll
for (int i=0; i<4; ++i) (a_share+to_As+i)[0] = tmp_a[i]; 
# pragma unroll
for (int i=0; i<4; ++i) (b_share+to_Bs+i)[0] = tmp_b[i];

and that code works well, my asm version is:

__half2 tmp_a[4], tmp_b[4];
// load data into the two tmp registers
.......
sts128(a_share+to_As, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]);
sts128(b_share+to_Bs, tmp_b[0], tmp_b[1], tmp_b[2], tmp_b[3]);

where:

__device__ __forceinline__ void sts128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

The code above just crushed the kernel… Do you know where I did wrong?
Thank you so much for your help!

striker159 · October 27, 2023, 4:31am

Is there a specific reason why you would like to use inline ptx? You could just cast the array and the pointer to int4 to get 128-bit memory operations in c++

czxu · October 27, 2023, 4:55am

well, I am learning PTX and the inline asm lol, so it’s more like a homework for myself ;-)

czxu · October 27, 2023, 5:04am

An update is that the kernel containing the code above returned a “an illegal memory access was encountered” error

czxu · October 27, 2023, 5:14am

The strange thing is that I have been using a very similar asm func stg128 to store data into global memory, and that function is working smoothly, but the sts128 got illegal access by changing st.global to st.shared

__device__ __forceinline__ void stg128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3) {
    asm volatile(
        "st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

striker159 · October 27, 2023, 5:44am

Can you show the complete code for your non-working example?

czxu · October 27, 2023, 5:49am

sure, that might be a little bit long. It’s basically a matrix multiplication (128*128 per block)
The sts128 function is in the load global to share memory part (LDG2S)

#include "matrix_mul.cuh"
#include "cuda_utils.h"

#define LDG2S(a_share, b_share) \
{ \
    __half2 tmp_a[4] = { \
        __ldg(from_a), \
        __ldg(from_a + 1*K/2), \
        __ldg(from_a + 2*K/2), \
        __ldg(from_a + 3*K/2) \
    }; \
    __half2 tmp_b[4] = { \
        __ldg(from_b), \
        __ldg(from_b + 1*K/2), \
        __ldg(from_b + 2*K/2), \
        __ldg(from_b + 3*K/2), \
    }; \
    sts128(a_share+to_As, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]); \
    sts128(b_share+to_Bs, tmp_b[0], tmp_b[1], tmp_b[2], tmp_b[3]); \
    from_a += 8; from_b += 8; \
} \

#define MATMUL_THREAD(a_share, b_share) \
{ \
    unsigned int from_As = warp_row*32 + thread_row*8; \
    unsigned int from_Bs = warp_col*64 + thread_col*8; \
    _Pragma("unroll") \
    for (int i_inner_step=0; i_inner_step<8; ++i_inner_step) { \
        __half2 pA[8], pB[8]; \
        _Pragma("unroll") \
        for (int i=0; i<8; ++i){ \
            pA[i] = (a_share+from_As+i)[0]; \
            pB[i] = (b_share+from_Bs+i)[0]; \
        } \
        _Pragma("unroll") \
        for (int i=0; i<4; ++i) { \
            _Pragma("unroll") \
            for (int j=0; j<4; ++j) { \
                __half2 tmp[4] = {__half2{pB[2*j].x, pB[2*j+1].y}, \
                                __half2{pA[2*i].y, pA[2*i].x}, \
                                __half2{pB[2*j].y, pB[2*j+1].x}, \
                                __half2{pA[2*i+1].y, pA[2*i+1].x}}; \
                acc[2*i][j] = __hfma2(tmp[1], tmp[2], acc[2*i][j]); \
                acc[2*i][j] = __hfma2(pA[2*i], tmp[0], acc[2*i][j]); \
                acc[2*i+1][j] = __hfma2(tmp[3], tmp[2], acc[2*i+1][j]); \
                acc[2*i+1][j] = __hfma2(pA[2*i+1], tmp[0], acc[2*i+1][j]); \
            } \
        } \
        from_As += (128+LD_buffer); \
        from_Bs += (128+LD_buffer); \
    } \
} \

// #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var))) from cuda_fp16.hpp
__device__ __forceinline__ void ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    unsigned int reg0_ui, reg1_ui, reg2_ui, reg3_ui;
    asm volatile(
        "ld.global.nc.v4.b32 {%0, %1, %2, %3}, [%4];\n"
        : "=r"(reg0_ui),
          "=r"(reg1_ui),
          "=r"(reg2_ui),
          "=r"(reg3_ui)
        : "l"(addr)
    );
    reg0 = *(reinterpret_cast<__half2 *>(&reg0_ui));
    reg1 = *(reinterpret_cast<__half2 *>(&reg1_ui));
    reg2 = *(reinterpret_cast<__half2 *>(&reg2_ui));
    reg3 = *(reinterpret_cast<__half2 *>(&reg3_ui));
}

__device__ __forceinline__ void stg128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3) {
    asm volatile(
        "st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

__device__ __forceinline__ void sts128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

// __device__ __forceinline__ void ldgsts32(const uint32_t &smem_addr,
//                                          const void *gmem_ptr,
//                                          const uint32_t &src_size, bool guard) {
//     asm volatile (
//         "{.reg .pred p;\n"
//         " setp.ne.b32 p, %3, 0;\n"
// #if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4
//         " @p cp.async.ca.shared.global.L2::128B [%0], [%1], 4, %2;}\n"
// #else
//         " @p cp.async.ca.shared.global [%0], [%1], 4, %2;}\n"
// #endif
//         :
//         : "r"(smem_addr), "l"(gmem_ptr), "r"(src_size), "r"((int)guard)
//     );
// }

/*
This implementation is the SIMT core version.
For each block, we assign 16*16 threads,
For each thread, we assign 8*8 C matrix
For each block, we assign 128*128 C matrix,
For each warp, we assign 32*64 C matrix
For each step, we set k = 16
*/


__global__ void matrix_mul_smit_pipeline_kernel_128x128(__half2* matA, __half2* matBT, __half2* matC, int M, int N, int K) {
    const unsigned int block_id = blockIdx.x;
    const unsigned int thread_id = threadIdx.x;
    const unsigned int block_row = block_id / (N/128);
    const unsigned int block_col = block_id % (N/128);
    const unsigned int warp_id = thread_id / 32;
    const unsigned int warp_row = warp_id / 2;
    const unsigned int warp_col = warp_id % 2;
    const unsigned int thread_row = (thread_id % 32) / 8;
    const unsigned int thread_col = (thread_id % 32) % 8;

    const unsigned int LD_buffer = 8;

    // shared memory
    __shared__ __align__(16 * 1024) char smem[18 * 1024];
    // As/Bs needs 128 * 16 * half = 128 * 16 * 16 bits = 32768 bits = 32768 / 8 char = 4096 char
    // add the LD_buffer: need 4352 char = 4.25 k ==> 4.5 k
    __half2* As[2] = {reinterpret_cast<__half2 *>(smem),
                    reinterpret_cast<__half2 *>(smem + 4608)};
    __half2* Bs[2] = {reinterpret_cast<__half2 *>(smem + 4608*2),
                    reinterpret_cast<__half2 *>(smem + 4608*3)};
    // TODO: what is the __align__ used for and why we add some buffer into the share memory?

    __half2 acc[8][4];
    // load C into the acc
    __half2* from_c = matC + (block_row*128 + warp_row*32 + thread_row*8) * (N/2) + block_col*128/2 + warp_col*64/2 + thread_col*8/2;
    #pragma unroll
    for (int i=0; i<8; ++i) {
        ldg128(from_c+i*N/2, acc[i][0], acc[i][1], acc[i][2], acc[i][3]);
    }

    // set the outer for loop initial value
    __half2* from_a = matA + (block_row*128 + 4*(thread_id/8)) * (K/2) + thread_id%8;
    unsigned int to_As = (thread_id%8) * (128+LD_buffer) + 4*(thread_id/8);
    __half2* from_b = matBT + (block_col*128 + 4*(thread_id/8)) * (K/2) + thread_id%8; 
    unsigned int to_Bs = (thread_id%8) * (128+LD_buffer) + 4*(thread_id/8);
    // outer loop
    LDG2S(As[0], Bs[0])
    __syncthreads();
    unsigned int pipeline_indicator = 0;
    #pragma unroll
    for (int i_step=0; i_step<K/16-1; ++i_step) {
        // load sub A, B matrix
        LDG2S(As[1-pipeline_indicator], Bs[1-pipeline_indicator])
        MATMUL_THREAD(As[pipeline_indicator], Bs[pipeline_indicator])
        __syncthreads();
        pipeline_indicator = 1 - pipeline_indicator;
    }
    MATMUL_THREAD(As[pipeline_indicator], Bs[pipeline_indicator])
    __syncthreads();
    // write back to C
    __half2* to_c = matC + (block_row*128 + warp_row*32 + thread_row*8) * (N/2) + block_col*128/2 + warp_col*64/2 + thread_col*8/2;
    #pragma unroll
    for (int i=0; i<8; ++i) {
        stg128(to_c+i*N/2, acc[i][0], acc[i][1], acc[i][2], acc[i][3]);
    }
    __syncthreads();
    return;
}

matrix_template matrix_mul_smit_pipeline_host(const matrix_template& matA, const matrix_template& matBT, matrix_template& matC, int M, int N, int K) {
    event_pair timer;
    // cudaMalloc device arrays
    float* device_matA = 0;
    float* device_matBT = 0;
    float* device_matC = 0;
    cudaMalloc((void**)&device_matA, M * K * sizeof(float));
    cudaMalloc((void**)&device_matBT, N * K * sizeof(float));
    cudaMalloc((void**)&device_matC, M * N * sizeof(float));
    if(device_matA == 0 || device_matBT == 0 || device_matC == 0) {
        printf("couldn't allocate memory\n");
        return matC;
    }
    // __half_copy
    __half* device_matA_h = 0;
    __half* device_matBT_h = 0;
    __half* device_matC_h = 0;
    cudaMalloc((void**)&device_matA_h, M * K * sizeof(__half));
    cudaMalloc((void**)&device_matBT_h, N * K * sizeof(__half));
    cudaMalloc((void**)&device_matC_h, M * N * sizeof(__half));
    if(device_matA_h == 0 || device_matBT_h == 0 || device_matC_h == 0) {
        printf("couldn't allocate memory\n");
        return matC;
    }
    // cuda mem copy
    cudaMemcpy(device_matA, matA.data(), M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_matBT, matBT.data(), N * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_matC, matC.data(), M * N * sizeof(float), cudaMemcpyHostToDevice);

    cast_kernel_float2half<<<128, 256>>>(device_matA_h, device_matA, M * K);
    cast_kernel_float2half<<<128, 256>>>(device_matBT_h, device_matBT, N * K);
    cast_kernel_float2half<<<128, 256>>>(device_matC_h, device_matC, M * N);

    __half2* device_matA_h2 = reinterpret_cast<__half2 *>(device_matA_h); 
    __half2* device_matBT_h2 = reinterpret_cast<__half2 *>(device_matBT_h);
    __half2* device_matC_h2 = reinterpret_cast<__half2 *>(device_matC_h);

    // kernel call
    int block_size = 16 * 16;
    int grid_size = (M * N) / (128 * 128);
    start_timer(&timer);
    matrix_mul_smit_pipeline_kernel_128x128<<<grid_size, block_size>>>(device_matA_h2, device_matBT_h2, device_matC_h2, M, N, K);
    float kernel_time_ms = stop_timer(&timer);
    device_matC_h = reinterpret_cast<__half *>(device_matC_h2);
    cast_kernel_half2float<<<128, 256>>>(device_matC, device_matC_h, M * N);
    cudaMemcpy(matC.data(), device_matC, M * N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(device_matA);
    cudaFree(device_matBT);
    cudaFree(device_matC);
    cudaFree(device_matA_h);
    cudaFree(device_matBT_h);
    cudaFree(device_matC_h);
    printf("cuda kernel <matrix_mul_smit_pipeline_kernel_128x128> runtime %f ms.\n", kernel_time_ms);
    return matC;
}

czxu · October 27, 2023, 5:50am

or in github: [2023/10/26] try using vectorized read/write everywhere, but got asm … · chengzhe-xu/project-gravity@19809e0 · GitHub (simt_pipeline.cu)

striker159 · October 27, 2023, 6:41am

Well, I expected a minimal example which can be executed.

I am not a ptx expert. As far as I know, the vector operand in load and store instructions must be aligned to vector size and the unpacked registers must be continuous.
I would suggest specifying proper alignment for the source and target registers in both global and shared vectorized loads. Either by using alignas or using proper vector types.

ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
   // unsigned int reg0_ui, reg1_ui, reg2_ui, reg3_ui;
   uint4 reg_ui;
...

//__half2 tmp_a[4]
alignas(16) __half2 tmp_a[4];

njuffa · October 27, 2023, 9:04am

I might have missed it, but I don’t think CUDA provides vector types with __half2 elements. You could easily build your own 4-vector type with alignas(), however. And if you access such a 4-vector, the compiler will happily use 128-bit loads and stores, no need to use PTX inline assembly.

The fact that the ldg128() code above does nothing to ensure the required 16-byte alignment suggests that it has been “working fine” by sheer luck so far.

[Later:] I see why you may have started using PTX inline assembly. For some reason that I do not understand, the compiler insists on generating 4 loads for my 16-byte aligned vector type. It does produce a vector store into shared memory, though. Might be worth checking whether the compiler from CUDA 12.3 generates a vector load from global memory.

[Later still:] I replaced alignas(16) with CUDA’s own __align__(16) and that does not help with vectorizing the load from global memory. Not sure what is going on.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_fp16.h>

struct alignas(16) my_half2_4 {__half2 arr[4]; };
typedef struct my_half2_4 my_half2_4;

__global__ void kernel (my_half2_4 *data)
{
    __shared__ my_half2_4 foo;
    
    if (threadIdx.x == 0) {
        foo = *data;
    }
    __syncthreads();
    printf ("thread %d: %12.5f %12.5f\n", 
            threadIdx.x, 
            __low2float  (foo.arr[threadIdx.x]),
            __high2float (foo.arr[threadIdx.x]));
}

int main (void)
{
    my_half2_4 *data = 0;
    unsigned short d[8] = {0x3c00, 0xc000, 0x4200, 0xc400, 
                           0x4500, 0xc600, 0x4700, 0xc800};
    cudaMalloc ((void**)&data, sizeof (*data));
    printf ("data=%p\n", data);
    cudaMemcpy (data, d, sizeof (*data), cudaMemcpyHostToDevice);
    kernel<<<1,4>>>(data);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

striker159 · October 27, 2023, 12:03pm

CUDA 12.3 also generates 4 scalar global loads and a vectorized shared store

ld.global.u32 %r2, [%rd2+12];
ld.global.u32 %r3, [%rd2+8];
ld.global.u32 %r4, [%rd2+4];
ld.global.u32 %r5, [%rd2];
st.shared.v4.u32 [_ZZ6kernelP10my_half2_4E3foo], {%r5, %r4, %r3, %r2};

Robert_Crovella · October 27, 2023, 3:01pm

It seems evident you are using the same a_share address for both CUDA C++ and (inline) PTX.

When using a shared address, and passing that shared address from CUDA C++ to (inline) PTX, it is often necessary to convert the shared address into a form usable in the shared logical space that the PTX instruction may expect. My guess would be that this is the issue.

Here is a complete example:


# cat t67.cu
#include <cuda_fp16.h>

__device__ __forceinline__ void sts128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

__global__ void k(__half2 *a, __half2 *b){

  __shared__ __half2 a_share[256];
  __half2 tmp_a[4];
  tmp_a[0] = a[0];
  tmp_a[1] = a[1];
  tmp_a[2] = a[2];
  tmp_a[3] = a[3];
#ifndef USE_FIX
  sts128(a_share, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]);
#else
  size_t as = __cvta_generic_to_shared(a_share);
  __half2 *my_as = reinterpret_cast<__half2 *>(as);
  sts128(my_as, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]);
#endif
  b[0] = a_share[3];
  b[1] = a_share[2];
  b[2] = a_share[1];
  b[3] = a_share[0];
}

int main(){

  __half2 *a;
  cudaMallocManaged(&a, 256*sizeof(__half2));
  k<<<1,1>>>(a, a+128);
  cudaDeviceSynchronize();
}
# nvcc -o t67 t67.cu -arch=sm_89
# compute-sanitizer ./t67
========= COMPUTE-SANITIZER
========= Invalid __shared__ write of size 16 bytes
=========     at 0xb0 in k(__half2 *, __half2 *)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x4e000000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e950]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1060e]
=========                in /root/bobc/./t67
=========     Host Frame:cudaLaunchKernel [0x7081e]
=========                in /root/bobc/./t67
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xaf8d]
=========                in /root/bobc/./t67
=========     Host Frame:__device_stub__Z1kP7__half2S0_(__half2*, __half2*) [0xae0c]
=========                in /root/bobc/./t67
=========     Host Frame:k(__half2*, __half2*) [0xae4d]
=========                in /root/bobc/./t67
=========     Host Frame:main [0xac6a]
=========                in /root/bobc/./t67
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaac5]
=========                in /root/bobc/./t67
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x47e786]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaDeviceSynchronize [0x48734]
=========                in /root/bobc/./t67
=========     Host Frame:main [0xac6f]
=========                in /root/bobc/./t67
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaac5]
=========                in /root/bobc/./t67
=========
========= ERROR SUMMARY: 2 errors
# nvcc -o t67 t67.cu -arch=sm_89 -DUSE_FIX
# compute-sanitizer ./t67
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
#

I’m not suggesting this is exactly how I would write the code. I did this to leave your sts128 function unchanged. If I were doing this, I would probably rewrite the sts128 function in some fashion, either to handle the shared conversion, or to accept the size_t variable without requiring a reinterpretation.

For completeness, here is the sass for the -DUSE_FIX compiled case:

# cuobjdump -sass ./t67

Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_89

Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_89
                Function : _Z1kP7__half2S0_
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                    /* 0x00000a0000017a02 */
                                                                             /* 0x000fc40000000f00 */
        /*0010*/                   MOV R2, c[0x0][0x160] ;                   /* 0x0000580000027a02 */
                                                                             /* 0x000fe20000000f00 */
        /*0020*/                   IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ;  /* 0x00005900ff037624 */
                                                                             /* 0x000fe200078e00ff */
        /*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;              /* 0x0000460000047ab9 */
                                                                             /* 0x000fc80000000a00 */
        /*0040*/                   LDG.E R8, [R2.64] ;                       /* 0x0000000402087981 */
                                                                             /* 0x000ea8000c1e1900 */
        /*0050*/                   LDG.E R9, [R2.64+0x4] ;                   /* 0x0000040402097981 */
                                                                             /* 0x000ea8000c1e1900 */
        /*0060*/                   LDG.E R10, [R2.64+0x8] ;                  /* 0x00000804020a7981 */
                                                                             /* 0x000ea8000c1e1900 */
        /*0070*/                   LDG.E R11, [R2.64+0xc] ;                  /* 0x00000c04020b7981 */
                                                                             /* 0x000ea2000c1e1900 */
        /*0080*/                   MOV R4, c[0x0][0x168] ;                   /* 0x00005a0000047a02 */
                                                                             /* 0x000fe20000000f00 */
        /*0090*/                   IMAD.MOV.U32 R5, RZ, RZ, c[0x0][0x16c] ;  /* 0x00005b00ff057624 */
                                                                             /* 0x000fc400078e00ff */
        /*00a0*/                   STS.128 [RZ], R8 ;                        /* 0x00000008ff007388 */
                                                                             /* 0x004fe80000000c00 */
        /*00b0*/                   LDS.128 R12, [RZ] ;                       /* 0x00000000ff0c7984 */
                                                                             /* 0x000e280000000c00 */
        /*00c0*/                   STG.E [R4.64], R11 ;                      /* 0x0000000b04007986 */
                                                                             /* 0x000fe8000c101904 */
        /*00d0*/                   STG.E [R4.64+0x4], R14 ;                  /* 0x0000040e04007986 */
                                                                             /* 0x001fe8000c101904 */
        /*00e0*/                   STG.E [R4.64+0x8], R13 ;                  /* 0x0000080d04007986 */
                                                                             /* 0x000fe8000c101904 */
        /*00f0*/                   STG.E [R4.64+0xc], R12 ;                  /* 0x00000c0c04007986 */
                                                                             /* 0x000fe2000c101904 */
        /*0100*/                   EXIT ;                                    /* 0x000000000000794d */
                                                                             /* 0x000fea0003800000 */
        /*0110*/                   BRA 0x110;                                /* 0xfffffff000007947 */
                                                                             /* 0x000fc0000383ffff */
        /*0120*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0130*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0140*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0150*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
                ..........



Fatbin ptx code:
================
arch = sm_89
code version = [8,2]
host = linux
compile_size = 64bit
compressed
#

confirming that a 128 bit STS instruction is being used.

If you’re still having trouble, I suggest providing an example that is as short as I have provided, and as complete as I have provided, that demonstrates the issue.

njuffa · October 27, 2023, 5:18pm

@striker159 Thanks for checking with CUDA 12.3. Best I can tell, the failure to vectorize the load is somehow related to how the opaque type __half2 is implemented. If I replace it with my own struct based equivalent my_half2, I get the desired and expected 128-bit load from global memory (see modified code below). I am both puzzled and annoyed.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_fp16.h>

struct alignas(4) my_half2 {__half x, y;};
typedef struct my_half2 my_half2;
struct alignas(16) my_half2_4 {my_half2 arr[4]; };
typedef struct my_half2_4 my_half2_4;

__global__ void kernel (my_half2_4 *data)
{
    __shared__ my_half2_4 foo;
    
    if (threadIdx.x == 0) {
        foo = *data;
    }
    __syncthreads();
    printf ("thread %d: %13.5e %13.5e\n", threadIdx.x, 
            __half2float (foo.arr[threadIdx.x].x),
            __half2float (foo.arr[threadIdx.x].y));
}

int main (void)
{
    my_half2_4 *data = 0;
    unsigned short d[8] = {0x3c00, 0xc000, 0x4200, 0xc400, 
                           0x4500, 0xc600, 0x4700, 0xc800};
    cudaMalloc ((void**)&data, sizeof (*data));
    printf ("data=%p\n", data);
    cudaMemcpy (data, d, sizeof (*data), cudaMemcpyHostToDevice);
    kernel<<<1,4>>>(data);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

czxu · October 27, 2023, 6:03pm

Thank you so much @striker159 @Robert_Crovella and @njuffa ! You are right and the issue is that I pass directly a __half2 address used in c++ to PTX, and @Robert_Crovella 's solution works well! You guys are awesome!

system · November 10, 2023, 6:04pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Identifier "__HALF2_TO_UI" is undefined when using asm for cuda CUDA Programming and Performance cuda	18	552	October 27, 2023
Why cuda kernel use unexpected stack frame? CUDA Programming and Performance cuda , kernel	8	347	April 3, 2024
Why my ldmatrix PTX instruction is wrong? CUDA Programming and Performance cuda	10	787	March 30, 2024
multi dimension array CUDA Programming and Performance	26	32792	February 12, 2010
coalescing struct loading problem CUDA Programming and Performance	21	12742	March 5, 2010
CUDA and Image Processing CUDA Programming and Performance	45	22181	August 5, 2008
Error or incomprehension, MMa ptx mixed precision Bfloat16 rtx3080 CUDA Programming and Performance	20	2272	October 12, 2021
Upgrading to CUDA 12.4 broke down the application GPU-Accelerated Libraries cublas , cusparse	13	1187	July 21, 2024
CUDA thrust compilation error CUDA Programming and Performance	85	417	April 4, 2025
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16335	January 30, 2011

How to write inline asm function sts128 (store 128 bits to shared memory)?

Related topics