How to write inline asm function sts128 (store 128 bits to shared memory)?

Hi Forum,
I am trying to replace the part of the code that write data from registers to shared memory with inline asm function, but my code crushed without any error message after I made that modification… would you please help me on the correct format of the inline function?
My original code is :

__half2 tmp_a[4], tmp_b[4];
// load data into the two tmp registers, a_share and b_share are __half2 pointers to shared memory space
.......
# pragma unroll
for (int i=0; i<4; ++i) (a_share+to_As+i)[0] = tmp_a[i]; 
# pragma unroll
for (int i=0; i<4; ++i) (b_share+to_Bs+i)[0] = tmp_b[i]; 

and that code works well, my asm version is:

__half2 tmp_a[4], tmp_b[4];
// load data into the two tmp registers
.......
sts128(a_share+to_As, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]);
sts128(b_share+to_Bs, tmp_b[0], tmp_b[1], tmp_b[2], tmp_b[3]);

where:

__device__ __forceinline__ void sts128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

The code above just crushed the kernel… Do you know where I did wrong?
Thank you so much for your help!

Is there a specific reason why you would like to use inline ptx? You could just cast the array and the pointer to int4 to get 128-bit memory operations in c++

well, I am learning PTX and the inline asm lol, so it’s more like a homework for myself ;-)

An update is that the kernel containing the code above returned a “an illegal memory access was encountered” error

The strange thing is that I have been using a very similar asm func stg128 to store data into global memory, and that function is working smoothly, but the sts128 got illegal access by changing st.global to st.shared

__device__ __forceinline__ void stg128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3) {
    asm volatile(
        "st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

Can you show the complete code for your non-working example?

sure, that might be a little bit long. It’s basically a matrix multiplication (128*128 per block)
The sts128 function is in the load global to share memory part (LDG2S)

#include "matrix_mul.cuh"
#include "cuda_utils.h"

#define LDG2S(a_share, b_share) \
{ \
    __half2 tmp_a[4] = { \
        __ldg(from_a), \
        __ldg(from_a + 1*K/2), \
        __ldg(from_a + 2*K/2), \
        __ldg(from_a + 3*K/2) \
    }; \
    __half2 tmp_b[4] = { \
        __ldg(from_b), \
        __ldg(from_b + 1*K/2), \
        __ldg(from_b + 2*K/2), \
        __ldg(from_b + 3*K/2), \
    }; \
    sts128(a_share+to_As, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]); \
    sts128(b_share+to_Bs, tmp_b[0], tmp_b[1], tmp_b[2], tmp_b[3]); \
    from_a += 8; from_b += 8; \
} \

#define MATMUL_THREAD(a_share, b_share) \
{ \
    unsigned int from_As = warp_row*32 + thread_row*8; \
    unsigned int from_Bs = warp_col*64 + thread_col*8; \
    _Pragma("unroll") \
    for (int i_inner_step=0; i_inner_step<8; ++i_inner_step) { \
        __half2 pA[8], pB[8]; \
        _Pragma("unroll") \
        for (int i=0; i<8; ++i){ \
            pA[i] = (a_share+from_As+i)[0]; \
            pB[i] = (b_share+from_Bs+i)[0]; \
        } \
        _Pragma("unroll") \
        for (int i=0; i<4; ++i) { \
            _Pragma("unroll") \
            for (int j=0; j<4; ++j) { \
                __half2 tmp[4] = {__half2{pB[2*j].x, pB[2*j+1].y}, \
                                __half2{pA[2*i].y, pA[2*i].x}, \
                                __half2{pB[2*j].y, pB[2*j+1].x}, \
                                __half2{pA[2*i+1].y, pA[2*i+1].x}}; \
                acc[2*i][j] = __hfma2(tmp[1], tmp[2], acc[2*i][j]); \
                acc[2*i][j] = __hfma2(pA[2*i], tmp[0], acc[2*i][j]); \
                acc[2*i+1][j] = __hfma2(tmp[3], tmp[2], acc[2*i+1][j]); \
                acc[2*i+1][j] = __hfma2(pA[2*i+1], tmp[0], acc[2*i+1][j]); \
            } \
        } \
        from_As += (128+LD_buffer); \
        from_Bs += (128+LD_buffer); \
    } \
} \

// #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var))) from cuda_fp16.hpp
__device__ __forceinline__ void ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    unsigned int reg0_ui, reg1_ui, reg2_ui, reg3_ui;
    asm volatile(
        "ld.global.nc.v4.b32 {%0, %1, %2, %3}, [%4];\n"
        : "=r"(reg0_ui),
          "=r"(reg1_ui),
          "=r"(reg2_ui),
          "=r"(reg3_ui)
        : "l"(addr)
    );
    reg0 = *(reinterpret_cast<__half2 *>(&reg0_ui));
    reg1 = *(reinterpret_cast<__half2 *>(&reg1_ui));
    reg2 = *(reinterpret_cast<__half2 *>(&reg2_ui));
    reg3 = *(reinterpret_cast<__half2 *>(&reg3_ui));
}

__device__ __forceinline__ void stg128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3) {
    asm volatile(
        "st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

__device__ __forceinline__ void sts128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

// __device__ __forceinline__ void ldgsts32(const uint32_t &smem_addr,
//                                          const void *gmem_ptr,
//                                          const uint32_t &src_size, bool guard) {
//     asm volatile (
//         "{.reg .pred p;\n"
//         " setp.ne.b32 p, %3, 0;\n"
// #if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4
//         " @p cp.async.ca.shared.global.L2::128B [%0], [%1], 4, %2;}\n"
// #else
//         " @p cp.async.ca.shared.global [%0], [%1], 4, %2;}\n"
// #endif
//         :
//         : "r"(smem_addr), "l"(gmem_ptr), "r"(src_size), "r"((int)guard)
//     );
// }

/*
This implementation is the SIMT core version.
For each block, we assign 16*16 threads,
For each thread, we assign 8*8 C matrix
For each block, we assign 128*128 C matrix,
For each warp, we assign 32*64 C matrix
For each step, we set k = 16
*/


__global__ void matrix_mul_smit_pipeline_kernel_128x128(__half2* matA, __half2* matBT, __half2* matC, int M, int N, int K) {
    const unsigned int block_id = blockIdx.x;
    const unsigned int thread_id = threadIdx.x;
    const unsigned int block_row = block_id / (N/128);
    const unsigned int block_col = block_id % (N/128);
    const unsigned int warp_id = thread_id / 32;
    const unsigned int warp_row = warp_id / 2;
    const unsigned int warp_col = warp_id % 2;
    const unsigned int thread_row = (thread_id % 32) / 8;
    const unsigned int thread_col = (thread_id % 32) % 8;

    const unsigned int LD_buffer = 8;

    // shared memory
    __shared__ __align__(16 * 1024) char smem[18 * 1024];
    // As/Bs needs 128 * 16 * half = 128 * 16 * 16 bits = 32768 bits = 32768 / 8 char = 4096 char
    // add the LD_buffer: need 4352 char = 4.25 k ==> 4.5 k
    __half2* As[2] = {reinterpret_cast<__half2 *>(smem),
                    reinterpret_cast<__half2 *>(smem + 4608)};
    __half2* Bs[2] = {reinterpret_cast<__half2 *>(smem + 4608*2),
                    reinterpret_cast<__half2 *>(smem + 4608*3)};
    // TODO: what is the __align__ used for and why we add some buffer into the share memory?

    __half2 acc[8][4];
    // load C into the acc
    __half2* from_c = matC + (block_row*128 + warp_row*32 + thread_row*8) * (N/2) + block_col*128/2 + warp_col*64/2 + thread_col*8/2;
    #pragma unroll
    for (int i=0; i<8; ++i) {
        ldg128(from_c+i*N/2, acc[i][0], acc[i][1], acc[i][2], acc[i][3]);
    }

    // set the outer for loop initial value
    __half2* from_a = matA + (block_row*128 + 4*(thread_id/8)) * (K/2) + thread_id%8;
    unsigned int to_As = (thread_id%8) * (128+LD_buffer) + 4*(thread_id/8);
    __half2* from_b = matBT + (block_col*128 + 4*(thread_id/8)) * (K/2) + thread_id%8; 
    unsigned int to_Bs = (thread_id%8) * (128+LD_buffer) + 4*(thread_id/8);
    // outer loop
    LDG2S(As[0], Bs[0])
    __syncthreads();
    unsigned int pipeline_indicator = 0;
    #pragma unroll
    for (int i_step=0; i_step<K/16-1; ++i_step) {
        // load sub A, B matrix
        LDG2S(As[1-pipeline_indicator], Bs[1-pipeline_indicator])
        MATMUL_THREAD(As[pipeline_indicator], Bs[pipeline_indicator])
        __syncthreads();
        pipeline_indicator = 1 - pipeline_indicator;
    }
    MATMUL_THREAD(As[pipeline_indicator], Bs[pipeline_indicator])
    __syncthreads();
    // write back to C
    __half2* to_c = matC + (block_row*128 + warp_row*32 + thread_row*8) * (N/2) + block_col*128/2 + warp_col*64/2 + thread_col*8/2;
    #pragma unroll
    for (int i=0; i<8; ++i) {
        stg128(to_c+i*N/2, acc[i][0], acc[i][1], acc[i][2], acc[i][3]);
    }
    __syncthreads();
    return;
}

matrix_template matrix_mul_smit_pipeline_host(const matrix_template& matA, const matrix_template& matBT, matrix_template& matC, int M, int N, int K) {
    event_pair timer;
    // cudaMalloc device arrays
    float* device_matA = 0;
    float* device_matBT = 0;
    float* device_matC = 0;
    cudaMalloc((void**)&device_matA, M * K * sizeof(float));
    cudaMalloc((void**)&device_matBT, N * K * sizeof(float));
    cudaMalloc((void**)&device_matC, M * N * sizeof(float));
    if(device_matA == 0 || device_matBT == 0 || device_matC == 0) {
        printf("couldn't allocate memory\n");
        return matC;
    }
    // __half_copy
    __half* device_matA_h = 0;
    __half* device_matBT_h = 0;
    __half* device_matC_h = 0;
    cudaMalloc((void**)&device_matA_h, M * K * sizeof(__half));
    cudaMalloc((void**)&device_matBT_h, N * K * sizeof(__half));
    cudaMalloc((void**)&device_matC_h, M * N * sizeof(__half));
    if(device_matA_h == 0 || device_matBT_h == 0 || device_matC_h == 0) {
        printf("couldn't allocate memory\n");
        return matC;
    }
    // cuda mem copy
    cudaMemcpy(device_matA, matA.data(), M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_matBT, matBT.data(), N * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_matC, matC.data(), M * N * sizeof(float), cudaMemcpyHostToDevice);

    cast_kernel_float2half<<<128, 256>>>(device_matA_h, device_matA, M * K);
    cast_kernel_float2half<<<128, 256>>>(device_matBT_h, device_matBT, N * K);
    cast_kernel_float2half<<<128, 256>>>(device_matC_h, device_matC, M * N);

    __half2* device_matA_h2 = reinterpret_cast<__half2 *>(device_matA_h); 
    __half2* device_matBT_h2 = reinterpret_cast<__half2 *>(device_matBT_h);
    __half2* device_matC_h2 = reinterpret_cast<__half2 *>(device_matC_h);

    // kernel call
    int block_size = 16 * 16;
    int grid_size = (M * N) / (128 * 128);
    start_timer(&timer);
    matrix_mul_smit_pipeline_kernel_128x128<<<grid_size, block_size>>>(device_matA_h2, device_matBT_h2, device_matC_h2, M, N, K);
    float kernel_time_ms = stop_timer(&timer);
    device_matC_h = reinterpret_cast<__half *>(device_matC_h2);
    cast_kernel_half2float<<<128, 256>>>(device_matC, device_matC_h, M * N);
    cudaMemcpy(matC.data(), device_matC, M * N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(device_matA);
    cudaFree(device_matBT);
    cudaFree(device_matC);
    cudaFree(device_matA_h);
    cudaFree(device_matBT_h);
    cudaFree(device_matC_h);
    printf("cuda kernel <matrix_mul_smit_pipeline_kernel_128x128> runtime %f ms.\n", kernel_time_ms);
    return matC;
}

or in github: [2023/10/26] try using vectorized read/write everywhere, but got asm … · chengzhe-xu/project-gravity@19809e0 · GitHub (simt_pipeline.cu)

Well, I expected a minimal example which can be executed.

I am not a ptx expert. As far as I know, the vector operand in load and store instructions must be aligned to vector size and the unpacked registers must be continuous.
I would suggest specifying proper alignment for the source and target registers in both global and shared vectorized loads. Either by using alignas or using proper vector types.

ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
   // unsigned int reg0_ui, reg1_ui, reg2_ui, reg3_ui;
   uint4 reg_ui;
...
//__half2 tmp_a[4]
alignas(16) __half2 tmp_a[4];
1 Like

I might have missed it, but I don’t think CUDA provides vector types with __half2 elements. You could easily build your own 4-vector type with alignas(), however. And if you access such a 4-vector, the compiler will happily use 128-bit loads and stores, no need to use PTX inline assembly.

The fact that the ldg128() code above does nothing to ensure the required 16-byte alignment suggests that it has been “working fine” by sheer luck so far.

[Later:] I see why you may have started using PTX inline assembly. For some reason that I do not understand, the compiler insists on generating 4 loads for my 16-byte aligned vector type. It does produce a vector store into shared memory, though. Might be worth checking whether the compiler from CUDA 12.3 generates a vector load from global memory.

[Later still:] I replaced alignas(16) with CUDA’s own __align__(16) and that does not help with vectorizing the load from global memory. Not sure what is going on.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_fp16.h>

struct alignas(16) my_half2_4 {__half2 arr[4]; };
typedef struct my_half2_4 my_half2_4;

__global__ void kernel (my_half2_4 *data)
{
    __shared__ my_half2_4 foo;
    
    if (threadIdx.x == 0) {
        foo = *data;
    }
    __syncthreads();
    printf ("thread %d: %12.5f %12.5f\n", 
            threadIdx.x, 
            __low2float  (foo.arr[threadIdx.x]),
            __high2float (foo.arr[threadIdx.x]));
}

int main (void)
{
    my_half2_4 *data = 0;
    unsigned short d[8] = {0x3c00, 0xc000, 0x4200, 0xc400, 
                           0x4500, 0xc600, 0x4700, 0xc800};
    cudaMalloc ((void**)&data, sizeof (*data));
    printf ("data=%p\n", data);
    cudaMemcpy (data, d, sizeof (*data), cudaMemcpyHostToDevice);
    kernel<<<1,4>>>(data);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}
1 Like

CUDA 12.3 also generates 4 scalar global loads and a vectorized shared store

ld.global.u32 %r2, [%rd2+12];
ld.global.u32 %r3, [%rd2+8];
ld.global.u32 %r4, [%rd2+4];
ld.global.u32 %r5, [%rd2];
st.shared.v4.u32 [_ZZ6kernelP10my_half2_4E3foo], {%r5, %r4, %r3, %r2};


1 Like

It seems evident you are using the same a_share address for both CUDA C++ and (inline) PTX.

When using a shared address, and passing that shared address from CUDA C++ to (inline) PTX, it is often necessary to convert the shared address into a form usable in the shared logical space that the PTX instruction may expect. My guess would be that this is the issue.

Here is a complete example:


# cat t67.cu
#include <cuda_fp16.h>

__device__ __forceinline__ void sts128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(*(reinterpret_cast<unsigned int *>(&reg0))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg1))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg2))),
          "r"(*(reinterpret_cast<unsigned int *>(&reg3)))
    );
}

__global__ void k(__half2 *a, __half2 *b){

  __shared__ __half2 a_share[256];
  __half2 tmp_a[4];
  tmp_a[0] = a[0];
  tmp_a[1] = a[1];
  tmp_a[2] = a[2];
  tmp_a[3] = a[3];
#ifndef USE_FIX
  sts128(a_share, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]);
#else
  size_t as = __cvta_generic_to_shared(a_share);
  __half2 *my_as = reinterpret_cast<__half2 *>(as);
  sts128(my_as, tmp_a[0], tmp_a[1], tmp_a[2], tmp_a[3]);
#endif
  b[0] = a_share[3];
  b[1] = a_share[2];
  b[2] = a_share[1];
  b[3] = a_share[0];
}

int main(){

  __half2 *a;
  cudaMallocManaged(&a, 256*sizeof(__half2));
  k<<<1,1>>>(a, a+128);
  cudaDeviceSynchronize();
}
# nvcc -o t67 t67.cu -arch=sm_89
# compute-sanitizer ./t67
========= COMPUTE-SANITIZER
========= Invalid __shared__ write of size 16 bytes
=========     at 0xb0 in k(__half2 *, __half2 *)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x4e000000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e950]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1060e]
=========                in /root/bobc/./t67
=========     Host Frame:cudaLaunchKernel [0x7081e]
=========                in /root/bobc/./t67
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xaf8d]
=========                in /root/bobc/./t67
=========     Host Frame:__device_stub__Z1kP7__half2S0_(__half2*, __half2*) [0xae0c]
=========                in /root/bobc/./t67
=========     Host Frame:k(__half2*, __half2*) [0xae4d]
=========                in /root/bobc/./t67
=========     Host Frame:main [0xac6a]
=========                in /root/bobc/./t67
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaac5]
=========                in /root/bobc/./t67
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x47e786]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaDeviceSynchronize [0x48734]
=========                in /root/bobc/./t67
=========     Host Frame:main [0xac6f]
=========                in /root/bobc/./t67
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaac5]
=========                in /root/bobc/./t67
=========
========= ERROR SUMMARY: 2 errors
# nvcc -o t67 t67.cu -arch=sm_89 -DUSE_FIX
# compute-sanitizer ./t67
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
#

I’m not suggesting this is exactly how I would write the code. I did this to leave your sts128 function unchanged. If I were doing this, I would probably rewrite the sts128 function in some fashion, either to handle the shared conversion, or to accept the size_t variable without requiring a reinterpretation.

For completeness, here is the sass for the -DUSE_FIX compiled case:

# cuobjdump -sass ./t67

Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_89

Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_89
                Function : _Z1kP7__half2S0_
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                    /* 0x00000a0000017a02 */
                                                                             /* 0x000fc40000000f00 */
        /*0010*/                   MOV R2, c[0x0][0x160] ;                   /* 0x0000580000027a02 */
                                                                             /* 0x000fe20000000f00 */
        /*0020*/                   IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ;  /* 0x00005900ff037624 */
                                                                             /* 0x000fe200078e00ff */
        /*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;              /* 0x0000460000047ab9 */
                                                                             /* 0x000fc80000000a00 */
        /*0040*/                   LDG.E R8, [R2.64] ;                       /* 0x0000000402087981 */
                                                                             /* 0x000ea8000c1e1900 */
        /*0050*/                   LDG.E R9, [R2.64+0x4] ;                   /* 0x0000040402097981 */
                                                                             /* 0x000ea8000c1e1900 */
        /*0060*/                   LDG.E R10, [R2.64+0x8] ;                  /* 0x00000804020a7981 */
                                                                             /* 0x000ea8000c1e1900 */
        /*0070*/                   LDG.E R11, [R2.64+0xc] ;                  /* 0x00000c04020b7981 */
                                                                             /* 0x000ea2000c1e1900 */
        /*0080*/                   MOV R4, c[0x0][0x168] ;                   /* 0x00005a0000047a02 */
                                                                             /* 0x000fe20000000f00 */
        /*0090*/                   IMAD.MOV.U32 R5, RZ, RZ, c[0x0][0x16c] ;  /* 0x00005b00ff057624 */
                                                                             /* 0x000fc400078e00ff */
        /*00a0*/                   STS.128 [RZ], R8 ;                        /* 0x00000008ff007388 */
                                                                             /* 0x004fe80000000c00 */
        /*00b0*/                   LDS.128 R12, [RZ] ;                       /* 0x00000000ff0c7984 */
                                                                             /* 0x000e280000000c00 */
        /*00c0*/                   STG.E [R4.64], R11 ;                      /* 0x0000000b04007986 */
                                                                             /* 0x000fe8000c101904 */
        /*00d0*/                   STG.E [R4.64+0x4], R14 ;                  /* 0x0000040e04007986 */
                                                                             /* 0x001fe8000c101904 */
        /*00e0*/                   STG.E [R4.64+0x8], R13 ;                  /* 0x0000080d04007986 */
                                                                             /* 0x000fe8000c101904 */
        /*00f0*/                   STG.E [R4.64+0xc], R12 ;                  /* 0x00000c0c04007986 */
                                                                             /* 0x000fe2000c101904 */
        /*0100*/                   EXIT ;                                    /* 0x000000000000794d */
                                                                             /* 0x000fea0003800000 */
        /*0110*/                   BRA 0x110;                                /* 0xfffffff000007947 */
                                                                             /* 0x000fc0000383ffff */
        /*0120*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0130*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0140*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0150*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                      /* 0x0000000000007918 */
                                                                             /* 0x000fc00000000000 */
                ..........



Fatbin ptx code:
================
arch = sm_89
code version = [8,2]
host = linux
compile_size = 64bit
compressed
#

confirming that a 128 bit STS instruction is being used.

If you’re still having trouble, I suggest providing an example that is as short as I have provided, and as complete as I have provided, that demonstrates the issue.

1 Like

@striker159 Thanks for checking with CUDA 12.3. Best I can tell, the failure to vectorize the load is somehow related to how the opaque type __half2 is implemented. If I replace it with my own struct based equivalent my_half2, I get the desired and expected 128-bit load from global memory (see modified code below). I am both puzzled and annoyed.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_fp16.h>

struct alignas(4) my_half2 {__half x, y;};
typedef struct my_half2 my_half2;
struct alignas(16) my_half2_4 {my_half2 arr[4]; };
typedef struct my_half2_4 my_half2_4;

__global__ void kernel (my_half2_4 *data)
{
    __shared__ my_half2_4 foo;
    
    if (threadIdx.x == 0) {
        foo = *data;
    }
    __syncthreads();
    printf ("thread %d: %13.5e %13.5e\n", threadIdx.x, 
            __half2float (foo.arr[threadIdx.x].x),
            __half2float (foo.arr[threadIdx.x].y));
}

int main (void)
{
    my_half2_4 *data = 0;
    unsigned short d[8] = {0x3c00, 0xc000, 0x4200, 0xc400, 
                           0x4500, 0xc600, 0x4700, 0xc800};
    cudaMalloc ((void**)&data, sizeof (*data));
    printf ("data=%p\n", data);
    cudaMemcpy (data, d, sizeof (*data), cudaMemcpyHostToDevice);
    kernel<<<1,4>>>(data);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}
1 Like

Thank you so much @striker159 @Robert_Crovella and @njuffa ! You are right and the issue is that I pass directly a __half2 address used in c++ to PTX, and @Robert_Crovella 's solution works well! You guys are awesome!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.