Identifier "__HALF2_TO_UI" is undefined when using asm for cuda

Hi Forum,
I am trying to use asm code to implement ldg128 and stg128 for cuda global memory access, and here is my main code for this part:

__device__ __forceinline__ void ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "ld.global.nc.v4.b32 {%1, %2, %3, %4}, [%0];\n"
        : "=r"(__HALF2_TO_UI(reg0)),
          "=r"(__HALF2_TO_UI(reg1)),
          "=r"(__HALF2_TO_UI(reg2)),
          "=r"(__HALF2_TO_UI(reg3))
        : "l"(addr)
    );
}

__device__ __forceinline__ void stg128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3) {
    asm volatile(
        "st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(__HALF2_TO_UI(reg0)),
          "r"(__HALF2_TO_UI(reg1)),
          "r"(__HALF2_TO_UI(reg2)),
          "r"(__HALF2_TO_UI(reg3))
    );
}

To compile, I have included

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_fp16.hpp>

and add my cuda arch in the Cmake (so functions like __hfma. __hfma2 works fine). When I want to compile the code above, an error occured saying:

common.cu(41): error: identifier "__HALF2_TO_UI" is undefined

common.cu(41): error: an asm operand must have scalar type

common.cu(42): error: an asm operand must have scalar type

common.cu(43): error: an asm operand must have scalar type

common.cu(44): error: an asm operand must have scalar type

May I know where did I do wrong? I cat the /usr/local/cuda/include/cuda_fp16.hpp and found the definition of __HALF2_TO_UI, but seems that the code cannot find that…

Thank you for your help!
Chengzhe

That macro is not usable in PTX assembly. The header file you reference is for CUDA C++, not PTX.

Why not just do this operation in CUDA C++?

1 Like

Oh I see… how I could include PTX?
I am learning how to use those macros lol, the code is from a former working project, and I try to learn how to utilize the vector ldg/stgs

The first version of my code is using __ldg()s, and trying to play around vector ldg

these are 32-bit registers. How about just using the variable itself?
instead of:

"=r"(__HALF2_TO_UI(reg0)),   

do:

 "=r"(reg0),

according to my testing, that compiles cleanly for me

Thank you!
I change my code to

__device__ __forceinline__ void ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    asm volatile(
        "ld.global.nc.v4.b32 {%1, %2, %3, %4}, [%0];\n"
        : "=r"(reg0),
          "=r"(reg1),
          "=r"(reg2),
          "=r"(reg3)
        : "l"(addr)
    );
}

__device__ __forceinline__ void stg128(__half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3) {
    asm volatile(
        "st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
        :
        : "l"(addr),
          "r"(reg0),
          "r"(reg1),
          "r"(reg2),
          "r"(reg3)
    );
}

but it says that error: an asm operand must have scalar type. Do you know what does that means?

Which CUDA version are you using? Please switch to CUDA 12.2 or newer.

1 Like

Oh, it’s cuda_11.4.r11.4… Got it, I will install CUDA 12.2! Thank you!!
// btw do we have some cuda 12.2 based dockers for orin developer kit lol?

cancel that, I made a mistake.

you mean about the cuda version12.2? lol

This seems to work:

# cat t63.cu
#include <cuda_fp16.h>

__device__ __forceinline__ void ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){
    unsigned reg0u, reg1u, reg2u, reg3u;
    asm volatile(
        "ld.global.nc.v4.b32 {%0, %1, %2, %3}, [%4];\n"
        : "=r"(reg0u),
          "=r"(reg1u),
          "=r"(reg2u),
          "=r"(reg3u)
        : "l"(addr)
    );
    reg0 = *(reinterpret_cast<__half2 *>(&reg0u));
    reg1 = *(reinterpret_cast<__half2 *>(&reg1u));
    reg2 = *(reinterpret_cast<__half2 *>(&reg2u));
    reg3 = *(reinterpret_cast<__half2 *>(&reg3u));
}

__global__ void k(__half2* a, __half2 *b){
        __half2 reg0, reg1, reg2, reg3;
        ldg128(a, reg0, reg1, reg2, reg3);
        b[0] = reg0;
        b[1] = reg1;
        b[2] = reg2;
        b[3] = reg3;
}
int main(){

  __half2 *a = NULL;
  k<<<1,1>>>(a, a);
  cudaDeviceSynchronize();
}
# nvcc t63.cu -o t63
# cuobjdump -sass ./t63

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_52
                Function : _Z1kP7__half2S0_
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
                                                              /* 0x001fc800fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;     /* 0x4c98078000870001 */
        /*0010*/                   MOV R2, c[0x0][0x140] ;    /* 0x4c98078005070002 */
        /*0018*/                   MOV R3, c[0x0][0x144] ;    /* 0x4c98078005170003 */
                                                              /* 0x001fc800fe2007b1 */
        /*0028*/                   LDG.E.CI.U.128 R8, [R2] ;  /* 0xeed7a00000070208 */
        /*0030*/                   MOV R4, c[0x0][0x148] ;    /* 0x4c98078005270004 */
        /*0038*/                   MOV R5, c[0x0][0x14c] ;    /* 0x4c98078005370005 */
                                                              /* 0x001fc400fe2107f1 */
        /*0048*/                   STG.E [R4], R8 ;           /* 0xeedc200000070408 */
        /*0050*/                   STG.E [R4+0x4], R9 ;       /* 0xeedc200000470409 */
        /*0058*/                   STG.E [R4+0x8], R10 ;      /* 0xeedc20000087040a */
                                                              /* 0x001fbc00fde007f1 */
        /*0068*/                   STG.E [R4+0xc], R11 ;      /* 0xeedc200000c7040b */
        /*0070*/                   NOP ;                      /* 0x50b0000000070f00 */
        /*0078*/                   NOP ;                      /* 0x50b0000000070f00 */
                                                              /* 0x001ffc00ffe007e2 */
        /*0088*/                   NOP ;                      /* 0x50b0000000070f00 */
        /*0090*/                   EXIT ;                     /* 0xe30000000007000f */
        /*0098*/                   BRA 0x98 ;                 /* 0xe2400fffff87000f */
                                                              /* 0x001f8000fc0007e0 */
        /*00a8*/                   NOP;                       /* 0x50b0000000070f00 */
        /*00b0*/                   NOP;                       /* 0x50b0000000070f00 */
        /*00b8*/                   NOP;                       /* 0x50b0000000070f00 */
                ..........



Fatbin ptx code:
================
arch = sm_52
code version = [8,2]
host = linux
compile_size = 64bit
compressed
#

You can see in the compiled code there aren’t any extra instructions.

1 Like

Thank you! I will try it!
So it seems that we need to pass registers as unsigned ints into the asm function, either use __HALF2_TO_UI or use reinterpret_cast to convert __half2, right?

I haven’t studied __HALF2_TO_UI

btw is the cuobjdump converting c to sass? can it do PTX? super cool tools!

__HALF2_TO_UI is not in cuda_fp16.h, it is in cuda_fp16.hpp

I took a look in that file, and it seems to be doing what I did:

#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))

The CUDA documentation is here. cuobjdump is part of the binary utilities, look for “CUDA Binary Utilities” on the left, and click on it.

1 Like

That’s really awesome. Thank you so much!

This is how I would do the same thing using CUDA C++. I don’t see a need to use inline PTX for this.

# cat t63.cu
#include <cuda_fp16.h>
__device__ __forceinline__ void ldg128(const __half2* addr, __half2 &reg0, __half2 &reg1, __half2 &reg2, __half2 &reg3){

    const int4 val = __ldg(reinterpret_cast<const int4 *>(addr));
    reg0 = *(reinterpret_cast<const __half2 *>(&val.x));
    reg1 = *(reinterpret_cast<const __half2 *>(&val.y));
    reg2 = *(reinterpret_cast<const __half2 *>(&val.z));
    reg3 = *(reinterpret_cast<const __half2 *>(&val.w));
}

__global__ void k(__half2* a, __half2 *b){
        __half2 reg0, reg1, reg2, reg3;
        ldg128(a, reg0, reg1, reg2, reg3);
        b[0] = reg0;
        b[1] = reg1;
        b[2] = reg2;
        b[3] = reg3;
}
int main(){

  __half2 *a = NULL;
  k<<<1,1>>>(a, a);
  cudaDeviceSynchronize();
}
# nvcc t63.cu -o t63
# cuobjdump -sass ./t63

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_52
                Function : _Z1kP7__half2S0_
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
                                                              /* 0x001fc800fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;     /* 0x4c98078000870001 */
        /*0010*/                   MOV R2, c[0x0][0x140] ;    /* 0x4c98078005070002 */
        /*0018*/                   MOV R3, c[0x0][0x144] ;    /* 0x4c98078005170003 */
                                                              /* 0x001fc800fe2007b1 */
        /*0028*/                   LDG.E.CI.U.128 R8, [R2] ;  /* 0xeed7a00000070208 */
        /*0030*/                   MOV R4, c[0x0][0x148] ;    /* 0x4c98078005270004 */
        /*0038*/                   MOV R5, c[0x0][0x14c] ;    /* 0x4c98078005370005 */
                                                              /* 0x001fc400fe2107f1 */
        /*0048*/                   STG.E [R4], R8 ;           /* 0xeedc200000070408 */
        /*0050*/                   STG.E [R4+0x4], R9 ;       /* 0xeedc200000470409 */
        /*0058*/                   STG.E [R4+0x8], R10 ;      /* 0xeedc20000087040a */
                                                              /* 0x001fbc00fde007f1 */
        /*0068*/                   STG.E [R4+0xc], R11 ;      /* 0xeedc200000c7040b */
        /*0070*/                   NOP ;                      /* 0x50b0000000070f00 */
        /*0078*/                   NOP ;                      /* 0x50b0000000070f00 */
                                                              /* 0x001ffc00ffe007e2 */
        /*0088*/                   NOP ;                      /* 0x50b0000000070f00 */
        /*0090*/                   EXIT ;                     /* 0xe30000000007000f */
        /*0098*/                   BRA 0x98 ;                 /* 0xe2400fffff87000f */
                                                              /* 0x001f8000fc0007e0 */
        /*00a8*/                   NOP;                       /* 0x50b0000000070f00 */
        /*00b0*/                   NOP;                       /* 0x50b0000000070f00 */
        /*00b8*/                   NOP;                       /* 0x50b0000000070f00 */
                ..........



Fatbin ptx code:
================
arch = sm_52
code version = [8,2]
host = linux
compile_size = 64bit
compressed
#

This of course requires the __half pointer to be properly aligned for a 128 bit load, but the other variants require that also.

1 Like

Side-remark: My understanding is that bit_cast (new in C++20) is generally preferable to reinterpret_cast, as discussed here, for example:

The CUDA Programming Guide seems to list excluded C++20 features rather than providing a positive list of included features, and I have not yet tried to use it myself.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.