How to use the `ex2.approx.f16x2` instruction?

I am computing 2^x (aka ex2). For the rest of my CUDA kernel, I am using half2 vectorized instructions, so I would like to use the half2 ex2 instruction. I am using an NVIDIA A10 GPU.

In the ptx manual, it says the ex2.approx.f16x2 can run ex2 on a half2 data type.

Using CUDA

I was thinking maybe the h2exp2() cuda function would call the ex2.approx.f16x2 instruction.

See the below code example:

#include <cuda_fp16.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

__global__ void myKernel_half2(__half2* x){

    auto block = cg::this_thread_block();
    int32_t gid = block.group_index().x;
    int32_t tid = block.thread_index().x;
    int32_t idx = gid*256 + tid;

    half2 input = x[idx];
    half2 output;
 
    output = h2exp2(x[idx]);

    x[idx] = output;
}

I compile the above with -gencode=arch=compute_86,code=sm_86 and this is the SASS that I see:

 MOV R1, c[0x0][0x28] 
 S2R R2, SR_CTAID.X 
 MOV R5, 0x4 
 ULDC.64 UR4, c[0x0][0x118] 
 S2R R3, SR_TID.X 
 LEA R2, R2, R3, 0x8 
 IMAD.WIDE R2, R2, R5, c[0x0][0x160] 
 LDG.E R0, [R2.64] 
 HADD2.F32 R4, -RZ, R0.reuse.H0_H0 
 HADD2.F32 R0, -RZ, R0.H1_H1 
 MUFU.EX2 R4, R4 
 MUFU.EX2 R0, R0 
 FFMA R5, R4, 5.9604644775390625e-08, R4 
 FFMA R6, R0, 5.9604644775390625e-08, R0 
 F2FP.PACK_AB R5, R6, R5 
 STG.E [R2.64], R5 
 EXIT 

Unfortunately, this SASS is not calling ex2.approx.f16x2, and it is doing something more complicated. It converts the numbers to fp32, runs two calls to EX2 in fp32, and converts back to fp16.

Using PTX

I also tried it using inline PTX assembly.

#include <cuda_fp16.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

__global__ void myKernel_half2(__half2* x){

    auto block = cg::this_thread_block();
    int32_t gid = block.group_index().x;
    int32_t tid = block.thread_index().x;
    int32_t idx = gid*256 + tid;

    half2 input = x[idx];
    half2 output;
 
    asm ("ex2.approx.f16x2 %0, %1;" : "=f"(output) : "f"(input));

    x[idx] = output;
}

However, this fails with a compile error:

<source>(19): error: an asm operand must have scalar type
   asm ("ex2.approx.f16x2 %0, %1;" : "=f"(output) : "f"(input));
                                     ^

<source>(19): error: an asm operand must have scalar type
   asm ("ex2.approx.f16x2 %0, %1;" : "=f"(output) : "f"(input));
                                                    ^

2 errors detected in the compilation of "<source>".
Compiler returned: 1

Is there a way that I can get my CUDA kernel to use the ex2.approx.f16x2 instruction?

ex2.approx.f16x2 is a PTX instruction for which there may well be no corresponding hardware instruction in the GPU target architecture you specified, in which case it is mapped to an emulation sequence comprising multiple SASS instructions.

Elsewhere the documentation indicates that .f16x2 is considered equivalent to .b32, so you might want to try the "r" binding of the inline assembly facility.

[Later:] The following builds fine for me:

#include "cuda_fp16.h"

__device__ half2 raw_ex2 (half2 arg)
{
    half2 ret;
    half hi, lo;
    unsigned short ilo, ihi;
    unsigned int in, out;

    lo = __low2half (arg);
    hi = __high2half (arg);
    in = ((((unsigned int) __half_as_ushort (hi)) << 16) |
          (((unsigned int) __half_as_ushort (lo)) <<  0));
    asm ("ex2.approx.f16x2 %0, %1;\n\t" : "=r"(out) : "r"(in));
    ilo = (unsigned short)(out);
    ihi = (unsigned short)(out >> 16);
    ret = __halves2half2 (__ushort_as_half (ilo), __ushort_as_half (ihi));
    return ret;
}

Compiled with nvcc -c -rdc=true -arch=sm_90 -o test_ex2_fp16.obj test_ex2_fp16.cu, this results in

        /*0000*/                   ULDC.64 UR4, c[0x0][0x208] ;     /* 0x0000820000047ab9 */
        /*0010*/                   LD.E R6, desc[UR4][R6.64] ;      /* 0x0000000406067980 */
        /*0020*/                   MUFU.EX2.F16 R0, R6 ;            /* 0x0000000600007308 */
        /*0030*/                   MUFU.EX2.F16 R3, R6.H1 ;         /* 0x1000000600037308 */
        /*0040*/                   PRMT R3, R0, 0x5410, R3 ;        /* 0x0000541000037816 */
        /*0050*/                   SHF.R.U32.HI R3, RZ, 0x10, R3 ;  /* 0x00000010ff037819 */
        /*0060*/                   PRMT R3, R0, 0x5410, R3 ;        /* 0x0000541000037816 */
        /*0070*/                   ST.E desc[UR4][R4.64], R3 ;      /* 0x0000000304007985 */
        /*0080*/                   RET.ABS.NODEC R20 0x0 ;          /* 0x0000000014007950 */

Not sure why multiple PRMT instructions are being generated when re-interpreting the output. My code may have a bug. I don’t have an sm_90 device available on which I can run and test this.

[Even later:]

Interestingly, the use of h2exp2() does not result in MUFU.EX2.F16 instructions being emitted, even with -use_fast_math. I tried, using CUDA 12.3:

__device__ half2 intrin_ex2 (half2 arg)
{
    return h2exp2 (arg);
}

Compiled with nvcc -c -rdc=true -use_fast_math -arch=sm_90 -o test_ex2_fp16.obj test_ex2_fp16.cu

        /*0000*/                   ULDC.64 UR4, c[0x0][0x208] ;               /* 0x0000820000047ab9 */
        /*0010*/                   LD.E R6, desc[UR4][R6.64] ;                /* 0x0000000406067980 */
        /*0020*/                   HADD2.F32 R0, -RZ, R6.H0_H0 ;              /* 0x20000006ff007230 */
        /*0030*/                   HADD2.F32 R3, -RZ, R6.H1_H1 ;              /* 0x30000006ff037230 */
        /*0040*/                   MUFU.EX2 R0, R0 ;                          /* 0x0000000000007308 */
        /*0050*/                   MUFU.EX2 R3, R3 ;                          /* 0x0000000300037308 */
        /*0060*/                   FFMA R8, R0, 5.9604644775390625e-08, R0 ;  /* 0x3380000000087823 */
        /*0070*/                   FFMA R9, R3, 5.9604644775390625e-08, R3 ;  /* 0x3380000003097823 */
        /*0080*/                   F2FP.F16.F32.PACK_AB R9, R9, R8 ;          /* 0x000000080909723e */
        /*0090*/                   ST.E desc[UR4][R4.64], R9 ;                /* 0x0000000904007985 */
        /*00a0*/                   RET.ABS.NODEC R20 0x0 ;                    /* 0x0000000014007950 */

Given that the “expensive” part of the respective SASS sequences here are the MUFU.EX2 instructions, there is likely not much difference in performance between using the built-in function versus home-brew code implemented via the SASS instruction.

I would not expect vectorized MUFU instructions to materialize anytime soon: Their implementation involves ROM-lookup, and the number of read ports on the ROM would have to be doubled, among other changes. I recall from my time working on AMD’s 3DNow! 2-way SIMD floating-point that we also punted on properly vectorizing the instructions PFRCP and PFRSQRT which were likewise based on ROM-lookup.

The dual PRMT instructions when using PXT inline assembly for ex2.approx.f16x2 appear to be the result of a missing peephole optimization in ptxas.

ex2.approx.f16x2 resolves into three instructions, two MUFU.EX2.F16 that return their 16-bit results in two separate registers, followed by a PRMT to pack this data into a single 32-bit register. In order to do type re-interpretation, programmers have to split the content of that register into two 16-bit halves, then combine them again, which generates a second PRMT. The compiler fails to detect that these data moves are redundant.

I tried replacing __halves2half2() with make_half2(), but the resulting SASS is identical. The following workaround using more inline PTX assembly seems to work:

/* extract least significant 16 bits of an unsigned int into an unsigned short*/
__forceinline__ __device__ unsigned short uint2loushort (unsigned int arg)
{
    unsigned short res;
    asm ("{\n\t"
         ".reg .b16 lo, hi;\n\t"
         "mov.b32 {lo, hi}, %1;\n\t"
         "mov.b16 %0, lo;\n\t"
         "}\n\t"
         : "=h"(res) : "r"(arg));
    return res;
}

/* extract most significant 16 bits of an unsigned int into an unsigned short */
__forceinline__ __device__ unsigned short uint2hiushort (unsigned int arg)
{
    unsigned short res;
    asm ("{\n\t"
         ".reg .b16 lo, hi;\n\t"
         "mov.b32 {lo, hi}, %1;\n\t"
         "mov.b16 %0, hi;\n\t"
         "}\n\t"
         : "=h"(res) : "r"(arg));
    return res;
}

__device__ half2 raw_ex2 (half2 arg)
{
    half2 res;
    half hi, lo;
    unsigned short ilo, ihi;
    unsigned int in, out;

    lo = __low2half (arg);
    hi = __high2half (arg);
    ilo = __half_as_ushort (lo);
    ihi = __half_as_ushort (hi);
    in = ((unsigned int)ihi << 16) | ((unsigned int)ilo);
    asm ("ex2.approx.f16x2 %0, %1;\n\t" : "=r"(out) : "r"(in));
    ilo = uint2loushort (out);
    ihi = uint2hiushort (out);
    lo = __ushort_as_half (ilo);
    hi = __ushort_as_half (ihi);
    res = __halves2half2 (lo, hi);
    return res;
}

With nvcc -c -rdc=true -arch=sm_90 -o test_ex2_fp16.obj test_ex2_fp16.cu`, this compiles to

        /*0000*/                   ULDC.64 UR4, c[0x0][0x208] ;  /* 0x0000820000047ab9 */
        /*0010*/                   LD.E R6, desc[UR4][R6.64] ;   /* 0x0000000406067980 */
        /*0020*/                   MUFU.EX2.F16 R0, R6 ;         /* 0x0000000600007308 */
        /*0030*/                   MUFU.EX2.F16 R3, R6.H1 ;      /* 0x1000000600037308 */
        /*0040*/                   PRMT R3, R0, 0x5410, R3 ;     /* 0x0000541000037816 */
        /*0050*/                   ST.E desc[UR4][R4.64], R3 ;   /* 0x0000000304007985 */
        /*0060*/                   RET.ABS.NODEC R20 0x0 ;       /* 0x0000000014007950 */