More accurate version of exp2f() with no change in performance

njuffa · February 17, 2023, 12:30pm

Today I wrote some code in which I wanted to scale a float by an integer power of two. My assumption was that CUDA’s exp2f() would deliver a mathematically accurate result when passed an integer value. That turns out to be true except for exp2f(-127.0f).

The disassembly (I looked at sm_75, but it should be functionally equivalent for all supported architectures) shows that exp2f() maps to the MUFU.EX2 instruction, with a fixup if the result will be subnormal. This is needed because the special function units do not offer support for subnormal results. The implementation of exp2f() when compiled with -ftz=false in CUDA 11.8 is essentially:

__forceinline__ __device__ float raw_ex2 (float a)
{
    float r;
    asm ("ex2.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
    return r;
}

__device__ float cuda11_exp2f (float a)
{
    float t = a;
    if (a < -126.0f) t = t * 0.5f;
    float r = raw_ex2 (t);
    if (a < -126.0f) r= r * r;
    return r;
}

The maximum error observed is 2.38344 ulp. My proposed replacement uses a different fixup, resulting in a maximum error of 2.05498 ulp. Instead of halving the argument and squaring the exponentiation result for fixup, it adds 24 to the argument and multiplies the result by 2^-24. The number of instructions needed is identical, and the constants are incorporated into the immediate field of the instructions. So there is no performance impact.

/*
  Copyright (c) 2023, Norbert Juffa

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define EXP2F_TEST_THREADS (128)
#define EXP2F_TEST_DEFLEN  (1 << 24)
#define FTZ                (0)
#define USE_CUDA_BUILTIN   (0)

__forceinline__ __device__ float raw_ex2 (float a)
{
    float r;
    asm ("ex2.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
    return r;
}

__device__ float my_exp2f (float a)
{
    float t = a;
    if (a < -126.0f) t = t + 24.0f;
    float r = raw_ex2 (t);
    if (a < -126.0f) r= r * 0x1.0p-24f;
    return r;
}

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

float uintAsFloat (unsigned int x)
{
    float r;
    memcpy (&r, &x, sizeof r);
    return r;
}

unsigned int floatAsUInt (float x)
{
    unsigned int r;
    memcpy (&r, &x, sizeof r);
    return r;
}

unsigned long long int doubleAsULongInt (double x)
{
    unsigned long long int r;
    memcpy (&r, &x, sizeof r);
    return r;
}

double floatUlpErr (float res, double ref)
{
    unsigned long long int i;
    unsigned long long int j;
    unsigned long long int err;
    int expoRef;
    
    if (isnan(res) || isnan (ref) || isinf(res) || isinf (ref) ||
        (res == 0.0f) || (ref == 0.0f)) {
        return 0.0;
    }
    i = ((unsigned long long int)floatAsUInt(res)) << 32;
    expoRef = (int)(((doubleAsULongInt(ref) >> 52) & 0x7ff) - 1023);
    if (expoRef >= 129) {
        j = (doubleAsULongInt(ref) & 0x8000000000000000ULL) |
            0x7fffffffffffffffULL;
    } else if (expoRef < -126) {
        j = ((doubleAsULongInt(ref) << 11) | 0x8000000000000000ULL) >> 8;
        j = j >> (-(expoRef + 126));
        j = j | (doubleAsULongInt(ref) & 0x8000000000000000ULL);
    } else {
        j = ((doubleAsULongInt(ref) << 11) & 0x7fffffffffffffffULL) >> 8;
        j = j | ((unsigned long long int)(expoRef + 127) << 55);
        j = j | (doubleAsULongInt(ref) & 0x8000000000000000ULL);
    }
    err = (i < j) ? (j - i) : (i - j);
    return err / 4294967296.0;
}

__global__ void exp2f_test (const float * __restrict__ src, 
                            float * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
#if USE_CUDA_BUILTIN
        dst[i] = exp2f (src[i]);
#else // USE_CUDA_BUILTIN
        dst[i] = my_exp2f (src[i]);
#endif
    }
}    

int main (void)
{
    float *a, *b;
    float *d_a, *d_b;
    float res, reff, errloc = 0;
    unsigned int argi, resi, refi;
    double darg, ref, ulp, maxulp = 0;
    int ulp1 = 0;

    /* Allocate memory on the host */
    a = (float *)malloc (sizeof(a[0]) * EXP2F_TEST_DEFLEN);
    b = (float *)malloc (sizeof(b[0]) * EXP2F_TEST_DEFLEN);

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0])*EXP2F_TEST_DEFLEN));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0])*EXP2F_TEST_DEFLEN));
    
    argi = 0x00000000;
    do {
        
        for (int i = 0; i < EXP2F_TEST_DEFLEN; i++) {
            a[i] = uintAsFloat (argi);
            argi++;
        }

        CUDA_SAFE_CALL (cudaMemcpy (d_a, a, sizeof(d_a[0])*EXP2F_TEST_DEFLEN, 
                                    cudaMemcpyHostToDevice));

        /* Compute execution configuration */
        dim3 dimBlock(EXP2F_TEST_THREADS);
        int threadBlocks = (EXP2F_TEST_DEFLEN + (dimBlock.x - 1)) / dimBlock.x;
        if (threadBlocks > 65520) threadBlocks = 65520;
        dim3 dimGrid(threadBlocks);

        exp2f_test<<<dimGrid,dimBlock>>>(d_a, d_b, EXP2F_TEST_DEFLEN);
        CHECK_LAUNCH_ERROR();
    
        CUDA_SAFE_CALL (cudaMemcpy (b, d_b, sizeof(b[0]) * EXP2F_TEST_DEFLEN, 
                                    cudaMemcpyDeviceToHost));

        for (int i = 0; i < EXP2F_TEST_DEFLEN; i++) {
            darg = (double)a[i];
#if FTZ == 1
            if ((fabsf(a[i]) > 0) && (fabsf(a[i]) < uintAsFloat (0x00800000))) {
                darg = 0.0 * darg;
            }
#endif
            ref = exp2 (darg);
#if FTZ == 1
            if (ref < uintAsFloat (0x00800000)) {
                ref = 0.0 * ref;
            }
#endif
            if (isnan (ref)) {
                reff = uintAsFloat (0x7fffffff); // canonical NaN
            } else {
                reff = (float)ref;
            }
            refi = floatAsUInt (reff);
            res = b[i];
            resi = floatAsUInt (res);
            if (resi != refi) ulp1++;
            ulp = floatUlpErr (res, ref);
            if (ulp > maxulp) {
                errloc = a[i];
                maxulp = ulp;
            }
            if (llabs ((long long)resi - (long long)refi) > 2) {
                printf ("!!!! error: a=%a (% 15.8e)  b=%a   reff=%a\n",
                        a[i], a[i], b[i], reff);
            }
        }
        printf ("^^^^ argi = %08x  maxulp=%.5f  ulp1=%d  errloc=%15.8e\n", 
                argi, maxulp, ulp1, errloc);
    } while (argi);

    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

Topic		Replies	Views
Accuracy-optimized performance-neutral implementation of erfcxf() CUDA Programming and Performance	2	416	September 4, 2022
Accuracy-optimized implementation of erff(), without performance impact CUDA Programming and Performance	0	668	June 19, 2022
A faster double-precision exp() CUDA Programming and Performance	8	3128	September 5, 2017
Optimized version of single-precision error function, erff() CUDA Programming and Performance	21	4466	December 25, 2017
Anticipating future standards: Implementation of exp2m1f() CUDA Programming and Performance	1	544	August 11, 2021
A more accurate, performance-competitive implementation of expf() CUDA Programming and Performance	24	8269	November 19, 2017
Question about cuda exp() function CUDA Programming and Performance cuda	9	1701	January 6, 2023
A faster and more accurate implementation of expm1f() CUDA Programming and Performance	1	1257	December 18, 2019
Accuracy-optimized implementation of expm1f() without performance penalty CUDA Programming and Performance	0	121	June 30, 2024
Faster and more accurate implementation of log1pf() CUDA Programming and Performance	15	3300	January 25, 2017

More accurate version of exp2f() with no change in performance

Related topics