An accuracy-optimized performance-competitive implementation of `erfcf`

A recent report that compares the accuracy of standard math functions across multiple platform shows that the single-precision variant of the complementary error function, erfcf(), has wider error bounds in the CUDA standard math library than most other math libraries. In particular, when I perform an exhaustive test of the implementation in CUDA 11, I find the maximum error to be 4.48672 ulps.

Below I am showing two implementation variants that reduce maximum error to either 3.64344 ulps (FASTER=1) or 2.63657 ulps (FASTER=0). On my Quadro RTX 4000, the variant FASTER=1 seems to match the performance of CUDA 11’s built-in exactly (to better than measurement noise level of 2%), while the performance with FASTER=0 is about 2.5% lower than in CUDA 11.

That latter difference is just barely outside noise level, and I would expect relative performance to fluctuate slightly based on GPU architecture and toolchain version. Therefore I consider both implementation variants to be performance competitive with CUDA 11.

/*
  Copyright (c) 2017-2022, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Compute exp(a) * 2**scale. Max ulp err = 0.86565 */
__forceinline__ __device__ float expf_scale (float a, int scale)
{
    const float flt_int_cvt = 12582912.0f; // 0x1.8p23 
    float f, r, j, t;
    int i;

    // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
    j = fmaf (1.442695f, a, flt_int_cvt); // // 0x1.715476p0 // log2(e)
    t = j - flt_int_cvt; 
    f = fmaf (t, -6.93145752e-1f, a); // -0x1.62e400p-1  // log_2_hi 
    f = fmaf (t, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo 
    i = __float_as_int (j);
    // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
    r =             1.37805939e-3f;  // 0x1.694000p-10
    r = fmaf (r, f, 8.37312452e-3f); // 0x1.125edcp-7
    r = fmaf (r, f, 4.16695364e-2f); // 0x1.555b5ap-5
    r = fmaf (r, f, 1.66664720e-1f); // 0x1.555450p-3
    r = fmaf (r, f, 4.99999851e-1f); // 0x1.fffff6p-2
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    // exp(a) = 2**(i+scale) * r;
    r = __int_as_float (((i + scale) << 23) + __float_as_int (r));
    return r;
}

/*  
 * Based on: M. M. Shepherd and J. G. Laframboise, "Chebyshev Approximation of 
 * (1+2x)exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 36,
 * No. 153, January 1981, pp. 249-253.
 *
 * FASTER=0: maximum ulp error = 2.63657
 * FASTER=1: maximum ulp error = 3.64344
 */  
__inline__ __device__ float my_erfcf (float x)
{
    const float TWO_TO_M24 = 5.9604644775390625e-8f;
    float a, d, e, p, q, r, s, t;

    a = fabsf (x);

    /* Compute q = (a-2)/(a+2) accurately. [0, 10.0546875] -> [-1, 0.66818] */
    p = a + 2.0f;
    asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(p));
#if FASTER
    q = fmaf (a, r, -2.0f * r);

    /* Approximate (1+2*a)*exp(a*a)*erfc(a) as p(q)+1 for q in [-1, 0.66818] */
    p =             -4.00900841e-4f;  // -0x1.a46000p-12
    p = fmaf (p, q, -1.23049226e-3f); // -0x1.4290f0p-10
    p = fmaf (p, q,  1.31353654e-3f); //  0x1.5855f2p-10
    p = fmaf (p, q,  8.63232370e-3f); //  0x1.1add2ep-7
    p = fmaf (p, q, -8.05992913e-3f); // -0x1.081b96p-7
    p = fmaf (p, q, -5.42046241e-2f); // -0x1.bc0b56p-5
    p = fmaf (p, q,  1.64055422e-1f); //  0x1.4ffc4ap-3
    p = fmaf (p, q, -1.66031465e-1f); // -0x1.54084ep-3
    p = fmaf (p, q, -9.27639827e-2f); // -0x1.7bf616p-4
    p = fmaf (p, q,  2.76978403e-1f); //  0x1.1ba03ap-2

#else // FASTER
    q = fmaf (-4.0f, r, 1.0f);
    t = fmaf (q + 1.0f, -2.0f, a); 
    e = fmaf (-a, q, t); 
    q = fmaf (r, e, q); 

    /* Approximate (1+2*a)*exp(a*a)*erfc(a) as p(q)+1 for q in [-1, 0.66818] */
    p =             -4.01020050e-4f;  // -0x1.a48000p-12
    p = fmaf (p, q, -1.23079563e-3f); // -0x1.42a54cp-10
    p = fmaf (p, q,  1.31352816e-3f); //  0x1.585562p-10
    p = fmaf (p, q,  8.63240846e-3f); //  0x1.1adde4p-7
    p = fmaf (p, q, -8.05994961e-3f); // -0x1.081bc2p-7
    p = fmaf (p, q, -5.42045869e-2f); // -0x1.bc0b42p-5
    p = fmaf (p, q,  1.64055437e-1f); //  0x1.4ffc4cp-3
    p = fmaf (p, q, -1.66031420e-1f); // -0x1.540848p-3 
    p = fmaf (p, q, -9.27639827e-2f); // -0x1.7bf616p-4
    p = fmaf (p, q,  2.76978403e-1f); //  0x1.1ba03ap-2
#endif // FASTER

    /* Divide (1+p) by (1+2*a) ==> exp(a*a)*erfc(a) */
    d = fmaf (2.0f, a, 1.0f);
    asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(d));
    q = fmaf (p, r, r); // q = (p+1)/(1+2*a)
    e = fmaf (fmaf (q, -a, 0.5f), 2.0f, p - q); // residual: (p+1)-q*(1+2*a) 
    r = fmaf (e, r, q);

    /* Multiply by exp(-a*a) ==> erfc(a) */
    s = a * a; 
    e = expf_scale (-s, 24); /* scale by 2**24 to avoid intermediate underflow*/
    t = fmaf (a, -a, s);
    r = fmaf (r, e, r * e * t);
    r = r * TWO_TO_M24;

    /* Clamp result for large arguments */
    if (a > 10.0546875f) r = 0.0f;

    /* Handle negative arguments to erfc() */
    r = (x < 0.0f) ? (2.0f - r) : r;

    return r;
}
1 Like

Ooops. Looks like I navigated to the wrong sub-forum. I meant to post this in the “CUDA Programming and Performance” sub-forum. I will post over there as well, I don’t think it is too far off-topic here so will let this remain in place.

1 Like