An accuracy-optimized performance-competitive implementation of `erfcf`

njuffa · August 2, 2022, 7:02am

A recent report that compares the accuracy of standard math functions across multiple platform shows that the single-precision variant of the complementary error function, erfcf(), has wider error bounds in the CUDA standard math library than most other math libraries. In particular, when I perform an exhaustive test of the implementation in CUDA 11, I find the maximum error to be 4.48672 ulps.

Below I am showing two implementation variants that reduce maximum error to either 3.64344 ulps (FASTER=1) or 2.63657 ulps (FASTER=0). On my Quadro RTX 4000, the variant FASTER=1 seems to match the performance of CUDA 11’s built-in exactly (to better than measurement noise level of 2%), while the performance with FASTER=0 is about 2.5% lower than in CUDA 11.

That latter difference is just barely outside noise level, and I would expect relative performance to fluctuate slightly based on GPU architecture and toolchain version. Therefore I consider both implementation variants to be performance competitive with CUDA 11.

/*
  Copyright (c) 2017-2022, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Compute exp(a) * 2**scale. Max ulp err = 0.86565 */
__forceinline__ __device__ float expf_scale (float a, int scale)
{
    const float flt_int_cvt = 12582912.0f; // 0x1.8p23 
    float f, r, j, t;
    int i;

    // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
    j = fmaf (1.442695f, a, flt_int_cvt); // // 0x1.715476p0 // log2(e)
    t = j - flt_int_cvt; 
    f = fmaf (t, -6.93145752e-1f, a); // -0x1.62e400p-1  // log_2_hi 
    f = fmaf (t, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo 
    i = __float_as_int (j);
    // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
    r =             1.37805939e-3f;  // 0x1.694000p-10
    r = fmaf (r, f, 8.37312452e-3f); // 0x1.125edcp-7
    r = fmaf (r, f, 4.16695364e-2f); // 0x1.555b5ap-5
    r = fmaf (r, f, 1.66664720e-1f); // 0x1.555450p-3
    r = fmaf (r, f, 4.99999851e-1f); // 0x1.fffff6p-2
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    // exp(a) = 2**(i+scale) * r;
    r = __int_as_float (((i + scale) << 23) + __float_as_int (r));
    return r;
}

/*  
 * Based on: M. M. Shepherd and J. G. Laframboise, "Chebyshev Approximation of 
 * (1+2x)exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 36,
 * No. 153, January 1981, pp. 249-253.
 *
 * FASTER=0: maximum ulp error = 2.63657
 * FASTER=1: maximum ulp error = 3.64344
 */  
__inline__ __device__ float my_erfcf (float x)
{
    const float TWO_TO_M24 = 5.9604644775390625e-8f;
    float a, d, e, p, q, r, s, t;

    a = fabsf (x);

    /* Compute q = (a-2)/(a+2) accurately. [0, 10.0546875] -> [-1, 0.66818] */
    p = a + 2.0f;
    asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(p));
#if FASTER
    q = fmaf (a, r, -2.0f * r);

    /* Approximate (1+2*a)*exp(a*a)*erfc(a) as p(q)+1 for q in [-1, 0.66818] */
    p =             -4.00900841e-4f;  // -0x1.a46000p-12
    p = fmaf (p, q, -1.23049226e-3f); // -0x1.4290f0p-10
    p = fmaf (p, q,  1.31353654e-3f); //  0x1.5855f2p-10
    p = fmaf (p, q,  8.63232370e-3f); //  0x1.1add2ep-7
    p = fmaf (p, q, -8.05992913e-3f); // -0x1.081b96p-7
    p = fmaf (p, q, -5.42046241e-2f); // -0x1.bc0b56p-5
    p = fmaf (p, q,  1.64055422e-1f); //  0x1.4ffc4ap-3
    p = fmaf (p, q, -1.66031465e-1f); // -0x1.54084ep-3
    p = fmaf (p, q, -9.27639827e-2f); // -0x1.7bf616p-4
    p = fmaf (p, q,  2.76978403e-1f); //  0x1.1ba03ap-2

#else // FASTER
    q = fmaf (-4.0f, r, 1.0f);
    t = fmaf (q + 1.0f, -2.0f, a); 
    e = fmaf (-a, q, t); 
    q = fmaf (r, e, q); 

    /* Approximate (1+2*a)*exp(a*a)*erfc(a) as p(q)+1 for q in [-1, 0.66818] */
    p =             -4.01020050e-4f;  // -0x1.a48000p-12
    p = fmaf (p, q, -1.23079563e-3f); // -0x1.42a54cp-10
    p = fmaf (p, q,  1.31352816e-3f); //  0x1.585562p-10
    p = fmaf (p, q,  8.63240846e-3f); //  0x1.1adde4p-7
    p = fmaf (p, q, -8.05994961e-3f); // -0x1.081bc2p-7
    p = fmaf (p, q, -5.42045869e-2f); // -0x1.bc0b42p-5
    p = fmaf (p, q,  1.64055437e-1f); //  0x1.4ffc4cp-3
    p = fmaf (p, q, -1.66031420e-1f); // -0x1.540848p-3 
    p = fmaf (p, q, -9.27639827e-2f); // -0x1.7bf616p-4
    p = fmaf (p, q,  2.76978403e-1f); //  0x1.1ba03ap-2
#endif // FASTER

    /* Divide (1+p) by (1+2*a) ==> exp(a*a)*erfc(a) */
    d = fmaf (2.0f, a, 1.0f);
    asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(d));
    q = fmaf (p, r, r); // q = (p+1)/(1+2*a)
    e = fmaf (fmaf (q, -a, 0.5f), 2.0f, p - q); // residual: (p+1)-q*(1+2*a) 
    r = fmaf (e, r, q);

    /* Multiply by exp(-a*a) ==> erfc(a) */
    s = a * a; 
    e = expf_scale (-s, 24); /* scale by 2**24 to avoid intermediate underflow*/
    t = fmaf (a, -a, s);
    r = fmaf (r, e, r * e * t);
    r = r * TWO_TO_M24;

    /* Clamp result for large arguments */
    if (a > 10.0546875f) r = 0.0f;

    /* Handle negative arguments to erfc() */
    r = (x < 0.0f) ? (2.0f - r) : r;

    return r;
}

njuffa · August 2, 2022, 7:06am

Ooops. Looks like I navigated to the wrong sub-forum. I meant to post this in the “CUDA Programming and Performance” sub-forum. I will post over there as well, I don’t think it is too far off-topic here so will let this remain in place.

Topic		Replies	Views
An accuracy-optimized performance-competitive implementation of erfcf() CUDA Programming and Performance	0	749	August 2, 2022
Optimized version of single-precision error function, erff() CUDA Programming and Performance	21	4656	December 25, 2017
Accuracy-optimized performance-neutral implementation of erfcxf() CUDA Programming and Performance	2	471	September 4, 2022
Calling all Juffas! What's up with erfcf() nowadays? CUDA Programming and Performance	5	907	August 28, 2023
Larger than expected / documented error in erfcinvf() CUDA Programming and Performance	7	606	February 13, 2020
A very compact implementation of the inverse error function erfinvf() CUDA Programming and Performance	0	1142	June 4, 2018
Math function performance Legacy PGI Compilers	2	825	September 24, 2019
expf vs __expf ? CUDA Programming and Performance	3	4268	April 23, 2009
Accuracy-optimized implementation of erff(), without performance impact CUDA Programming and Performance	0	694	June 19, 2022
Error while using erf and erfc function Legacy PGI Compilers	2	4870	October 20, 2010

An accuracy-optimized performance-competitive implementation of `erfcf`

Related topics