Accuracy-optimized implementation of tanf(), without performance impact

njuffa · July 4, 2022, 9:56pm

A recent report evaluating and comparing the accuracy of math functions of various commonly-used libraries: Vincenzo Innocente and Paul Zimmermann, “Accuracy of Mathematical Functions in Single, Double, Extended Double and Quadruple Precision”, (HAL Preprint hal-03141101v2), 2022, highlights the fact that many of the math functions in CUDA are worthy of improvement in terms of accuracy. Looking at the details of some functions, it seems that this is possible without a negative impact on performance.

A case in point is the function tanf(), where I was able to reduce the maximum error of 3.096 ulps in CUDA 11 to 1.955 ulps, with no negative performance impact observed on my Quadro RTX 4000 (compute capability 7.5).

/*
  Copyright (c) 2022, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Approximate tangent on [-PI/4,+PI/4]. Maximum ulp error = 0.99698 */
__forceinline__ __device__ float tanf_poly (float s)
{
    float r;

    r =             4.38117981e-3f;  // 0x1.1f2000p-8
    r = fmaf (r, s, 8.94600598e-5f); // 0x1.773902p-14
    r = fmaf (r, s, 1.08341556e-2f); // 0x1.63037cp-7
    r = fmaf (r, s, 2.12811474e-2f); // 0x1.5cab9ap-6
    r = fmaf (r, s, 5.40602170e-2f); // 0x1.badc7ep-5
    r = fmaf (r, s, 1.33326918e-1f); // 0x1.110db4p-3
    r = fmaf (r, s, 3.33333433e-1f); // 0x1.110db4p-3
    r = r * s;
    return r;
}

/* GPU's built-in reciprocal approximation, MUFU.RCP */
__forceinline__ __device__ float rcp_gpu_raw (float a)
{
    float r;

    asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a)); // r = 1.0f / a
    return r;
}

/* Map tangent value based on quadrant */
__forceinline__ __device__ float tanf_core (float a, int i)
{
    float r, s, t;
    
    s = a * a;
    t = tanf_poly (s);
    r = fmaf (t, a, a);
    if (i & 1) {
        /* -1/r = -1/(r_hi+r_lo) ~= -1/r_hi + r_lo/(r_hi*r_hi) */
        s = a - r;        
        s = fmaf (t, a, s);    // r_lo
        t = rcp_gpu_raw (-r);  // -1/r_hi
        r = fmaf (r, t, 1.0f); // -1/r_hi = t+u
        r = fmaf (s, t, r);
        r = fmaf (r, t, t);
    }
    return r;
}

/* 190 bits of 2/PI for Payne-Hanek style argument reduction. */
__constant__ unsigned int two_over_pi_f [] = 
{
    0x28be60db,
    0x9391054a,
    0x7f09d5f4,
    0x7d4d3770,
    0x36d8a566,
    0x4f10e410
};

__forceinline__ __device__ float trig_red_slowpath_f (float a, int *quadrant)
{
    unsigned long long int p;
    unsigned int ia, hi, mid, lo, tmp, i;
    int e, q;
    float r;

    ia = ((((unsigned int)__float_as_int (a)) & 0x007fffff) << 8) | 0x80000000;
    e = ((((unsigned int)__float_as_int (a)) >> 23) & 0xff) - 126;
    
    /* compute product x * 2/pi in 2.62 fixed-point format */
    i = (unsigned int)e >> 5;
    e = (unsigned int)e & 31;

    hi  = i ? two_over_pi_f [i-1] : 0;
    mid = two_over_pi_f [i+0];
    lo  = two_over_pi_f [i+1];
    tmp = two_over_pi_f [i+2];
 
    if (e) {
        hi  = (hi  << e) | (mid >> (32 - e));
        mid = (mid << e) | (lo  >> (32 - e));
        lo  = (lo  << e) | (tmp >> (32 - e));
    }

    p = (unsigned long long int)ia * lo;
    p = (unsigned long long int)ia * mid + (p >> 32);
    p = ((unsigned long long int)(ia * hi) << 32) + p;

    q = (int)(p >> 62);                // integral portion = quadrant<1:0>
    p = p & 0x3fffffffffffffffULL;     // fraction
    if (p & 0x2000000000000000ULL) {   // fraction >= 0.5
        p = p - 0x4000000000000000ULL; // fraction - 1.0
        q = q + 1;
    }

    /* compute remainder of x / (pi/2) */
    double d = (double)(long long int)p;
    d = d * 3.4061215800865545e-19; // 0x1.921fb54442d18p-62 // pi/2 * 0x1.p-62 
    r = (float)d;
    if (a < 0.0f) {
        r = -r;
        q = -q;
    }
    *quadrant = q;
    return r;
}

/* Argument reduction for trigonometric functions that reduces the argument
   to the interval [-PI/4, +PI/4] and also returns the quadrant. It returns 
   -0.0f for an input of -0.0f 
*/
__forceinline__ __device__ float trig_red_f (float a, float switch_over, int *q)
{    
    float j, r;

    if (fabsf (a) > switch_over) {
        /* Payne-Hanek style reduction. M. Payne and R. Hanek, Radian reduction
           for trigonometric functions. SIGNUM Newsletter, 18:19-24, 1983
        */
        r = trig_red_slowpath_f (a, q);
    } else {
        /* FMA-enhanced Cody-Waite style reduction. W. J. Cody and W. Waite, 
           "Software Manual for the Elementary Functions", Prentice-Hall 1980
        */
        j = fmaf (a, 0.636619747f, 1.2582912e+7f);//0x1.45f306p-1,0x1.800000p+23
        *q = __float_as_int (j);
        j = j - 1.25829120e+7f; // 0x1.800000p+23

        r = fmaf (j, -1.57079601e+00f, a); // -0x1.921fb0p+00 // pio2_high
        r = fmaf (j, -3.13916473e-07f, r); // -0x1.5110b4p-22 // pio2_mid
        r = fmaf (j, -5.39030253e-15f, r); // -0x1.846988p-48 // pio2_low
    }
    return r;
}

/* maximum ulp error = 1.95549  ulp1 = 669180566 */
__inline__ __device__ float my_tanf (float a)
{
    float r;
    int i;

    a = fmaf (a, 0.0f, a); // inf -> NaN
    r = trig_red_f (a, 252.898206f, &i);
    r = tanf_core (r, i);
    return r;
}

njuffa · July 5, 2022, 6:20am

In case anyone is wondering: Sine and cosine implementations using the same framework as my_tanf) above are only minimally more accurate and minimally faster than the built-in functions in CUDA 11.

/*
  Copyright (c) 2022, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Approximate sine on [-PI/4,+PI/4]. Maximum ulp error = 0.64196
   Returns -0.0f for an argument of -0.0f
   Polynomial approximation based on T. Myklebust, "Computing accurate 
   Horner form approximations to special functions in finite precision
   arithmetic", http://arxiv.org/abs/1508.03211, retrieved on 8/29/2016
*/
__forceinline__ __device__ float sinf_poly (float a, float s)
{
    float r, t;
    r =              2.86567956e-6f;  //  0x1.80a000p-19
    r = fmaf (r, s, -1.98559923e-4f); // -0x1.a0690cp-13
    r = fmaf (r, s,  8.33338592e-3f); //  0x1.111182p-07
    r = fmaf (r, s, -1.66666672e-1f); // -0x1.555556p-03
    t = fmaf (a, s, 0.0f); // ensure -0 is passed through
    r = fmaf (r, t, a);
    return r;
}

/* Approximate cosine on [-PI/4,+PI/4]. Maximum ulp error = 0.87444 */
__forceinline__ __device__ float cosf_poly (float s)
{
    float r;
    r =              2.44677067e-5f;  //  0x1.9a8000p-16
    r = fmaf (r, s, -1.38877297e-3f); // -0x1.6c0efap-10
    r = fmaf (r, s,  4.16666567e-2f); //  0x1.555550p-05
    r = fmaf (r, s, -5.00000000e-1f); // -0x1.000000p-01
    r = fmaf (r, s,  1.00000000e+0f); //  0x1.000000p+00
    return r;
}

/* Map sine or cosine value based on quadrant */
__forceinline__ __device__ float sinf_cosf_core (float a, int i)
{
    float r, s;

    s = a * a;
    r = (i & 1) ? cosf_poly (s) : sinf_poly (a, s);
    if (i & 2) {
        r = 0.0f - r; // don't change "sign" of NaNs
    }
    return r;
}

/* maximum ulp error = 1.492409  ulp1 = 337613180 */
__inline__ __device__ float my_sinf (float a)
{
    float r;
    int i;

    a = fmaf (a, 0.0f, a); // inf -> NaN
    r = trig_red_f (a, 71476.0625f, &i);
    r = sinf_cosf_core (r, i);
    return r;
}  

/* maximum ulp error = 1.495098  ulp1 = 342020912 */
__inline__ __device__ float my_cosf (float a)
{
    float r;
    int i;

    a = fmaf (a, 0.0f, a); // inf -> NaN
    r = trig_red_f (a, 71476.0625f, &i);
    r = sinf_cosf_core (r, i + 1);
    return r;
}

Topic		Replies	Views
An implementation of single-precision tanpi() for CUDA CUDA Programming and Performance	1	783	July 26, 2017
More accurate and somewhat faster implementation of atanf() CUDA Programming and Performance	0	873	May 15, 2022
A faster and more accurate implementation of sincosf() CUDA Programming and Performance	25	9259	August 6, 2017
Hardware-accelerated tanh() on Turing CUDA Programming and Performance	1	1766	April 7, 2021
Implementation of atan2f() with improved accuracy (no negative impact on performance) CUDA Programming and Performance	0	852	February 27, 2023
sincospif() implementation with improved performance and accuracy CUDA Programming and Performance	8	3351	August 16, 2016
Gamma function implementation with significantly improved accuracy CUDA Programming and Performance	0	759	March 21, 2022
Weekend project: Fast arctangent computation with Borchardt's algorithm CUDA Programming and Performance	3	258	February 23, 2025
Optimized version of single-precision error function, erff() CUDA Programming and Performance	21	4520	December 25, 2017
On the utility of SFU instructions for half-precision math functions CUDA Programming and Performance	8	2478	September 16, 2019

Accuracy-optimized implementation of tanf(), without performance impact

Related topics