Faster and more accurate implementation of logf()

njuffa · December 13, 2015, 11:07pm

I figured the approach I used to construct a well-tuned implementation of log1pf() might work well for plain old logf() as well, and indeed it does. On my Quadro K2200 (sm_50) I measure the following throughput with CUDA 7.5:

CUDA 7.5 logf() -ftz=false  14.8 billion function calls per second
CUDA 7.5 logf() -ftz=true   16.5 billion function calls per second
my_logf()                   27.4 billion function calls per second

The accuracy is also improved, with the maximum error compared to the mathematical result reduced from 1.00366 ulps to 0.85089 ulps, which means my_logf() is faithfully rounded.

[code below updated 7/3/2016, 7/18/2016, 1/21/2017, 12/18/2017, 4/23/2018, 4/28/2018, 5/2/2023, 2/12/2025]

/*
  Copyright (c) 2015-2025, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* compute natural logarithm. max ulp error = 0.85095 */
__device__ float my_logf (float a)
{
    float i, m, r, s, t;
    int e;

    i = 0.0f;
    if (a < 1.175494351e-38f){ // 0x1.0p-126
        a = a * 8388608.0f; // 0x1.0p+23
        i = -23.0f;
    }
    e = (__float_as_int (a) - __float_as_int (0.666666667f)) & 0xff800000;
    m = __int_as_float (__float_as_int (a) - e);
    i = fmaf ((float)e, 1.19209290e-7f, i); // 0x1.0p-23
    /* m in [2/3, 4/3] */
    m = m - 1.0f;
    s = m * m;
    /* Compute log1p(m) for m in [-1/3, 1/3] */
    r =             -0.130310059f;  // -0x1.0ae000p-3
    t =              0.140869141f;  //  0x1.208000p-3
    r = fmaf (r, s, -0.121486895f); // -0x1.f19c3ep-4
    t = fmaf (t, s,  0.139815226f); //  0x1.1e5772p-3
    r = fmaf (r, s, -0.166845486f); // -0x1.55b316p-3
    t = fmaf (t, s,  0.200120315f); //  0x1.99d8aep-3
    r = fmaf (r, s, -0.249996230f); // -0x1.fffe06p-3
    r = fmaf (t, m, r);
    r = fmaf (r, m,  0.333331972f); //  0x1.5554fap-2
    r = fmaf (r, m, -0.500000000f); // -0x1.000000p-1  
    r = fmaf (r, s, m);
    r = fmaf (i,  0.693147182f, r); //  0x1.62e430p-1 // log(2)
    if (!((a > 0.0f) && (a < __int_as_float (0x7f800000)))) { // +INF
        asm ("lg2.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a)); // handle NaN, INF
    }
    return r;
}

CudaaduC · December 14, 2015, 12:43am

I figured the approach I used to construct a well-tuned implementation of log1pg() might work well for plain old logf() as well, and indeed it does. On my Quadro K2200 (sm_50) I measure the following throughput with CUDA 7.5:

CUDA 7.5 logf() -ftz=false  14.8 billion function calls per second
CUDA 7.5 logf() -ftz=true   16.5 billion function calls per second
my_logf()                   19.5 billion function calls per second

The accuracy is also improved, with the maximum error compared to the mathematical result reduced from 1.00366 ulps to 0.87972 ulps, which means my_logf() is faithfully rounded.

/*
  Copyright (c) 2015, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* compute natural logarithm. max ulp = 0.87972 */
__device__ float my_logf (float a)
{
    float m, r, i;
    int32_t e;

    i = 0.0f;
    if (a < 1.175494351e-38f){ // 0x1.0p-126
        a = a * 8388608.0f; // 0x1.0p+23
        i = -23.0f;
    }
    e = (__float_as_int (a) - 0x3f400000) & 0xff800000;
    m = __int_as_float (__float_as_int (a) - e);
    i = fmaf ((float)e, 1.19209290e-7f, i); // 0x1.0p-23
    /* m in [0.75, 1.5] */
    m = m - 1.0f;
    /* compute log(1+m), m in [-0.25, 0.5] */ 
    r =             -4.53445576e-2f;  // -0x1.73766ep-5
    r = fmaf (r, m,  1.05476119e-1f); //  0x1.b007bap-4
    r = fmaf (r, m, -1.32298961e-1f); // -0x1.0ef2c2p-3
    r = fmaf (r, m,  1.44913569e-1f); //  0x1.28c872p-3
    r = fmaf (r, m, -1.66415498e-1f); // -0x1.54d1a6p-3
    r = fmaf (r, m,  1.99888408e-1f); //  0x1.995f18p-3
    r = fmaf (r, m, -2.50001967e-1f); // -0x1.000084p-2
    r = fmaf (r, m,  3.33335131e-1f); //  0x1.5555cep-2
    r = fmaf (r, m, -5.00000000e-1f); // -0x1.000000p-1
    r = r * m;
    r = fmaf (r, m, m);
    r = fmaf (i, 6.93147182e-01f, r); //  0x1.62e430p-1 // log(2)
    if (!((a > 0.0f) && (a < __int_as_float (0x7f800000)))) { // +INF
        r = a + a;  // handle inputs of NaN, +Inf
        if (a  < 0.0f) r = __int_as_float (0x7fffffff); //  NaN
        if (a == 0.0f) r = __int_as_float (0xff800000); // -Inf
    }
    return r;
}

While I do not call logf() too often in my simulations ( 10-30 million times per simulation batch) I substituted your implementation in my code and ran a few tests.

NOTE: I changed ‘int32_t’ ‘int’ and used the forceinline qualifier.

Overall the times seemed a bit faster but not enough to make any conclusion.

Did notice a difference in running time based on the build customization using CUDA 6.5 vs CUDA 7.5. CUDA 7.5 seems to be a bit faster but that is probably related to the other more frequent computation.

Have you tested the difference between these two? Should there be a difference?

In general I have noticed that the compile output for CUDA 7.5 indicates more registers used per kernel. For floating point computation heavy applications CUDA 7.5 seems to perform better but for integer computation (like the code for array permutation generation and evaluation) CUDA 6.5 has a significant performance edge.

The results are slightly different (in terms of statistics related to photons detected and fluoreseced) but by a very very small amount.

Thanks for sharing!

njuffa · December 14, 2015, 1:17am

With such a small percentage of your math function calls being logf(), I would not expect a performance differences beyond noise level (2%). In general, the single-precision math functions in CUDA are probably ripe for an overhaul, now that FMA-less platform no longer have to be supported. In addition, Intel has added a bunch of useful instructions to AVX-512 ([url]http://arith22.gforge.inria.fr/slides/s1-cornea.pdf[/url]), for example VSCALEF and VGETMANT, that I expect will significantly enhance the competitiveness of their math library.

I do have CUDA 6.5 installed in addition to CUDA 7.5 as a fall-back position, but my experience so far with CUDA 7.5 has been positive, and I have not had a need to go back. In light of that, I have not done any side-by-side comparisons.

I have seen multiple anecdotal reports that register pressure is higher with CUDA 7.5, but cannot say I have hit a situation where there was a performance regression that could be traced to increased register use. Your mileage may vary. I have also read that CUDA 7.5 still has a compiler bug that keeps the mfaktc folks (primer number search) stuck on CUDA 6.5. I do not know what that bug is about, and I have not run into any CUDA 7.5 compiler bugs myself so far.

There are many compiler optimizations that are desirable in general but result in increased register pressure. The trade-offs are complex and cannot be handled optimally by heuristics for every single piece of code. If the code is performance critical at application level, and the performance regression is > 5%, I would suggest filing a bug.

njuffa · December 14, 2015, 1:22am

With such a small percentage of your math function calls being logf(), I would not expect a performance difference beyond noise level (2%). In general, the single-precision math functions in CUDA are probably ripe for an overhaul, now that FMA-less platform no longer have to be supported. In addition, Intel has added a bunch of useful instructions to AVX-512 (http://arith22.gforge.inria.fr/slides/s1-cornea.pdf), e.g. VSCALEF and VMANTF, that I expect will significantly enhance the competitiveness of their math library.

I do have CUDA 6.5 installed in addition to CUDA 7.5 as a fall-back position, but my experience so far with CUDA 7.5 has been positive, and I have not had a need to go back. In light of that, I have not done any side-by-side comparisons.

I have seen multiple anecdotal reports that register pressure is higher with CUDA 7.5, but cannot say I have hit a situation where there was a performance regression that could be traced to increased register use. Your mileage may vary. I have also read that CUDA 7.5 still has a compiler bug that keeps the mfaktc folks (primer number search) stuck on CUDA 6.5. I do not know what that bug is about, and I have not run into any CUDA 7.5 compiler bugs myself so far.

There are many compiler optimizations that are desirable in general but result in increased register pressure. The trade-offs are complex and cannot be handled optimally by heuristics for every single piece of code. If the code is performance critical at application level, and the performance regression is > 5%, I would suggest filing a bug.

njuffa · December 14, 2015, 5:07pm

I have updated the code for my_logf() in my original post. It now uses a core approximation on a symmetric interval. This reduces the maximum error slightly (but less than I had hoped), and also increases the percentage of correctly rounded results slightly.

njuffa · July 4, 2016, 5:14am

Minor accuracy and performance improvements (increased ILP) to posted code.

njuffa · July 19, 2016, 4:01am

I boosted performance another 5% by reducing the special case handling to a single predicated MUFU instruction. The accuracy is also improved a tiny little bit, and I tried to increase energy efficiency by eliminating one constant memory read (least significant coefficient was reduced to half precision and consequently incorporated into the instruction itself).

njuffa · January 21, 2017, 5:59pm

Boosted performance another 3% by changing the core approximation from a partial Estrin scheme to a partial second-order Horner scheme. This also facilitated reducing a second coefficient to half precision, which therefore does not need to be stored in constant memory and instead is incorporated as an immediate constant into an FFMA. Minor reduction in the maximum error, down to 0.85304 ulps.

njuffa · January 25, 2017, 10:23pm

Tested against CUDA 8.0, and the latest version of my_logf() barely edges out the built-in logf() on my Quadro K2200:

CUDA 8.0 logf: 26.5e9 function calls / second; 0.86416 ulps
latest my_logf: 27.1e9 function calls / second; 0.85304 ulps

njuffa · December 18, 2017, 7:51pm

The maximum error of my_logf() in #1 has been reduced to 0.85095 ulps by improving the core approximation.

njuffa · August 7, 2024, 2:36pm

Those who need something faster than my_logf(), but more accurate than CUDA’s device function intrinsic __logf() may want to try the variant below in which special-case support and support for subnormal arguments have been eliminated, and accuracy has been reduced. It compiles to 15 instructions for an sm_89 target.

/*
  Copyright 2024, Norbert Juffa

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* natural log on [0x1.f7a5ecp-127, 0x1.fffffep127]. Maximum relative error 9.4529e-5 */
__device__ float my_faster_logf (float a)
{
    float m, r, s, t, i, f;
    int32_t e;

    e = (__float_as_int (a) - 0x3f2aaaab) & 0xff800000;
    m = __int_as_float (__float_as_int (a) - e);
    i = (float)e * 1.19209290e-7f; // 0x1.0p-23
    /* m in [2/3, 4/3] */
    f = m - 1.0f;
    s = f * f;
    /* Compute log1p(f) for f in [-1/3, 1/3] */
    r = fmaf (0.230836749f, f, -0.279208571f); // 0x1.d8c0f0p-3, -0x1.1de8dap-2
    t = fmaf (0.331826031f, f, -0.498910338f); // 0x1.53ca34p-2, -0x1.fee25ap-2
    r = fmaf (r, s, t);
    r = fmaf (r, s, f);
    r = fmaf (i, 0.693147182f, r); // 0x1.62e430p-1 // log(2) 
    return r;
}

Topic		Replies	Views
Faster and more accurate implementation of log1pf() CUDA Programming and Performance	15	3297	January 25, 2017
Faster and more accurate implementation of log2f() CUDA Programming and Performance	5	1299	March 7, 2019
Optimized version of single-precision error function, erff() CUDA Programming and Performance	21	4461	December 25, 2017
A more accurate and faster implementation of powf() CUDA Programming and Performance	6	4347	February 13, 2025
A more accurate, performance-competitive implementation of expf() CUDA Programming and Performance	24	8254	November 19, 2017
A more accurate, performance-competitive implementation of lgammaf() CUDA Programming and Performance	0	758	April 23, 2022
On the utility of SFU instructions for half-precision math functions CUDA Programming and Performance	8	2436	September 16, 2019
A faithfully-rounded performance-competitive implementation of log10f CUDA Programming and Performance	0	534	April 20, 2019
Calling all Juffas! What's up with erfcf() nowadays? CUDA Programming and Performance	5	781	August 28, 2023
A faster double-precision exp() CUDA Programming and Performance	8	3127	September 5, 2017

Faster and more accurate implementation of logf()

Related topics