We’ve encountered a possible miscompilation with the following program:
nvhpc_miscompile.cpp
#include <stdio.h>
#include <math.h>
void f(int X, int G, const float *xs, const float *ys, float *zs) {
for (int i = 0; i < X; i++) {
int ix = G * X + i;
// problematic expression:
zs[ix] = cosf(xs[ix]) * cosf(ys[ix]);
// also reproducible with:
// zs[ix] = sinf(xs[ix]) * sinf(ys[ix]);
// zs[ix] = cosf(xs[ix]) * sinf(ys[ix]);
// zs[ix] = sinf(xs[ix]) * cosf(ys[ix]);
}
}
#define N (16) // anything less than this and the problem goes away
int main(int argc, char *argv[]) {
float xs[N] = {0.f, 1.f, 2.f, 3.f, 0.f, 1.f, 2.f, 3.f, 0.f, 1.f, 2.f, 3.f, 0.f, 1.f, 2.f, 3.f}; // random values
float ys[N] = {0.f, 1.f, 2.f, 3.f, 0.f, 1.f, 2.f, 3.f, 0.f, 1.f, 2.f, 3.f, 0.f, 1.f, 2.f, 3.f}; // random values
float zs1[N] = {};
for (int G = 0; G < (N / 1); G++)
f(1, G, xs, ys, zs1);
float zs2[N] = {};
for (int G = 0; G < (N / 2); G++)
f(2, G, xs, ys, zs2); // wrong answer!
for (int i = 0; i < N; i++) // check for errors
printf("[%d] |%f - %f| = %f\n", i, zs1[i], zs2[i], fabs(zs1[i] - zs2[i]));
return 0;
}
> /opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/bin/nvc++ nvhpc_miscompile.cpp -O2 -g
> ./a.out
[0] |1.000000 - 1.000000| = 0.000000
[1] |0.291927 - 0.540302| = 0.248376
[2] |0.173178 - 0.173178| = 0.000000
[3] |0.980085 - 0.411982| = 0.568103
[4] |1.000000 - 1.000000| = 0.000000
[5] |0.291927 - 0.540302| = 0.248376
[6] |0.173178 - 0.173178| = 0.000000
[7] |0.980085 - 0.411982| = 0.568103
[8] |1.000000 - 1.000000| = 0.000000
[9] |0.291927 - 0.540302| = 0.248376
[10] |0.173178 - 0.173178| = 0.000000
[11] |0.980085 - 0.411982| = 0.568103
[12] |1.000000 - 1.000000| = 0.000000
[13] |0.291927 - 0.540302| = 0.248376
[14] |0.173178 - 0.173178| = 0.000000
[15] |0.980085 - 0.411982| = 0.568103
Absolute difference should be 0 in all cases.
The bug is reproducible with -O2
or -fast
, but not -O0
GCC and Clang reports the correct result at any optimisation level, even -Ofast
:
[0] |1.000000 - 1.000000| = 0.000000
[1] |0.291927 - 0.291927| = 0.000000
[2] |0.173178 - 0.173178| = 0.000000
[3] |0.980085 - 0.980085| = 0.000000
[4] |1.000000 - 1.000000| = 0.000000
[5] |0.291927 - 0.291927| = 0.000000
[6] |0.173178 - 0.173178| = 0.000000
[7] |0.980085 - 0.980085| = 0.000000
[8] |1.000000 - 1.000000| = 0.000000
[9] |0.291927 - 0.291927| = 0.000000
[10] |0.173178 - 0.173178| = 0.000000
[11] |0.980085 - 0.980085| = 0.000000
[12] |1.000000 - 1.000000| = 0.000000
[13] |0.291927 - 0.291927| = 0.000000
[14] |0.173178 - 0.173178| = 0.000000
[15] |0.980085 - 0.980085| = 0.000000
We suspect the vectorised intrinsics (cos/sin) of 2-wide vectors is incorrect.
Looking the the disassembly, the f
function got inlined into main
as expected and the callsite of the problematic expression looks identical to a working combination ( tanf(xs[ix]) * cosf(ys[ix])
in this case):
; incorrect result:
; zs[ix] = cosf(xs[ix]) * cosf(ys[ix]);
4011b0: 48 63 db movslq %ebx, %rbx
4011b3: c4 c1 7a 10 04 9f vmovss (%r15,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
4011b9: e8 a2 fe ff ff callq 0x401060 <__fs_cos_1@plt>
4011be: c5 fa 11 45 d4 vmovss %xmm0, -44(%rbp)
4011c3: c4 c1 7a 10 04 9c vmovss (%r12,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
4011c9: e8 92 fe ff ff callq 0x401060 <__fs_cos_1@plt>
4011ce: c5 fa 59 45 d4 vmulss -44(%rbp), %xmm0, %xmm0
4011d3: c4 c1 7a 11 04 9e vmovss %xmm0, (%r14,%rbx,4)
; correct result:
; zs[ix] = tanf(xs[ix]) * cosf(ys[ix]);
4011f0: 48 63 db movslq %ebx, %rbx
4011f3: c4 c1 7a 10 04 9f vmovss (%r15,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
4011f9: e8 72 fe ff ff callq 0x401070 <__fs_cos_1@plt>
4011fe: c5 fa 11 45 d4 vmovss %xmm0, -44(%rbp)
401203: c4 c1 7a 10 04 9c vmovss (%r12,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
401209: e8 42 fe ff ff callq 0x401050 <__fs_tan_1@plt>
40120e: c5 fa 59 45 d4 vmulss -44(%rbp), %xmm0, %xmm0
401213: c4 c1 7a 11 04 9e vmovss %xmm0, (%r14,%rbx,4)
The issue is reproducible via -tp=znver3
, znver2
, neoverse-v1
, neoverse-n1
, skylake-avx512
with NVHPC 22.3
, 22.5
, and 22.7
.
For reference, this bug is a reduction of the BUDE kernel: miniBUDE/fasten.hpp at v2 · UoB-HPC/miniBUDE · GitHub
Cheers.