I’m always hesitant to report bugs on compilers (because it’s usually user error), but this seems to be real.
I want to square int z
, 32 bits, and multiply by a 64 bit value, like so:
int z = 87142; // z*z<0
return (z*z)*3LL;
This works fine when z*z>=0
, but if z*z<0
the result is incorrect because the optimizer is using mul.wide.u32
, the unsigned version. In release mode (/O2) the PTX looks like this:
mul.lo.s32 %r2, %r1, %r1;
mul.wide.u32 %rd1, %r2, 3;
In debug mode, the result is correct but unoptimized:
mul.lo.s32 %r3, %r1, %r1;
cvt.s64.s32 %rd1, %r3;
mul.lo.s64 %rd2, %rd1, 3;
Based on some tests, it appears the optimizer thinks z*z>0
always. For example, it works correctly for z*z*z*3LL
where the compiler can’t make that assumption:
mul.lo.s32 %r2, %r1, %r1;
mul.lo.s32 %r3, %r2, %r1;
mul.wide.s32 %rd1, %r3, 3; <- correct
The workaround I found is forcing a pointless conversion like so: ((int)((__int64)z * z)) * 3LL
. It didn’t add any overhead, but apparently tricks the optimizer to assume the value is signed, and thus use mul.wide.s32
.
mul.lo.s32 %r2, %r1, %r1;
mul.wide.s32 %rd1, %r2, 3;
Example program, runs the same function on the host and device. The host is assumed to be correct. Compiled with visual studio 2019 and cuda_11.0.2_451.48_win10.exe which appears to be current.
#include "stdio.h"
__device__ __host__ __int64 CudaErrorTestInternal(int z) {
printf("z=%d z*z=%d (z*z)*3=%d\n", z, z * z, (z * z) * 3); // multiply by a 32 bit value (works fine)
printf("z=%d z*z=%d (z*z)*3LL=%lld\n", z, z * z, (z * z) * 3LL); // multiply by a 64 bit value that needs 32 bits (fails)
printf("z=%d z*z=%d (z*z)*30000000000LL=%lld\n", z, z * z, (z * z) * 30000000000LL); // multiply by a 64 bit value that needs 64 bits (fails)
printf("z=%d z*z=%d ((int)((__int64)z * z)) * 3LL=%lld\n", z, z * z, ((int)((__int64)z * z)) * 3LL); // a workaround that works correctly without overhead
printf("z=%d z*z*z=%d (z * z * z) * 3LL=%lld\n", z, z * z * z, (z * z * z) * 3LL); // cubing works fine with no workaround
return 0;
}
__global__ void CudaErrorTest(int z) {
CudaErrorTestInternal(z);
}
int main(int argc, char* argv[]) {
int z = 87142; // z*z<0, z*z*z<0
printf("host:\n");
CudaErrorTestInternal(z);
printf("device:\n");
CudaErrorTest <<<1, 1>>> (z);
return 0;
}
Output (release mode, /O2
), has errors:
host:
z=87142 z*z=-996206428 (z*z)*3=1306348012
z=87142 z*z=-996206428 (z*z)*3LL=-2988619284
z=87142 z*z=-996206428 (z*z)*30000000000LL=7007295307419103232
z=87142 z*z=-996206428 ((int)((__int64)z * z)) * 3LL=-2988619284
z=87142 z*z*z=-1541562024 (z * z * z) * 3LL=-4624686072
device:
z=87142 z*z=-996206428 (z*z)*3=1306348012
z=87142 z*z=-996206428 (z*z)*3LL=9896282604 < - this is wrong
z=87142 z*z=-996206428 (z*z)*30000000000LL=6729105671452241920 < - this is wrong
z=87142 z*z=-996206428 ((int)((__int64)z * z)) * 3LL=-2988619284 < - workaround successful
z=87142 z*z*z=-1541562024 (z * z * z) * 3LL=-4624686072
Output (debug mode), all correct:
host:
z=87142 z*z=-996206428 (z*z)*3=1306348012
z=87142 z*z=-996206428 (z*z)*3LL=-2988619284
z=87142 z*z=-996206428 (z*z)*30000000000LL=7007295307419103232
z=87142 z*z=-996206428 ((int)((__int64)z * z)) * 3LL=-2988619284
z=87142 z*z*z=-1541562024 (z * z * z) * 3LL=-4624686072
device:
z=87142 z*z=-996206428 (z*z)*3=1306348012
z=87142 z*z=-996206428 (z*z)*3LL=-2988619284
z=87142 z*z=-996206428 (z*z)*30000000000LL=7007295307419103232
z=87142 z*z=-996206428 ((int)((__int64)z * z)) * 3LL=-2988619284
z=87142 z*z*z=-1541562024 (z * z * z) * 3LL=-4624686072
Version from PTX (assuming my driver version is irrelevant since the error ends up in the PTX):
// Compiler Build ID: CL-28540450
// Cuda compilation tools, release 11.0, V11.0.194
// Based on LLVM 3.4svn
Annotated with the PTX code, each line compiled individually.
printf("z=%d z*z=%d (z*z)*3=%d\n", z, z * z, (z * z) * 3);
mul.lo.s32 %r2, %r1, %r1;
mul.lo.s32 %r3, %r2, 3; <- correct
printf("z=%d z*z=%d (z*z)*3LL=%lld\n", z, z * z, (z * z) * 3LL);
mul.lo.s32 %r2, %r1, %r1;
mul.wide.u32 %rd1, %r2, 3; <- problem, uses unsigned multiply
printf("z=%d z*z=%d (z*z)*30000000000LL=%lld\n", z, z * z, (z * z) * 30000000000LL);
mul.lo.s32 %r2, %r1, %r1;
cvt.u64.u32 %rd1, %r2; <- problem, uses unsigned convert
mul.lo.s64 %rd2, %rd1, 30000000000; <- correct, but garbage in
printf("z=%d z*z=%d ((int)((__int64)z * z)) * 3LL=%lld\n", z, z * z, ((int)((__int64)z * z)) * 3LL);
mul.lo.s32 %r2, %r1, %r1;
mul.wide.s32 %rd1, %r2, 3; <- correct
printf("z=%d z*z*z=%d (z * z * z) * 3LL=%lld\n", z, z * z * z, (z * z * z) * 3LL);
mul.lo.s32 %r2, %r1, %r1;
mul.lo.s32 %r3, %r2, %r1;
mul.wide.s32 %rd1, %r3, 3; <- correct