I have encountered a miscompile for cuda program with clang 6.0 , with optimization '-O1’and disappear with ‘-O2’ and ‘-O3’.I have reduced the test-case appended below,
demo.cu*
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <stdio.h>
struct S1 {
long shoud_not_change;
long irrelevant;
};
__device__ int * func_3(int ** p_221, struct S1 * ptr_arg)
{
long *l_302 = &ptr_arg->irrelevant;
(*l_302) = 0;
return *p_221;
}
__device__ void func_2(struct S1 * ptr_arg)
{
int *l_231 = (int*)0;
func_3(&l_231, ptr_arg);
}
__device__ void func_1(struct S1 * ptr_arg)
{
func_2(ptr_arg);
}
extern "C" __global__ void entry(long *result)
{
struct S1 c_783 = {
11L, // shoud_not_change
0L, // irrelevant
};
result[0] = (long)c_783.shoud_not_change;
struct S1* ptr_arg = &c_783;
__syncthreads();
func_1(ptr_arg);
__syncthreads();
result[1] = (long)ptr_arg->shoud_not_change;
}
int main(void)
{
long h_result[2];
long* result;
size_t result_size = sizeof(long)*2;
cudaMalloc((void**)&result,result_size);
entry<<<100,100>>>(result);
cudaMemcpy(&h_result,result,result_size,cudaMemcpyDeviceToHost);
printf("result[0] is %ld, result[1] is %ld\n", h_result[0], h_result[1]);
}
The failing and passing behaviors are:
FAIL
$clang++ demo.cu -O1 -o bug --cuda-gpu-arch=sm_50 --cuda-path=/usr/local/cuda-8.0 -I/usr/local/cuda-8.0/targets/x86_64-linux/include/ -L/usr/local/cuda/lib64 -lcudart -lcuda
$./bug
output: result[0] is 11, result[1] is 0
PASS
$clang++ demo.cu -O2 -o bugfree-O2 --cuda-gpu-arch=sm_50 --cuda-path=/usr/local/cuda-8.0 -I/usr/local/cuda-8.0/targets/x86_64-linux/include/ -L/usr/local/cuda/lib64 -lcudart -lcuda
$./bugfree-O2
output: result[0] is 11, result[1] is 11
PASS
$clang++ demo.cu -O3 -o bugfree-O3 --cuda-gpu-arch=sm_50 --cuda-path=/usr/local/cuda-8.0 -I/usr/local/cuda-8.0/targets/x86_64-linux/include/ -L/usr/local/cuda/lib64 -lcudart -lcuda
$./bugfree-O3
output: result[0] is 11, result[1] is 11
PASS
$nvcc -o bugfree-nvcc -arch sm_50 demo.cu
$./bugfree-nvcc
output: result[0] is 11, result[1] is 11
PTX generated by clang is identical for both CUDA-8 and CUDA-9.However, SASS generated by ptxas is noticeably different. With CUDA-9 the code is very straightforward and you can see two writes, 8 bytes apart, both with the same value. With CUDA-8, func3 messes up the store to *l_302 and writes zero to the should_not_change field instead.
Configuration->
#uname -a
Linux G1024-workstation1 4.4.0-121-generic #145-Ubuntu SMP Fri Apr 13 13:47:23 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
$ clang++ --version
clang version 6.0.0 (tags/RELEASE_600/final)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
cat /etc/release/
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION=“Ubuntu 16.04.2 LTS”
NAME=“Ubuntu”
VERSION=“16.04.2 LTS (Xenial Xerus)”
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME=“Ubuntu 16.04.2 LTS”
VERSION_ID=“16.04”
HOME_URL=“http://www.ubuntu.com/”
SUPPORT_URL=“http://help.ubuntu.com/”
BUG_REPORT_URL=“http://bugs.launchpad.net/ubuntu/”
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial
Device: NVIDIA Quadro K2200
#nvidia-smi
Wed May 16 15:07:23 2018
±----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111 Driver Version: 384.111 |
|-------------------------------±---------------------±---------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|========
=======================+======================+======================|
| 0 Quadro K2200 Off | 00000000:03:00.0 Off | N/A |
| 42% 52C P0 2W / 39W | 0MiB / 4040MiB | 0% Default |
±------------------------------±---------------------±---------------------+
±----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
±----------------------------------------------------------------------------+
CUDA toolkit version:release 8.0, V8.0.61