ptx miscompile bug report in cuda8.0

I have encountered a miscompile for cuda program with clang 6.0 , with optimization '-O1’and disappear with ‘-O2’ and ‘-O3’.I have reduced the test-case appended below,

demo.cu*

#include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "cuda.h"

    #include <stdio.h>

    struct S1 {
        long shoud_not_change;
        long irrelevant;
    };

    __device__ int * func_3(int ** p_221, struct S1 * ptr_arg)
    {
        long *l_302 = &ptr_arg->irrelevant;
        (*l_302) = 0;
        return *p_221;
    }

    __device__ void func_2(struct S1 * ptr_arg)
    {
        int *l_231 = (int*)0;
        func_3(&l_231, ptr_arg);
    }


    __device__ void func_1(struct S1 * ptr_arg)
    {
        func_2(ptr_arg);
    }

    extern "C" __global__ void entry(long *result)
    {
        struct S1 c_783 = {
            11L, // shoud_not_change  
            0L, // irrelevant
        };
        result[0] = (long)c_783.shoud_not_change;
        struct S1* ptr_arg = &c_783;
        __syncthreads();
        func_1(ptr_arg);
        __syncthreads();
        result[1] = (long)ptr_arg->shoud_not_change;
    }
    int main(void)
    {
        long h_result[2];
        long* result;
        size_t result_size = sizeof(long)*2;
        cudaMalloc((void**)&result,result_size);
        entry<<<100,100>>>(result);
        cudaMemcpy(&h_result,result,result_size,cudaMemcpyDeviceToHost);
        printf("result[0] is %ld, result[1] is %ld\n", h_result[0], h_result[1]);

    }

The failing and passing behaviors are:

FAIL
$clang++ demo.cu -O1 -o bug --cuda-gpu-arch=sm_50 --cuda-path=/usr/local/cuda-8.0 -I/usr/local/cuda-8.0/targets/x86_64-linux/include/ -L/usr/local/cuda/lib64 -lcudart -lcuda

$./bug

output: result[0] is 11, result[1] is 0

PASS
$clang++ demo.cu -O2 -o bugfree-O2 --cuda-gpu-arch=sm_50 --cuda-path=/usr/local/cuda-8.0 -I/usr/local/cuda-8.0/targets/x86_64-linux/include/ -L/usr/local/cuda/lib64 -lcudart -lcuda

$./bugfree-O2

output: result[0] is 11, result[1] is 11

PASS
$clang++ demo.cu -O3 -o bugfree-O3 --cuda-gpu-arch=sm_50 --cuda-path=/usr/local/cuda-8.0 -I/usr/local/cuda-8.0/targets/x86_64-linux/include/ -L/usr/local/cuda/lib64 -lcudart -lcuda

$./bugfree-O3

output: result[0] is 11, result[1] is 11

PASS
$nvcc -o bugfree-nvcc -arch sm_50 demo.cu
$./bugfree-nvcc
output: result[0] is 11, result[1] is 11

PTX generated by clang is identical for both CUDA-8 and CUDA-9.However, SASS generated by ptxas is noticeably different. With CUDA-9 the code is very straightforward and you can see two writes, 8 bytes apart, both with the same value. With CUDA-8, func3 messes up the store to *l_302 and writes zero to the should_not_change field instead.

Configuration->

#uname -a
Linux G1024-workstation1 4.4.0-121-generic #145-Ubuntu SMP Fri Apr 13 13:47:23 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

$ clang++ --version
clang version 6.0.0 (tags/RELEASE_600/final)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

cat /etc/release/
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION=“Ubuntu 16.04.2 LTS”
NAME=“Ubuntu”
VERSION=“16.04.2 LTS (Xenial Xerus)”
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME=“Ubuntu 16.04.2 LTS”
VERSION_ID=“16.04”
HOME_URL=“http://www.ubuntu.com/
SUPPORT_URL=“http://help.ubuntu.com/
BUG_REPORT_URL=“http://bugs.launchpad.net/ubuntu/
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial
Device: NVIDIA Quadro K2200

#nvidia-smi
Wed May 16 15:07:23 2018
±----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111 Driver Version: 384.111 |
|-------------------------------±---------------------±---------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|========
=======================+======================+======================|
| 0 Quadro K2200 Off | 00000000:03:00.0 Off | N/A |
| 42% 52C P0 2W / 39W | 0MiB / 4040MiB | 0% Default |
±------------------------------±---------------------±---------------------+

±----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
±----------------------------------------------------------------------------+

CUDA toolkit version:release 8.0, V8.0.61

  1. if the behavior is correct with CUDA 9.0, then certainly the solution and recommendation would be to use CUDA 9.0 There would be no fix applied to CUDA 8.0 for this issue if it is already fixed in CUDA 9.0

  2. although clang is supported as a host compiler for use by nvcc in certain environments, compilation with clang (instead of nvcc) is not supported by NVIDIA. Issues with the clang toolchain should be brought to the attention of the clang development community.

  3. If you wish to file a bug, please register as a developer and use the bug reporting portal at developer.nvidia.com