bug report for nvcc version 10

i found a strange problem about a cuda program when compiled by nvcc version 10.
i used a tool generate cuda program, and i construct a if condition that always true, i initialize if condition in the host code.So it means the code without this if condition should get a same result with that have if condition.
However,in my program, i got the different result when i add a if condition which is always true.

#ifndef NO_GROUP_DIVERGENCE
#define GROUP_DIVERGE(a, b) get_block_id(a)
#else
#define GROUP_DIVERGE(x, y) (y)
#endif

#ifndef NO_FAKE_DIVERGENCE
#define FAKE_DIVERGE(x, y, z) (x - y)
#else
#define FAKE_DIVERGE(x, y, z) (z)
#endif


// Seed: 47863

#include "CUDA.h"

__constant__ uint32_t permutations[10][12] = {
{5,6,8,7,10,9,0,3,4,2,11,1}, // permutation 0
{11,7,9,4,2,0,6,8,5,1,3,10}, // permutation 1
{7,9,0,8,6,3,2,1,10,5,4,11}, // permutation 2
{1,5,10,8,11,2,4,7,0,9,3,6}, // permutation 3
{10,9,6,7,11,0,3,2,1,5,8,4}, // permutation 4
{4,7,5,1,11,9,0,6,2,8,3,10}, // permutation 5
{9,7,11,3,10,8,6,0,1,5,2,4}, // permutation 6
{3,9,5,1,10,0,2,6,4,8,11,7}, // permutation 7
{10,4,1,7,0,11,5,6,3,2,9,8}, // permutation 8
{11,2,8,0,7,10,9,4,5,6,3,1} // permutation 9
};
/* --- Struct/Union Declarations --- */


/* --- Struct/Union Declarations End --- */
struct S2 {
    int32_t g_19;
    int64_t g_93;
    int16_t g_95;
    int32_t g_196;
    uint32_t v_collective;
    uint64_t global_0_offset;
    uint64_t global_1_offset;
    uint64_t global_2_offset;
    uint64_t local_0_offset;  uint64_t local_1_offset;
    uint64_t local_2_offset;
    uint64_t group_0_offset;
    uint64_t group_1_offset;
    uint64_t group_2_offset;
    uint32_t tid;
     volatile uint32_t *l_atomic_reduction;
     volatile uint32_t *g_atomic_reduction;
     int32_t *tg_input;
     int64_t *l_comm_values;
     int64_t *g_comm_values;
};


__device__ int16_t  func_1(struct S2 * p_283);
__device__ int32_t  func_6(int32_t * p_7, struct S2 * p_283);

__device__ int16_t  func_1(struct S2 * p_283)
{ /* block id: 4 */
    int32_t *l_13[1];

    func_6(l_13[0] , p_283);
    return p_283->g_93;
}
__device__ int32_t  func_6(int32_t * p_7, struct S2 * p_283)
{ /* block id: 56 */
    int32_t l_194 = 1UL;
    for (p_283->g_95 = 0; (p_283->g_95 >= (-3)); p_283->g_95 = safe_sub_func_int16_t_s_s(p_282->g_95, 4))
    { /* block id: 64 */
        int32_t *l_151 = &p_283->g_19;
        for (p_283->g_93 = 0; (p_283->g_93 != (-19)); p_283->g_93 = safe_sub_func_uint64_t_u_u(p_283->g_93, 5))
        { /* block id: 69 */
        //This is TG condition, with this condition ,the result of this program is timeout, without this condition, the program will get a result.
        if ((p_283->tg_input[4] >= p_283->tg_input[5]))
            (*l_151) = ((VECTOR_MAKE(int32_t, 2))(9383, 886)).y;
        }
    }
    return l_194;
}extern "C" __global__ void entry( long *result,  volatile uint *g_atomic_input,  volatile uint *g_special_values,  volatile int *g_atomic_reduction,  int *tg_input , int *sequence_input,  long *g_comm_values) {
    int i, j, k;
    __shared__ volatile uint32_t l_atomic_reduction[1];
    if (get_linear_local_id() == 0)
        for (i = 0; i < 1; i++)
            l_atomic_reduction[i] = 0;
    __shared__ int64_t l_comm_values[12];
    if (get_linear_local_id() == 0)
        for (i = 0; i < 12; i++)
            l_comm_values[i] = 1;
    struct S2 c_284;
    struct S2* p_283 = &c_284;
    struct S2 c_285 = {
        0x0E82CA96L, // p_283->g_19
        6L, // p_283->g_93
        0x066AL, // p_283->g_95
        0x69ECBE81L, // p_283->g_196
        0, // p_283->v_collective
        sequence_input[get_global_id(0)], // p_283->global_0_offset
        sequence_input[get_global_id(1)], // p_283->global_1_offset
        sequence_input[get_global_id(2)], // p_283->global_2_offset
        sequence_input[get_local_id(0)], // p_283->local_0_offset
        sequence_input[get_local_id(1)], // p_283->local_1_offset
        sequence_input[get_local_id(2)], // p_283->local_2_offset
        sequence_input[get_group_id(0)], // p_283->group_0_offset
        sequence_input[get_group_id(1)], // p_283->group_1_offset
        sequence_input[get_group_id(2)], // p_283->group_2_offset
        (safe_add_func_uint32_t_u_u((safe_mul_func_uint32_t_u_u(get_linear_group_id(), 12)), permutations[0][get_linear_local_id()])), // p_283->tid
        l_atomic_reduction, // l_atomic_reduction
        g_atomic_reduction, // g_atomic_reduction
        tg_input, // tg_input
        l_comm_values, // l_comm_values
        g_comm_values, // g_comm_values
    };
    c_284 = c_285;
    __syncthreads();
    func_1(p_283);
    __syncthreads();
    uint64_t crc64_context = 0xFFFFFFFFFFFFFFFFUL;
    int print_hash_value = 0;
    transparent_crc(p_283->g_19, "p_283->g_19", print_hash_value);
    transparent_crc(p_283->g_93, "p_283->g_93", print_hash_value);transparent_crc(p_283->g_95, "p_283->g_95", print_hash_value);
    transparent_crc(p_283->g_196, "p_283->g_196", print_hash_value);
    transparent_crc(p_283->v_collective, "p_283->v_collective", print_hash_value);
    transparent_crc(p_283->l_comm_values[get_linear_local_id()], "p_283->l_comm_values[get_linear_local_id()]", print_hash_value);
    transparent_crc(p_283->g_comm_values[get_linear_group_id() * 12 + get_linear_local_id()], "p_283->g_comm_values[get_linear_group_id() * 12 + get_linear_local_id()]", print_hash_value);
       result[get_linear_global_id()] = crc64_context ^ 0xFFFFFFFFFFFFFFFFUL;
}

Here i can not add attachment,so i don’t show the head file. If someone has suggestion,please tell me,and i will send you the complete code.

Configuration-。

#uanme -a
Linux wxy-Inspiron-7460 4.15.0-35-generic #38-Ubuntu SMP Wed Sep 12 10:38:24 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

#nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130

#nvidia-smi
Mon Nov 5 21:08:27 2018
±----------------------------------------------------------------------------+
| NVIDIA-SMI 410.48 Driver Version: 410.48 |
|-------------------------------±---------------------±---------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce 940MX Off | 00000000:01:00.0 Off | N/A |
| N/A 49C P0 N/A / N/A | 439MiB / 2004MiB | 8% Default |
±------------------------------±---------------------±---------------------+

±----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 2238 G /usr/lib/xorg/Xorg 230MiB |
| 0 4577 G /usr/bin/gnome-shell 202MiB |
| 0 11782 G /usr/lib/firefox/firefox 1MiB |
| 0 16810 G /usr/lib/firefox/firefox 1MiB |
±----------------------------------------------------------------------------+
#device
NVIDIA GeForce 940MX

#lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 18.04.1 LTS
Release: 18.04
Codename: bionic

#cat /proc/version
Linux version 4.15.0-35-generic (buildd@lcy01-amd64-024) (gcc version 7.3.0 (Ubuntu 7.3.0-16ubuntu3)) #38-Ubuntu SMP Wed Sep 12 10:38:24 UTC 2018


i upload the complete code in the Baidu Cloud, you can download the source code from the link

[url]https://pan.baidu.com/s/1bTjZuVY2bphyis8TdNY0wg[/url]

It’s probably best if you file a bug at developer.nvidia.com