i found a strange problem about a cuda program when compiled by nvcc version 10.
i used a tool generate cuda program, and i construct a if condition that always true, i initialize if condition in the host code.So it means the code without this if condition should get a same result with that have if condition.
However,in my program, i got the different result when i add a if condition which is always true.
#ifndef NO_GROUP_DIVERGENCE
#define GROUP_DIVERGE(a, b) get_block_id(a)
#else
#define GROUP_DIVERGE(x, y) (y)
#endif
#ifndef NO_FAKE_DIVERGENCE
#define FAKE_DIVERGE(x, y, z) (x - y)
#else
#define FAKE_DIVERGE(x, y, z) (z)
#endif
// Seed: 47863
#include "CUDA.h"
__constant__ uint32_t permutations[10][12] = {
{5,6,8,7,10,9,0,3,4,2,11,1}, // permutation 0
{11,7,9,4,2,0,6,8,5,1,3,10}, // permutation 1
{7,9,0,8,6,3,2,1,10,5,4,11}, // permutation 2
{1,5,10,8,11,2,4,7,0,9,3,6}, // permutation 3
{10,9,6,7,11,0,3,2,1,5,8,4}, // permutation 4
{4,7,5,1,11,9,0,6,2,8,3,10}, // permutation 5
{9,7,11,3,10,8,6,0,1,5,2,4}, // permutation 6
{3,9,5,1,10,0,2,6,4,8,11,7}, // permutation 7
{10,4,1,7,0,11,5,6,3,2,9,8}, // permutation 8
{11,2,8,0,7,10,9,4,5,6,3,1} // permutation 9
};
/* --- Struct/Union Declarations --- */
/* --- Struct/Union Declarations End --- */
struct S2 {
int32_t g_19;
int64_t g_93;
int16_t g_95;
int32_t g_196;
uint32_t v_collective;
uint64_t global_0_offset;
uint64_t global_1_offset;
uint64_t global_2_offset;
uint64_t local_0_offset; uint64_t local_1_offset;
uint64_t local_2_offset;
uint64_t group_0_offset;
uint64_t group_1_offset;
uint64_t group_2_offset;
uint32_t tid;
volatile uint32_t *l_atomic_reduction;
volatile uint32_t *g_atomic_reduction;
int32_t *tg_input;
int64_t *l_comm_values;
int64_t *g_comm_values;
};
__device__ int16_t func_1(struct S2 * p_283);
__device__ int32_t func_6(int32_t * p_7, struct S2 * p_283);
__device__ int16_t func_1(struct S2 * p_283)
{ /* block id: 4 */
int32_t *l_13[1];
func_6(l_13[0] , p_283);
return p_283->g_93;
}
__device__ int32_t func_6(int32_t * p_7, struct S2 * p_283)
{ /* block id: 56 */
int32_t l_194 = 1UL;
for (p_283->g_95 = 0; (p_283->g_95 >= (-3)); p_283->g_95 = safe_sub_func_int16_t_s_s(p_282->g_95, 4))
{ /* block id: 64 */
int32_t *l_151 = &p_283->g_19;
for (p_283->g_93 = 0; (p_283->g_93 != (-19)); p_283->g_93 = safe_sub_func_uint64_t_u_u(p_283->g_93, 5))
{ /* block id: 69 */
//This is TG condition, with this condition ,the result of this program is timeout, without this condition, the program will get a result.
if ((p_283->tg_input[4] >= p_283->tg_input[5]))
(*l_151) = ((VECTOR_MAKE(int32_t, 2))(9383, 886)).y;
}
}
return l_194;
}extern "C" __global__ void entry( long *result, volatile uint *g_atomic_input, volatile uint *g_special_values, volatile int *g_atomic_reduction, int *tg_input , int *sequence_input, long *g_comm_values) {
int i, j, k;
__shared__ volatile uint32_t l_atomic_reduction[1];
if (get_linear_local_id() == 0)
for (i = 0; i < 1; i++)
l_atomic_reduction[i] = 0;
__shared__ int64_t l_comm_values[12];
if (get_linear_local_id() == 0)
for (i = 0; i < 12; i++)
l_comm_values[i] = 1;
struct S2 c_284;
struct S2* p_283 = &c_284;
struct S2 c_285 = {
0x0E82CA96L, // p_283->g_19
6L, // p_283->g_93
0x066AL, // p_283->g_95
0x69ECBE81L, // p_283->g_196
0, // p_283->v_collective
sequence_input[get_global_id(0)], // p_283->global_0_offset
sequence_input[get_global_id(1)], // p_283->global_1_offset
sequence_input[get_global_id(2)], // p_283->global_2_offset
sequence_input[get_local_id(0)], // p_283->local_0_offset
sequence_input[get_local_id(1)], // p_283->local_1_offset
sequence_input[get_local_id(2)], // p_283->local_2_offset
sequence_input[get_group_id(0)], // p_283->group_0_offset
sequence_input[get_group_id(1)], // p_283->group_1_offset
sequence_input[get_group_id(2)], // p_283->group_2_offset
(safe_add_func_uint32_t_u_u((safe_mul_func_uint32_t_u_u(get_linear_group_id(), 12)), permutations[0][get_linear_local_id()])), // p_283->tid
l_atomic_reduction, // l_atomic_reduction
g_atomic_reduction, // g_atomic_reduction
tg_input, // tg_input
l_comm_values, // l_comm_values
g_comm_values, // g_comm_values
};
c_284 = c_285;
__syncthreads();
func_1(p_283);
__syncthreads();
uint64_t crc64_context = 0xFFFFFFFFFFFFFFFFUL;
int print_hash_value = 0;
transparent_crc(p_283->g_19, "p_283->g_19", print_hash_value);
transparent_crc(p_283->g_93, "p_283->g_93", print_hash_value);transparent_crc(p_283->g_95, "p_283->g_95", print_hash_value);
transparent_crc(p_283->g_196, "p_283->g_196", print_hash_value);
transparent_crc(p_283->v_collective, "p_283->v_collective", print_hash_value);
transparent_crc(p_283->l_comm_values[get_linear_local_id()], "p_283->l_comm_values[get_linear_local_id()]", print_hash_value);
transparent_crc(p_283->g_comm_values[get_linear_group_id() * 12 + get_linear_local_id()], "p_283->g_comm_values[get_linear_group_id() * 12 + get_linear_local_id()]", print_hash_value);
result[get_linear_global_id()] = crc64_context ^ 0xFFFFFFFFFFFFFFFFUL;
}
Here i can not add attachment,so i don’t show the head file. If someone has suggestion,please tell me,and i will send you the complete code.