Ok spent a whole day to find out what the issue was inside a rather large project. I have reproduced a very simple test case which highlights the behaviour I cant explain (below). Essentially it performs three conditions tests in slightly different ways each should always evaluate to false however in x64 build with CUDA 2.3 (using SDK template) Test 1 always evaluates true. I have tested this in both emulation mode and on a separate machine with 32 bit windows XP. Both of which behave as expected. It seems that Test 1 only evaluates true when running on the device in 64 bit windows. Can anyone confirm this behaviour or explain what is going on???
Paul
int2* h_data;
int2* d_data;
int3* h_result;
int3* d_result;
const int THREADS = 1024;
__global__ void test(int2* data, int3* result){
int i = threadIdx.x + blockIdx.x*blockDim.x;
//set result value to 0
result[i].x = 1;
result[i].y = 2;
result[i].z = 3;
//some variable from data
int2 compare = make_int2(data[i].x, data[i].y);
//TEST 1) FAIL: WHY IS THIS EVALUATING TRUE AS ALL Y VALUES ARE -1
if ((compare.x==0)&&(compare.y==0))
result[i].x = 0;
/*
//TEST 2) PASS: BLOCK OUT ABOVE AND USE THIS INSTEAD AND CONDITIONSA ARE AS EXPECTED
if (compare.x==0)
if(compare.y==0)
result[i].y = 0;
//TEST 3) PASS: WHY IS THIS EVALUATING TRUE AS ALL Y VALUES ARE -1
if ((data[i].x==0)&&(data[i].y==0))
result[i].z = 0;
*/
}
int main( int argc, char** argv)
{
//allocate host and device
h_data = (int2*)malloc(THREADS*sizeof(int2));
h_result = (int3*)malloc(THREADS*sizeof(int3));
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data, THREADS*sizeof(int2)));
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_result, THREADS*sizeof(int3)));
//fill with data
for (int i=0; i<THREADS; i++){
h_data[i].x = 0;
h_data[i].y = -1;
}
//upload
CUDA_SAFE_CALL( cudaMemcpy( d_data, h_data, THREADS*sizeof(int2), cudaMemcpyHostToDevice));
//run kernal
dim3 grid = dim3(8,1,1);
dim3 threads = dim3(128,1,1);
test<<<grid, threads>>>(d_data, d_result);
//copy back and check result in debugger. All results should be 0
CUDA_SAFE_CALL( cudaMemcpy( h_result, d_result, THREADS*sizeof(int3), cudaMemcpyDeviceToHost));
for (int i=0; i<THREADS; i++){
printf("%i) ", i);
//print test 1 results
if(h_result[i].x)
printf("TEST1:PASS, ");
else
printf("TEST1:FAIL, ");
//print test 2 results
if(h_result[i].y)
printf("TEST2:PASS, ");
else
printf("TEST2:FAIL, ");
//print test 3 results
if(h_result[i].y)
printf("TEST3:PASS \n");
else
printf("TEST3:FAIL \n");
}
}