I’ve ported some algo to CUDA, and on DeviceEmu it works ok, but when I compile Release version, result from kernel are completely different, even on CPU emulation.
I’ve tried to write simple program and result is the same, value got from thread is always = 0, looks like it doesn’t change.
Here is the code:
#define THREADS_PER_BLOCK 64
typedef unsigned long long long64;
typedef unsigned char u8;
typedef unsigned long u32;
#pragma pack(push, 1)
struct __align__(16) fake_u64
{
u32 hi;
 u32 lo;
};
#pragma pack(pop)
typedef fake_u64 u64;
__global__ void CUDA_unit(u64 *shedule, u64 *d_result)
{
 u32 idx = blockIdx.x*blockDim.x + threadIdx.x;
 if (idx == 0) {
  d_result->hi = 1;
  d_result->lo = 2;
 }
 return;
}
int main(void)
{
 u64  *d_result;
 u64  *h_result;
 u64  *d_shedule;
 u64  *h_shedule;
 u64  in;
 CUT_DEVICE_INIT(NULL, NULL);
 in.hi = 1;
 in.lo = 2;
 shedule_size = 2;
 h_shedule = (u64 *)malloc(sizeof(u64)*shedule_size);
 if (!h_shedule) {
  printf("No memory on host for shedule array\n");
  return 0;
 }
 h_result = (u64 *)malloc(sizeof(u64));
 if (!h_result) {
  printf("No memory on host for result\n");
  return 0;
 }
 h_result->hi = 0;
 h_result->lo = 0;
 CUDA_SAFE_CALL( cudaMalloc((void**) &d_result, sizeof(u64)) );
 CUDA_SAFE_CALL( cudaMemcpy(d_result, h_result, sizeof(u64), cudaMemcpyHostToDevice) );
Â
 for (i=0; i<shedule_size; i++) {
  h_shedule[i] = in;
 }
 CUDA_SAFE_CALL(cudaMalloc((void **)&d_shedule, shedule_size*sizeof(u64)) );
 cudaMemcpy(d_shedule, h_shedule, shedule_size*sizeof(u64), cudaMemcpyHostToDevice);
 free(h_shedule);
 num_blocks = shedule_size / THREADS_PER_BLOCK + (shedule_size % THREADS_PER_BLOCK == 0 ? 0 : 1);
 CUDA_unit<<<num_blocks, shedule_size>>>(d_shedule, (u64 *)d_result);
 CUDA_SAFE_CALL( cudaThreadSynchronize() );
 CUT_CHECK_ERROR("CUDA_unit");
 CUDA_SAFE_CALL( cudaMemcpy(h_result, d_result, sizeof(u64), cudaMemcpyDeviceToHost) );
Â
 printf ("%08X:%08X\n", h_result->hi, h_result->lo);
 CUDA_SAFE_CALL( cudaFree(d_result) );
 CUDA_SAFE_CALL( cudaFree(d_shedule) )
 free(h_result);
 CUT_EXIT(NULL, NULL);
}
In my algo only one thread writes to d_result, so after calling cudaThreadSynchronize() there should be 0 or some value in d_result.
When i copy it to h_result, it’s always = 0 in Release mode and normal in DebugEmu mode.
I’ve tried to disable optimisation, disable loops unrollment, change memory alignment, no result. Where is my mistake?
P.S. env = VC2005, CUDA 2.0