Problems passing doubles to/from kernel - they become 0!

Wonder if anyone can help with this.

The test rig - which I’ve attached - seems to indicate that there is a problem with the passing of doubles to and from the host and device.

Note that this code all works fine when run in emulation and we see the same - wrong - results when running on either a NVS290 or C1060.

You can see from the code that the double array is being set correctly to 3 on the device and then copied into the integer array but when the double array is passed back to the host it is set to 0!

Similarly, if I try to pass a double into the kernel and use it it always appears to be zero when used in the kernel code.

When I change the type from double to float it all works!

[font=“Courier New”]

=== Kernel =======================================

#define VARTYPE double

[font=“Courier New”]global void Init1(
VARTYPE d_array,
int
d_iArray,
int ivalue,
VARTYPE value,
int size
){
//Thread index
const int tid = blockDim.x * blockIdx.x + threadIdx.x;

if(tid < size)
{
    d_array[tid] = 3.0f;		  // double array shown as zero on host
    
d_iArray[tid] = d_array[tid];  // integer array is set correctly using this line

//d_iArray[tid] = value; // integer array is set to zero using this line	   		
}

}[/font]

=== Main program ======================================

#define SIZE 1000

#define VARTYPE double

int main(int argc, char **argv){
//‘h_’ prefix - CPU (host) memory space
VARTYPE *h_results;
int *h_Iresults;

//'d_' prefix - GPU (device) memory space
VARTYPE *d_vector;
int    *d_IVector;


double gpuTime;

unsigned int hTimer;
int i;

CUT_DEVICE_INIT(argc, argv);
CUT_SAFE_CALL( cutCreateTimer(&hTimer) );

h_results = (VARTYPE *)malloc(SIZE*sizeof(VARTYPE));
h_Iresults = (int *)malloc(SIZE*sizeof(int));

CUDA_SAFE_CALL( cudaMalloc((void**)&d_vector, SIZE*sizeof(VARTYPE)));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_IVector, SIZE*sizeof(int)));

CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );

Init1<<<32, 256>>>(
        d_vector,
        d_IVector,
    2,
        1.0,
        SIZE
        );

CUT_CHECK_ERROR("execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer(hTimer) );
gpuTime = cutGetTimerValue(hTimer);

printf("Reading back GPU results...\n");
//Read back GPU results to compare them to CPU results
CUDA_SAFE_CALL( cudaMemcpy(h_results, d_vector, SIZE*sizeof(VARTYPE), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(h_Iresults, d_IVector, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );

for(i=0; i < SIZE; i++)
    printf("i=%d vi=%d vd=%.3f; ", i, h_Iresults[i], h_results[i]);
printf("\n");


printf("Shutting down...\n");
printf("...releasing GPU memory.\n");
CUDA_SAFE_CALL( cudaFree(d_vector)  );
CUDA_SAFE_CALL( cudaFree(d_IVector)  );

printf("...releasing CPU memory.\n");
free(h_results);
free(h_Iresults);
CUT_SAFE_CALL( cutDeleteTimer(hTimer) );
printf("Shutdown done.\n");

CUT_EXIT(argc, argv);

}
[/font]

Compile with -arch sm_13 when using doubles.

Yep. That was it. Many thanks. Shame that it’s not actually documented anywhere except in the example makefiles!