Wonder if anyone can help with this.
The test rig - which I’ve attached - seems to indicate that there is a problem with the passing of doubles to and from the host and device.
Note that this code all works fine when run in emulation and we see the same - wrong - results when running on either a NVS290 or C1060.
You can see from the code that the double array is being set correctly to 3 on the device and then copied into the integer array but when the double array is passed back to the host it is set to 0!
Similarly, if I try to pass a double into the kernel and use it it always appears to be zero when used in the kernel code.
When I change the type from double to float it all works!
[font=“Courier New”]
=== Kernel =======================================
#define VARTYPE double
[font=“Courier New”]global void Init1(
VARTYPE d_array,
int d_iArray,
int ivalue,
VARTYPE value,
int size
){
//Thread index
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid < size)
{
d_array[tid] = 3.0f; // double array shown as zero on host
d_iArray[tid] = d_array[tid]; // integer array is set correctly using this line
//d_iArray[tid] = value; // integer array is set to zero using this line
}
}[/font]
=== Main program ======================================
#define SIZE 1000
#define VARTYPE double
int main(int argc, char **argv){
//‘h_’ prefix - CPU (host) memory space
VARTYPE *h_results;
int *h_Iresults;
//'d_' prefix - GPU (device) memory space
VARTYPE *d_vector;
int *d_IVector;
double gpuTime;
unsigned int hTimer;
int i;
CUT_DEVICE_INIT(argc, argv);
CUT_SAFE_CALL( cutCreateTimer(&hTimer) );
h_results = (VARTYPE *)malloc(SIZE*sizeof(VARTYPE));
h_Iresults = (int *)malloc(SIZE*sizeof(int));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_vector, SIZE*sizeof(VARTYPE)));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_IVector, SIZE*sizeof(int)));
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
Init1<<<32, 256>>>(
d_vector,
d_IVector,
2,
1.0,
SIZE
);
CUT_CHECK_ERROR("execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer(hTimer) );
gpuTime = cutGetTimerValue(hTimer);
printf("Reading back GPU results...\n");
//Read back GPU results to compare them to CPU results
CUDA_SAFE_CALL( cudaMemcpy(h_results, d_vector, SIZE*sizeof(VARTYPE), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(h_Iresults, d_IVector, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );
for(i=0; i < SIZE; i++)
printf("i=%d vi=%d vd=%.3f; ", i, h_Iresults[i], h_results[i]);
printf("\n");
printf("Shutting down...\n");
printf("...releasing GPU memory.\n");
CUDA_SAFE_CALL( cudaFree(d_vector) );
CUDA_SAFE_CALL( cudaFree(d_IVector) );
printf("...releasing CPU memory.\n");
free(h_results);
free(h_Iresults);
CUT_SAFE_CALL( cutDeleteTimer(hTimer) );
printf("Shutdown done.\n");
CUT_EXIT(argc, argv);
}
[/font]