casting & deviceemu vs actual GPU problem with casting on actual cpu


I just tried to write a small test case for comparing the cast and add performance vs. simply adding a constant to a vector.

My code behaves as expected when run in deviceemu mode but not if run on the real card. I am using CUDA 2.2 on Suse 11.1 and a 9500 GT.

  1. Can someone spot the bug (it should be simply adding a constant)?

  2. Why do deviceemu and actual GPU differ in this case (the actually more interesting question).



P.S. this is test code so not too nice :-).


Test Case for comparing the performance impact of double float cast.





#include <time.h>

#include <cuda.h>


// code

using namespace std;

void Test(int N, int b);

device double addd(double a, double b)


return a + b;


device float addf(float a, float b)


return a + b;


global void testKernel(double* a, int b, double* result,int n)


for (int i=0; i< n; i++)


result[i] = addd(a[i], b);

//only emu

//printf("%f result %f \n",a[i], result[i]);



global void testKernel_cast(double* a, int b, float* resultf, double* result,int n)


for (int i=0; i< n; i++)


resultf[i] =  addf((float)a[i], (float)b);

result[i] = (double)resultf[i];

//only emu

//printf("%f result %f \n",a[i], resultf[i]);



// Perform time-consuming operation

int main()


//number of items per vector

int n=10;

//number of iterations to perform each loop

int factor =1;

//host variables

double* array1_h = new double[n];


for (int i=0; i<n; i++)


array1_h[i] = i;


double* cpuResult = new double[n];

float* cpuResultf = new float[n];

//device variables

double* result;

double* result1; //dummy for cast

float* resultf;

double* array1_d;

//allocate & copy data to device

cudaMalloc((void**) &array1_d, sizeof(double)*n);

cudaMemcpy(array1_d, array1_h, sizeof(double)*n, cudaMemcpyHostToDevice);

cudaMalloc((void**) &result, sizeof(double)*n);

cudaMalloc((void**) &result1, sizeof(double)*n);

cudaMalloc((void**) &resultf, sizeof(float)*n);

clock_t start, end, diff[2];

//run kernel1 simply adding

start = clock();

for (int j=0; j<factor; j++)


testKernel<<< 1, 1 >>>( array1_d, 2, result,n);



end = clock();

diff[0]= (end-start);

//run kernel 2 with casting to float and back

start = clock();

for (int j=0; j<factor; j++)


testKernel_cast<<< 1, 1 >>>( array1_d, 2, resultf,result1, n);



end = clock();

diff[1]= (end-start);

//copy data back

cudaMemcpy(cpuResult, result, sizeof(double)*n, cudaMemcpyDeviceToHost);

cudaMemcpy(cpuResultf, resultf, sizeof(float)*n, cudaMemcpyDeviceToHost);


//check for differences

double dif=0;

for (int i=0; i<n; i++)


dif+= abs(cpuResult[i]- cpuResultf[i]);

	if (abs(cpuResult[i]- cpuResultf[i]) >0)	

	cout <<i <<" "<<abs(cpuResult[i]- cpuResultf[i])<<" "<<cpuResult[i]<<" "<<cpuResultf[i] <<endl;


printf(“\n Differenz %f\n”, dif);

//printf(“\nTimeDiff %f”, ((long double)diff[1])/ ((long double)diff[0] ) );




delete array1_h;

return 0;


Your card doesn’t support double precision.

Ahhhhhhhhhhh thanks!