casting & deviceemu vs actual GPU problem with casting on actual cpu

Hi,

I just tried to write a small test case for comparing the cast and add performance vs. simply adding a constant to a vector.

My code behaves as expected when run in deviceemu mode but not if run on the real card. I am using CUDA 2.2 on Suse 11.1 and a 9500 GT.

  1. Can someone spot the bug (it should be simply adding a constant)?

  2. Why do deviceemu and actual GPU differ in this case (the actually more interesting question).

Thanks

joerg

P.S. this is test code so not too nice :-).

[codebox]/*

Test Case for comparing the performance impact of double float cast.

*/

#include

#include

#include

#include <time.h>

#include <cuda.h>

#include

// code

using namespace std;

void Test(int N, int b);

device double addd(double a, double b)

{

return a + b;

}

device float addf(float a, float b)

{

return a + b;

}

global void testKernel(double* a, int b, double* result,int n)

{

for (int i=0; i< n; i++)

{

result[i] = addd(a[i], b);

//only emu

//printf("%f result %f \n",a[i], result[i]);

}

}

global void testKernel_cast(double* a, int b, float* resultf, double* result,int n)

{

for (int i=0; i< n; i++)

{

resultf[i] =  addf((float)a[i], (float)b);

result[i] = (double)resultf[i];

//only emu

//printf("%f result %f \n",a[i], resultf[i]);

}

}

// Perform time-consuming operation

int main()

{

//number of items per vector

int n=10;

//number of iterations to perform each loop

int factor =1;

//host variables

double* array1_h = new double[n];

//initialize

for (int i=0; i<n; i++)

{

array1_h[i] = i;

}

double* cpuResult = new double[n];

float* cpuResultf = new float[n];

//device variables

double* result;

double* result1; //dummy for cast

float* resultf;

double* array1_d;

//allocate & copy data to device

cudaMalloc((void**) &array1_d, sizeof(double)*n);

cudaMemcpy(array1_d, array1_h, sizeof(double)*n, cudaMemcpyHostToDevice);

cudaMalloc((void**) &result, sizeof(double)*n);

cudaMalloc((void**) &result1, sizeof(double)*n);

cudaMalloc((void**) &resultf, sizeof(float)*n);

clock_t start, end, diff[2];

//run kernel1 simply adding

start = clock();

for (int j=0; j<factor; j++)

{

testKernel<<< 1, 1 >>>( array1_d, 2, result,n);

cudaThreadSynchronize();

}

end = clock();

diff[0]= (end-start);

//run kernel 2 with casting to float and back

start = clock();

for (int j=0; j<factor; j++)

{

testKernel_cast<<< 1, 1 >>>( array1_d, 2, resultf,result1, n);

cudaThreadSynchronize();

}

end = clock();

diff[1]= (end-start);

//copy data back

cudaMemcpy(cpuResult, result, sizeof(double)*n, cudaMemcpyDeviceToHost);

cudaMemcpy(cpuResultf, resultf, sizeof(float)*n, cudaMemcpyDeviceToHost);

cudaThreadSynchronize();

//check for differences

double dif=0;

for (int i=0; i<n; i++)

{

dif+= abs(cpuResult[i]- cpuResultf[i]);

	if (abs(cpuResult[i]- cpuResultf[i]) >0)	

	cout <<i <<" "<<abs(cpuResult[i]- cpuResultf[i])<<" "<<cpuResult[i]<<" "<<cpuResultf[i] <<endl;

}

printf(“\n Differenz %f\n”, dif);

//printf(“\nTimeDiff %f”, ((long double)diff[1])/ ((long double)diff[0] ) );

cudaFree(result);

cudaFree(resultf);

cudaFree(array1_d);

delete array1_h;

return 0;

}[/codebox]

Your card doesn’t support double precision.

Ahhhhhhhhhhh thanks!