Hi,
I just tried to write a small test case for comparing the cast and add performance vs. simply adding a constant to a vector.
My code behaves as expected when run in deviceemu mode but not if run on the real card. I am using CUDA 2.2 on Suse 11.1 and a 9500 GT.
-
Can someone spot the bug (it should be simply adding a constant)?
-
Why do deviceemu and actual GPU differ in this case (the actually more interesting question).
Thanks
joerg
P.S. this is test code so not too nice :-).
[codebox]/*
Test Case for comparing the performance impact of double float cast.
*/
#include <time.h>
#include <cuda.h>
// code
using namespace std;
void Test(int N, int b);
device double addd(double a, double b)
{
return a + b;
}
device float addf(float a, float b)
{
return a + b;
}
global void testKernel(double* a, int b, double* result,int n)
{
for (int i=0; i< n; i++)
{
result[i] = addd(a[i], b);
//only emu
//printf("%f result %f \n",a[i], result[i]);
}
}
global void testKernel_cast(double* a, int b, float* resultf, double* result,int n)
{
for (int i=0; i< n; i++)
{
resultf[i] = addf((float)a[i], (float)b);
result[i] = (double)resultf[i];
//only emu
//printf("%f result %f \n",a[i], resultf[i]);
}
}
// Perform time-consuming operation
int main()
{
//number of items per vector
int n=10;
//number of iterations to perform each loop
int factor =1;
//host variables
double* array1_h = new double[n];
//initialize
for (int i=0; i<n; i++)
{
array1_h[i] = i;
}
double* cpuResult = new double[n];
float* cpuResultf = new float[n];
//device variables
double* result;
double* result1; //dummy for cast
float* resultf;
double* array1_d;
//allocate & copy data to device
cudaMalloc((void**) &array1_d, sizeof(double)*n);
cudaMemcpy(array1_d, array1_h, sizeof(double)*n, cudaMemcpyHostToDevice);
cudaMalloc((void**) &result, sizeof(double)*n);
cudaMalloc((void**) &result1, sizeof(double)*n);
cudaMalloc((void**) &resultf, sizeof(float)*n);
clock_t start, end, diff[2];
//run kernel1 simply adding
start = clock();
for (int j=0; j<factor; j++)
{
testKernel<<< 1, 1 >>>( array1_d, 2, result,n);
cudaThreadSynchronize();
}
end = clock();
diff[0]= (end-start);
//run kernel 2 with casting to float and back
start = clock();
for (int j=0; j<factor; j++)
{
testKernel_cast<<< 1, 1 >>>( array1_d, 2, resultf,result1, n);
cudaThreadSynchronize();
}
end = clock();
diff[1]= (end-start);
//copy data back
cudaMemcpy(cpuResult, result, sizeof(double)*n, cudaMemcpyDeviceToHost);
cudaMemcpy(cpuResultf, resultf, sizeof(float)*n, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
//check for differences
double dif=0;
for (int i=0; i<n; i++)
{
dif+= abs(cpuResult[i]- cpuResultf[i]);
if (abs(cpuResult[i]- cpuResultf[i]) >0)
cout <<i <<" "<<abs(cpuResult[i]- cpuResultf[i])<<" "<<cpuResult[i]<<" "<<cpuResultf[i] <<endl;
}
printf(“\n Differenz %f\n”, dif);
//printf(“\nTimeDiff %f”, ((long double)diff[1])/ ((long double)diff[0] ) );
cudaFree(result);
cudaFree(resultf);
cudaFree(array1_d);
delete array1_h;
return 0;
}[/codebox]