Hi,
I am developing a cuda program to run simulation and in addition I am running similar code on the host computer in order to compare performance and results. The kernel is run repeatedly to effectively execute a simple multiply and accumulate.
For the case where the MAC starts at 0.0 there is an error between the host and device calculations which disappears after 5 iteration.
D=Device result, H = Host result, E = error
D0.000000000000000000 H0.000000000000000000 E0.000000000000000000
D0.000945000094361603 H0.000945000094361603 E0.000000000000000000
D0.001890000188723207 H0.001890000188723207 E0.000000000000000000
D0.002835000166669488 H0.002835000399500132 E-0.000000000232830644
D0.003780000144615769 H0.003780000377446413 E-0.000000000232830644
D0.004725000355392694 H0.004725000355392694 E0.000000000000000000
D0.005670000333338976 H0.005670000333338976 E0.000000000000000000
For cases with large initial values there is no error but whenever the initial value is 0.0 or small (1e-4) a similar error is present.
Is this normal error for single precision calculations? I would have expected the rounding errors to accumulate and cause divergence of the two calculations?
In addition when I change the code to execute all the calculations as a single invocation of the kernel the error disappears.
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define N 32
#define N2 (N*N)
global void testKernel(float *data,float *p)
{
int i = threadIdx.x;
float w = p[0];
float dt = p[1];
data[i] = w*dt + data[i-N];
}
void HtestKernel(float *data, float *p)
{
int i;
float w = p[0];
float dt = p[1];
for(i = 0; i < N; i++)
{
data[i] = w*dt + data[i-N];
}
}
int main()
{
float Hp = (float)malloc(sizeof(float)*2);
Hp[0] = 12.6f;
Hp[1] = 0.000075f;
float *Dp;
cudaMalloc((void**)&Dp, sizeof(float)*2);
cudaMemcpy((void*)Dp,(void*)Hp,sizeof(float)*2,cudaMemcpyHos
tToDevice);
int i;
float Hdata = (float)malloc(sizeof(float)*N2);
for(i = 0; i < N2; i++)
{
Hdata[i] = 0.0001;
}
float *Ddata,DHdata = (float)malloc(sizeof(float)*N2);
cudaMalloc((void**)&Ddata,sizeof(float)*N2);
cudaMemcpy((void*)Ddata,(void*)Hdata,sizeof(float)*N2,cudaMe
mcpyHostToDevice);
dim3 grid(1);
dim3 block(N);
for(i = 1; i < N; i++)
{
testKernel<<<grid,block>>>(Ddata+i*N,Dp);
cudaThreadSynchronize();
HtestKernel(Hdata+i*N,Hp);
}
cudaMemcpy((void*)DHdata, (void*)Ddata,sizeof(float)*N2,cudaMemcpyDeviceToHost);
for(i = 0; i < N2; i += N)
{
printf("D%1.18f H%1.18f E%1.18f\n",DHdata[i],Hdata[i],DHdata[i]-Hdata[i]);
}
return 0;
}
[/codebox]
Thanks,
Josh