Hi,

I am developing a cuda program to run simulation and in addition I am running similar code on the host computer in order to compare performance and results. The kernel is run repeatedly to effectively execute a simple multiply and accumulate.

For the case where the MAC starts at 0.0 there is an error between the host and device calculations which disappears after 5 iteration.

D=Device result, H = Host result, E = error

D0.000000000000000000 H0.000000000000000000 E0.000000000000000000

D0.000945000094361603 H0.000945000094361603 E0.000000000000000000

D0.001890000188723207 H0.001890000188723207 E0.000000000000000000

D0.002835000166669488 H0.002835000399500132 E-0.000000000232830644

D0.003780000144615769 H0.003780000377446413 E-0.000000000232830644

D0.004725000355392694 H0.004725000355392694 E0.000000000000000000

D0.005670000333338976 H0.005670000333338976 E0.000000000000000000

For cases with large initial values there is no error but whenever the initial value is 0.0 or small (1e-4) a similar error is present.

Is this normal error for single precision calculations? I would have expected the rounding errors to accumulate and cause divergence of the two calculations?

In addition when I change the code to execute all the calculations as a single invocation of the kernel the error disappears.

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cutil.h>

#define N 32

#define N2 (N*N)

**global** void testKernel(float *data,float *p)

{

int i = threadIdx.x;

float w = p[0];

float dt = p[1];

data[i] = w*dt + data[i-N];

}

void HtestKernel(float *data, float *p)

{

int i;

float w = p[0];

float dt = p[1];

for(i = 0; i < N; i++)

```
{
data[i] = w*dt + data[i-N];
}
```

}

int main()

{

float *Hp = (float*)malloc(sizeof(float)*2);

Hp[0] = 12.6f;

Hp[1] = 0.000075f;

float *Dp;

cudaMalloc((void**)&Dp, sizeof(float)*2);

cudaMemcpy((void*)Dp,(void*)Hp,sizeof(float)*2,cudaMemcpyHos

tToDevice);

int i;

float *Hdata = (float*)malloc(sizeof(float)*N2);

for(i = 0; i < N2; i++)

```
{
Hdata[i] = 0.0001;
}
```

float *Ddata,*DHdata = (float*)malloc(sizeof(float)*N2);

cudaMalloc((void**)&Ddata,sizeof(float)*N2);

cudaMemcpy((void*)Ddata,(void*)Hdata,sizeof(float)*N2,cudaMe

mcpyHostToDevice);

dim3 grid(1);

dim3 block(N);

for(i = 1; i < N; i++)

```
{
testKernel<<<grid,block>>>(Ddata+i*N,Dp);
cudaThreadSynchronize();
HtestKernel(Hdata+i*N,Hp);
}
```

cudaMemcpy((void*)DHdata, (void*)Ddata,sizeof(float)*N2,cudaMemcpyDeviceToHost);

for(i = 0; i < N2; i += N)

```
{
printf("D%1.18f H%1.18f E%1.18f\n",DHdata[i],Hdata[i],DHdata[i]-Hdata[i]);
}
```

return 0;

}

[/codebox]

Thanks,

Josh