I would like to have some clarifications on GPU computing accuracy and determinism.

Basically, I’m trying to implement a 1-D Fourier transform by a cublasCgemm routine call and to compile it as a Matlab mex function. So, I have two kernels: one to calculate the vector to be transformed and one to calculate the Fourier kernel matrix. Then, I have the mex function calling the two kernels and the cublasCgemm function.

Here is the excitation filling kernel.

[codebox]// *******************************

// * KERNEL - EXCITATION FILLING *

// *******************************

**global** void Excitation_Filling(float *d_X, float *d_Y, float *d_Z, float d_alfa, float d_z0, float *d_x_terzo_vett, float *d_y_terzo_vett, float *d_z_terzo_vett, float *d_r_vett, float *d_theta_vett, float* d_fase_vett, cuFloatComplex *d_campo_feed, int dimx, int dimy, float beta, float mfatt)

{

```
/* Evaluates the thread indices */
int idx = __umul24(blockIdx.x,BLOCK_SIZE_X) + threadIdx.x;
int idy = __umul24(blockIdx.y,BLOCK_SIZE_Y) + threadIdx.y;
```

/* Evaluates the matrix filling index */

```
int id_vett = idx*dimx+idy;
```

if (idx < dimx && idy < dimy) {

d_x_terzo_vett[id_vett]=-d_X[id_vett];

```
d_y_terzo_vett[id_vett]=__fadd_rn(__fmul_rn(d_Y[id_vett],cos
```

f(d_alfa)),__fmul_rn(__fadd_rn(d_Z[id_vett],-(d_z0)),sinf(d_alfa)));

```
d_z_terzo_vett[id_vett]=__fadd_rn(__fmul_rn(d_Y[id_vett],sin
```

f(d_alfa)),-__fmul_rn(__fadd_rn(d_Z[id_vett],-(d_z0)),cosf(d_alfa)));

d_r_vett[id_vett]=sqrtf(_*fmul_rn(d_x_terzo_vett[id_vett],d*

x_terzo_vett[id_vett])+_*fmul_rn(d_y_terzo_vett[id_vett],d_y*

terzo_vett[id_vett])+__fmul_rn(d_z_terzo_vett[id_vett],d_z_te

rzo_vett[id_vett]));

```
d_theta_vett[id_vett]=asinf(__fdividef(sqrtf(__fadd_rn(__fmu
```

l_rn(d_x_terzo_vett[id_vett],d_x_terzo_vett[id_vett]),

```
__fmul_rn(d_y_terzo_vett[id_vett],d_y_terzo_vett[id_vett])))
```

,d_r_vett[id_vett]));

d_campo_feed[id_vett]=make_cuFloatComplex(cosf(__fmul_rn(bet

a,d_r_vett[id_vett])),-sinf(__fmul_rn(beta,d_r_vett[id_vett])));

```
d_campo_feed[id_vett]=cuCdivf(d_campo_feed[id_vett],make_cuF
```

loatComplex(d_r_vett[id_vett],0.0f));

```
d_campo_feed[id_vett]=cuCmulf(d_campo_feed[id_vett],make_cuF
```

loatComplex(powf(cosf(d_theta_vett[id_vett]),mfatt),0.0f));

d_campo_feed[id_vett]=cuCmulf(d_campo_feed[id_vett],make_cuF

loatComplex(cosf(d_fase_vett[id_vett]),sinf(d_fase_vett[id_ve

tt])));

}

}

[/codebox]

Here is the Fourier kernel matrix filling kernel

[codebox]// **********************************

// * KERNEL - KERNEL MATRIX FILLING *

// **********************************

**global** void Kernel_Matrix_Filling(float *d_X, float *d_Y, float *d_Z, float *d_u_vett, float *d_v_vett, cuFloatComplex *d_Kernel_Matrix,

```
int dimx, int dimy, int dimu)
```

{

```
/* Evaluates the thread indices */
int idx = __umul24(blockIdx.x,BLOCK_SIZE_X) + threadIdx.x;
int idy = __umul24(blockIdx.y,BLOCK_SIZE_Y) + threadIdx.y;
```

/* Evaluates the matrix filling index */

```
int id_vett = idx*dimu+idy;
```

if (idx < dimx*dimy && idy < dimu) {

d_Kernel_Matrix[id_vett]=make_cuFloatComplex(cosf(__fadd_rn(

__fmul_rn(d_u_vett[idy],d_X[idx]),__fmul_rn(d_v_vett[idy],d_Y

[idx]))),

```
sinf(__fadd_rn(__fmul_rn(d_u_vett[idy],d_X[idx]),__fmul_rn(d
```

_v_vett[idy],d_Y[idx]))));

}

}

[/codebox]

Since the beginning, I had accuracy issues, which were mitigated by use of __fadd_rn and __fmul_rn. However, the final accuracy I get is 1e-3 or 1e-4, which is not enough for my application. I’m using single precision since I have a GT 230M card.

What could be the source of such large errors ? Use of cos/sin/sqrt functions ? Synchronization ?

Strangely, the result changes when I change blocksize and dimgrid, the faster the algorithm runs, the poorer the result is. I have separately tested the cublasCgemm routine call and checked it returns with a satisfactory accuracy.