Hi all,

i´m implementing the following code, which calculates the parameter d_epsilon 4 times (blockDim.y) and repeat the same calculation 200 times (gridDim.y).

```
__global__ void makeEpsilonKernel( float *d_f, float* d_phi, float *d_thickness,
float2 *d_nComplex, float2 *d_epsilon, int numData, int numLayer)
{
const float2 cImagOne = make_float2(0.0, 1.0);
const float pi = 4.0 * atan(1.0);
const float cFactor = float(299.789);
int ty, index, ind2, ind3, ind4;
float term1;
float2 term2;
ty = threadIdx.y;
index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index < numData)
{
ind2 =ty;
ind3 = index + (numData * ty);
ind4 = index + (numData * ty) + blockIdx.y*blockDim.y*numData;
term1 = (2.0 * pi *d_thickness[ind2]* cos(d_phi[ind3])) / cFactor;
term2 = d_complex_skal_mult(term1*d_f[index] , cImagOne);
d_epsilon[ind4] = d_complex_exp(d_complex_mult(term2, d_nComplex[ind3]));
}
__syncthreads();
}
```

```
extern "C" void makeEpsilon( int blockSize, float *d_f,
float *d_thickness, int numData, int numLayer)
{
dim3 threads(blockSize, 4, 1);
dim3 grid(iDivUp(numData, threads.x), 200, 1);
makeEpsilonKernel<<<grid, threads>>>( d_f, d_phi, d_thickness,
d_nComplex, d_epsilon, numData, numLayer);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
```

The first calculation is correct. The next one give wrong results back.

This means:

blockIdx.y = 0 : correct results

blockIdx.y = 1 : wrong results

blockIdx.y = 2 : wrong results

blockIdx.y = 3 : wrong results

blockIdx.y = 4 : correct results

The same scheme is repeated for the 200 calculations. Every fourth results are correct.

can that be a synchronization error?

I’m grateful for any help. Many thanks in advance.

Sorry for my english :-)

