I have written a a couple of kernels in my project of implementing subdivision using CUDA. The problem that I am seing is that while in emulation mode I get a total difference in the vertices between a regular CPU implementation and my CUDA implementation in the order of 10^-5 which I am happy with. But, when running in device mode, this error climbs to 10^-2 which is quite a lot to say the least! The code for the kernels that calculate new vertices is is given under.

Does anyone see something that could cause this strange behaviour? Any help is appreciated.

```
__global__
void vertTransKernelV2(float4* vertsIn, float4* vertsOut, uint2** neighList, bool cpy)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
float4 v = vertsIn[index];
uint2* neighs = getNeighList(index, neighList, cpy);
int n = neighs[0].y;
float4 sumNeigh = make_float4(0.0, 0.0, 0.0, 0.0);
for(int i = 1; i <= n; i++)
sumNeigh = sumNeigh+vertsIn[neighs[i].y];
float alpha = calcAlpha((float)n);
float4 val = v*(1.0-alpha)+sumNeigh*(alpha*(1.0/(float)n));
val.w = -1.0;
vertsOut[index] = val;
}
__device__
float calcAlpha(float numNeigh)
{
return (1.0/9.0)*(4.0-2.0*__cosf((2.0*M_PI)/numNeigh));
}
__global__
void vertGenKernelV2(float4* vertsIn, float4* vertsOut, uint4* indsIn, uint4* indsOut, int lastIndex, uint2** neighList)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
uint4 currTri = indsIn[index];
float4 v1 = vertsIn[currTri.x];
float4 v2 = vertsIn[currTri.y];
float4 v3 = vertsIn[currTri.z];
float4 centroid = (v1+v2+v3)/3.0;
centroid.w = 0.0;
int vertOutIndex = lastIndex+index;
vertsOut[vertOutIndex] = centroid;
}
```