Hi!

I’ve written a kernel that does some calculations. At a certain point, my kernel call the following function:

```
__global__ void mykern(....)
{
......
float2 r[N3CELL2]; // N3CELL2 = 1536
......
xcorr(x,y,r);
.......
}
__device__ void xcorr(float2 *x, float2 *y, float2 *r)
{
int delay,i,j,k=0;
float2 nf2,mx,my,sx,sy,sxy,denom,partSubx,partSuby;
const int n=N3CELL; // N3CELL = 768
const int maxdelay=N3CELL;
float2 newy[n];
for(i=0; i<NCELL_DEF; ++i) // NCELL_DEF = 256
newy[i] = y[i];
for(; i<N3CELL; ++i){
newy[i].x = 0.0f;
newy[i].y = 0.0f;
}
/* Calculate the mean of the two series x[], y[] */
mx.x = 0.0f;
mx.y = 0.0f;
my.x = 0.0f;
my.y = 0.0f;
for (i=0;i<n;i++) {
mx = c_add(mx,x[i]);
my = c_add(my,newy[i]);
}
nf2.x = n;
nf2.y = 0.0f;
mx = c_div(mx,nf2);
my = c_div(my,nf2);
/* Calculate the denominator */
sx.x = 0.0f;
sx.y = 0.0f;
sy.x = 0.0f;
sy.y = 0.0f;
for (i=0;i<n;i++) {
partSubx = c_sub(x[i],mx);
partSuby = c_sub(newy[i],my);
sx = c_add(sx,c_mul(partSubx,partSubx));
sy = c_add(sy,c_mul(partSuby,partSuby));
}
denom = c_sqrt(c_mul(sx,sy));
/* Calculate the correlation series */
for (delay=-maxdelay;delay<maxdelay;++delay) {
sxy.x = 0.0f;
sxy.y = 0.0f;
for (i=0;i<n;++i) {
j = i + delay;
if (j < 0 || j >= n)
continue;
else
sxy = c_add(sxy,c_mul(c_sub(x[i],mx),c_sub(newy[j],my)));
}
r[k] = c_div(sxy,denom);
++k;
/* r is the correlation coefficient at "delay" */
}
}
```

The problem is that it returns me an array with all NaN values. It’s strange because the same function, written in plain C on another program, works very well. I’ve also tried to launch my kernel with one thread only, but the situation doesn’t change. I don’t know why :(