Hi!

I need to write in CUDA a function that calculate cross-correlation like the native xcorr() Matlab function. I’ve found this page http://local.wasp.uwa.edu.au/~pbourke/misc…eous/correlate/ which explains how to write a “brute-force” routine in C (i.e. without using FFT). I’ve adapted it for my needs in this way (I need only raw cross-correlation without normalization):

```
// x has 512 elements and y has 256 elements
// NCELL_DEF = 256, N2CELL=512
// r is the result array which has 1023 elements (2*N2CELL-1)
__device__ void xcorr(float2 *x, float2 *y, float2 *r)
{
int delay,i,j,k=0;
float2 nf2,mx,my,sx,sy,sxy,denom,partSubx,partSuby;
const int n = N2CELL;
const int maxdelay = N2CELL-1;
float2 newy[n];
for(i=0; i<NCELL_DEF; ++i)
newy[i] = y[i];
for(; i<N2CELL; ++i){
newy[i].x = 0.0f;
newy[i].y = 0.0f;
}
// Calculate the mean of the two series x[], y[]
mx.x = 0.0f;
mx.y = 0.0f;
my.x = 0.0f;
my.y = 0.0f;
for (i=0;i<n;i++) {
mx = c_add(mx,x[i]);
my = c_add(my,newy[i]);
}
nf2.x = n;
nf2.y = 0.0f;
mx = c_div(mx,nf2);
my = c_div(my,nf2);
//Calculate the correlation series
for (delay=-maxdelay;delay<maxdelay;++delay) {
sxy.x = 0.0f;
sxy.y = 0.0f;
for (i=0;i<n;++i) {
j = i + delay;
if (j < 0 || j>=n)
sxy = c_add(sxy,c_mul(c_sub(x[i],mx),c_con(my)));
else
sxy = c_add(sxy,c_mul(c_sub(x[i],mx),c_sub(newy[j],my)));
}
r[k] = sxy;
++k;
//r is the correlation coefficient at "delay"
}
}
```

But this code gave me different results against Matlab function. How can I do?