I am trying to compare cross-correlation using FFT vs using windowing method.
My Matlab code is:
[codebox]isize = 20;
n = 7;
for i = 1:n %%7x7 xcorr
for j = 1:n
xcout(i,j) = sum(sum(ffcorr1 .* ref(i:i+isize-1,j:j+isize-1))); %%ref is 676 element array and ffcorr1 is a 400 element array
end
end[/codebox]
similar CUDA kernel:
[codebox]global void xc_corr(double* in_im, double* ref_im, int pix3, int isize, int n, double* out1, double* temp1, double* sum_temp1)
{
int p = blockIdx.x * blockDim.x + threadIdx.x;
int q = 0;
int i = 0;
int j = 0;
int summ = 0;
for(i = 0; i < n; ++i)
{
for(j = 0; j < n; ++j)
{
summ = 0; //force update
for(p = 0; p < pix1; ++p)
{
for(q = 0; q < pix1; ++q)
{
temp1[((i*n+j)*pix1*pix1)+p*pix1+q] = in_im[p*pix1+q] * ref_im[(p+i)*pix1+(q+j)];
sum_temp1[((i*n+j)*pix1*pix1)+p*pix1+q] += temp1[((i*n+j)*pix1*pix1)+p*pix1+q];
out1[i*n+j] = sum_temp1[((i*n+j)*pix1*pix1)+p*pix1+q];
}
}
}
}[/codebox]
I have called this in my kernel as
[codebox] int blocksize = 64; //multiple of 32
int nblocks = (pix3+blocksize-1)/blocksize; //round to max pix3 = 400
xc_corr <<< nblocks,blocksize >>> (ffcorr1, ref_d, pix3, isize, npix, xcout, xc_partial);
cudaThreadSynchronize();[/codebox]
Somehow, when I do a diff on the output file, I see that the CUDA kernel computes for only the first 400 elements.
What is the correct way to write this kernel??
Thanks in advance.