I’m trying to do a 2D-FFT for cross-correlation between two images: keypoint_d
of size 128x128 and image_d of size 256x256. One way to do that is by using the cuFFT Library.
So far, here are the steps I used for a for an IN-PLACE C2C transform: :
- Add 0 padding to
Pattern_img
to have an equal size with regard toimage_d
: (256x256) <==> NXxNY - I created my 2D C2C plan.
- Performed the forward 2D transform for each image.
- Multiplied both images in the frequency domain using the appropriate complex multiplication.
- Perform IFFT of the multiplication-result.
- Copy to host and display the IFFT result
So my goal is to find any resemblance between the two images. I suppose that the highest value (in the IFFT of the multiplication) will be located where the two images match the most, isn’t it? If so, then I will just find the highest value and save its coordinates.
I tried to read what’s inside the buffer inversed_h but it is full of big number … Any help?
Here is my code:
Note that NX = NY = 256
fftSize = NX*NY
Complex* CrossCorelationMonoPlan(float* winBuffer, float* kpBuffer, int fftSize, int NX, int NY)
{
Complex* inversed_h = (Complex*)malloc(sizeof(Complex)*fftSize);
cufftHandle plan;
cufftPlan2d(&plan, NX, NY, CUFFT_C2C);
cufftComplex* kpBuffer_pad_d;
gpuErrchk(cudaMalloc((void **)&kpBuffer_pad_d, mem_size_pad));
dataTransfer_R2C << < ceil(NX*NX/ 256), 256 >> > (kpBuffer, kpBuffer_pad_d, NX, NY);
gpuErrchk(cudaDeviceSynchronize());
cufftComplex* winBuffer_d;
gpuErrchk(cudaMalloc((void **)&winBuffer_d, mem_size_pad));
dataTransfer_R2C << < ceil(NX*NY / 256), 256 >> > (winBuffer, winBuffer_d, NX, NY);
gpuErrchk(cudaDeviceSynchronize());
// .. Start FFT :: in-place
printf("Transforming signal cufftExecR2C\n");
cufftExecC2C(plan, (cufftComplex *)kpBuffer_pad_d, (cufftComplex *)kpBuffer_pad_d, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)winBuffer_d, (cufftComplex *)winBuffer_d, CUFFT_FORWARD);
printf("Launching Complex multiplication<<< >>>\n");
ComplexMulAndScale << < ceil(NX*NY/256), 256 >> >(winBuffer_d, kpBuffer_pad_d, fftSize, 1.0f / fftSize);
printf("Transforming signal back cufftExecC2C\n");
cufftExecC2C(plan, (cufftComplex *)winBuffer_d, (cufftComplex *)winBuffer_d, CUFFT_INVERSE);
gpuErrchk(cudaMemcpy(inversed_h, winBuffer_d, sizeof(Complex)*fftSize, cudaMemcpyDeviceToHost));
Findmax(inversed_h, fftSize, NX, NY);
//free(inversed_h);
gpuErrchk(cudaFree(kpBuffer_pad_d));
gpuErrchk(cudaFree(winBuffer_d));
return inversed_h;
}
where my two kernels are:
__global__ void ComplexMulAndScale (cufftComplex *a, cufftComplex *b, int size, float scale)
{
const int tId = blockIdx.x * blockDim.x + threadIdx.x;
if(tId < size)
{
Complex c;
c.x = (a[tId].x * b[tId].x - a[tId].y * b[tId].y)*scale;
c.y = (a[tId].x * b[tId].y + a[tId].y * b[tId].x)*scale;
a[tId] = c;
}
}
__global__ void dataTransfer_R2C(float* dataIn_d, float2* dataOut_d, int paddedWidth, int paddedHeight)
{
int tId = blockIdx.x * blockDim.x + threadIdx.x;
int x = tId % paddedWidth;
int y = tId / paddedWidth;
if (tId < paddedWidth * paddedHeight)
{
dataOut_d[x + y*paddedWidth].x = dataIn_d[tId];
dataOut_d[x + y*paddedWidth].y = 0;
}
}
int Findmax(cufftComplex * arr, unsigned int size, int width, int height)
{
Complex max = arr[0];
int index;
for (unsigned int i = 1; i < size; i++) {
if (arr[i].x > max.x) {
num_of_occurs = 1;
max = arr[i];
index = i;
}
}
cout << "index = " << index << endl;
cout << "X = " << index % width << "Y = " << index / width << endl;
cout << "Real = " << max.x << " " << "Imag = " << max.y << endl;
return index;
}