2D cuFFT using C2C

I’m trying to do a 2D-FFT for cross-correlation between two images: keypoint_d of size 128x128 and image_d of size 256x256. One way to do that is by using the cuFFT Library.

So far, here are the steps I used for a for an IN-PLACE C2C transform: :

  1. Add 0 padding to Pattern_img to have an equal size with regard to image_d : (256x256) <==> NXxNY
  2. I created my 2D C2C plan.
  3. Performed the forward 2D transform for each image.
  4. Multiplied both images in the frequency domain using the appropriate complex multiplication.
  5. Perform IFFT of the multiplication-result.
  6. Copy to host and display the IFFT result

So my goal is to find any resemblance between the two images. I suppose that the highest value (in the IFFT of the multiplication) will be located where the two images match the most, isn’t it? If so, then I will just find the highest value and save its coordinates.

I tried to read what’s inside the buffer inversed_h but it is full of big number … Any help?
Here is my code:

Note that NX = NY = 256
fftSize = NX*NY

Complex* CrossCorelationMonoPlan(float* winBuffer, float* kpBuffer, int fftSize, int NX, int NY)
{
    Complex* inversed_h = (Complex*)malloc(sizeof(Complex)*fftSize);
    cufftHandle plan;
    cufftPlan2d(&plan, NX, NY, CUFFT_C2C);

    cufftComplex* kpBuffer_pad_d;
    gpuErrchk(cudaMalloc((void **)&kpBuffer_pad_d, mem_size_pad));
    dataTransfer_R2C << < ceil(NX*NX/ 256), 256 >> > (kpBuffer, kpBuffer_pad_d, NX, NY);
    gpuErrchk(cudaDeviceSynchronize());

    cufftComplex* winBuffer_d;
    gpuErrchk(cudaMalloc((void **)&winBuffer_d, mem_size_pad));
    dataTransfer_R2C << < ceil(NX*NY / 256), 256 >> > (winBuffer, winBuffer_d, NX, NY);
    gpuErrchk(cudaDeviceSynchronize());


    // .. Start FFT :: in-place
    printf("Transforming signal cufftExecR2C\n");

    cufftExecC2C(plan, (cufftComplex *)kpBuffer_pad_d, (cufftComplex *)kpBuffer_pad_d, CUFFT_FORWARD);
    cufftExecC2C(plan, (cufftComplex *)winBuffer_d, (cufftComplex *)winBuffer_d, CUFFT_FORWARD);

    printf("Launching Complex multiplication<<< >>>\n");
    ComplexMulAndScale << < ceil(NX*NY/256), 256 >> >(winBuffer_d, kpBuffer_pad_d, fftSize, 1.0f / fftSize);

    printf("Transforming signal back cufftExecC2C\n");
    cufftExecC2C(plan, (cufftComplex *)winBuffer_d, (cufftComplex *)winBuffer_d, CUFFT_INVERSE);

    gpuErrchk(cudaMemcpy(inversed_h, winBuffer_d, sizeof(Complex)*fftSize, cudaMemcpyDeviceToHost));

    Findmax(inversed_h, fftSize, NX, NY);

    //free(inversed_h);

    gpuErrchk(cudaFree(kpBuffer_pad_d));
    gpuErrchk(cudaFree(winBuffer_d));

    return inversed_h;
}

where my two kernels are:

__global__ void ComplexMulAndScale (cufftComplex *a, cufftComplex *b, int size, float scale)
{
    const int tId = blockIdx.x * blockDim.x + threadIdx.x;
    if(tId < size)
    {
        Complex c;
        c.x = (a[tId].x * b[tId].x - a[tId].y * b[tId].y)*scale;
        c.y = (a[tId].x * b[tId].y + a[tId].y * b[tId].x)*scale;
        a[tId] = c;
    }
}     

__global__ void dataTransfer_R2C(float* dataIn_d, float2* dataOut_d, int paddedWidth, int paddedHeight)
{
    int tId = blockIdx.x * blockDim.x + threadIdx.x;
    int x = tId % paddedWidth;
    int y = tId / paddedWidth;

    if (tId < paddedWidth * paddedHeight)
    {
    		dataOut_d[x + y*paddedWidth].x = dataIn_d[tId];
	    	dataOut_d[x + y*paddedWidth].y = 0;
    }
}

int Findmax(cufftComplex * arr, unsigned int size, int width, int height)
{
    Complex max = arr[0];
    int index;
    for (unsigned int i = 1; i < size; i++) {
	    if (arr[i].x > max.x) {
		    num_of_occurs = 1;
		    max = arr[i];
		    index = i;
	    }
    }
    cout << "index = " << index << endl;
    cout << "X = " << index % width << "Y  = " << index / width << endl;
    cout << "Real = " << max.x << "           " << "Imag = " << max.y << endl;
    return index;
}

Update:
I noticed that the multiplication kernel was wrong, therefore here is an update.
I’m also trying to implement a 2D R2C fft to compare the two techniques, but it does not work as well. I don’t want to post a new question in that regard since this one still unsolved …

I would appreciate any kind of help!

Complex* CrossCorelationMonoPlan(float* winBuffer, float* kpBuffer, int fftSize, int NX, int NY)
{
    cufftComplex* kpBuffer_pad_d;
    gpuErrchk(cudaMalloc((void **)&kpBuffer_pad_d, mem_size_pad));
    //dataTransfer_R2C << < ceil(NX*NY/ 256), 256 >> > (kpBuffer, kpBuffer_pad_d, NX, NY);
    for (int i = 0; i<NX; ++i) {
        gpuErrchk(cudaMemcpy2D(kpBuffer_pad_d + i*NY, sizeof(cufftComplex), kpBuffer + NY* i, sizeof(float), sizeof(float), NY, cudaMemcpyHostToDevice));
    }
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(res_h, kpBuffer_pad_d, sizeof(Complex)*fftSize, cudaMemcpyDeviceToHost)); // TEST


// Copy the input winBuffer into a new cufftComplex buffer 

    cufftComplex* winBuffer_d;
    gpuErrchk(cudaMalloc((void **)&winBuffer_d, mem_size_pad));
    //dataTransfer_R2C << < ceil(NX*NY / 256), 256 >> > (winBuffer, winBuffer_d, NX, NY);
    for (int i = 0; i<NX; ++i) {
        gpuErrchk(cudaMemcpy2D(winBuffer_d + i*NY, sizeof(cufftComplex), winBuffer + i*NY, sizeof(float), sizeof(float), NY, cudaMemcpyHostToDevice));
    }

    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(res_h, winBuffer_d, sizeof(Complex)*fftSize, cudaMemcpyDeviceToHost)); // TEST

// .. Start FFT :: in-place
    printf("Transforming signal cufftExecR2C\n");
    if (cufftExecC2C(plan, (cufftComplex *)kpBuffer_pad_d, (cufftComplex *)kpBuffer_pad_d, CUFFT_FORWARD) != CUFFT_SUCCESS)
    {
        fprintf(stderr, "CUFFT Error: Unable to execute the plan\n");
    }

    if (cufftExecC2C(plan, (cufftComplex *)winBuffer_d, (cufftComplex *)winBuffer_d, CUFFT_FORWARD) != CUFFT_SUCCESS)
    {
        fprintf(stderr, "CUFFT Error: Unable to execute the plan\n");
    }

    // Complex* res_h = (Complex*)malloc(sizeof(Complex)*fftSize);
    //cufftExecC2C(plan, (cufftComplex *)winBuffer_d, (cufftComplex *)winBuffer_d, CUFFT_INVERSE);
    //gpuErrchk(cudaMemcpy(res_h, kpBuffer_pad_d, sizeof(Complex)*fftSize, cudaMemcpyDeviceToHost)); // TEST

    printf("Launching Complex multiplication<<< >>>\n");
    cufftComplex *res_mult_d;
    gpuErrchk(cudaMalloc((void **)&res_mult_d, mem_size_pad));
    ComplexPointwiseMulAndScale << < ceil(NX*NY / 256), 256>> > (winBuffer_d, kpBuffer_pad_d, res_mult_d, fftSize, NX, NY, 1.0f);

    printf("Transforming signal back cufftExecC2C\n");
    if (cufftExecC2C(plan, (cufftComplex *)res_mult_d, (cufftComplex *)winBuffer_d, CUFFT_INVERSE) !=CUFFT_SUCCESS)
    {
    	fprintf(stderr, "CUFFT Error: Unable to execute the plan\n");
    }
    gpuErrchk(cudaMemcpy(inversed_h, winBuffer_d, sizeof(Complex)*fftSize, cudaMemcpyDeviceToHost));
    
    // Find max
    Complex max = arr[0];
    int index = 0;
    for ( int i = 1; i < size; i++) 
    {
    	if (inversed_h[i].x > max.x ) {
    		max = inversed_h[i];
    		index = i;
    	}	
    }

    gpuErrchk(cudaFree(kpBuffer_pad_d));
    gpuErrchk(cudaFree(winBuffer_d));

Multiplication kernel update:

 __global__ void ComplexMulAndScale(cufftComplex *a, cufftComplex *b, cufftComplex *c, int size, int width, int height, float scale)    
 const int tId = blockIdx.x * blockDim.x + threadIdx.x;
 if(tId < size)
{
	int x = tId % width;
	int y = tId / width;

	Complex tmp;
	tmp.x = 0;
	tmp.y = 0;
	for (int i = 0; i < width; i++)
	{
		Complex ae;
		Complex be;
		ae = a[y*width + i];
		be = b[i*width + x];
		tmp.x += ae.x * be.x - ae.y * be.y;
		tmp.y += ae.x * be.y + ae.y * be.x;
	}
	c[tId] = tmp;
}
}

I’m back with a new update. i have compared this algo (the updated one) with the one matlab and here is what i did; i took the same window twice of size 8x8. i did fft of each matrix, multiply their responses and compute the ifft of the multiplication. Both the result on matlab (using fft2 & ifft2) and cuFFT (C2C) are correct.
I am thinking that since both windows are 8x8, the correlation ( Highest value in the IFFT) should be located in (0,0), Do you agree? If no, then could you please explain why? This is the step i am missing …