Image filtering in frequency domain

RemsNems · September 8, 2014, 11:02am

Hello everyone,
I have a program in Matlab and I want to translate it in C++/Cuda. I think succeed quite well except for the filtering part.
In my Matlab code, I define the filter (a Difference of Gaussian) directly in the frequency domain. I want to do the same in CUDA. The problem is that my CUDA code does not work well.

I tried to reduce the code to only filter the images.
The matlab version is :

im = ...; % Read the image
IM = fftshift(fft2(im));
[L, C] = size(IM);
u = ((1:C)-1)/(C-1) - 0.5
v = ((1:L)-1)/(L-1) - 0.5;
[U, V] = meshgrid(u, v);
rad2 = U.*U + V.*V;
sigma1 = 0.2; sigma2 = 0.1;
dog = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
imb_matlab = real(ifft2(ifftshift(dog.*IM)));

And this is what I get with matlab :

In CUDA the result is different :

Here is the C++/CUDA code :

main

#include <iostream>
#include <cmath>
 
#include "FileManager.h"
#include "HCS_kernel.h"
using namespace std;
 
int main(int argc, char* argv[])
{
    int *size;
    int dim, imH, imW;
    double *im, *im_d;
    FileManager FM;
    FileManager_Error FMerror;
    cudaError_t error;          // To handle CUDA errors (time handler)
    GPU_HCS_Error reserr = GPU_HCS_SUCCESS; // Result for the main program
    cufftResult cuError;            // cuFFT errors
    cufftDoubleComplex *IM;         // Image spectrum on GPU
    cufftHandle planD2Z, planZ2D;           // Plans for FFT and IFFT
     
    // Read Images
    FMerror = FM.readImage("../HCS/lena1.bin", &im, &dim, &size);
    if(FMerror != FM_SUCCESS)
        return FMerror;
    imH = size[0];
    imW = size[1];
     
    // Memory allocation
    error = cudaMalloc((void**) &im_d, imW*imH*sizeof(double));
    if(error != cudaSuccess)
    {
        cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_ALLOCMEM;
        return reserr;
    }
     
    error = cudaMalloc((void**) &IM, imW*imH*sizeof(cufftDoubleComplex));
    if(error != cudaSuccess)
    {
        cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_ALLOCMEM;
        return reserr;
    }
     
    // Copy data
    error = cudaMemcpy(im_d, (double*)im, imW*imH*sizeof(double), cudaMemcpyHostToDevice);
    if(error != cudaSuccess)
    {
        cout<<"[HCS] Erreur lors de la copie de l'image 1 sur le peripherique (ligne "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_CPYMEM;
        return reserr;
    }
     
    // ---- Compute FFT ---- //
    // Create 2D plans
    cuError = cufftPlan2d(&planD2Z, imW, imH, CUFFT_D2Z);
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Erreur lors de la creattion du plan FFT (ligne "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_PLAN;
        return reserr;
    }
 
    cuError = cufftPlan2d(&planZ2D, imW, imH, CUFFT_Z2D);
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Erreur lors de la creattion du plan IFFT (ligne "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_PLAN;
        return reserr;
    }
     
    // Compute FFT
    cuError = cufftExecD2Z(planD2Z, (cufftDoubleReal*)im_d, IM); // Implicitly FORWARD
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_FFT;
        return reserr;
    }
     
    // Dimensions for a block and a grid
    dim3 dimBlock(NB_THREADS, NB_THREADS);
    int nbBlocsW = imW/NB_THREADS;
    if((imW%NB_THREADS) != 0)
        nbBlocsW++;
    int nbBlocsH = imH/NB_THREADS;
    if((imH%NB_THREADS) != 0)
        nbBlocsH++;
    dim3 dimGrid(nbBlocsW, nbBlocsH);
 
 
    // Compute FFT
    cuError = cufftExecD2Z(planD2Z, (cufftDoubleReal*)im_d, IM); // Implicitly FORWARD
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_FFT;
        return reserr;
    }
     
    kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
 
 
    // ---- Filtering ---- //
    kernel_DoGFiltering<<<dimGrid, dimBlock>>>(IM, imH, imW);
 
    kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
 
     
    // compute IFFTs
    cuError = cufftExecZ2D(planZ2D, IM, (cufftDoubleReal*)im_d); // Implicitly INVERSE
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_FFT;
        return reserr;
    }
     
     
    cudaFree(im_d);
    cudaFree(IM);
    cufftDestroy(planD2Z);
    cufftDestroy(planZ2D);
 
    delete[] im;
    delete[] size;
 
    return 0;
}

kernel.h

#ifndef DEF_HCSKERNEL
#define DEF_HCSKERNEL
 
#include <cuda_runtime.h>
#include <cufft.h>
 
// On the device
#define NB_THREADS      32
 
 
enum GPU_HCS_Error
{
    GPU_HCS_SUCCESS,
    GPU_HCS_ERROR_ALLOCMEM,
    GPU_HCS_ERROR_FREEMEM,
    GPU_HCS_ERROR_CPYMEM,
    GPU_HCS_ERROR_TIME,
    GPU_HCS_ERROR_PLAN,
    GPU_HCS_ERROR_FFT
};
 
__global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW);
__global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW);
 
#endif

kernel.cu

#include "HCS_kernel.h"
 
 
__global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int ind = idy*imW + idx;
    int x, y, indshift;
    cufftDoubleComplex v;
     
     
    if(idx < imW && idy < imH/2)
    {
        // A OPTIMISER
        if(idx<imW/2 && idy<imH/2)
        {
            x=idx+imW/2;
            y=idy+imH/2;
        }
        else if(idx>=imW/2 && idy<imH/2)
        {
            x=idx-imW/2;
            y=idy+imH/2;
        }
     
        indshift = y*imW+x;
        v.x = IM[ind].x;
        v.y = IM[ind].y;
     
        IM[ind].x = IM[indshift].x;
        IM[ind].y = IM[indshift].y;
     
        IM[indshift].x = v.x;
        IM[indshift].y = v.y;
    }
}
 
 
__global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int ind = idy*imW + idx;
    double u, v, rad, rad2, DoG, fact;
    double sigma1 = 0.2, sigma2 = 0.1;
     
    // Mask the treads
    if(idx < imW && idy < imH)
    {
        // Convert indices 0..(N-1) -> -0.5..0.5
        u = (((double)idx)/(((double)imW)-1.)) - 0.5;
        v = (((double)idx)/(((double)imH)-1.)) - 0.5;
        rad2 = u*u + v*v;
        rad = sqrt(rad2);
         
        // Compute the Difference of Gaussian
        DoG = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
             
        fact = (double) 1./(imH*imW);
        IM[ind].x = IM[ind].x * DoG * fact;
        IM[ind].y = IM[ind].y * DoG * fact;
    }
}

I found some ways to shift the spectrum in a more efficient way but I wanted a simple implementation to better understand what happen.

For me, the Matlab and CUDA codes are equivalent, but the results show that they are not and I could not find what is wrong in my CUDA implementation.

Do you have any idea?
Thanks in advance.

Robert_Crovella · September 8, 2014, 1:39pm

This code in your kernel_DoGFiltering looks broken:

    u = (((double)idx)/(((double)imW)-1.)) - 0.5;
    v = (((double)idx)/(((double)imH)-1.)) - 0.5;

shouldn’t one of those idx actually (for v) be idy ?

RemsNems · September 9, 2014, 10:44am

Thanks for your answer,
you are right, I made a mistake at this line.
Unfortunately, the shifted image is only attenuate, as you can see in the following image :

Just in case, I also changed the line 60 in the same kernel :

fact = 1./((double) (imH*imW));

It does not change the result either.

Does Matlab handle some stuff when computing the FFT that cuFFT does not?
Both are based on the fftw algorithms but maybe I missed something in this way?

Robert_Crovella · September 9, 2014, 2:12pm

You may want to take a look at the Data Layout section of the CUFFT documentation.

[url]http://docs.nvidia.com/cuda/cufft/index.html#data-layout[/url]

A CUFFT Real-to-Complex transform does not produce the same number of output elements as the input:

“Finally, R2C demands an input array (X1,X2,…,Xn) of real values and returns an array (x1,x2,…,x[n/2]+1) of non-redundant complex elements”

Likewise the C2R (inverse case) has a similar disparity between input and output elements. This may be the source of your shifted image.

You may want to review the simpleCUFFT cuda sample code, which does a similar operation (1D forward transform, filtering in the frequency domain, followed by 1D inverse transform):

[url]http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-cufft[/url]

RemsNems · September 10, 2014, 1:54pm

Thanks for the links. Indeed, I missed the Data layout section.

However, I saw the simpleCUFFT example and there is a little difference between this code and what I want to do : I define the filter directly in the frequency domain whereas the filter is defined in the spatial domain and then its FFT is compute in this example.

This link is still hard to me to understand.
The complex array only carries non redundant information because of the symmetry of the spectrum, am I right? In this case, is the fftshift necessary?

So I have to allocate an array of size n/2+1? I have juste tried this and it does not work. I assume the solution is more subtle. My main question is more about the filter definition. Do I just have to define it for the positive frequency only?

RemsNems · October 2, 2014, 11:06am

Hi all,
I tried to modify my programs in several ways. I applied the filter only for idy < (imH/2+1). Since my data are stored ina arow-major way, I also tried for idx < (imW/2+1). I still can not obtain the good filtered image.

Do you have any clue?

Thanks in advance.

Robert_Crovella · October 3, 2014, 3:26am

I think your problem is that you haven’t taken into account data layout correctly. After fiddling with your code to create something I could run, I was able to reproduce your ghost images.

As a first step to get around this problem, I advise converting all of your FFT operations to complex-to-complex (Z2Z) transforms. Once you understand and agree that this produces the output you want, then you can work on understanding how to do a D2Z (and Z2D) transform.

Here’s a fully worked example for me that does not produce the ghost images (using Z2Z forward and inverse transforms):

#include <iostream>
#include <cmath>
#include <cufft.h>
#include "pgm.h"

// On the device
#define NB_THREADS      32


enum GPU_HCS_Error
{
    GPU_HCS_SUCCESS,
    GPU_HCS_ERROR_ALLOCMEM,
    GPU_HCS_ERROR_FREEMEM,
    GPU_HCS_ERROR_CPYMEM,
    GPU_HCS_ERROR_TIME,
    GPU_HCS_ERROR_PLAN,
    GPU_HCS_ERROR_FFT
};

__global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW);
__global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW);


__global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int ind = idy*imW + idx;
    int x, y, indshift;
    cufftDoubleComplex v;


    if(idx < imW && idy < imH/2)
    {
        // A OPTIMISER
        if(idx<imW/2 && idy<imH/2)
        {
            x=idx+imW/2;
            y=idy+imH/2;
        }
        else if(idx>=imW/2 && idy<imH/2)
        {
            x=idx-imW/2;
            y=idy+imH/2;
        }

        indshift = y*imW+x;
        v.x = IM[ind].x;
        v.y = IM[ind].y;

        IM[ind].x = IM[indshift].x;
        IM[ind].y = IM[indshift].y;

        IM[indshift].x = v.x;
        IM[indshift].y = v.y;
    }
}


__global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int ind = idy*imW + idx;
    double u, v, rad2, DoG, fact;
    double sigma1 = 0.2, sigma2 = 0.1;

    // Mask the treads
    if(idx < imW && idy < imH)
    {
        // Convert indices 0..(N-1) -> -0.5..0.5
        u = (((double)idx)/(((double)imW)-1.)) - 0.5;
        v = (((double)idy)/(((double)imH)-1.)) - 0.5;
        rad2 = u*u + v*v;
        //rad = sqrt(rad2);

        // Compute the Difference of Gaussian
        DoG = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));

        fact = (double) 1./(imH*imW);
        IM[ind].x = IM[ind].x * DoG * fact;
        IM[ind].y = IM[ind].y * DoG * fact;
    }
}

using namespace std;

int main(int argc, char* argv[])
{
    int imH, imW;
    cufftDoubleComplex *im, *im_d;
    cudaError_t error;          // To handle CUDA errors (time handler)
    GPU_HCS_Error reserr = GPU_HCS_SUCCESS; // Result for the main program
    cufftResult cuError;            // cuFFT errors
    cufftDoubleComplex *IM;         // Image spectrum on GPU
    cufftHandle planFZ2Z, planIZ2Z;           // Plans for FFT and IFFT
    PGMImage img_in, img_out;
    // Read Images

    getPGMfile("Lena.pgm", &img_in);
    imH = img_in.height;
    imW = img_in.width;
    img_out.height = img_in.height;
    img_out.width = img_in.width;
    img_out.maxVal = img_in.maxVal;

    im = (cufftDoubleComplex *)malloc(imW*imH*sizeof(cufftDoubleComplex));
    for (int i = 0; i < imH; i++)
      for (int j = 0; j < imW; j++){
        im[(imW*i)+j].x = (img_in.data[i][j].red + img_in.data[i][j].green + img_in.data[i][j].blue)/(3.0*img_in.maxVal);
        im[(imW*i)+j].y = 0.0;}
    // Memory allocation
    error = cudaMalloc((void**) &im_d, imW*imH*sizeof(cufftDoubleComplex));
    if(error != cudaSuccess)
    {
        cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_ALLOCMEM;
        return reserr;
    }

    error = cudaMalloc((void**) &IM, imW*imH*sizeof(cufftDoubleComplex));
    if(error != cudaSuccess)
    {
        cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_ALLOCMEM;
        return reserr;
    }

    // Copy data
    error = cudaMemcpy(im_d, im, imW*imH*sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice);
    if(error != cudaSuccess)
    {
        cout<<"[HCS] Erreur lors de la copie de l'image 1 sur le peripherique (ligne "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_CPYMEM;
        return reserr;
    }

    // ---- Compute FFT ---- //
    // Create 2D plans
    cuError = cufftPlan2d(&planFZ2Z, imW, imH, CUFFT_Z2Z);
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Erreur lors de la creattion du plan FFT (ligne "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_PLAN;
        return reserr;
    }

    cuError = cufftPlan2d(&planIZ2Z, imW, imH, CUFFT_Z2Z);
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Erreur lors de la creattion du plan IFFT (ligne "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_PLAN;
        return reserr;
    }

    // Compute FFT
    cuError = cufftExecZ2Z(planFZ2Z, im_d, IM, CUFFT_FORWARD); 
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_FFT;
        return reserr;
    }

    // Dimensions for a block and a grid
    dim3 dimBlock(NB_THREADS, NB_THREADS);
    int nbBlocsW = imW/NB_THREADS;
    if((imW%NB_THREADS) != 0)
        nbBlocsW++;
    int nbBlocsH = imH/NB_THREADS;
    if((imH%NB_THREADS) != 0)
        nbBlocsH++;
    dim3 dimGrid(nbBlocsW, nbBlocsH);



    kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);


    // ---- Filtering ---- //
    kernel_DoGFiltering<<<dimGrid, dimBlock>>>(IM, imH, imW);

    kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);


    // compute IFFTs
    cuError = cufftExecZ2Z(planIZ2Z, IM, im_d, CUFFT_INVERSE); 
    if(cuError != CUFFT_SUCCESS)
    {
        cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
        reserr = GPU_HCS_ERROR_FFT;
        return reserr;
    }

    cudaMemcpy(im, im_d, imW*imH*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
#define SHIFT 0.2
    for (int i = 0; i < imH; i++)
      for (int j = 0; j < imW; j++){
        if ((im[(i*imW)+j].x+SHIFT)< 0.0) printf("%f\n", im[(i*imW)+j].x);
        unsigned temp = ((im[(i*imW)+j].x+SHIFT)/((double)(1)))*img_out.maxVal;
        if (temp > img_out.maxVal) {printf(" output saturation: %u\n", temp); return 1;}
        img_out.data[i][j].red =   temp;
        img_out.data[i][j].green = temp;
        img_out.data[i][j].blue =  temp;}
    save(&img_out);
    cudaFree(im_d);
    cudaFree(IM);
    cufftDestroy(planFZ2Z);
    cufftDestroy(planIZ2Z);

    delete[] im;

    return 0;
}

I used the Lena.pgm image that gets installed with the CUDA samples.
The pgm.cpp and pgm.h files I used can be found here:

http://sun.iwu.edu/~shelley/sie/zoo/journal/pgm.c.html
http://sun.iwu.edu/~shelley/sie/zoo/journal/pgm.h.html

I compiled like so:

nvcc -arch=sm_20 -o t503 t503.cu pgm.cpp -lcufft

Notice how I provided a complete example that doesn’t depend on any unknown libraries or files?
In the future if you want help, I suggest you do the same.

I also noticed while fiddling with the code you have posted here, that you are doing the forward transform twice. I don’t think this affects the results, but my code differs from yours noticeably in that respect also. The only change I made to the kernels was to fix the issue I already pointed out.

Topic		Replies	Views
cuFFT and fftw CUDA Programming and Performance	10	4297	August 25, 2010
Using 1D Kernel on 2D fft Image filtering CUDA Programming and Performance	1	4285	June 28, 2009
CUDA FFT different from Matlab FFT CUDA Programming and Performance	32	9571	March 29, 2011
Cufft 1D transform CUDA Programming and Performance	6	3090	January 8, 2015
2D CUFFT wrong result GPU-Accelerated Libraries cufft	8	3198	November 7, 2023
cufft questions CUDA Programming and Performance	5	1523	April 2, 2012
Data layout on device of the output from cufftExecR2C CUDA Programming and Performance	0	10235	September 7, 2011
about frequency spectrum computed from cuFFT GPU-Accelerated Libraries	9	1605	January 5, 2018
fftshift with a kernel An error ocurre CUDA Programming and Performance	0	9310	July 23, 2010
[SOLVED] cuFFT data storage, maybe I'm operating on the wrong elements CUDA Programming and Performance	16	2287	June 11, 2022

Image filtering in frequency domain

Related topics