Image filtering in frequency domain

Hello everyone,
I have a program in Matlab and I want to translate it in C++/Cuda. I think succeed quite well except for the filtering part.
In my Matlab code, I define the filter (a Difference of Gaussian) directly in the frequency domain. I want to do the same in CUDA. The problem is that my CUDA code does not work well.

I tried to reduce the code to only filter the images.
The matlab version is :

im = ...; % Read the image
IM = fftshift(fft2(im));
[L, C] = size(IM);
u = ((1:C)-1)/(C-1) - 0.5
v = ((1:L)-1)/(L-1) - 0.5;
[U, V] = meshgrid(u, v);
rad2 = U.*U + V.*V;
sigma1 = 0.2; sigma2 = 0.1;
dog = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
imb_matlab = real(ifft2(ifftshift(dog.*IM)));

And this is what I get with matlab :

In CUDA the result is different :

Here is the C++/CUDA code :

  • main
  • #include <iostream>
    #include <cmath>
     
    #include "FileManager.h"
    #include "HCS_kernel.h"
    using namespace std;
     
    int main(int argc, char* argv[])
    {
        int *size;
        int dim, imH, imW;
        double *im, *im_d;
        FileManager FM;
        FileManager_Error FMerror;
        cudaError_t error;          // To handle CUDA errors (time handler)
        GPU_HCS_Error reserr = GPU_HCS_SUCCESS; // Result for the main program
        cufftResult cuError;            // cuFFT errors
        cufftDoubleComplex *IM;         // Image spectrum on GPU
        cufftHandle planD2Z, planZ2D;           // Plans for FFT and IFFT
         
        // Read Images
        FMerror = FM.readImage("../HCS/lena1.bin", &im, &dim, &size);
        if(FMerror != FM_SUCCESS)
            return FMerror;
        imH = size[0];
        imW = size[1];
         
        // Memory allocation
        error = cudaMalloc((void**) &im_d, imW*imH*sizeof(double));
        if(error != cudaSuccess)
        {
            cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_ALLOCMEM;
            return reserr;
        }
         
        error = cudaMalloc((void**) &IM, imW*imH*sizeof(cufftDoubleComplex));
        if(error != cudaSuccess)
        {
            cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_ALLOCMEM;
            return reserr;
        }
         
        // Copy data
        error = cudaMemcpy(im_d, (double*)im, imW*imH*sizeof(double), cudaMemcpyHostToDevice);
        if(error != cudaSuccess)
        {
            cout<<"[HCS] Erreur lors de la copie de l'image 1 sur le peripherique (ligne "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_CPYMEM;
            return reserr;
        }
         
        // ---- Compute FFT ---- //
        // Create 2D plans
        cuError = cufftPlan2d(&planD2Z, imW, imH, CUFFT_D2Z);
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Erreur lors de la creattion du plan FFT (ligne "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_PLAN;
            return reserr;
        }
     
        cuError = cufftPlan2d(&planZ2D, imW, imH, CUFFT_Z2D);
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Erreur lors de la creattion du plan IFFT (ligne "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_PLAN;
            return reserr;
        }
         
        // Compute FFT
        cuError = cufftExecD2Z(planD2Z, (cufftDoubleReal*)im_d, IM); // Implicitly FORWARD
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_FFT;
            return reserr;
        }
         
        // Dimensions for a block and a grid
        dim3 dimBlock(NB_THREADS, NB_THREADS);
        int nbBlocsW = imW/NB_THREADS;
        if((imW%NB_THREADS) != 0)
            nbBlocsW++;
        int nbBlocsH = imH/NB_THREADS;
        if((imH%NB_THREADS) != 0)
            nbBlocsH++;
        dim3 dimGrid(nbBlocsW, nbBlocsH);
     
     
        // Compute FFT
        cuError = cufftExecD2Z(planD2Z, (cufftDoubleReal*)im_d, IM); // Implicitly FORWARD
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_FFT;
            return reserr;
        }
         
        kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
     
     
        // ---- Filtering ---- //
        kernel_DoGFiltering<<<dimGrid, dimBlock>>>(IM, imH, imW);
     
        kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
     
         
        // compute IFFTs
        cuError = cufftExecZ2D(planZ2D, IM, (cufftDoubleReal*)im_d); // Implicitly INVERSE
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_FFT;
            return reserr;
        }
         
         
        cudaFree(im_d);
        cudaFree(IM);
        cufftDestroy(planD2Z);
        cufftDestroy(planZ2D);
     
        delete[] im;
        delete[] size;
     
        return 0;
    }
    
  • kernel.h
  • #ifndef DEF_HCSKERNEL
    #define DEF_HCSKERNEL
     
    #include <cuda_runtime.h>
    #include <cufft.h>
     
    // On the device
    #define NB_THREADS      32
     
     
    enum GPU_HCS_Error
    {
        GPU_HCS_SUCCESS,
        GPU_HCS_ERROR_ALLOCMEM,
        GPU_HCS_ERROR_FREEMEM,
        GPU_HCS_ERROR_CPYMEM,
        GPU_HCS_ERROR_TIME,
        GPU_HCS_ERROR_PLAN,
        GPU_HCS_ERROR_FFT
    };
     
    __global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW);
    __global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW);
     
    #endif
    
  • kernel.cu
  • #include "HCS_kernel.h"
     
     
    __global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW)
    {
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        int ind = idy*imW + idx;
        int x, y, indshift;
        cufftDoubleComplex v;
         
         
        if(idx < imW && idy < imH/2)
        {
            // A OPTIMISER
            if(idx<imW/2 && idy<imH/2)
            {
                x=idx+imW/2;
                y=idy+imH/2;
            }
            else if(idx>=imW/2 && idy<imH/2)
            {
                x=idx-imW/2;
                y=idy+imH/2;
            }
         
            indshift = y*imW+x;
            v.x = IM[ind].x;
            v.y = IM[ind].y;
         
            IM[ind].x = IM[indshift].x;
            IM[ind].y = IM[indshift].y;
         
            IM[indshift].x = v.x;
            IM[indshift].y = v.y;
        }
    }
     
     
    __global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW)
    {
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        int ind = idy*imW + idx;
        double u, v, rad, rad2, DoG, fact;
        double sigma1 = 0.2, sigma2 = 0.1;
         
        // Mask the treads
        if(idx < imW && idy < imH)
        {
            // Convert indices 0..(N-1) -> -0.5..0.5
            u = (((double)idx)/(((double)imW)-1.)) - 0.5;
            v = (((double)idx)/(((double)imH)-1.)) - 0.5;
            rad2 = u*u + v*v;
            rad = sqrt(rad2);
             
            // Compute the Difference of Gaussian
            DoG = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
                 
            fact = (double) 1./(imH*imW);
            IM[ind].x = IM[ind].x * DoG * fact;
            IM[ind].y = IM[ind].y * DoG * fact;
        }
    }
    

    I found some ways to shift the spectrum in a more efficient way but I wanted a simple implementation to better understand what happen.

    For me, the Matlab and CUDA codes are equivalent, but the results show that they are not and I could not find what is wrong in my CUDA implementation.

    Do you have any idea?
    Thanks in advance.

    This code in your kernel_DoGFiltering looks broken:

        u = (((double)idx)/(((double)imW)-1.)) - 0.5;
        v = (((double)idx)/(((double)imH)-1.)) - 0.5;
    

    shouldn’t one of those idx actually (for v) be idy ?

    Thanks for your answer,
    you are right, I made a mistake at this line.
    Unfortunately, the shifted image is only attenuate, as you can see in the following image :

    Just in case, I also changed the line 60 in the same kernel :

    fact = 1./((double) (imH*imW));
    

    It does not change the result either.

    Does Matlab handle some stuff when computing the FFT that cuFFT does not?
    Both are based on the fftw algorithms but maybe I missed something in this way?

    You may want to take a look at the Data Layout section of the CUFFT documentation.

    http://docs.nvidia.com/cuda/cufft/index.html#data-layout

    A CUFFT Real-to-Complex transform does not produce the same number of output elements as the input:

    “Finally, R2C demands an input array (X1,X2,…,Xn) of real values and returns an array (x1,x2,…,x[n/2]+1) of non-redundant complex elements”

    Likewise the C2R (inverse case) has a similar disparity between input and output elements. This may be the source of your shifted image.

    You may want to review the simpleCUFFT cuda sample code, which does a similar operation (1D forward transform, filtering in the frequency domain, followed by 1D inverse transform):

    http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-cufft

    Thanks for the links. Indeed, I missed the Data layout section.

    However, I saw the simpleCUFFT example and there is a little difference between this code and what I want to do : I define the filter directly in the frequency domain whereas the filter is defined in the spatial domain and then its FFT is compute in this example.

    This link is still hard to me to understand.
    The complex array only carries non redundant information because of the symmetry of the spectrum, am I right? In this case, is the fftshift necessary?

    So I have to allocate an array of size n/2+1? I have juste tried this and it does not work. I assume the solution is more subtle. My main question is more about the filter definition. Do I just have to define it for the positive frequency only?

    Hi all,
    I tried to modify my programs in several ways. I applied the filter only for idy < (imH/2+1). Since my data are stored ina arow-major way, I also tried for idx < (imW/2+1). I still can not obtain the good filtered image.

    Do you have any clue?

    Thanks in advance.

    I think your problem is that you haven’t taken into account data layout correctly. After fiddling with your code to create something I could run, I was able to reproduce your ghost images.

    As a first step to get around this problem, I advise converting all of your FFT operations to complex-to-complex (Z2Z) transforms. Once you understand and agree that this produces the output you want, then you can work on understanding how to do a D2Z (and Z2D) transform.

    Here’s a fully worked example for me that does not produce the ghost images (using Z2Z forward and inverse transforms):

    #include <iostream>
    #include <cmath>
    #include <cufft.h>
    #include "pgm.h"
    
    // On the device
    #define NB_THREADS      32
    
    
    enum GPU_HCS_Error
    {
        GPU_HCS_SUCCESS,
        GPU_HCS_ERROR_ALLOCMEM,
        GPU_HCS_ERROR_FREEMEM,
        GPU_HCS_ERROR_CPYMEM,
        GPU_HCS_ERROR_TIME,
        GPU_HCS_ERROR_PLAN,
        GPU_HCS_ERROR_FFT
    };
    
    __global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW);
    __global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW);
    
    
    __global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW)
    {
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        int ind = idy*imW + idx;
        int x, y, indshift;
        cufftDoubleComplex v;
    
    
        if(idx < imW && idy < imH/2)
        {
            // A OPTIMISER
            if(idx<imW/2 && idy<imH/2)
            {
                x=idx+imW/2;
                y=idy+imH/2;
            }
            else if(idx>=imW/2 && idy<imH/2)
            {
                x=idx-imW/2;
                y=idy+imH/2;
            }
    
            indshift = y*imW+x;
            v.x = IM[ind].x;
            v.y = IM[ind].y;
    
            IM[ind].x = IM[indshift].x;
            IM[ind].y = IM[indshift].y;
    
            IM[indshift].x = v.x;
            IM[indshift].y = v.y;
        }
    }
    
    
    __global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW)
    {
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        int ind = idy*imW + idx;
        double u, v, rad2, DoG, fact;
        double sigma1 = 0.2, sigma2 = 0.1;
    
        // Mask the treads
        if(idx < imW && idy < imH)
        {
            // Convert indices 0..(N-1) -> -0.5..0.5
            u = (((double)idx)/(((double)imW)-1.)) - 0.5;
            v = (((double)idy)/(((double)imH)-1.)) - 0.5;
            rad2 = u*u + v*v;
            //rad = sqrt(rad2);
    
            // Compute the Difference of Gaussian
            DoG = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
    
            fact = (double) 1./(imH*imW);
            IM[ind].x = IM[ind].x * DoG * fact;
            IM[ind].y = IM[ind].y * DoG * fact;
        }
    }
    
    using namespace std;
    
    int main(int argc, char* argv[])
    {
        int imH, imW;
        cufftDoubleComplex *im, *im_d;
        cudaError_t error;          // To handle CUDA errors (time handler)
        GPU_HCS_Error reserr = GPU_HCS_SUCCESS; // Result for the main program
        cufftResult cuError;            // cuFFT errors
        cufftDoubleComplex *IM;         // Image spectrum on GPU
        cufftHandle planFZ2Z, planIZ2Z;           // Plans for FFT and IFFT
        PGMImage img_in, img_out;
        // Read Images
    
        getPGMfile("Lena.pgm", &img_in);
        imH = img_in.height;
        imW = img_in.width;
        img_out.height = img_in.height;
        img_out.width = img_in.width;
        img_out.maxVal = img_in.maxVal;
    
        im = (cufftDoubleComplex *)malloc(imW*imH*sizeof(cufftDoubleComplex));
        for (int i = 0; i < imH; i++)
          for (int j = 0; j < imW; j++){
            im[(imW*i)+j].x = (img_in.data[i][j].red + img_in.data[i][j].green + img_in.data[i][j].blue)/(3.0*img_in.maxVal);
            im[(imW*i)+j].y = 0.0;}
        // Memory allocation
        error = cudaMalloc((void**) &im_d, imW*imH*sizeof(cufftDoubleComplex));
        if(error != cudaSuccess)
        {
            cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_ALLOCMEM;
            return reserr;
        }
    
        error = cudaMalloc((void**) &IM, imW*imH*sizeof(cufftDoubleComplex));
        if(error != cudaSuccess)
        {
            cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_ALLOCMEM;
            return reserr;
        }
    
        // Copy data
        error = cudaMemcpy(im_d, im, imW*imH*sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice);
        if(error != cudaSuccess)
        {
            cout<<"[HCS] Erreur lors de la copie de l'image 1 sur le peripherique (ligne "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_CPYMEM;
            return reserr;
        }
    
        // ---- Compute FFT ---- //
        // Create 2D plans
        cuError = cufftPlan2d(&planFZ2Z, imW, imH, CUFFT_Z2Z);
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Erreur lors de la creattion du plan FFT (ligne "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_PLAN;
            return reserr;
        }
    
        cuError = cufftPlan2d(&planIZ2Z, imW, imH, CUFFT_Z2Z);
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Erreur lors de la creattion du plan IFFT (ligne "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_PLAN;
            return reserr;
        }
    
        // Compute FFT
        cuError = cufftExecZ2Z(planFZ2Z, im_d, IM, CUFFT_FORWARD); 
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_FFT;
            return reserr;
        }
    
        // Dimensions for a block and a grid
        dim3 dimBlock(NB_THREADS, NB_THREADS);
        int nbBlocsW = imW/NB_THREADS;
        if((imW%NB_THREADS) != 0)
            nbBlocsW++;
        int nbBlocsH = imH/NB_THREADS;
        if((imH%NB_THREADS) != 0)
            nbBlocsH++;
        dim3 dimGrid(nbBlocsW, nbBlocsH);
    
    
    
        kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
    
    
        // ---- Filtering ---- //
        kernel_DoGFiltering<<<dimGrid, dimBlock>>>(IM, imH, imW);
    
        kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
    
    
        // compute IFFTs
        cuError = cufftExecZ2Z(planIZ2Z, IM, im_d, CUFFT_INVERSE); 
        if(cuError != CUFFT_SUCCESS)
        {
            cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
            reserr = GPU_HCS_ERROR_FFT;
            return reserr;
        }
    
        cudaMemcpy(im, im_d, imW*imH*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
    #define SHIFT 0.2
        for (int i = 0; i < imH; i++)
          for (int j = 0; j < imW; j++){
            if ((im[(i*imW)+j].x+SHIFT)< 0.0) printf("%f\n", im[(i*imW)+j].x);
            unsigned temp = ((im[(i*imW)+j].x+SHIFT)/((double)(1)))*img_out.maxVal;
            if (temp > img_out.maxVal) {printf(" output saturation: %u\n", temp); return 1;}
            img_out.data[i][j].red =   temp;
            img_out.data[i][j].green = temp;
            img_out.data[i][j].blue =  temp;}
        save(&img_out);
        cudaFree(im_d);
        cudaFree(IM);
        cufftDestroy(planFZ2Z);
        cufftDestroy(planIZ2Z);
    
        delete[] im;
    
        return 0;
    }
    

    I used the Lena.pgm image that gets installed with the CUDA samples.
    The pgm.cpp and pgm.h files I used can be found here:

    http://sun.iwu.edu/~shelley/sie/zoo/journal/pgm.c.html
    http://sun.iwu.edu/~shelley/sie/zoo/journal/pgm.h.html

    I compiled like so:

    nvcc -arch=sm_20 -o t503 t503.cu pgm.cpp -lcufft

    Notice how I provided a complete example that doesn’t depend on any unknown libraries or files?
    In the future if you want help, I suggest you do the same.

    I also noticed while fiddling with the code you have posted here, that you are doing the forward transform twice. I don’t think this affects the results, but my code differs from yours noticeably in that respect also. The only change I made to the kernels was to fix the issue I already pointed out.