Hello everyone,
I have a program in Matlab and I want to translate it in C++/Cuda. I think succeed quite well except for the filtering part.
In my Matlab code, I define the filter (a Difference of Gaussian) directly in the frequency domain. I want to do the same in CUDA. The problem is that my CUDA code does not work well.
I tried to reduce the code to only filter the images.
The matlab version is :
im = ...; % Read the image
IM = fftshift(fft2(im));
[L, C] = size(IM);
u = ((1:C)-1)/(C-1) - 0.5
v = ((1:L)-1)/(L-1) - 0.5;
[U, V] = meshgrid(u, v);
rad2 = U.*U + V.*V;
sigma1 = 0.2; sigma2 = 0.1;
dog = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
imb_matlab = real(ifft2(ifftshift(dog.*IM)));
And this is what I get with matlab :
In CUDA the result is different :
Here is the C++/CUDA code :
#include <iostream>
#include <cmath>
#include "FileManager.h"
#include "HCS_kernel.h"
using namespace std;
int main(int argc, char* argv[])
{
int *size;
int dim, imH, imW;
double *im, *im_d;
FileManager FM;
FileManager_Error FMerror;
cudaError_t error; // To handle CUDA errors (time handler)
GPU_HCS_Error reserr = GPU_HCS_SUCCESS; // Result for the main program
cufftResult cuError; // cuFFT errors
cufftDoubleComplex *IM; // Image spectrum on GPU
cufftHandle planD2Z, planZ2D; // Plans for FFT and IFFT
// Read Images
FMerror = FM.readImage("../HCS/lena1.bin", &im, &dim, &size);
if(FMerror != FM_SUCCESS)
return FMerror;
imH = size[0];
imW = size[1];
// Memory allocation
error = cudaMalloc((void**) &im_d, imW*imH*sizeof(double));
if(error != cudaSuccess)
{
cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_ALLOCMEM;
return reserr;
}
error = cudaMalloc((void**) &IM, imW*imH*sizeof(cufftDoubleComplex));
if(error != cudaSuccess)
{
cout<<"[HCS] Error Memory Allocation (line "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_ALLOCMEM;
return reserr;
}
// Copy data
error = cudaMemcpy(im_d, (double*)im, imW*imH*sizeof(double), cudaMemcpyHostToDevice);
if(error != cudaSuccess)
{
cout<<"[HCS] Erreur lors de la copie de l'image 1 sur le peripherique (ligne "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_CPYMEM;
return reserr;
}
// ---- Compute FFT ---- //
// Create 2D plans
cuError = cufftPlan2d(&planD2Z, imW, imH, CUFFT_D2Z);
if(cuError != CUFFT_SUCCESS)
{
cout<<"[HCS] Erreur lors de la creattion du plan FFT (ligne "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_PLAN;
return reserr;
}
cuError = cufftPlan2d(&planZ2D, imW, imH, CUFFT_Z2D);
if(cuError != CUFFT_SUCCESS)
{
cout<<"[HCS] Erreur lors de la creattion du plan IFFT (ligne "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_PLAN;
return reserr;
}
// Compute FFT
cuError = cufftExecD2Z(planD2Z, (cufftDoubleReal*)im_d, IM); // Implicitly FORWARD
if(cuError != CUFFT_SUCCESS)
{
cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_FFT;
return reserr;
}
// Dimensions for a block and a grid
dim3 dimBlock(NB_THREADS, NB_THREADS);
int nbBlocsW = imW/NB_THREADS;
if((imW%NB_THREADS) != 0)
nbBlocsW++;
int nbBlocsH = imH/NB_THREADS;
if((imH%NB_THREADS) != 0)
nbBlocsH++;
dim3 dimGrid(nbBlocsW, nbBlocsH);
// Compute FFT
cuError = cufftExecD2Z(planD2Z, (cufftDoubleReal*)im_d, IM); // Implicitly FORWARD
if(cuError != CUFFT_SUCCESS)
{
cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_FFT;
return reserr;
}
kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
// ---- Filtering ---- //
kernel_DoGFiltering<<<dimGrid, dimBlock>>>(IM, imH, imW);
kernel_fftshift2D<<<dimGrid, dimBlock>>>(IM, imH, imW);
// compute IFFTs
cuError = cufftExecZ2D(planZ2D, IM, (cufftDoubleReal*)im_d); // Implicitly INVERSE
if(cuError != CUFFT_SUCCESS)
{
cout<<"[HCS] Error "<<cuError<<" during executing CUFFT (line "<<__LINE__<<")"<<endl;
reserr = GPU_HCS_ERROR_FFT;
return reserr;
}
cudaFree(im_d);
cudaFree(IM);
cufftDestroy(planD2Z);
cufftDestroy(planZ2D);
delete[] im;
delete[] size;
return 0;
}
#ifndef DEF_HCSKERNEL
#define DEF_HCSKERNEL
#include <cuda_runtime.h>
#include <cufft.h>
// On the device
#define NB_THREADS 32
enum GPU_HCS_Error
{
GPU_HCS_SUCCESS,
GPU_HCS_ERROR_ALLOCMEM,
GPU_HCS_ERROR_FREEMEM,
GPU_HCS_ERROR_CPYMEM,
GPU_HCS_ERROR_TIME,
GPU_HCS_ERROR_PLAN,
GPU_HCS_ERROR_FFT
};
__global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW);
__global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW);
#endif
#include "HCS_kernel.h"
__global__ void kernel_fftshift2D(cufftDoubleComplex *IM, int imH, int imW)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int ind = idy*imW + idx;
int x, y, indshift;
cufftDoubleComplex v;
if(idx < imW && idy < imH/2)
{
// A OPTIMISER
if(idx<imW/2 && idy<imH/2)
{
x=idx+imW/2;
y=idy+imH/2;
}
else if(idx>=imW/2 && idy<imH/2)
{
x=idx-imW/2;
y=idy+imH/2;
}
indshift = y*imW+x;
v.x = IM[ind].x;
v.y = IM[ind].y;
IM[ind].x = IM[indshift].x;
IM[ind].y = IM[indshift].y;
IM[indshift].x = v.x;
IM[indshift].y = v.y;
}
}
__global__ void kernel_DoGFiltering(cufftDoubleComplex *IM, int imH, int imW)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int ind = idy*imW + idx;
double u, v, rad, rad2, DoG, fact;
double sigma1 = 0.2, sigma2 = 0.1;
// Mask the treads
if(idx < imW && idy < imH)
{
// Convert indices 0..(N-1) -> -0.5..0.5
u = (((double)idx)/(((double)imW)-1.)) - 0.5;
v = (((double)idx)/(((double)imH)-1.)) - 0.5;
rad2 = u*u + v*v;
rad = sqrt(rad2);
// Compute the Difference of Gaussian
DoG = exp(-rad2/(2*sigma1*sigma1)) - exp(-rad2/(2*sigma2*sigma2));
fact = (double) 1./(imH*imW);
IM[ind].x = IM[ind].x * DoG * fact;
IM[ind].y = IM[ind].y * DoG * fact;
}
}
I found some ways to shift the spectrum in a more efficient way but I wanted a simple implementation to better understand what happen.
For me, the Matlab and CUDA codes are equivalent, but the results show that they are not and I could not find what is wrong in my CUDA implementation.
Do you have any idea?
Thanks in advance.