Hello,
I tried to compute the FFT and get the real and imaginary component back on a Geforce series 10 and 20 and I did not obtain the same result. I have differences up to 1000 and 1300 on a 32 bits floating point image for real and imaginary component respectively.
Is it normal ?
Here is the code:
#include <iostream>
#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
#include <kernel.hpp>
#include <assert.h>
#define checkCudaError(code) \
{ \
gpuAssert((code), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess) {
fprintf(stderr, "CUDA assert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
int main()
{
constexpr size_t SIZE { 1024 };
constexpr size_t HALFSIZE { (SIZE / 2) + 1 };
const auto input_image = [] {
const auto image { cv::imread("C:/tmp/input.tiff", cv::ImreadModes::IMREAD_ANYDEPTH) };
cv::Mat float_image;
image.convertTo(float_image, CV_32F);
return float_image;
}();
float* device_input { nullptr };
cufftComplex* device_fft { nullptr };
float* device_real { nullptr };
float* device_imag { nullptr };
cufftHandle plan;
checkCudaError(cudaMalloc((void**)&device_input, SIZE * SIZE * sizeof(float)));
checkCudaError(cudaMalloc((void**)&device_fft, SIZE * HALFSIZE * sizeof(cufftComplex)));
checkCudaError(cudaMalloc((void**)&device_real, SIZE * HALFSIZE * sizeof(float)));
checkCudaError(cudaMalloc((void**)&device_imag, SIZE * HALFSIZE * sizeof(float)));
cufftPlan2d(&plan, SIZE, SIZE, CUFFT_R2C);
checkCudaError(cudaPeekAtLastError());
cudaMemcpy(device_input, input_image.data, SIZE * SIZE * sizeof(float), cudaMemcpyHostToDevice);
cufftExecR2C(plan, device_input, device_fft);
checkCudaError(cudaPeekAtLastError());
checkCudaError(cudaDeviceSynchronize());
extractFFTParts(device_fft, device_real, device_imag, SIZE, HALFSIZE);
checkCudaError(cudaPeekAtLastError());
checkCudaError(cudaDeviceSynchronize());
cv::Mat real_image { SIZE, HALFSIZE, CV_32F };
cudaMemcpy(real_image.data, device_real, SIZE * HALFSIZE * sizeof(float), cudaMemcpyDeviceToHost);
cv::Mat imag_image { SIZE, HALFSIZE, CV_32F };
cudaMemcpy(imag_image.data, device_imag, SIZE * HALFSIZE * sizeof(float), cudaMemcpyDeviceToHost);
cufftDestroy(plan);
checkCudaError(cudaFree(device_imag));
checkCudaError(cudaFree(device_real));
checkCudaError(cudaFree(device_fft));
checkCudaError(cudaFree(device_input));
cv::imwrite("C:/tmp/real.tiff", real_image);
cv::imwrite("C:/tmp/imag.tiff", imag_image);
}
And the kernel called in extractFFTParts
__global__ void kernel(cufftComplex* fft, float* real, float* imag, size_t w, size_t h)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < w && y < h) {
int i = x + y * w;
real[i] = fft[i].x;
imag[i] = fft[i].y;
}
}
System information:
CUDA ToolKit : 10.2
Windows : 10
IDE : Visual Studio 2019