cufftExecR2C generate different results between Geforce series 10 and series 20

Hello,

I tried to compute the FFT and get the real and imaginary component back on a Geforce series 10 and 20 and I did not obtain the same result. I have differences up to 1000 and 1300 on a 32 bits floating point image for real and imaginary component respectively.

Is it normal ?

Here is the code:

#include <iostream>

#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cufft.h>

#include <kernel.hpp>

#include <assert.h>

#define checkCudaError(code)                   \
    {                                          \
        gpuAssert((code), __FILE__, __LINE__); \
    }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess) {
        fprintf(stderr, "CUDA assert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
}

int main()
{
    constexpr size_t SIZE { 1024 };
    constexpr size_t HALFSIZE { (SIZE / 2) + 1 };

    const auto input_image = [] {
        const auto image { cv::imread("C:/tmp/input.tiff", cv::ImreadModes::IMREAD_ANYDEPTH) };
        cv::Mat float_image;
        image.convertTo(float_image, CV_32F);
        return float_image;
    }();

    float* device_input { nullptr };
    cufftComplex* device_fft { nullptr };
    float* device_real { nullptr };
    float* device_imag { nullptr };
    cufftHandle plan;

    checkCudaError(cudaMalloc((void**)&device_input, SIZE * SIZE * sizeof(float)));
    checkCudaError(cudaMalloc((void**)&device_fft, SIZE * HALFSIZE * sizeof(cufftComplex)));
    checkCudaError(cudaMalloc((void**)&device_real, SIZE * HALFSIZE * sizeof(float)));
    checkCudaError(cudaMalloc((void**)&device_imag, SIZE * HALFSIZE * sizeof(float)));
    cufftPlan2d(&plan, SIZE, SIZE, CUFFT_R2C);
    checkCudaError(cudaPeekAtLastError());

    cudaMemcpy(device_input, input_image.data, SIZE * SIZE * sizeof(float), cudaMemcpyHostToDevice);

    cufftExecR2C(plan, device_input, device_fft);
    checkCudaError(cudaPeekAtLastError());
    checkCudaError(cudaDeviceSynchronize());
    extractFFTParts(device_fft, device_real, device_imag, SIZE, HALFSIZE);
    checkCudaError(cudaPeekAtLastError());
    checkCudaError(cudaDeviceSynchronize());

    cv::Mat real_image { SIZE, HALFSIZE, CV_32F };
    cudaMemcpy(real_image.data, device_real, SIZE * HALFSIZE * sizeof(float), cudaMemcpyDeviceToHost);

    cv::Mat imag_image { SIZE, HALFSIZE, CV_32F };
    cudaMemcpy(imag_image.data, device_imag, SIZE * HALFSIZE * sizeof(float), cudaMemcpyDeviceToHost);

    cufftDestroy(plan);
    checkCudaError(cudaFree(device_imag));
    checkCudaError(cudaFree(device_real));
    checkCudaError(cudaFree(device_fft));
    checkCudaError(cudaFree(device_input));

    cv::imwrite("C:/tmp/real.tiff", real_image);
    cv::imwrite("C:/tmp/imag.tiff", imag_image);
}

And the kernel called in extractFFTParts

__global__ void kernel(cufftComplex* fft, float* real, float* imag, size_t w, size_t h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < w && y < h) {
        int i = x + y * w;
        real[i] = fft[i].x;
        imag[i] = fft[i].y;
    }
}

System information:
CUDA ToolKit : 10.2
Windows : 10
IDE : Visual Studio 2019

Yes this is expected. cuFFT :: CUDA Toolkit Documentation

Results produced by the cuFFT library are deterministic (ie, bitwise reproducible) as long as the following are kept constant between runs: plan input parameters, cuFFT version, and GPU model.

1 Like

Thanks, i did not see that.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.