cufftExecR2C generate different results between Geforce series 10 and series 20

Hello,

I tried to compute the FFT and get the real and imaginary component back on a Geforce series 10 and 20 and I did not obtain the same result. I have differences up to 1000 and 1300 on a 32 bits floating point image for real and imaginary component respectively.

Is it normal ?

Here is the code:

#include <iostream>

#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cufft.h>

#include <kernel.hpp>

#include <assert.h>

#define checkCudaError(code)                   \
    {                                          \
        gpuAssert((code), __FILE__, __LINE__); \
    }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess) {
        fprintf(stderr, "CUDA assert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
}

int main()
{
    constexpr size_t SIZE { 1024 };
    constexpr size_t HALFSIZE { (SIZE / 2) + 1 };

    const auto input_image = [] {
        const auto image { cv::imread("C:/tmp/input.tiff", cv::ImreadModes::IMREAD_ANYDEPTH) };
        cv::Mat float_image;
        image.convertTo(float_image, CV_32F);
        return float_image;
    }();

    float* device_input { nullptr };
    cufftComplex* device_fft { nullptr };
    float* device_real { nullptr };
    float* device_imag { nullptr };
    cufftHandle plan;

    checkCudaError(cudaMalloc((void**)&device_input, SIZE * SIZE * sizeof(float)));
    checkCudaError(cudaMalloc((void**)&device_fft, SIZE * HALFSIZE * sizeof(cufftComplex)));
    checkCudaError(cudaMalloc((void**)&device_real, SIZE * HALFSIZE * sizeof(float)));
    checkCudaError(cudaMalloc((void**)&device_imag, SIZE * HALFSIZE * sizeof(float)));
    cufftPlan2d(&plan, SIZE, SIZE, CUFFT_R2C);
    checkCudaError(cudaPeekAtLastError());

    cudaMemcpy(device_input, input_image.data, SIZE * SIZE * sizeof(float), cudaMemcpyHostToDevice);

    cufftExecR2C(plan, device_input, device_fft);
    checkCudaError(cudaPeekAtLastError());
    checkCudaError(cudaDeviceSynchronize());
    extractFFTParts(device_fft, device_real, device_imag, SIZE, HALFSIZE);
    checkCudaError(cudaPeekAtLastError());
    checkCudaError(cudaDeviceSynchronize());

    cv::Mat real_image { SIZE, HALFSIZE, CV_32F };
    cudaMemcpy(real_image.data, device_real, SIZE * HALFSIZE * sizeof(float), cudaMemcpyDeviceToHost);

    cv::Mat imag_image { SIZE, HALFSIZE, CV_32F };
    cudaMemcpy(imag_image.data, device_imag, SIZE * HALFSIZE * sizeof(float), cudaMemcpyDeviceToHost);

    cufftDestroy(plan);
    checkCudaError(cudaFree(device_imag));
    checkCudaError(cudaFree(device_real));
    checkCudaError(cudaFree(device_fft));
    checkCudaError(cudaFree(device_input));

    cv::imwrite("C:/tmp/real.tiff", real_image);
    cv::imwrite("C:/tmp/imag.tiff", imag_image);
}

And the kernel called in extractFFTParts

__global__ void kernel(cufftComplex* fft, float* real, float* imag, size_t w, size_t h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < w && y < h) {
        int i = x + y * w;
        real[i] = fft[i].x;
        imag[i] = fft[i].y;
    }
}

System information:
CUDA ToolKit : 10.2
Windows : 10
IDE : Visual Studio 2019

Yes this is expected. cuFFT 12.3 documentation

Results produced by the cuFFT library are deterministic (ie, bitwise reproducible) as long as the following are kept constant between runs: plan input parameters, cuFFT version, and GPU model.

1 Like

Thanks, i did not see that.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.