Performance Degradation in cuFFT callback from CUDA 11.8 to 12.9+

Starting from version 11.8, the performance of cufft callback has significantly degraded . I have a business scenario where I load int16 complex data and perform c2c calculations. I use a load callback to convert int16 complex to float complex during the reading process.

  • Version 11.7 took 113ms.
  • Version 12.9 took 459ms.

Can the new lto callback resolve this performance degradation?

// 11.7: 
//      /usr/local/cuda-11.7/bin/nvcc -O2 -dc -o a.o a.cu
//      /usr/local/cuda-11.7/bin/nvcc -O2 -o a.out a.o -lcufft_static -lculibos
//     ./a.out
// 12.9:
//     /usr/local/cuda-12.9/bin/nvcc -O2 -dc -o a.o a.cu
//     /usr/local/cuda-12.9/bin/nvcc -O2 -o a.out a.o -lcufft_static -lculibos
//    ./a.out
#include <cufft.h>
#include <cufftXt.h>
#include <cuda_runtime.h>
#include <iostream>
#include <complex>
#include <vector>
#include <chrono>
#include <cstring>


#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \
                      << cudaGetErrorString(err) << std::endl; \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

#define CUFFT_CHECK(call) \
    do { \
        cufftResult err = call; \
        if (err != CUFFT_SUCCESS) { \
            std::cerr << "cuFFT error at " << __FILE__ << ":" << __LINE__ << ": " \
                      << static_cast<int>(err) << std::endl; \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__device__ cufftComplex loadCallback(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer) {
    int16_t *inputData = (int16_t*)dataIn;
    
    size_t inputOffset = offset * 2;
    
    cufftComplex result{};
    result.x = static_cast<float>(inputData[inputOffset]);    
    result.y = static_cast<float>(inputData[inputOffset + 1]);     
    return result;
}
__device__ cufftCallbackLoadC d_loadCallbackPtr = loadCallback;

class CUDAFFTDemo {
private:
    const size_t fft_length = 50000;
    const size_t batch_size = 4000;
    const size_t warmup_runs = 10;
    const size_t benchmark_runs = 10;
    
    int16_t *d_input_data;      
    cufftComplex *d_fft_output; 
    cufftHandle plan;

public:
    CUDAFFTDemo() : d_input_data(nullptr), d_fft_output(nullptr) {
        setup();
    }
    
    ~CUDAFFTDemo() {
        cleanup();
    }

private:
    void setup() {
        std::cout << "Setting up CUDA FFT demo..." << std::endl;
        std::cout << "FFT Length: " << fft_length << ", Batch: " << batch_size << std::endl;
        
        size_t input_size = fft_length * batch_size * 2;
        size_t output_size = fft_length * batch_size;
        
        CUDA_CHECK(cudaMalloc(&d_input_data, input_size * sizeof(int16_t) * 2));
        CUDA_CHECK(cudaMalloc(&d_fft_output, output_size * sizeof(cufftComplex)));
        
        std::vector<int16_t> host_input(input_size);
        for (size_t i = 0; i < input_size; ++i) {
            // host_input[i] = static_cast<int16_t>(i % batch_size); // -32768 to 32767
            host_input[i] = 1; // -32768 to 32767
        }
        CUDA_CHECK(cudaMemcpy(d_input_data, host_input.data(), 
                             input_size * sizeof(int16_t), cudaMemcpyHostToDevice));
        
        CUFFT_CHECK(cufftPlan1d(&plan, fft_length, CUFFT_C2C, batch_size));
        
        setupCallback();
        
        std::cout << "Setup completed successfully." << std::endl;
    }
    
    void setupCallback() {
        cufftCallbackLoadC h_loadCallback;
        CUDA_CHECK(cudaMemcpyFromSymbol(&h_loadCallback, d_loadCallbackPtr, sizeof(cufftCallbackLoadC)));
        
        CUFFT_CHECK(cufftXtSetCallback(plan, (void **)&h_loadCallback, CUFFT_CB_LD_COMPLEX, nullptr));
    }
    
    void cleanup() {
        if (d_input_data) {
            cudaFree(d_input_data);
            d_input_data = nullptr;
        }
        if (d_fft_output) {
            cudaFree(d_fft_output);
            d_fft_output = nullptr;
        }
        if (plan) {
            cufftDestroy(plan);
        }
    }

public:
    void runWarmup() {
        std::cout << "Running warmup (" << warmup_runs << " iterations)..." << std::endl;
        
        for (size_t i = 0; i < warmup_runs; ++i) {
            CUFFT_CHECK(cufftExecC2C(plan, (cufftComplex*)d_input_data, d_fft_output, CUFFT_FORWARD));
            CUDA_CHECK(cudaDeviceSynchronize());
        }
        
        std::cout << "Warmup completed." << std::endl;
    }
    
    void runBenchmark() {
        std::cout << "Running benchmark (" << benchmark_runs << " iterations)..." << std::endl;
        
        auto start_time = std::chrono::high_resolution_clock::now();
        
        for (size_t i = 0; i < benchmark_runs; ++i) {
            CUFFT_CHECK(cufftExecC2C(plan, (cufftComplex*)d_input_data, d_fft_output, CUFFT_FORWARD));
            CUDA_CHECK(cudaDeviceSynchronize());
        }
        
        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
        
        double total_time_ms = duration.count();
        double avg_time_ms = total_time_ms / benchmark_runs;
        double total_samples = fft_length * batch_size * benchmark_runs;
        double throughput = total_samples / (total_time_ms / 1000.0);
        
        std::cout << "Benchmark Results:" << std::endl;
        std::cout << "  Total time: " << total_time_ms << " ms" << std::endl;
        std::cout << "  Average time per FFT: " << avg_time_ms << " ms" << std::endl;
        std::cout << "  Throughput: " << throughput / 1e6 << " million samples/second" << std::endl;
        std::cout << "  FFTs per second: " << (1000.0 / avg_time_ms) * batch_size << std::endl;
    }
    
    void verifyResults() {

        std::vector<cufftComplex> host_output(fft_length * batch_size);
        CUDA_CHECK(cudaMemcpy(host_output.data(), d_fft_output, 
                             fft_length * batch_size * sizeof(cufftComplex), 
                             cudaMemcpyDeviceToHost));
        for (size_t i = 0; i < 10; ++i) {
            printf("%d %f %f\n", (int)i, host_output[i].x, host_output[i].y);
        }
        
        size_t non_zero_count = 0;
        for (size_t i = 0; i < fft_length * batch_size; ++i) {
            if (host_output[i].x != 0.0f || host_output[i].y != 0.0f) {
                non_zero_count++;
            }
        }
        
        std::cout << "Verification: " << non_zero_count << " out of " 
                  << fft_length * batch_size << " output values are non-zero." << std::endl;
    }
};

int main() {
    try {
        int device_count;
        CUDA_CHECK(cudaGetDeviceCount(&device_count));
        if (device_count == 0) {
            std::cerr << "No CUDA devices found!" << std::endl;
            return EXIT_FAILURE;
        }
        
        cudaDeviceProp prop;
        CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
        std::cout << "Using CUDA device: " << prop.name << std::endl;
        std::cout << "Compute capability: " << prop.major << "." << prop.minor << std::endl;
        
        CUDAFFTDemo demo;
        
        demo.runWarmup();
        
        demo.runBenchmark();
        
        demo.verifyResults();
        
        std::cout << "Demo completed successfully!" << std::endl;
        
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return EXIT_FAILURE;
    }
    
    return EXIT_SUCCESS;
}