Starting from version 11.8, the performance of cufft callback has significantly degraded . I have a business scenario where I load int16 complex data and perform c2c calculations. I use a load callback to convert int16 complex to float complex during the reading process.
- Version 11.7 took 113ms.
- Version 12.9 took 459ms.
Can the new lto callback resolve this performance degradation?
// 11.7:
// /usr/local/cuda-11.7/bin/nvcc -O2 -dc -o a.o a.cu
// /usr/local/cuda-11.7/bin/nvcc -O2 -o a.out a.o -lcufft_static -lculibos
// ./a.out
// 12.9:
// /usr/local/cuda-12.9/bin/nvcc -O2 -dc -o a.o a.cu
// /usr/local/cuda-12.9/bin/nvcc -O2 -o a.out a.o -lcufft_static -lculibos
// ./a.out
#include <cufft.h>
#include <cufftXt.h>
#include <cuda_runtime.h>
#include <iostream>
#include <complex>
#include <vector>
#include <chrono>
#include <cstring>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \
<< cudaGetErrorString(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while(0)
#define CUFFT_CHECK(call) \
do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
std::cerr << "cuFFT error at " << __FILE__ << ":" << __LINE__ << ": " \
<< static_cast<int>(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while(0)
__device__ cufftComplex loadCallback(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer) {
int16_t *inputData = (int16_t*)dataIn;
size_t inputOffset = offset * 2;
cufftComplex result{};
result.x = static_cast<float>(inputData[inputOffset]);
result.y = static_cast<float>(inputData[inputOffset + 1]);
return result;
}
__device__ cufftCallbackLoadC d_loadCallbackPtr = loadCallback;
class CUDAFFTDemo {
private:
const size_t fft_length = 50000;
const size_t batch_size = 4000;
const size_t warmup_runs = 10;
const size_t benchmark_runs = 10;
int16_t *d_input_data;
cufftComplex *d_fft_output;
cufftHandle plan;
public:
CUDAFFTDemo() : d_input_data(nullptr), d_fft_output(nullptr) {
setup();
}
~CUDAFFTDemo() {
cleanup();
}
private:
void setup() {
std::cout << "Setting up CUDA FFT demo..." << std::endl;
std::cout << "FFT Length: " << fft_length << ", Batch: " << batch_size << std::endl;
size_t input_size = fft_length * batch_size * 2;
size_t output_size = fft_length * batch_size;
CUDA_CHECK(cudaMalloc(&d_input_data, input_size * sizeof(int16_t) * 2));
CUDA_CHECK(cudaMalloc(&d_fft_output, output_size * sizeof(cufftComplex)));
std::vector<int16_t> host_input(input_size);
for (size_t i = 0; i < input_size; ++i) {
// host_input[i] = static_cast<int16_t>(i % batch_size); // -32768 to 32767
host_input[i] = 1; // -32768 to 32767
}
CUDA_CHECK(cudaMemcpy(d_input_data, host_input.data(),
input_size * sizeof(int16_t), cudaMemcpyHostToDevice));
CUFFT_CHECK(cufftPlan1d(&plan, fft_length, CUFFT_C2C, batch_size));
setupCallback();
std::cout << "Setup completed successfully." << std::endl;
}
void setupCallback() {
cufftCallbackLoadC h_loadCallback;
CUDA_CHECK(cudaMemcpyFromSymbol(&h_loadCallback, d_loadCallbackPtr, sizeof(cufftCallbackLoadC)));
CUFFT_CHECK(cufftXtSetCallback(plan, (void **)&h_loadCallback, CUFFT_CB_LD_COMPLEX, nullptr));
}
void cleanup() {
if (d_input_data) {
cudaFree(d_input_data);
d_input_data = nullptr;
}
if (d_fft_output) {
cudaFree(d_fft_output);
d_fft_output = nullptr;
}
if (plan) {
cufftDestroy(plan);
}
}
public:
void runWarmup() {
std::cout << "Running warmup (" << warmup_runs << " iterations)..." << std::endl;
for (size_t i = 0; i < warmup_runs; ++i) {
CUFFT_CHECK(cufftExecC2C(plan, (cufftComplex*)d_input_data, d_fft_output, CUFFT_FORWARD));
CUDA_CHECK(cudaDeviceSynchronize());
}
std::cout << "Warmup completed." << std::endl;
}
void runBenchmark() {
std::cout << "Running benchmark (" << benchmark_runs << " iterations)..." << std::endl;
auto start_time = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < benchmark_runs; ++i) {
CUFFT_CHECK(cufftExecC2C(plan, (cufftComplex*)d_input_data, d_fft_output, CUFFT_FORWARD));
CUDA_CHECK(cudaDeviceSynchronize());
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
double total_time_ms = duration.count();
double avg_time_ms = total_time_ms / benchmark_runs;
double total_samples = fft_length * batch_size * benchmark_runs;
double throughput = total_samples / (total_time_ms / 1000.0);
std::cout << "Benchmark Results:" << std::endl;
std::cout << " Total time: " << total_time_ms << " ms" << std::endl;
std::cout << " Average time per FFT: " << avg_time_ms << " ms" << std::endl;
std::cout << " Throughput: " << throughput / 1e6 << " million samples/second" << std::endl;
std::cout << " FFTs per second: " << (1000.0 / avg_time_ms) * batch_size << std::endl;
}
void verifyResults() {
std::vector<cufftComplex> host_output(fft_length * batch_size);
CUDA_CHECK(cudaMemcpy(host_output.data(), d_fft_output,
fft_length * batch_size * sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
for (size_t i = 0; i < 10; ++i) {
printf("%d %f %f\n", (int)i, host_output[i].x, host_output[i].y);
}
size_t non_zero_count = 0;
for (size_t i = 0; i < fft_length * batch_size; ++i) {
if (host_output[i].x != 0.0f || host_output[i].y != 0.0f) {
non_zero_count++;
}
}
std::cout << "Verification: " << non_zero_count << " out of "
<< fft_length * batch_size << " output values are non-zero." << std::endl;
}
};
int main() {
try {
int device_count;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
if (device_count == 0) {
std::cerr << "No CUDA devices found!" << std::endl;
return EXIT_FAILURE;
}
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
std::cout << "Using CUDA device: " << prop.name << std::endl;
std::cout << "Compute capability: " << prop.major << "." << prop.minor << std::endl;
CUDAFFTDemo demo;
demo.runWarmup();
demo.runBenchmark();
demo.verifyResults();
std::cout << "Demo completed successfully!" << std::endl;
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}