cufft 函数 cufftPlanMany 运行报错
我的代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <time.h>
// CUDA错误检查宏
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA错误在 %s:%d 代码=%d(%s) \"%s\"\n", \
__FILE__, __LINE__, err, cudaGetErrorString(err), #call); \
exit(EXIT_FAILURE); \
} \
} while (0)
// cuFFT错误检查宏
#define CUFFT_CHECK(call) \
do { \
cufftResult err = call; \
if (err != CUFFT_SUCCESS) { \
fprintf(stderr, "cuFFT错误在 %s:%d 代码=%d\n", \
__FILE__, __LINE__, err); \
exit(EXIT_FAILURE); \
} \
} while (0)
// STFT参数
const int fft_size = 8192; // 傅立叶变换长度
const int hop_size = 2048; // 跳步大小
const int signal_length = 1000000; // 信号长度,可根据GPU内存调整
// 生成随机实数信号
float* generate_real_signal(int length) {
float* signal = (float*)malloc(length * sizeof(float));
if (!signal) {
fprintf(stderr, "主机内存分配失败\n");
exit(EXIT_FAILURE);
}
for (int i = 0; i < length; i++) {
signal[i] = (float)rand() / RAND_MAX; // 随机值范围[0, 1]
}
return signal;
}
// 生成随机复数信号
cufftComplex* generate_complex_signal(int length) {
cufftComplex* signal = (cufftComplex*)malloc(length * sizeof(cufftComplex));
if (!signal) {
fprintf(stderr, "主机内存分配失败\n");
exit(EXIT_FAILURE);
}
for (int i = 0; i < length; i++) {
signal[i].x = (float)rand() / RAND_MAX; // 实部
signal[i].y = (float)rand() / RAND_MAX; // 虚部
}
return signal;
}
// 实数STFT实现
void real_stft(float* h_signal, int signal_length, int fft_size, int hop_size, int n_runs) {
// 计算批处理大小(信号分段数)
int num_segments = (signal_length - fft_size) / hop_size + 1;
printf("实数STFT - 分段数: %d\n", num_segments);
// 在GPU上分配输入和输出内存
float* d_signal;
cufftComplex* d_spectrum;
CUDA_CHECK(cudaMalloc(&d_signal, signal_length * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_spectrum, num_segments * (fft_size / 2 + 1) * sizeof(cufftComplex)));
// 创建cuFFT计划,使用cufftPlanMany支持批处理
cufftHandle plan;
int rank = 1; // 一维FFT
int n[1] = {fft_size}; // FFT长度
int inembed[1] = {fft_size}; // 输入嵌入维度
int onembed[1] = {fft_size / 2 + 1}; // 输出嵌入维度(实数到复数)
int istride = 1; // 输入步幅
int ostride = 1; // 输出步幅
int idist = hop_size; // 输入段间距离
int odist = fft_size / 2 + 1; // 输出段间距离
CUFFT_CHECK(cufftPlanMany(&plan, rank, n, inembed, istride, idist,
onembed, ostride, odist, CUFFT_R2C, num_segments));
// 创建CUDA事件用于时间测量
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
// 时间统计变量(单位:毫秒)
float total_time = 0.0f;
float min_time = 1e9f;
float max_time = 0.0f;
float htod_time = 0.0f;
float compute_time = 0.0f;
// 运行n次
for (int run = 0; run < n_runs; run++) {
float elapsed;
// 测量主机到设备(htod)拷贝时间
CUDA_CHECK(cudaEventRecord(start, 0));
CUDA_CHECK(cudaMemcpy(d_signal, h_signal, signal_length * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaEventRecord(stop, 0));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&elapsed, start, stop));
if (run > 0) { // 从第二次开始统计
htod_time += elapsed;
}
// 测量计算时间
CUDA_CHECK(cudaEventRecord(start, 0));
CUFFT_CHECK(cufftExecR2C(plan, d_signal, d_spectrum)); // 实数到复数FFT
CUDA_CHECK(cudaEventRecord(stop, 0));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&elapsed, start, stop));
if (run > 0) { // 从第二次开始统计
compute_time += elapsed;
total_time += elapsed;
if (elapsed < min_time) min_time = elapsed;
if (elapsed > max_time) max_time = elapsed;
printf("实数STFT 第%d次运行时间: %.2f ms\n", run, elapsed);
} else {
printf("实数STFT 第%d次运行时间: %.2f ms (预热,不计入统计)\n", run, elapsed);
}
}
// 清理资源
CUFFT_CHECK(cufftDestroy(plan));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
CUDA_CHECK(cudaFree(d_signal));
CUDA_CHECK(cudaFree(d_spectrum));
// 打印统计报告
printf("\n实数STFT统计报告:\n");
printf("运行次数: %10d\n", n_runs - 1);
printf("算法名称: %10s\n", "Real STFT");
printf("总计算时间: %10.2f ms\n", compute_time);
printf("平均计算时间: %8.2f ms\n", compute_time / (n_runs - 1));
printf("最大计算时间: %8.2f ms\n", max_time);
printf("最小计算时间: %8.2f ms\n", min_time);
printf("htod拷贝时间: %8.2f ms\n", htod_time);
}
// 复数STFT实现
void complex_stft(cufftComplex* h_signal, int signal_length, int fft_size, int hop_size, int n_runs) {
// 计算批处理大小(信号分段数)
int num_segments = (signal_length - fft_size) / hop_size + 1;
printf("复数STFT - 分段数: %d\n", num_segments);
// 在GPU上分配输入和输出内存
cufftComplex* d_signal;
cufftComplex* d_spectrum;
CUDA_CHECK(cudaMalloc(&d_signal, signal_length * sizeof(cufftComplex)));
CUDA_CHECK(cudaMalloc(&d_spectrum, num_segments * fft_size * sizeof(cufftComplex)));
// 创建cuFFT计划,使用cufftPlanMany支持批处理
cufftHandle plan;
int rank = 1; // 一维FFT
int n[1] = {fft_size}; // FFT长度
int inembed[1] = {fft_size}; // 输入嵌入维度
int onembed[1] = {fft_size}; // 输出嵌入维度(复数到复数)
int istride = 1; // 输入步幅
int ostride = 1; // 输出步幅
int idist = hop_size; // 输入段间距离
int odist = fft_size; // 输出段间距离
CUFFT_CHECK(cufftPlanMany(&plan, rank, n, inembed, istride, idist,
onembed, ostride, odist, CUFFT_C2C, num_segments));
// 创建CUDA事件用于时间测量
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
// 时间统计变量(单位:毫秒)
float total_time = 0.0f;
float min_time = 1e9f;
float max_time = 0.0f;
float htod_time = 0.0f;
float compute_time = 0.0f;
// 运行n次
for (int run = 0; run < n_runs; run++) {
float elapsed;
// 测量主机到设备(htod)拷贝时间
CUDA_CHECK(cudaEventRecord(start, 0));
CUDA_CHECK(cudaMemcpy(d_signal, h_signal, signal_length * sizeof(cufftComplex), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaEventRecord(stop, 0));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&elapsed, start, stop));
if (run > 0) { // 从第二次开始统计
htod_time += elapsed;
}
// 测量计算时间
CUDA_CHECK(cudaEventRecord(start, 0));
CUFFT_CHECK(cufftExecC2C(plan, d_signal, d_spectrum, CUFFT_FORWARD)); // 复数到复数FFT
CUDA_CHECK(cudaEventRecord(stop, 0));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&elapsed, start, stop));
if (run > 0) { // 从第二次开始统计
compute_time += elapsed;
total_time += elapsed;
if (elapsed < min_time) min_time = elapsed;
if (elapsed > max_time) max_time = elapsed;
printf("复数STFT 第%d次运行时间: %.2f ms\n", run, elapsed);
} else {
printf("复数STFT 第%d次运行时间: %.2f ms (预热,不计入统计)\n", run, elapsed);
}
}
// 清理资源
CUFFT_CHECK(cufftDestroy(plan));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
CUDA_CHECK(cudaFree(d_signal));
CUDA_CHECK(cudaFree(d_spectrum));
// 打印统计报告
printf("\n复数STFT统计报告:\n");
printf("运行次数: %10d\n", n_runs - 1);
printf("算法名称: %10s\n", "Complex STFT");
printf("总计算时间: %10.2f ms\n", compute_time);
printf("平均计算时间: %8.2f ms\n", compute_time / (n_runs - 1));
printf("最大计算时间: %8.2f ms\n", max_time);
printf("最小计算时间: %8.2f ms\n", min_time);
printf("htod拷贝时间: %8.2f ms\n", htod_time);
}
int main(int argc, char* argv[]) {
// 设置随机种子
srand(time(NULL));
cudaSetDevice(0);
cudaDeviceReset();
// 获取运行次数,默认为10
int n_runs = 10;
if (argc > 1) {
n_runs = atoi(argv[1]);
if (n_runs <= 0) {
fprintf(stderr, "运行次数必须大于0\n");
return EXIT_FAILURE;
}
}
printf("运行次数设置为: %d\n", n_runs);
// 生成实数信号
float* h_real_signal = generate_real_signal(signal_length);
if (!h_real_signal) return EXIT_FAILURE;
// 生成复数信号
cufftComplex* h_complex_signal = generate_complex_signal(signal_length);
if (!h_complex_signal) {
free(h_real_signal);
return EXIT_FAILURE;
}
// 运行实数STFT
real_stft(h_real_signal, signal_length, fft_size, hop_size, n_runs);
// 运行复数STFT
complex_stft(h_complex_signal, signal_length, fft_size, hop_size, n_runs);
// 释放主机内存
free(h_real_signal);
free(h_complex_signal);
return 0;
}
我的报错信息如下:
运行次数设置为: 10
实数STFT - 分段数: 485
cuFFT错误在 stft.cu:83 代码=5
检查了很多遍,不知道是什么原因。
我使用的显卡信息:
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07 Driver Version: 570.133.07 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4060 Ti Off | 00000000:01:00.0 Off | N/A |
| 36% 44C P8 11W / 165W | 83MiB / 16380MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 1121 G /usr/lib/xorg/Xorg 56MiB |
| 0 N/A N/A 1253 G /usr/bin/gnome-shell 6MiB |
+-----------------------------------------------------------------------------------------+
请问这是什么原因导致的?