CUFFT Issues

Hey everyone,

I’m having some problems using the CUFFT libraries to do what I want it to do. Basically, I have 1024 separate signals, each with 1024 points that I want to run 1D FFTs on. It seems like Batching would be the best way to implement this but, I have found the documentation related to Batching a little thin…

As of now, to my understanding, I can run 64 1D FFTs at the same time using Batching, but get a Runtime API error when I increase the Batch to 128, and a CUFFT error when I try to set the Batch size to 256, 512 or 1024.

Anyone have any idea why I am getting this behavior in errors? Or how I can write some working code to perform the FFTs I want? Any help is greatly appreciated! I’ll provide my code below. Thanks!
Mike


/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cufft.h>
#include <cutil_inline.h>

// Complex data type
typedef float2 Complex;

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);

// define Signal and Batch sizes
#define SIGNAL_SIZE 1024
#define BATCH 128

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{

runTest(argc, argv);


cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////

void runTest(int argc, char** argv)
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );

int mem_size = sizeof(Complex) * SIGNAL_SIZE * BATCH;

// Allocate host memory for the signal
Complex* h_signal = (Complex*)malloc(mem_size);
Complex* h_transformed_signal = (Complex*)malloc(mem_size);
printf(“Host memory allocation: DONE \n”);

// Initalize the memory for the signal
for (unsigned int i = 0; i < (SIGNAL_SIZE * BATCH); ++i) {
h_signal[i].x = (float)i;
h_signal[i].y = 0;
}
printf(“Host memory initialization: DONE \n”);

// Allocate device memory for signal
Complex* d_signal;
Complex* d_transformed_signal;
cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size));
cutilSafeCall(cudaMalloc((void**)&d_transformed_signal, mem_size));
printf(“CUDA memory allocation: DONE \n”);

// Copy host memory to device
cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice));
printf(“Host → GPU memory copy: DONE \n”);

// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, mem_size, CUFFT_C2C, BATCH));

// Transform signal
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_transformed_signal, CUFFT_FORWARD));
printf(“FFT Complete \n”);

// Copy device memory to host
cutilSafeCall(cudaMemcpy(h_transformed_signal, d_transformed_signal, mem_size,
cudaMemcpyDeviceToHost));
printf(“GPU → Host memory copy: DONE \n\n”);

From your code:

// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, mem_size, CUFFT_C2C, BATCH));

you are using mem_size as the fft size. I think what you want is:
cufftSafeCall(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, BATCH));

Wow. Yep. That was sure it. Nice catch man! Thanks A TON! Really appreciate it!

Mike