We have been using Cufft on the Tesla C1060. When we ran the same test program on the Tesla C2050 we expected better performance but instead we found it to be almost half the speed. We are running a large number of small fft’s , i.e 1,000,000 32 x32 cufft’s .
This is the message I am getting on C1060 (Red Hat 5.2, CUDA 2.2):
running fft on 1000000 chips of size=32x32… OK (18506 msec)
This is the message I am getting on C2050 (Red Hat 5.4, CUDA 3.1):
running fft on 1000000 chips of size=32x32… OK (32505 msec)
I have included an example code that demonstrates the problem. Are we doing something wrong? Is there something in the makefile wrt. compilter flags?
Thanks in Advance
Sample Code:
#include <cufft.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#define FERMI 1
#define OK 0
#define ERROR -1
int main(void)
size_t i;
cufftHandle fft;
float* src;
cufftComplex* dst;
const size_t dim = 32;
const size_t size = dim * dim;
const size_t max = 1000000;
timeval timer[2];
// create fft plan
if (cufftPlan2d(&fft, dim, dim, CUFFT_R2C) != CUFFT_SUCCESS) {
fprintf(stderr, “unable to create fft plan\n”);
return ERROR;
if (cufftSetCompatibilityMode(fft, CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS) {
fprintf(stderr, “unable to set fft plan to native mode\n”);
return ERROR;
// allocate input chip
if (cudaMalloc(reinterpret_cast<void**>(&src), size * sizeof(float)) != cudaSuccess) {
fprintf(stderr, “unable to allocate input chip\n”);
return ERROR;
// allocate output chip
if (cudaMalloc(reinterpret_cast<void**>(&dst), size * sizeof(cufftComplex)) != cudaSuccess) {
fprintf(stderr, “unable to allocate output chip\n”);
return ERROR;
fprintf(stderr, “running fft on %zu chips of size=%zux%zu…”, max, dim, dim);
// start timer
gettimeofday(&timer[0], NULL);
// execute real->complex fft plan
for (i = 0; i < max; i++) {
if (cufftExecR2C(fft, src, dst) != CUFFT_SUCCESS) {
fprintf(stderr, " FAIL\nunable to execute real->complex fft plan\n");
return ERROR;
// synchronize cuda threads
if (cudaThreadSynchronize() != cudaSuccess) {
fprintf(stderr, " FAIL\nunable to synchronize cuda threads\n");
return ERROR;
// stop timer
gettimeofday(&timer[1], NULL);
if (timer[1].tv_usec < timer[0].tv_usec) {
timer[1].tv_usec += 1000000;
timer[1].tv_sec -= timer[0].tv_sec;
timer[1].tv_usec -= timer[0].tv_usec;
fprintf(stderr, " OK (%lu msec)\n", timer[1].tv_sec * 1000 + timer[1].tv_usec / 1000);
// deallocate input chip
if (cudaFree(src) != cudaSuccess) {
fprintf(stderr, “unable to deallocate input chip\n”);
return ERROR;
// deallocate output chip
if (cudaFree(dst) != cudaSuccess) {
fprintf(stderr, “unable to deallocate output chip\n”);
return ERROR;
// destroy fft plan
if (cufftDestroy(fft) != CUFFT_SUCCESS) {
fprintf(stderr, “unable to destroy fft plan\n”);
return ERROR;
fprintf(stderr, “OK\n”);
return OK;
The Following Makefile:
fft-test: fft-test.cpp
g++ -o $@ -O2 -fpic -fPIC -pipe -DNDEBUG -DNO_BLAS -I/usr/local/cuda/include -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -lcufft -lcudart $<
rm -f fft-test