FFT Timing

I’m timing the real to complex and complex to complex forward CUFFT and the complex to complex is extremely faster than the real to complex. Is it ok to use gettimeofday before and after the cufft call to get the execution time? Using this method the complex to complex is 10 to 70 times faster for FFT sizes of 512 -> 8192 elements with batch sizes of 500. Does anyone know why the huge discrepancy between the two? I saw an application note on how the real to complex is primarily for convenience and that the performance should equal or be less than the complex to complex but 70 times slower seems extreme?

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <sys/time.h>

#include <cufft.h>

#include <cutil_inline.h>

#include <cuda.h>

typedef float2 Complex;

int main(int argc, char **argv)

{

/* Parse command line args */

if (argc < 3) {

	printf("gpuBench <iterations> <num_elements> <fft_size>\n");

	printf("Example: gpuBench 1000 1000 1024\n");

}

int iterations 		= atoi(argv[1]);

int num_elements 	= atoi(argv[2]);

int fft_size	 		= atoi(argv[3]);

int mem_size_C = num_elements * fft_size * sizeof(Complex);

int mem_size_R = num_elements * fft_size * sizeof(float);

/* Set device */

cudaSetDevice(cutGetMaxGflopsDeviceId());

/* Define some variables */

timeval start_time, stop_time;

double *execute_time = (double *) malloc(3 * 2 * sizeof(double));

/* Complex in/out data */

Complex *host_in_data_C = (Complex *) malloc(mem_size_C);

Complex *host_out_data_C = (Complex *) malloc(mem_size_C);

Complex *device_in_data_C;

Complex *device_out_data_C = (Complex *) malloc(mem_size_C);

/* Real in/out data */

float *host_in_data_R = (float *) malloc(mem_size_R);

Complex *host_out_data_CR = (Complex *) malloc(mem_size_C);



float *device_in_data_R;

Complex *device_out_data_CR = (Complex *) malloc(mem_size_C);

/* Load the data */

for (int j = 0; j < num_elements; j++) {

	for (int i = 0; i < fft_size; i++) {

		host_in_data_C[fft_size * j + i].x = i;

		host_in_data_C[fft_size * j + i].y = 0;

		

		host_in_data_R[fft_size * j + i] = i;

	}

}                                         

/* Allocate devive data */

for (int i = 0; i < iterations; i++) {

cutilSafeCall(cudaMalloc((void **) &device_in_data_C, mem_size_C));

cutilSafeCall(cudaMalloc((void **) &device_in_data_R, mem_size_R));

cutilSafeCall(cudaMalloc((void **) &device_out_data_C, mem_size_C));

cutilSafeCall(cudaMalloc((void **) &device_out_data_CR, mem_size_C));

/* Copy over the host data to the device */

/* Copy Complex data */

gettimeofday(&start_time, NULL);

cutilSafeCall(cudaMemcpy(device_in_data_C, host_in_data_C,

												 mem_size_C, cudaMemcpyHostToDevice));

gettimeofday(&stop_time, NULL);

execute_time[0] += (stop_time.tv_sec-start_time.tv_sec) +

  						 (stop_time.tv_usec-start_time.tv_usec) / 1.e6;

/* Copy Real data */

gettimeofday(&start_time, NULL);

cutilSafeCall(cudaMemcpy(device_in_data_R, host_in_data_R, 

												 mem_size_R, cudaMemcpyHostToDevice));

gettimeofday(&stop_time, NULL);

execute_time[3] += (stop_time.tv_sec-start_time.tv_sec) +

  						 (stop_time.tv_usec-start_time.tv_usec) / 1.e6;

/* Set up and compute the FFT */

cufftHandle	fft_plan_C2C, fft_plan_R2C;

cufftSafeCall(cufftPlan1d(&fft_plan_C2C, fft_size, CUFFT_C2C, num_elements));

cufftSafeCall(cufftPlan1d(&fft_plan_R2C, fft_size, CUFFT_R2C, num_elements));

/* Complex 2 Complex FFT */

gettimeofday(&start_time, NULL);

cufftSafeCall(cufftExecC2C(fft_plan_C2C,

													 (cufftComplex *) device_in_data_C,

													 (cufftComplex *) device_out_data_C,

													 CUFFT_FORWARD));

gettimeofday(&stop_time, NULL);

execute_time[1] += (stop_time.tv_sec-start_time.tv_sec) +

  						 (stop_time.tv_usec-start_time.tv_usec) / 1.e6;



/* Real 2 Complex FFT */

gettimeofday(&start_time, NULL);

cufftSafeCall(cufftExecR2C(fft_plan_R2C,

													 (cufftReal *) device_in_data_R,

													 (cufftComplex *) device_out_data_CR));

gettimeofday(&stop_time, NULL);

execute_time[4] += (stop_time.tv_sec-start_time.tv_sec) +

  						 (stop_time.tv_usec-start_time.tv_usec) / 1.e6;

/* Copy device data to host */

gettimeofday(&start_time, NULL);

cutilSafeCall(cudaMemcpy(host_out_data_C, device_out_data_C, 

												 mem_size_C, cudaMemcpyDeviceToHost));

gettimeofday(&stop_time, NULL);

execute_time[2] += (stop_time.tv_sec-start_time.tv_sec) +

  						 (stop_time.tv_usec-start_time.tv_usec) / 1.e6;



gettimeofday(&start_time, NULL);

cutilSafeCall(cudaMemcpy(host_out_data_CR, device_out_data_CR, 

												 mem_size_C, cudaMemcpyDeviceToHost));

gettimeofday(&stop_time, NULL);

execute_time[5] += (stop_time.tv_sec-start_time.tv_sec) +

  						 (stop_time.tv_usec-start_time.tv_usec) / 1.e6;

cufftSafeCall(cufftDestroy(fft_plan_C2C));

cufftSafeCall(cufftDestroy(fft_plan_R2C));

cutilSafeCall(cudaFree(device_in_data_C));

cutilSafeCall(cudaFree(device_in_data_R));

cutilSafeCall(cudaFree(device_out_data_C));

cutilSafeCall(cudaFree(device_out_data_CR));

}



printf("%d\tCopy_H2D\tFFT\t\tCopy_D2H\n", fft_size);

printf("C2C\t%f\t%f\t%f\n", execute_time[0] / iterations,

													 execute_time[1] / iterations,

													 execute_time[2] / iterations);



printf("R2C\t%f\t%f\t%f\n", execute_time[3] / iterations,

													 execute_time[4] / iterations,

													 execute_time[5] / iterations);

cudaThreadExit();

/* for (int i = 0; i < floor(fft_size/2); i++) {

	printf("%f, %f, %f, %f\n", host_out_data_CR[i].x, host_out_data_CR[i].y,

															host_out_data_C[i].x, host_out_data_C[i].y);

}*/

// cutilExit(argc, argv);

}[/codebox]

You need to insert cudaThreadSynchronize before and after you start timing if you use gettimeofday or any CPU clock because kernel calls are asynchronous. The ideal method is to use the cudaEventRecord & friends, but I’ve resisted since they’re a bit cumbersome.

I encountered the same anomally as you earlier. This was because for CUDA 2.3, the R2C and C2R transforms waits for the transform to finish before returning (probably waits for C2C transform to finish before doing some conversion), while the C2C version doesn’t.

From my experience, the 2D FFTs are 2.5x slower than C2C and 1D FFTs are 7.5x slower. I’ve been looking into implementing a specialized 2D FFT for reals, so if you know anything about it, tell me.