Hi, I am new to CUDA and I am trying to perform FFT on my GPU using CuFFT library. The problem is, when I run the compiled code first time it takes around 500us but if I run it again immediately without waiting it takes around 175us. (I am running the .exe from cmd). Whenever I give some pause it takes 500us to compute the FFT. For different NX-point DFT it gives me almost the same timing. I am computing the time from start of FFT to end, not the data copying time. Can anyone please tell me what is the problem. What I was expecting was to have higher execution time for high NX-point FFT, but it is taking almost the same time. Attached below is my code.
I am using Visual C++ 2010 Express and CUDA v6.0. System specs: Corei7 3.60Ghz, RAM: 16GB, GPU: GeForce GT640 (Using same GPU for displaying and Computation)
Any suggestions and help would be appreciated.
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <time.h>
// includes,project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
// Raw Data Generation
#define TABLE_SIZE 1000
#define TWO_PI (3.14159 * 2)
#define CYCLES 20
#define NUMBER_OF_SAMPLES (TABLE_SIZE*CYCLES)
// FFT Values
#define NX 2048 // NX-point DFT
#define BATCH 1
// Sine Generator Function
void sin_func(float *sample_ptr)
{
float phaseIncrement = TWO_PI/TABLE_SIZE;
float currentPhase = 0.0;
int i;
for (i = 0; i < CYCLES*TABLE_SIZE; i ++){
*sample_ptr = sin(currentPhase);
sample_ptr = sample_ptr + sizeof(float)/4;
currentPhase += phaseIncrement;
}
}
void main()
{
const int ARRAY_SIZE = NUMBER_OF_SAMPLES*sizeof(float);
const int FFT_OUT_SIZE = sizeof(cufftComplex)*(NX/2+1)*BATCH;
// Variable Declaration for execution time computation
LARGE_INTEGER ticksPerSecond;
LARGE_INTEGER startTick; // A point in time
LARGE_INTEGER starttime; // For converting tick into real time
LARGE_INTEGER endTick; // A point in time
LARGE_INTEGER endtime; // For converting tick into real time
// get the high resolution counter's accuracy
QueryPerformanceFrequency(&ticksPerSecond);
//
// Initialization of input data on Host
float h_rawdata[NUMBER_OF_SAMPLES];
float h_checkdata[NUMBER_OF_SAMPLES];
sin_func(&h_rawdata[0]);
// Display values in the resulting array
for (int i =0; i < 12 ; i++) {
printf("%f", h_rawdata[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
//Initializing output array on Host
cufftComplex h_fftout[FFT_OUT_SIZE];
//Allocate memory on GPU
float *d_rawdata;
float *d_checkdata;
cufftHandle plan;
cufftComplex *d_fftout;
cudaMalloc((void**)&d_rawdata, ARRAY_SIZE);
cudaMalloc((void**)&d_checkdata, ARRAY_SIZE); // For Testing Only
cudaMalloc((void**)&d_fftout, FFT_OUT_SIZE);
//copying data to device(GPU) memory
cudaMemcpy (d_rawdata, h_rawdata, ARRAY_SIZE, cudaMemcpyHostToDevice);
// ** Doing FFT ** //
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
// fft starting
QueryPerformanceCounter(&startTick); // Time stamp at start of FFT
if (cufftExecR2C(plan, d_rawdata, d_fftout) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
QueryPerformanceCounter(&endTick); // Time stamp at end End of FFT
// ** Doing Inverse FFT ** //
if (cufftPlan1d(&plan, NX, CUFFT_C2R, BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
if (cufftExecC2R(plan, d_fftout, d_checkdata) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
// Copying Data Back to Host
cudaMemcpy (h_fftout, d_fftout, FFT_OUT_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy (h_checkdata, d_checkdata, ARRAY_SIZE, cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(d_rawdata);
cudaFree(d_fftout);
printf("\n");
// Displaying the resulting array
for (int i =0; i < 12 ; i++) {
printf("%f", h_checkdata[i]/NX);
printf(((i % 4) != 3) ? "\t" : "\n");
}
/// Ticks conversion
// convert the tick number into the number of seconds
// since the system was started...
starttime.QuadPart = startTick.QuadPart/ticksPerSecond.QuadPart;
endtime.QuadPart = endTick.QuadPart/ticksPerSecond.QuadPart;
//get the number of hours
int starthours = starttime.QuadPart/3600;
int endhours = endtime.QuadPart/3600;
//get the number of minutes
starttime.QuadPart = starttime.QuadPart - (starthours * 3600);
endtime.QuadPart = endtime.QuadPart - (endhours * 3600);
int startminutes = starttime.QuadPart/60;
int endminutes = endtime.QuadPart/60;
//get the number of seconds
int startseconds = starttime.QuadPart - (startminutes * 60);
int endseconds = starttime.QuadPart - (endminutes *60);
double ticks_per_micro= (double)ticksPerSecond.QuadPart/1000000;
//printf ("\n div = %f",ticks_per_micro);
//get the number of Microseconds
double startmicroSecondes = (double)((startTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);
double endmicroSecondes = (double)((endTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);
printf ("\n FFT Started %d:%d:%d::%.2f",starthours, startminutes, startseconds, startmicroSecondes);
printf ("\n FFT Ended %d:%d:%d::%.2f \n",endhours, endminutes, endseconds, endmicroSecondes);
printf ("\nFFT computation time for %d point DFT: %.2fus \n", NX, endmicroSecondes - startmicroSecondes);
}