FFT Computation Timing constraint on GPU.

Hi, I am new to CUDA and I am trying to perform FFT on my GPU using CuFFT library. The problem is, when I run the compiled code first time it takes around 500us but if I run it again immediately without waiting it takes around 175us. (I am running the .exe from cmd). Whenever I give some pause it takes 500us to compute the FFT. For different NX-point DFT it gives me almost the same timing. I am computing the time from start of FFT to end, not the data copying time. Can anyone please tell me what is the problem. What I was expecting was to have higher execution time for high NX-point FFT, but it is taking almost the same time. Attached below is my code.

I am using Visual C++ 2010 Express and CUDA v6.0. System specs: Corei7 3.60Ghz, RAM: 16GB, GPU: GeForce GT640 (Using same GPU for displaying and Computation)

Any suggestions and help would be appreciated.

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <time.h> 

// includes,project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>

// Raw Data Generation
#define TABLE_SIZE 1000
#define TWO_PI (3.14159 * 2)
#define CYCLES 20
#define NUMBER_OF_SAMPLES (TABLE_SIZE*CYCLES)

// FFT Values 
#define NX 2048  // NX-point DFT
#define BATCH 1 

// Sine Generator Function 
void sin_func(float *sample_ptr)
{ 
	float phaseIncrement = TWO_PI/TABLE_SIZE;
	float currentPhase = 0.0;
	int i;
	for (i = 0; i < CYCLES*TABLE_SIZE; i ++){
		*sample_ptr = sin(currentPhase);
	    sample_ptr = sample_ptr + sizeof(float)/4;
		currentPhase += phaseIncrement;
	}
}


void main()
{	
	const int ARRAY_SIZE = NUMBER_OF_SAMPLES*sizeof(float);
	const int FFT_OUT_SIZE = sizeof(cufftComplex)*(NX/2+1)*BATCH;

	// Variable Declaration for execution time computation
		LARGE_INTEGER ticksPerSecond;
		LARGE_INTEGER startTick;   // A point in time
		LARGE_INTEGER starttime;   // For converting tick into real time		
		LARGE_INTEGER endTick;   // A point in time
		LARGE_INTEGER endtime;   // For converting tick into real time			
		
		// get the high resolution counter's accuracy
		 QueryPerformanceFrequency(&ticksPerSecond);
	//
	

	// Initialization of input data on Host
	float h_rawdata[NUMBER_OF_SAMPLES];
	float h_checkdata[NUMBER_OF_SAMPLES];

	sin_func(&h_rawdata[0]);
	// Display values in the resulting array
	for (int i =0; i < 12 ; i++) {
		printf("%f", h_rawdata[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}
	

	//Initializing output array on Host
	cufftComplex h_fftout[FFT_OUT_SIZE];

	//Allocate memory on GPU
	float *d_rawdata;
	float *d_checkdata;
	cufftHandle plan; 
	cufftComplex *d_fftout;
	
	cudaMalloc((void**)&d_rawdata, ARRAY_SIZE);
	cudaMalloc((void**)&d_checkdata, ARRAY_SIZE); // For Testing Only
	cudaMalloc((void**)&d_fftout, FFT_OUT_SIZE);

	//copying data to device(GPU) memory
	cudaMemcpy (d_rawdata, h_rawdata, ARRAY_SIZE, cudaMemcpyHostToDevice);
    
	// ** Doing FFT ** //
	if (cudaGetLastError() != cudaSuccess){
		fprintf(stderr, "Cuda error: Failed to allocate\n"); 
		return;	
	} 
	if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS){
		fprintf(stderr, "CUFFT error: Plan creation failed"); 
		return;	
	}	
	
	// fft starting
	QueryPerformanceCounter(&startTick);  // Time stamp at start of FFT

	if (cufftExecR2C(plan, d_rawdata, d_fftout) != CUFFT_SUCCESS){ 
		fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
		return;	
	} 
	if (cudaDeviceSynchronize() != cudaSuccess){ 
		fprintf(stderr, "Cuda error: Failed to synchronize\n"); 
	return;	
	} 
	QueryPerformanceCounter(&endTick); // Time stamp at end End of FFT
	

	// ** Doing Inverse FFT ** //
	if (cufftPlan1d(&plan, NX, CUFFT_C2R, BATCH) != CUFFT_SUCCESS){
		fprintf(stderr, "CUFFT error: Plan creation failed"); 
		return;	
	}	
	if (cufftExecC2R(plan, d_fftout, d_checkdata) != CUFFT_SUCCESS){ 
		fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
		return;	
	} 
	if (cudaDeviceSynchronize() != cudaSuccess){ 
		fprintf(stderr, "Cuda error: Failed to synchronize\n"); 
	return;	
	} 
	
	// Copying Data Back to Host
	cudaMemcpy (h_fftout, d_fftout, FFT_OUT_SIZE, cudaMemcpyDeviceToHost);
	cudaMemcpy (h_checkdata, d_checkdata, ARRAY_SIZE, cudaMemcpyDeviceToHost);
	
	cufftDestroy(plan); 
	cudaFree(d_rawdata);
	cudaFree(d_fftout);
	
	printf("\n");
	// Displaying the resulting array
	for (int i =0; i < 12 ; i++) {
		printf("%f", h_checkdata[i]/NX);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

/// Ticks conversion	
   
  // convert the tick number into the number of seconds
  // since the system was started...
  starttime.QuadPart = startTick.QuadPart/ticksPerSecond.QuadPart;
  endtime.QuadPart = endTick.QuadPart/ticksPerSecond.QuadPart;

  //get the number of hours
  int starthours = starttime.QuadPart/3600;
  int endhours = endtime.QuadPart/3600;

  //get the number of minutes
  starttime.QuadPart = starttime.QuadPart - (starthours * 3600);
  endtime.QuadPart = endtime.QuadPart - (endhours * 3600);
  
  int startminutes = starttime.QuadPart/60;
  int endminutes = endtime.QuadPart/60;
  
  //get the number of seconds
  int startseconds = starttime.QuadPart - (startminutes * 60);
  int endseconds = starttime.QuadPart - (endminutes *60);
  
  double ticks_per_micro= (double)ticksPerSecond.QuadPart/1000000;
  //printf ("\n div = %f",ticks_per_micro);

  //get the number of Microseconds
  double startmicroSecondes = (double)((startTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);
  double endmicroSecondes = (double)((endTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);

  printf ("\n FFT Started %d:%d:%d::%.2f",starthours, startminutes, startseconds, startmicroSecondes);
  printf ("\n FFT Ended %d:%d:%d::%.2f \n",endhours, endminutes, endseconds, endmicroSecondes);

  printf ("\nFFT computation time for %d point DFT: %.2fus \n", NX, endmicroSecondes - startmicroSecondes);
}