Why Cufft is running slow?

I am trying to run 2d FFT using cuFFT. It is running fine and the result is also correct.

The problem is it is running very slow. For 2D fft I am using 256*128 input data. Basically 256 sampling points and 128 chirps.

For running this it is taking around 150 ms, which should take less than 1ms.

Anyone has any idea about it?

The code is shared below.

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include “helper_cuda.h”
#include “helper_functions.h”
#include <cufft.h>

int Num_ADC_Samples = 256; // number of ADC samples per chip
int NChirps = 128; // Number of Chirps in a frame

int main()
{
int count = 0;

// Host data
cufftComplex* h_idata = (cufftComplex*)malloc(sizeof(cufftComplex) * Num_ADC_Samples * NChirps); // Host input signal variable
cufftComplex* h_odata = (cufftComplex*)malloc(sizeof(cufftComplex) * Num_ADC_Samples * NChirps); // host output variable

// Construct the host data (complex 2D data)
for (unsigned int count = 0; count < Num_ADC_Samples * NChirps; count++)
{
	h_idata[count].x = static_cast <float> (rand()) / static_cast <float> (1000);;
	h_idata[count].y = static_cast <float> (rand()) / static_cast <float> (1000);;
}

// Perform FFT on Device
cufftHandle plan;

// Device Memory
cufftComplex* d_idata;
checkCudaErrors(cudaMalloc((void**)&d_idata, Num_ADC_Samples * NChirps * sizeof(cufftComplex)));


// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_idata, h_idata, Num_ADC_Samples * NChirps * sizeof(cufftComplex), cudaMemcpyHostToDevice));

// Threads per CTA (Cooperative Thread Array) (1024 threads per CTA)
int NUM_THREADS = 1 << 10;
int NUM_BLOCKS = ((Num_ADC_Samples * NChirps) + NUM_THREADS - 1) / NUM_THREADS;

/* Create a 1D/2D FFT plan. */
cufftPlan2d(&plan, Num_ADC_Samples, NChirps, CUFFT_C2C);

/* Use the CUFFT plan to transform the signal out of place. */
cufftExecC2C(plan, d_idata, d_idata, CUFFT_FORWARD);

checkCudaErrors(cudaDeviceSynchronize());

// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_odata, d_idata, Num_ADC_Samples * NChirps * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

cufftDestroy(plan);
cudaFree(h_idata), cudaFree(h_odata), cudaFree(d_idata);

}