Two very simple kernels - one to fill some data on the device (for the FFT to process) and another that calculates the magnitude squared of the FFT data. What’s odd is that our kernel routines are taking 50% longer than the FFT. My only suspicions are in how we allocated num threads per block and num blocks.

tpb = 1024; // thread per block

fftSize = 1024; // no relationship to tbp … just conincidence

int numBatches = 40000;

dataLength = fftSize * NumBatches; // ex 1024 * 40000

// Allocate the ComplexShortDevice and cufftComplexDevice buffers of size dataLength

// Allocate floatDeviceBuffer for mag squared output of size dataLength

// Copy complex short host data to the complexShortDeviceBuffer

// fill the array – note that num blocks = dataLength/tpb = 40000 in this example

fillFFTArray<<<(int)ceil(dataLength/ tpb ), tpb , 0, stream1 >>>cufftComplexDeviceBuffer, complexShortDeviceBuffer ,dataLength);

// Run the fft

cufftExecC2C(plan1, cufftComplexDeviceBuffer, cufftComplexDeviceBuffer, CUFFT_FORWARD) );

// Calc mag squared

calcMagnitudeSquared <<<(int)ceil(dataLength/tbp), tbp,0,stream1 >>> (floatDeviceBuffer, cufftComplexDeviceBuffer, dataLength);

**global** void fillFFTArray (cufftComplex *fltData, ComplexShort *shData, int count) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < count) {

fltData[i].x = shData[i].x;

fltData[i].y = shData[i].y;

}

}

**global** void calcMagnitudeSquared (float *realData, cufftComplex *cmplxData,int count ) {

```
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < count) {
realData[i] = (cmplxData[i].x * cmplxData[i].x) + (cmplxData[i].y * cmplxData[i].y);
}
```

}