The input sample is generated within the code.
************/
/
HEADERS
/
#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>
/
MACROS
/
#define DATASIZE 4096 // Input size
#define BATCH 1 // Number of batches of transform done by a plan
/
CUDA ERROR CHECK
**/
#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %dn”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/
MAIN
**********************************************************************/
int main ()
{
// Declaring iterators
int batchIter, dataLengthIter;
// Declaring parameters to create plan
cufftHandle plan;
// Rank denotes the dimension of the transform
int rank = 1;
// Size of the Fourier transform
int n = { DATASIZE };
// Distance between two successive input/output elements
int iStride = 1, oStride = 1;
// Distance between batches
int iDist = DATASIZE, oDist = DATASIZE;
// Number of batched executions
int batch = BATCH;
// Input size with pitch (ignored for 1D transforms)
int inEmbed = { 0 };
// Output size with pitch (ignored for 1D transforms)
int onEmbed = { 0 };
// Declaring events to record kernel execution
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
// Declaring device variable for input and output
cufftComplex *deviceInputData;
cufftComplex *deviceOutputData;
// Declaring and allocating host variable for input
cufftComplex hostInputData = ( cufftComplex )malloc( DATASIZE * BATCH * sizeof( cufftComplex ) );
// Declaring and allocating host variable for output
cufftComplex hostOutputData = ( cufftComplex )malloc( DATASIZE * BATCH * sizeof( cufftComplex ) );
// Allocating memory in the device for input and output
gpuErrchk( cudaMalloc( (void**)&deviceInputData, DATASIZE * BATCH * sizeof( cufftComplex) ) );
gpuErrchk( cudaMalloc( (void**)&deviceOutputData, (DATASIZE) * BATCH * sizeof( cufftComplex) ) );
// Creating a plan for batched 1D complex-to-complex n- point Fourier transform
cufftPlanMany( &plan, rank, n, inEmbed, iStride, iDist, onEmbed,
oStride, oDist, CUFFT_C2C, batch );
// Loop to generate real and complex input
for ( batchIter = 0; batchIter < BATCH; batchIter++ ) {
for ( dataLengthIter = 0; dataLengthIter < DATASIZE; dataLengthIter++ ) {
hostInputData[batchIter * DATASIZE + dataLengthIter].x = dataLengthIter + 1;
hostInputData[batchIter * DATASIZE + dataLengthIter].y = dataLengthIter + 1;
/* printf( “%f + %f I\n”, hostInputData[batchIter * DATASIZE + dataLengthIter].x,
hostInputData[batchIter * DATASIZE + dataLengthIter].y );*/
}
}
// Copying input from host to device variable
cudaMemcpy( deviceInputData, hostInputData, DATASIZE * BATCH * sizeof( cufftComplex ), cudaMemcpyHostToDevice );
cudaEventRecord(start);
// Executing the plan. The input and output pointers and direction of transformation is specified
cufftExecC2C( plan, deviceInputData, deviceOutputData, CUFFT_FORWARD );
cudaDeviceSynchronize();
cudaEventRecord(stop);
// Device->Host copy of the results
gpuErrchk( cudaMemcpy( hostOutputData, deviceOutputData, DATASIZE * BATCH * sizeof( cufftComplex ), cudaMemcpyDeviceToHost ) );
cudaEventSynchronize(stop);
// Displaying the output
/for (int batchIter = 0; batchIter < BATCH; batchIter++) {
for (int dataLengthIter = 0; dataLengthIter < DATASIZE ; dataLengthIter++) {
printf( “%i %i %f %f\n”, batchIter, dataLengthIter, hostOutputData[batchIterDATASIZE + dataLengthIter].x,
hostOutputData[batchIter * DATASIZE + dataLengthIter].y );
}
}*/
cudaEventElapsedTime(&milliseconds, start, stop);
printf(" Execution Time in milliseconds: %f\n", milliseconds);
// Destroying the plan
cufftDestroy( plan );
// Free-ing memory
gpuErrchk( cudaFree(deviceOutputData) );
gpuErrchk( cudaFree(deviceInputData) );
}