Thanks a lot for your answer AastaLLL.
I played a little bit with FFT example.
My usecase is to calculate two FFT. The output from the first FFT is the input for the second FFT.
The fft1 (512length) is executed 98304 times.
The fft2 (1024length) is executed 24576 times.
Before calculating fft2 i have to reorder the data. I want to use the advanced data layout provided by cufftPlanMany.
Here is the timing, that look really good to me:
execute fft1 : 28.778000
execute fft2 : 12.420000
But,as soon as i change the input handling of fft2 i get this timing:
execute fft1 : 27.136999
execute fft2 : 686.557007
fft2 increased by a factor of 55.
It seems i’m doing something wrong. Attached you find the code i use. The good timing behaviour can be enabled by replacing #if 0 by #if 1.
Can someone please let me know what i’m doing wrong.
Many thanks and have a nice weekend.
Dirk
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <unistd.h>
typedef float2 Complex;
//#define ZERO_COPY
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// The filter size is assumed to be a number smaller than the signal size
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
/* declarations */
float elapsedTimeInMs;
StopWatchInterface *timer;
int memSizeInFFT1;
int memSizeOutFFT1;
int memSizeOutFFT2;
float *pDataInFFT1;
Complex *pDataOutFFT1;
Complex *pDataOutFFT2;
FILE *pFileRead;
float *pFileData;
cufftHandle planFFT1;
cufftHandle planFFT2;
float *pCudaFftInFFT1;
Complex *pCudaFftOutFFT1;
Complex *pCudaFftOutFFT2;
/* variables for planMany range */
int istrideFFT1, ostrideFFT1;
int idistFFT1, odistFFT1;
int rankFFT1;
int rankFFT1Array[3];
int inembedArrayFFT1[3];
int onembedArrayFFT1[3];
/* variables for planMany speed */
int istrideFFT2, ostrideFFT2;
int idistFFT2, odistFFT2;
int rankFFT2;
int rankFFT2Array[3];
int inembedArrayFFT2[3];
int onembedArrayFFT2[3];
/* initialisations */
elapsedTimeInMs = 0.0f;
sdkCreateTimer(&timer);
memSizeInFFT1 = 512 * (1024*32*3) * sizeof(float);
memSizeOutFFT1 = 512 * (1024*32*3) * sizeof(Complex);
memSizeOutFFT2 = 1024 * (256*32*3)* sizeof(Complex)*2;
pDataInFFT1 = NULL;
pDataOutFFT1 = NULL;
/* configure planeMany for range */
#if 0
rankFFT1 = 1;
rankFFT1Array[0] = 512;
inembedArrayFFT1[0] = 512;
onembedArrayFFT1[0] = 512;
istrideFFT1 = 1;
idistFFT1 = 512;
ostrideFFT1 = 1;
odistFFT1 = 512;
/* configure planeMany for speed */
rankFFT2 = 1;
rankFFT2Array[0] = 1024;
inembedArrayFFT2[0] = 1024;
onembedArrayFFT2[0] = 1024;
istrideFFT2 = 1;
idistFFT2 = 1024;
ostrideFFT2 = 1;
odistFFT2 = 1024;
#else
rankFFT1 = 1;
rankFFT1Array[0] = 512;
inembedArrayFFT1[0] = 512;
onembedArrayFFT1[0] = 512;
istrideFFT1 = 1;
idistFFT1 = 512;
ostrideFFT1 = 1;
odistFFT1 = 512;
/* configure planeMany for speed */
rankFFT2 = 1;
rankFFT2Array[0] = 1024;
inembedArrayFFT2[0] = 1024;
onembedArrayFFT2[0] = 1024;
istrideFFT2 = (256*32*3);
idistFFT2 = 1;
ostrideFFT2 = 1;
odistFFT2 = 1024;
#endif
/* default call */
findCudaDevice(argc, (const char **)argv);
printf("[simpleCUFFT] is starting...\n");
/* memory allocation */
printf("memsite FFT1_In:%i\n",memSizeInFFT1);
printf("memsite FFT1_Out:%i\n",memSizeOutFFT1);
printf("memsite FFT2_Out:%i\n",memSizeOutFFT2);
cudaSetDeviceFlags(cudaDeviceMapHost);
checkCudaErrors(cudaHostAlloc((void **)&pDataInFFT1, memSizeInFFT1, cudaHostAllocMapped));
checkCudaErrors(cudaHostAlloc((void **)&pDataOutFFT1, memSizeOutFFT1, cudaHostAllocMapped));
checkCudaErrors(cudaHostAlloc((void **)&pDataOutFFT2, memSizeOutFFT2, cudaHostAllocMapped));
memset(pDataOutFFT1,0,memSizeOutFFT1);
memset(pDataOutFFT2,0,memSizeOutFFT2);
/* read input from file */
pFileData = (float *)malloc(memSizeInFFT1);
if(!pFileData){
printf("memory allocation error\n");
return;
}
pFileRead = fopen("rawData.bin","rb");
if(pFileRead == NULL){
printf("no valid file");
return;
}
fread(pFileData,sizeof(float),512* (1024*32*3),pFileRead);
fclose(pFileRead);
/* create FFT planFFT1 */
//checkCudaErrors(cufftPlan1d(&planFFT1, FFT_LENGTH_RANGE, CUFFT_R2C, NR_OF_FFT_RANGE));
/* range fft */
checkCudaErrors(cufftPlanMany( &planFFT1, rankFFT1, rankFFT1Array, \
&inembedArrayFFT1[0],istrideFFT1, idistFFT1, \
&onembedArrayFFT1[0],ostrideFFT1, odistFFT1, \
CUFFT_R2C, (1024*32*3)));
/* speed fft */
checkCudaErrors(cufftPlanMany( &planFFT2, rankFFT2, rankFFT2Array, \
&inembedArrayFFT2[0],istrideFFT2, idistFFT2, \
&onembedArrayFFT2[0],ostrideFFT2, odistFFT2, \
CUFFT_C2C, (256*32*3)));
/* convert pointer */
checkCudaErrors(cudaHostGetDevicePointer((void **)&pCudaFftInFFT1, (void *)pDataInFFT1, 0));
checkCudaErrors(cudaHostGetDevicePointer((void **)&pCudaFftOutFFT1, (void *)pDataOutFFT1, 0));
checkCudaErrors(cudaHostGetDevicePointer((void **)&pCudaFftOutFFT2, (void *)pDataOutFFT2, 0));
for(int i=0;i<10;i++){
/* time memcpy */
sdkResetTimer(&timer);
sdkStartTimer(&timer);
memcpy(pDataInFFT1,pFileData,memSizeInFFT1);
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
printf("copy rawdata from CPU RAM to GPU: %f\n", elapsedTimeInMs);
/* time first fft */
sdkResetTimer(&timer);
sdkStartTimer(&timer);
checkCudaErrors(cufftExecR2C(planFFT1, (cufftReal *)pCudaFftInFFT1, (cufftComplex *)pCudaFftOutFFT1));
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
printf("execute fft1 : %f\n", elapsedTimeInMs);
/* time second fft */
sdkResetTimer(&timer);
sdkStartTimer(&timer);
checkCudaErrors( cufftExecC2C(planFFT2, \
(cufftComplex *)pCudaFftOutFFT2, \
(cufftComplex *)pCudaFftOutFFT2, \
CUFFT_FORWARD));
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
printf("execute fft2 : %f\n", elapsedTimeInMs);
}
/* free planFFT1 memory */
checkCudaErrors(cufftDestroy(planFFT1));
checkCudaErrors(cufftDestroy(planFFT2));
/* write result to file */
FILE *pFile;
pFile = fopen("FftOut.bin","wb");
fwrite(pDataOutFFT1,sizeof(Complex),(512/2+1) * (1024*32*3),pFile);
fclose(pFile);
/* free memory */
cudaFreeHost(pDataInFFT1);
cudaFreeHost(pDataOutFFT1);
cudaFreeHost(pDataOutFFT2);
free(pFileData);
//free(pDataInFFT1);
//free(pDataOutFFT1);
}