Hi,
I’m trying to run the fastest way possible multiple kinds of FFTs on a stream of input images.
One idea was to execute the FFTs by batch and not one by one.
I ran a benchmark on my compute (i9-9960X, 128 Go RAM, TITAN RTX) with the following code and got a strange result.
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
#include <conio.h>
#define checkCudaErrors(ans) { checkCudaErrors_((ans), __FILE__, __LINE__); }
inline void checkCudaErrors_(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if (CUFFT_SUCCESS != err) {
fprintf(stderr, "cufftSafeCall() CUFFT error in file <%s>, line %i.n",
file, line);
getch(); exit(-1);
}
}
#define BATCH 256
using namespace std;
int main()
{
for (int i = 8; i <= 20; ++i)
{
/* Init data */
// Goes from 256 to 1048576
long long image_size = 1 << i;
cuComplex *d_idata, *d_odata, *d_odata_batched;
checkCudaErrors(cudaMallocManaged(&d_idata, BATCH * sizeof(cuComplex) * image_size));
checkCudaErrors(cudaMallocManaged(&d_odata, BATCH * sizeof(cuComplex) * image_size));
checkCudaErrors(cudaMallocManaged(&d_odata_batched, BATCH * sizeof(cuComplex) * image_size));
// Put in some values
for (long long i = 0; i < image_size * BATCH; ++i)
{
d_idata[i].x = 0.5 + i;
d_idata[i].y = 0.5 + i + 2;
}
/* None batched benchmark */
cufftHandle plan;
cufftSafeCall(cufftCreate(&plan));
size_t ws;
cufftSafeCall(cufftEstimate1d(image_size, CUFFT_C2C, 1, &ws));
cufftSafeCall(cufftMakePlan1d(plan, image_size, CUFFT_C2C, 1, &ws));
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start));
for (int i = 0; i < BATCH; ++i)
cufftSafeCall(cufftXtExec(plan, d_idata + i * image_size, d_odata + i * image_size, CUFFT_FORWARD));
checkCudaErrors(cudaEventRecord(stop));
checkCudaErrors(cudaEventSynchronize(stop));
float et;
checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
printf("none batched loop forward FFT time for %ld samples: %fms\n", image_size, et);
/* Batched benchark */
cufftHandle plan_batched;
cufftSafeCall(cufftCreate(&plan_batched));
size_t ws_batched;
cufftSafeCall(cufftEstimate1d(image_size, CUFFT_C2C, BATCH, &ws_batched));
cufftSafeCall(cufftMakePlan1d(plan_batched, image_size, CUFFT_C2C, BATCH, &ws_batched));
cudaEvent_t start_batched, stop_batched;
checkCudaErrors(cudaEventCreate(&start_batched));
checkCudaErrors(cudaEventCreate(&stop_batched));
checkCudaErrors(cudaEventRecord(start_batched));
cufftSafeCall(cufftXtExec(plan_batched, d_idata, d_odata_batched, CUFFT_FORWARD));
checkCudaErrors(cudaEventRecord(stop_batched));
checkCudaErrors(cudaEventSynchronize(stop_batched));
float et_batched;
checkCudaErrors(cudaEventElapsedTime(&et_batched, start_batched, stop_batched));
printf("batched forward FFT time for %ld samples: %fms\n", image_size, et_batched);
/* Free data */
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata));
checkCudaErrors(cudaFree(d_odata_batched));
cout << endl;
}
return 0;
}
Which gave me the following result :
none batched loop forward FFT time for 256 samples: 1.059200ms
batched forward FFT time for 256 samples: 0.006304ms
none batched loop forward FFT time for 512 samples: 1.148192ms
batched forward FFT time for 512 samples: 0.008032ms
none batched loop forward FFT time for 1024 samples: 1.235424ms
batched forward FFT time for 1024 samples: 0.012000ms
none batched loop forward FFT time for 2048 samples: 1.365248ms
batched forward FFT time for 2048 samples: 0.020192ms
none batched loop forward FFT time for 4096 samples: 1.940544ms
batched forward FFT time for 4096 samples: 0.031360ms
none batched loop forward FFT time for 8192 samples: 3.039808ms
batched forward FFT time for 8192 samples: 0.079520ms
none batched loop forward FFT time for 16384 samples: 4.212608ms
batched forward FFT time for 16384 samples: 0.263136ms
none batched loop forward FFT time for 32768 samples: 3.456544ms
batched forward FFT time for 32768 samples: 0.491072ms
none batched loop forward FFT time for 65536 samples: 3.774560ms
batched forward FFT time for 65536 samples: 0.984832ms
none batched loop forward FFT time for 131072 samples: 4.375968ms
batched forward FFT time for 131072 samples: 1.972576ms
none batched loop forward FFT time for 262144 samples: 7.563072ms
batched forward FFT time for 262144 samples: 3.947424ms
none batched loop forward FFT time for 524288 samples: 11.003968ms
batched forward FFT time for 524288 samples: 12.497760ms
none batched loop forward FFT time for 1048576 samples: 25.798401ms
batched forward FFT time for 1048576 samples: 24.676672ms
As far as I understand, batched FFTs on small images are parallelized and thus run faster. On the other hand, FFTs for big images can’t be parallelized, since for example a single 1024x1024 FFT already takes all the GPU resources (1 FFT is run in parallel but not multiple).
Am I correct or am I missing something ?