FFT takes 40% longer than expected on the Jetson Orin Nano Super. It has 1024 cores running at 1020MHz, so it should be 2.6x slower than my RTX 2060 which has 1920 cores and runs at 1395MHz, but in fact I find it runs 3.6x slower. What might be the underlying reason for this? Is there any thing we could do to tune it? It makes a lot of difference in our time-constrained application.
Test code: (by the way I am timing only the loop not the set up)
include <cufft.h>
cufftHandle getCufftPlan(size_t rows, size_t cols, int nbatch, cufftType type) {
cufftHandle handle;
constexpr int rank = 2;
constexpr int stride = 1;
int dims[rank] = { (int)rows, (int)cols };
int vol = dims[0] * dims[1];
cufftPlanMany(&handle, rank, dims, dims, stride, vol, dims, stride, vol, type, nbatch);
return handle;
}
int main(int argc, char** argv) {
int w = 4912/2;
int h = 3684/2;
cufftComplex* input;
cufftComplex* output;
cudaMalloc(&input, w * h * sizeof(cufftComplex));
cudaMalloc(&output, w * h * sizeof(cufftComplex));
cudaMemset(input, 0x00, w * h * sizeof(cufftComplex));
cudaMemset(output, 0x00, w * h * sizeof(cufftComplex));
cufftHandle plan = getCufftPlan(h, w, 1, CUFFT_C2C);
//auto time0 = std::chrono::high_resolution_clock::now();
for (int i=0; i<100; ++i) {
cufftExecC2C(plan, input, output, CUFFT_FORWARD);
}
//auto time1= std::chrono::high_resolution_clock::now();
//auto duration = std::chrono::duration_cast<std::chrono::microseconds>(time1 - time0);
//std::cout << duration.count() << std::endl;
cufftDestroy(plan);
cudaFree(output);
cudaFree(input);
}