hello, recently I have tested some dimensions performance using cuFFT, but I find the different results on different GPUs.
When I running the 2D complex to complex FFT, and the dimension set to 723*723.[The factor is 3 and 241].
I use cufftGetSize2d to measure the plan memory overhead.[I also try cufftEstimate2d, but the situation is the same]
But when I run the API on GTX1080 and V100, it gives a different result. and I check all link the same libcufft.so.10.
I am very strange that the memory overhead is depend on different GPU cards?
GTX1080:
worksize : 16867328 and complex size 4181832
V100:
worksize : 4181832 and complex size 4181832
The code is as fllows:
#include "stdio.h"
#include "cuda_runtime.h"
#include "cufft.h"
#include "device_launch_parameters.h"
float test2dfft(int dimx,int dimy) {
int N[2];
N[0] = dimx, N[1] = dimy;
int LENGTH = N[0] * N[1];
cufftComplex *input = (cufftComplex*) malloc(LENGTH * sizeof(cufftComplex));
cufftComplex *output = (cufftComplex*) malloc(LENGTH * sizeof(cufftComplex));
int i;
for (i = 0; i < N[0] * N[1]; i++) {
input[i].x = i % 1000;
input[i].y = 0;
}
cufftComplex *d_inputData, *d_outData;
cudaMalloc((void**) &d_inputData, N[0] * N[1] * sizeof(cufftComplex));
cudaMalloc((void**) &d_outData, N[0] * N[1] * sizeof(cufftComplex));
cudaMemcpy(d_inputData, input, N[0] * N[1] * sizeof(cufftComplex),cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, N[0], N[1], CUFFT_C2C);
size_t workSize;
cufftGetSize2d(plan, N[0], N[1], CUFFT_C2C, &workSize);
printf("worksize : %ld and complex size %ld \n", workSize, N[0] * N[1] * sizeof(cufftComplex));
cudaEvent_t start1;
cudaEventCreate(&start1);
cudaEvent_t stop1;
cudaEventCreate(&stop1);
cudaEventRecord(start1, NULL);
for (int i = 0; i < 100; i++) {
cufftExecC2C(plan, d_inputData, d_outData, CUFFT_FORWARD);
}
cudaEventRecord(stop1, NULL);
cudaEventSynchronize(stop1);
float msecTotal1 = 0.0f;
cudaEventElapsedTime(&msecTotal1, start1, stop1);
cudaMemcpy(output, d_outData, LENGTH * sizeof(cufftComplex),
cudaMemcpyDeviceToHost);
cufftDestroy(plan);
free(input);
free(output);
cudaFree(d_inputData);
cudaFree(d_outData);
return msecTotal1;
}
int main() {
double timeres[200];
double avertime = 0;
for (int i = 0; i < 100; i++) {
timeres[i] = test2dfft(723, 723);
printf("ITER %f \n", timeres[i]);
avertime += timeres[i];
}
printf("\n AVER %f \n", avertime / 100);
}