Hello All,
I am having a bit of trouble understanding the performance of my code. I find that when I do a forward FFT (D2Z) using CUFFT 2.3 double precision, subsequent calls to cudaMalloc() are slow (It takes seconds). I was thinking originally that perhaps the cufftExecD2Z function makes asynchronous kernel launches and when I timed the cudaMalloc() call I was also including the time for the FFT to finish. However, if this were the case then it wouldn't matter how much memory I allocated after the FFT, It would be slow because the FFT was slow. Here is the program I ran:
[codebox]#include <cuda_runtime.h>
#include <cufft.h>
#include <time.h>
using namespace std;
template
global void fill_buffer( T * buffer, unsigned long n_rows, unsigned long n_columns) {
unsigned long row = blockIdx.y*blockDim.y + threadIdx.y;
unsigned long col = blockIdx.x*blockDim.x + threadIdx.x;
if (row < n_rows && col < n_columns) {
buffer[row*n_columns + col] = 1;
}
}
int createPlan(cufftHandle *plan, unsigned long n_rows, unsigned long n_columns) {
if (cufftPlan2d(plan, n_rows, n_columns, CUFFT_D2Z) != CUFFT_SUCCESS) {
return 1;
}
return 0;
}
int main() {
unsigned long n_rows = 1799;
unsigned long n_columns = 1799;
char input;
cufftHandle double_plan;
if (createPlan(&double_plan, n_rows, n_columns) != 0) {
cout << "Error creating plan" << endl;
cin >> input;
return 0;
}
double *idata;
cuDoubleComplex * odata;
if (cudaMalloc((void **) &idata, sizeof(double)*n_rows*n_columns) != cudaSuccess) {
cout << "Could not allocate idata!" << endl;
cin >> input;
cufftDestroy(double_plan);
return 0;
}
if (cudaMalloc((void **) &odata, sizeof(cuDoubleComplex)*n_rows*(n_columns/2 + 1)) != cudaSuccess) {
cout << "Could not allocate odata!" << endl;
cin >> input;
cudaFree(idata);
cufftDestroy(double_plan);
return 0;
}
dim3 threadBlock(16, 16);
dim3 threadGrid((threadBlock.x + n_columns - 1)/threadBlock.x,
(threadBlock.y + n_rows - 1)/threadBlock.y);
fill_buffer<double><<<threadGrid, threadBlock>>>(idata, n_rows, n_columns);
if (cudaGetLastError() != cudaSuccess) {
cout << "Kernel execution error!" << endl;
cudaFree(idata);
cudaFree(odata);
cufftDestroy(double_plan);
cin >> input;
return 0;
}
void * dummy_buffer;
clock_t start = clock();
if (cudaMalloc(&dummy_buffer, sizeof(double)*40000) != cudaSuccess) {
cout << "Could not allocate temporary buffer" << endl;
cudaFree(idata);
cudaFree(odata);
cufftDestroy(double_plan);
cin >> input;
return 0;
}
clock_t stop = clock();
cudaFree(dummy_buffer);
cout << "Time to allocate a dummy buffer before FFT: " << ((double)(stop - start))/((double)CLOCKS_PER_SEC) << endl;
//Execute forward FFT
if (cufftExecD2Z(double_plan, idata, odata) != CUFFT_SUCCESS) {
cout << "Could not execute forward plan!" << endl;
cin >> input;
cudaFree(idata);
cudaFree(odata);
cufftDestroy(double_plan);
return 0;
}
start = clock();
if (cudaMalloc(&dummy_buffer, sizeof(double)*40000) != cudaSuccess) {
cout << "Could not allocate temporary buffer" << endl;
cudaFree(idata);
cudaFree(odata);
cufftDestroy(double_plan);
cin >> input;
return 0;
}
stop = clock();
cout << "Time to allocate a dummy buffer after FFT: " << ((double)(stop -start))/((double)CLOCKS_PER_SEC) << endl;
cudaFree(idata);
cudaFree(odata);
cudaFree(dummy_buffer);
cufftDestroy(double_plan);
cin >> input;
return 0;
}[/codebox]
If I reduce the amount of space I allocate after the FFT (say to 1000 bytes) then this problem disappears. I have also noticed that the time it takes to allocate space after the FFT greatly depends on the size of the FFT. Using matrix size 2000 X 2000 makes it run much faster. Is there something particularly inefficient about 1799 X 1799 FFTs? Was my original thought correct about actually timing the FFT not the cudaMalloc()? If so, why does amount of memory I allocate after the FFT matter? Am I missing something here? I compiled this using VS 2008 on Windows XP 32 bit edition with a GeForce GTX 285 with 1024 MB memory. Any help would be appreciated. Here is the output I get when I run the code with a 1799 X 1799 FFT:
Time to allocate a dummy buffer before FFT: 0
Time to allocate a dummy buffer after FFT: 2.047
Thank you,
Zach