Hi,
I discovered today that my 1D FFT plans using cufft where allocating large amounts of device memory.
This is quite confusing as I am of course already preparing a buffer for the CUFFT routines to utilize. Using cudaMemGEtInfo before and after the plan creation revealed that the CUFFT plans were occupying as much as ~140+ MiB which is quite prohibitive.
It was easy getting around this issue in my example (see simple code example below) by simply executing a smaller plan several times and moving the pointers.
Can someone tell me why the CUFFT routines work in this manner? It is clearly possible to drastically decrease memory utilization when working with large enough datasets and get the same performance. Does the CPU based FFTW exhibit the same behaviour?
Many thanks!
// Printouts of below code
Small diff: 80.000 MiB
Big diff: 160.000 MiB
Small plan exec time: 6.999 ms
BIg plan exec time: 6.927 ms
// This was written on the fly for windows, but you're smart and will understand anway...
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <windows.h>
double get_clock()
{
LARGE_INTEGER ticksPerSecond;
LARGE_INTEGER timeStamp;
QueryPerformanceFrequency(&ticksPerSecond);
QueryPerformanceCounter(&timeStamp);
return double(timeStamp.QuadPart)/(double)ticksPerSecond.QuadPart; // returns timestamp in secounds
}
int main(int argc, char* argv[])
{
int totalBatches = 40960;
int numBatches = totalBatches/2;
int batchSize = 512;
int els = numBatches*batchSize;
int totalN = batchSize*totalBatches;
int size = totalBatches*batchSize*sizeof(float2);
int N = size/sizeof(float2);
float2* h_output_small = (float2*)malloc(size);
float2* h_output_big = (float2*)malloc(size);
float2* d_input;
float2* d_output;
cudaMalloc((void**)&d_input, size);
cudaMalloc((void**)&d_output, size);
float2* h_input = (float2*)malloc(size);
for(int i = 0; i < N; i++)
{
h_input[i].x = float(rand()) / float(RAND_MAX);
h_input[i].y = float(rand()) / float(RAND_MAX);
}
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
size_t before, after;
size_t total;
cudaMemGetInfo(&before, &total);
cufftHandle smallPlan;
cufftPlan1d(&smallPlan, batchSize, CUFFT_C2C, numBatches);
cudaMemGetInfo( &after, &total);
printf("\n Small diff: %0.3f", float(before-after)/(1024.0f*1024.0f));
cudaMemGetInfo(&before, &total);
cufftHandle bigPlan;
cufftPlan1d(&bigPlan, batchSize, CUFFT_C2C, totalBatches);
cudaMemGetInfo( &after, &total);
printf("\n Big diff: %0.3f", float(before-after)/(1024.0f*1024.0f));
// Execute small plan twice!
double smallTime = get_clock();
cufftExecC2C(smallPlan, d_input, d_output, CUFFT_FORWARD);
cufftExecC2C(smallPlan, d_input+els, d_output+els, CUFFT_FORWARD);
cudaThreadSynchronize();
smallTime = get_clock() - smallTime;
printf("\n Small plan exec time: %0.3f ms", smallTime*float(1E3));
cudaMemcpy(h_output_small, d_output, size, cudaMemcpyDeviceToHost);
double bigTime = get_clock();
cufftExecC2C(bigPlan, d_input, d_output, CUFFT_FORWARD);
cudaThreadSynchronize();
bigTime = get_clock() - bigTime;
printf("\n BIg plan exec time: %0.3f ms", bigTime*float(1E3));
cudaMemcpy(h_output_big, d_output, size, cudaMemcpyDeviceToHost);
// Checkequal
for(int i = 0; i < els*2; i++)
{
float diff_x = fabs(h_output_small[i].x - h_output_big[i].x);
float diff_y = fabs(h_output_small[i].y - h_output_big[i].y);
if( diff_x > float(1E-5) || diff_y > float(1E-5))
printf("\n Mhmmmm");
}
printf("\n OUtput val: %0.3f", h_output_small[33]);
system("pause");
return 0;
}