There is a problem running many FFTs using CUFFT. After running approximately 400,000 2D FFTs, the CUDA program crashes. Sometimes the crash requires unloading and reloading the kernel module, other times I can start another CUDA program immediately.
Here some stripped down code that shows the problem:
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cutil.h>
#include <cufft.h>
#include <iostream>
#include <sstream>
#include <iomanip>
using namespace std;
int nxfft = 256;
int nyfft = 64;
//int batchSize = 280;
//const int num_runs = 1500;
int batchSize = 1;
const int num_runs = 1500 * 280;
const int colwidth = 13;
///////////////////////////////////////////////////////////////////////////////
void fillrand(float2* array, unsigned int size);
int testFFT(int nx, int ny, int count, int direction);
///////////////////////////////////////////////////////////////////////////////
int main()
{
cudaThreadSynchronize(); // forces CUDA runtime to initialize
srand((int)time(NULL));
cout << setfill('-');
cout << setw(colwidth) << left << "-Run" << "|";
cout << setw(colwidth) << left << "-Size" << "|";
cout << setw(colwidth) << left << "-Direction" << "|";
cout << setw(colwidth) << left << "-Time (ms)" << "|";
cout << setw(colwidth) << left << "-Time/xform" << "|";
cout << setfill(' ');
cout << endl;
for(int i = 0; i < num_runs; i++)
{
cout << setw(colwidth) << left << i << "|";
if(testFFT(nxfft, nyfft, batchSize, CUFFT_FORWARD) != 0)
{
cout << endl << "===FAILED===" << endl;
exit(1);
}
cout << setw(colwidth) << left << " " << "|";
if(testFFT(nxfft, nyfft, batchSize, CUFFT_INVERSE) != 0)
{
cout << endl << "===FAILED===" << endl;
exit(1);
}
}
return 0;
}
///////////////////////////////////////////////////////////////////////////////
int testFFT(int nx, int ny, int count, int direction)
{
ostringstream buf;
buf << nx << "x" << ny << "x" << count;
cout << setw(colwidth) << left << buf.str() << "|";
cout << setw(colwidth) << left << (direction == CUFFT_FORWARD ? "forward" : "inverse") << "|";
unsigned int timer = 0;
cutCreateTimer(&timer);
int size = nx * ny;
int byte_size = sizeof(float2) * size;
// Allocate and initalize host memory
float2* data = (float2*)malloc(byte_size * count);
if(data == NULL)
{
cout << "Allocating host buffer failed" << endl;
return 1;
}
fillrand(data, size * count);
// Allocate device memory and copy to device
float2* d_data;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_data, byte_size * count));
CUDA_SAFE_CALL(cudaMemcpy(d_data, data, byte_size * count, cudaMemcpyHostToDevice));
// Run the transform
cufftHandle cufftplan;
CUFFT_SAFE_CALL(cufftPlan2d(&cufftplan, ny, nx, CUFFT_C2C));
cutResetTimer(timer);
cutStartTimer(timer);
for(int n = 0; n < count; n++)
{
float2* d_ptr = d_data + n * nx * ny;
CUFFT_SAFE_CALL(cufftExecC2C(cufftplan, (cufftComplex*)d_ptr, (cufftComplex*)d_ptr, direction));
}
cudaThreadSynchronize();
cutStopTimer(timer);
cout << setw(colwidth) << left << fixed << cutGetTimerValue(timer) << "|";
cout << setw(colwidth) << left << fixed << cutGetTimerValue(timer) / (float)count << "|" << endl;
// cleanup
CUFFT_SAFE_CALL(cufftDestroy(cufftplan));
CUDA_SAFE_CALL(cudaFree(d_data));
free(data);
cutDeleteTimer(timer);
return 0;
}
///////////////////////////////////////////////////////////////////////////////
void fillrand(float2* array, unsigned int size)
{
for(unsigned int i = 0; i < size; i++)
{
array[i].x = rand() / (float)RAND_MAX;
array[i].y = rand() / (float)RAND_MAX;
}
}
Snippet from program output at crash:
196803 |256x64x1 |forward |0.135000 |0.135000 |
|256x64x1 |inverse |0.136000 |0.136000 |
196804 |256x64x1 |forward |0.136000 |0.136000 |
|256x64x1 |inverse |0.135000 |0.135000 |
196805 |256x64x1 |forward |0.135000 |0.135000 |
|256x64x1 |inverse |0.136000 |0.136000 |
cufft: ERROR: execute.cu, line 992
cufft: ERROR: CUFFT_EXEC_FAILED
cufft: ERROR: execute.cu, line 286
cufft: ERROR: CUFFT_EXEC_FAILED
cufft: ERROR: cufft.cu, line 115
cufft: ERROR: CUFFT_EXEC_FAILED
196806 |256x64x1 |forward |1.785000 |1.785000 |
cufft: ERROR: plan.cu, line 41
cufft: ERROR: CUFFT_INTERNAL_ERROR
cufft: ERROR: context.cu, line 27
Aborted
When I start the program, dmesg shows:
NVRM: API mismatch: the client has the version 100.14.10, but
NVRM: this kernel module has the version 100.14.11. Please
NVRM: make sure that this kernel module and all NVIDIA driver
NVRM: components have the same version.
(I also get this message for CUDA programs that run normally.) After the crash, dmesg contains:
NVRM: Xid (0001:00): 13, 0001 00000000 000050c0 00000368 00000000 00000080
The system specs are:
Q6600
MSI P6N Diamond motherboard (nForce 680i)
4GB RAM
2 XFX 8800GTX
Ultra X3 1600W power supply
CentOS 4.4 (RHEL 4.4)
CUDA 1.0
This is a particular problem for me - over 50 million FFTs are executed during the run of my algorithm. Let me know if I’ve overlooked something obvious or if I need to submit this as a bug.
Jim