EDIT: GeForce 8800 GTS, GeForce 8800 GTX, GeForce 8800 Ultra, and Quadro FX5600 results are now posted below.
Although not a G8x owner myself (yet!), I am very interested to know how quickly an 8800GTX could perform 1D FFTs with 128K elements, now that the 16K limit has been removed.
For the benefit of all, I’ve written the attached benchmarking tool and invite anyone with an 8800-series card to run it and post your results.
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <cufft.h>
#include <cutil.h>
#define MIN_NX 1024
#define MAX_NX 262144
#define MIN_BATCH 1
#define MAX_BATCH 64
#define FFTS_PER_TEST 32
int main(int argc, char **argv)
{
// force the program to run on a single processor
DWORD processAffinityMask;
DWORD systemAffinityMask;
GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
if (processAffinityMask!=1)
{
SetProcessAffinityMask(GetCurrentProcess(), 1);
system(argv[0]);
return 0;
}
// introduce the program
printf("\nCUFFT BENCHMARKING TOOL v1.0\n\n");
printf("This program evaluates the utility of using CUDA devices as\n");
printf("FFT coprocessors for digital signal processing applications.\n");
printf("Each table entry is an estimate of the maximum number of FFTs\n");
printf("that can be performed per second, considering both the time\n");
printf("needed to calculate the FFTs and the time needed to copy data\n");
printf("to and from the CUDA device.\n\n");
printf("To minimize interference from the OS and other programs, each\n");
printf("estimate is based on the fastest of %i identical calculations.\n", FFTS_PER_TEST);
// perform CUDA device initialization
CUT_DEVICE_INIT();
// display CUDA device info
int deviceCount;
CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
for (int dev = 0; dev < deviceCount; ++dev)
{
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
printf(" Major revision number: %d\n", deviceProp.major);
printf(" Minor revision number: %d\n", deviceProp.minor);
printf(" Total amount of global memory: %d bytes\n", deviceProp.totalGlobalMem);
printf(" Clock rate: %d kilohertz\n", deviceProp.clockRate);
}
// initialize host PC arrays
int hostArrayLength = MAX_NX * MAX_BATCH;
cufftComplex *hostArrayA = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex));
cufftComplex *hostArrayB = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex));
float *element = (float *)hostArrayA;
float *elementLimit = (float *)(hostArrayA+hostArrayLength);
while (element<elementLimit)
*element++ = rand();
// run timing tests for in-place and out-of-place FFTs
for (int out_of_place=0; out_of_place<2; ++out_of_place)
{
// print table headings
printf("\n");
printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("---------", batch); printf("\n");
if (out_of_place)
printf("1D Complex-to-Complex Out-of-Place FFTs\n");
else
printf("1D Complex-to-Complex In-Place FFTs\n");
printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("---------", batch); printf("\n");
printf(" nx | batch\n");
printf(" "); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n");
printf(" "); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("|%8i", batch); printf("\n");
printf("--------"); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n");
// run timing tests for a variety of FFT array lengths
for (int nx=MIN_NX; nx<MAX_NX+1; nx*=2)
{
printf("%8i", nx);
// run timing tests for a variety of batch settings
for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4)
{
// generate CUFFT plan
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan1d(&plan, nx, CUFFT_C2C, batch));
// allocate arrays on host PC and CUDA device, fill host array with random data
size_t arraySize = sizeof(cufftComplex) * nx * batch;
cufftComplex *deviceArrayA;
cufftComplex *deviceArrayB;
CUDA_SAFE_CALL(cudaMalloc((void**)&deviceArrayA, arraySize));
if (out_of_place)
CUDA_SAFE_CALL(cudaMalloc((void**)&deviceArrayB, arraySize));
// run a series of identical timing tests, looking for the fastest one (the one with the least OS interference)
int fastestRateFound = 0;
char *spinner = "|/-\";
int spindex = 0;
for (int rep=0; rep<FFTS_PER_TEST; ++rep)
{
__int64 startCount;
__int64 stopCount;
__int64 countsPerSec;
QueryPerformanceFrequency((LARGE_INTEGER *)&countsPerSec);
if (out_of_place)
{
QueryPerformanceCounter((LARGE_INTEGER *)&startCount);
CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice));
CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayB, CUFFT_FORWARD));
CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayB, arraySize, cudaMemcpyDeviceToHost));
QueryPerformanceCounter((LARGE_INTEGER *)&stopCount);
}
else
{
QueryPerformanceCounter((LARGE_INTEGER *)&startCount);
CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice));
CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayA, CUFFT_FORWARD));
CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayA, arraySize, cudaMemcpyDeviceToHost));
QueryPerformanceCounter((LARGE_INTEGER *)&stopCount);
}
int fftsPerSec = (int)( (countsPerSec*batch) / (stopCount-startCount) );
if (fastestRateFound<fftsPerSec)
fastestRateFound = fftsPerSec;
// provide some entertainment
printf("%c\b", spinner[spindex++]);
if (spindex==4)
spindex = 0;
}
printf("|%8i", fastestRateFound);
CUFFT_SAFE_CALL(cufftDestroy(plan));
CUDA_SAFE_CALL(cudaFree(deviceArrayA));
if (out_of_place)
CUDA_SAFE_CALL(cudaFree(deviceArrayB));
}
printf("\n");
}
printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n");
}
free(hostArrayA);
free(hostArrayB);
printf("\nPress ENTER to exit...\n");
fflush( stdout);
fflush( stderr);
getchar();
exit(EXIT_SUCCESS);
}