Hi all. I’m interested in doing 1D FFTs as quickly as possible, so I wrote some CUDA code to measure performance (in complex, 8-byte samples per second).
I’m not seeing the performance I’d like to be seeing – either my expectations are unrealistic, or I’m doing something wrong.
Pseudo code:
pts_per_fft = 256
num_batches = 128
num_iterations = 10000
start_timer()
for i in range(num_iterations):
transfer pts_per_fft*num_batches complex samples from host mem to device mem
perform in-place fft
transfer pts_per_fft*num_batches complex samples from device mem to host mem
stop_timer()
I have 2 test machines. One is a Macbook Pro (GeForce 8600M GT, PCI-e x16, 512MB), the other is an HP xw8600 containing a GTX280.
I have tried many different combinations of pts_per_fft and num_batches, and can’t seem to squeeze out more than 50MSamp/sec on the Macbook, or 115MSamp/sec on the GTX280. Is this about what I should be getting?
Here is my code:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
// includes, project
#include <cufft.h>
#include <cutil.h>
typedef float2 Complex;
#define PTS_PER_FFT 256
#define NUM_BATCHES 1024
#define NUM_ITER 2500
#define SIG_SIZE_PTS (PTS_PER_FFT*NUM_BATCHES)
#define SIG_SIZE_BYTES (sizeof(Complex)*SIG_SIZE_PTS)
float ranf() {
return ((float)rand() / ((float)(RAND_MAX)+(float)(1)));
}
void getRandomGaussianPair(float &r1, float &r2) {
float x1, x2, w;
do {
x1 = 2.0 * ranf() - 1.0;
x2 = 2.0 * ranf() - 1.0;
w = x1 * x1 + x2 * x2;
} while ( w >= 1.0 );
w = sqrt( (-2.0 * log( w ) ) / w );
r1 = x1 * w;
r2 = x2 * w;
}
void genSignal(Complex* signal, int numPts, double fSamp, double fWave) {
double dPhase = 2*M_PI*fWave/fSamp;
double phase = 0.0;
float nx, ny;
int i;
for(i=0; i<numPts; i++) {
phase = i*dPhase;
getRandomGaussianPair(nx, ny);
signal[i].x = sin(phase) + 0.1*nx;
signal[i].y = sin(phase + M_PI_2) + 0.1*ny;
}
}
int main(int argc, char** argv)
{
Complex* h_signal = (Complex*)malloc(SIG_SIZE_BYTES);
int i, j, ind;
cufftResult result;
cufftHandle plan;
double t1, t2;
//Create the FFT plan
result = cufftPlan1d(&plan, PTS_PER_FFT, CUFFT_C2C, NUM_BATCHES);
if(result != CUFFT_SUCCESS) {
printf("Problem encountered during FFT Plan creation\n");
}
srand(1234);
genSignal(h_signal, SIG_SIZE_PTS, 1.2e9, 1.2e9/16);
// Allocate device memory for signal
Complex* d_signal;
cudaMalloc((void**)&d_signal, SIG_SIZE_BYTES);
t1 = clock();
for(i=0; i<NUM_ITER; i++) {
// Copy host memory to device
cudaMemcpy(d_signal, h_signal, SIG_SIZE_BYTES, cudaMemcpyHostToDevice);
//Perform FFT
result = cufftExecC2C(plan, d_signal, d_signal, CUFFT_FORWARD);
if(result != CUFFT_SUCCESS) {
printf("Problem encountered during FFT Exec\n");
}
cudaMemcpy(h_signal, d_signal, SIG_SIZE_BYTES, cudaMemcpyDeviceToHost);
}
t2 = clock();
printf("Elapsed time (secs): %f\n", (t2-t1)/CLOCKS_PER_SEC);
//Print freq domain data
/*printf("Abs Freq domain data:\n");
for(i=0; i<NUM_BATCHES; i++) {
for(j=0; j<PTS_PER_FFT; j++) {
ind = i*PTS_PER_FFT + j;
printf("%i\t%i\t%f\n", i, j, sqrt(h_signal[ind].x*h_signal[ind].x + h_signal[ind].y*h_signal[ind].y));
}
}*/
//Clean up
cufftDestroy(plan);
free(h_signal);
return 0;
}