I’m trying to overlap CUFFTs and cudaMemcpyAsync(), but I’m getting large fluctuations in throughput.
I’ve measured the raw time for 100 4k x 4k FFTs at a consistent 400ms. But once I add a cudaMemcpyAsync(), the time goes anywhere from 453 ms to 578 ms. I’m expecting some increase in time due to 2 bandwidth hungry functions competing for bandwidth, but I don’t think the variability should be so high (Visual profiler shows the cudaMemcpyAsync() bandwidth varying between 11.6 GiB/s on most calls to 1.5 GiB/s on a bad call. I’m using a GTX 690 in a 1U server, and running SUSE 11.
Can someone suggest what could be causing the variations and what can be done about them? Also, can someone run my code, preferably a Tesla Kepler (i.e. GTX K10 or GTX K20), and see if the variability is also that big?
Thanks for helping
#include <iostream>
#include <cufft.h>
#include <cuda_runtime.h>
#include <vector>
using namespace std;
#define cudaCheckErrors(expr) { int e = expr; if (e != cudaSuccess) { printf("error %d\n", e); throw 0; } }
double Clock()
{
#ifdef _WIN32
LARGE_INTEGER t;
static double tick_interval = 0;
if (tick_interval == 0)
{
QueryPerformanceFrequency(&t);
tick_interval = 1.0 / t.QuadPart;
}
QueryPerformanceCounter(&t);
return t.QuadPart * tick_interval;
#else
timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return t.tv_sec + t.tv_nsec * 0.000000001;
#endif
}
int main()
{
const int N = 4096;
cudaStream_t streams[3];
for (int i = 0; i < 3; ++i)
cudaCheckErrors(cudaStreamCreate(&streams[i]));
cufftHandle plan;
float *data, *outData[3];
float *copy[3];
cudaCheckErrors(cudaMalloc(&data, N * N * sizeof(float)));
for (int i = 0; i < 3; ++i)
{
cudaCheckErrors(cudaMalloc(&outData[i], N * (N + 2) * sizeof(float)));
cudaCheckErrors(cudaHostAlloc(©[i], N * N * sizeof(float), cudaHostAllocMapped));
}
cudaCheckErrors(cufftPlan2d(&plan, N, N, CUFFT_R2C));
int queueIndex = 0;
for (int warmup = 1; warmup >= 0; --warmup)
{
cudaCheckErrors(cudaThreadSynchronize());
double t0 = Clock();
for (int i = -2; i < 100; ++i)
{
// 3 stage pipeline: iteration i issues i + 2 FFT, does nothing for i + 1 FFT, and waits on ith FFT
int nextIndex = (queueIndex + 2) % 3;
if (i + 2 < 100)
{
//cout << "issue " << streams[nextIndex] << endl;
cudaCheckErrors(cufftSetStream(plan, streams[nextIndex]));
cudaCheckErrors(cufftExecR2C(plan, data, (cufftComplex *)outData[nextIndex]));
cudaCheckErrors(cudaMemcpyAsync(copy[nextIndex], outData[nextIndex], N * N * 2, cudaMemcpyDeviceToHost, streams[nextIndex]));
}
if (i >= 0)
{
//cout << "waiting on " << streams[queueIndex] << endl;
//cudaCheckErrors(cudaStreamSynchronize(streams[queueIndex]));
}
queueIndex = (queueIndex + 1) % 3;
}
cudaCheckErrors(cudaThreadSynchronize());
if (!warmup)
cout << Clock() - t0 << endl;
}
cudaDeviceReset();
}