Hi there,
I am trying to implement a simple FFT transform using cuFFT with streams. Looks like I am getting incorrect results with more than 1 stream, while results are correct with 1 stream. This tells me there is something wrong with synchronization. I am using events. Please let me know what I could be doing wrong. The code is below.
Thanks so much!
#include <stdio.h>
#include <cufft.h>
#include <stdlib.h>
#include <complex>
using namespace std;
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
double checkError(complex<double> *a,int n, int m) ;
double checkError(complex<double> *a,int n, int m)
{
int i,j;
complex<double> ans;
double d, diff = 0.0;
for(j=0;j<m;j++)
for(i=0;i<n;i++) {
if(i == j+1)
ans = complex<double> (0.0,-0.5*n);
else if(i == n-1-j)
ans = complex<double> (0.0,0.5*n);
else
ans = 0.0;
d = abs(ans-a[i+j*n]);
if(d > diff) {
diff = d;
printf("diff=%lg, a=(%lg,%lg), ans=(%lg,%lg) at %d %d\n",diff,a[i+j*n].real(),a[i+j*n\
].imag(),ans.real(),ans.imag(),i,j);
}
}
return(diff);
}
float maxError(float *a, int n)
{
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i]-1.0f);
if (error > maxE) maxE = error;
}
return maxE;
}
int main(int argc, char **argv)
{
const int blockSize = 256, nStreams = 2;
// const int n = 4 * 1024 * blockSize * nStreams;
const int n = 4 * 16 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(complex<double>);
const int bytes = n * sizeof(complex<double>);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, devId));
printf("Device : %s\n", prop.name);
checkCuda( cudaSetDevice(devId) );
// allocate pinned host memory and device memory
complex<double> *a, *d_a,*b;
checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned
checkCuda( cudaMallocHost((void**)&b, bytes) ); // host pinned
checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device
float ms; // elapsed time in milliseconds
// create events and streams
cudaEvent_t startEvent, stopEvent, dummyEvent;
cudaStream_t stream[nStreams];
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventCreate(&dummyEvent) );
int *inembed=NULL;
int *onembed=NULL;
int idist = streamSize;
int istride=1;
int odist = streamSize;
int ostride=1;
int batch = nStreams;
double twopi = atan(1.0)*8.0;
int status;
for (int i = 0; i < nStreams; ++i)
checkCuda( cudaStreamCreate(&stream[i]) );
for (int i = 0; i < nStreams; i++)
{
checkCuda(cudaStreamCreate(&stream[i]));
}
// create cufft plans and set them in streams
cufftHandle* fftPlans = (cufftHandle*)malloc(sizeof(cufftHandle)*nStreams);
for(int i = 0; i < nStreams; i++)
{
status = cufftPlanMany(&fftPlans[i],1,(int *) &n,inembed,istride,idist,onembed,ostride,odist,CU\
FFT_Z2Z,batch/nStreams);
if (status != CUFFT_SUCCESS)
{
printf("Cufft FFT plan error: %d\n", status);
}
cufftSetStream(fftPlans[i],stream[i]);
}
// asynchronous version 1: loop over {copy, kernel, copy}
for(int j=0;j<batch;j++)
for(int i=0;i<streamSize;i++)
a[i+j*streamSize] = complex<double> (sin(twopi*i*(j+1)/streamSize),0.0);
checkCuda( cudaEventRecord(startEvent,0) );
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]) );
status = cufftExecZ2Z(fftPlans[i],(cufftDoubleComplex *) &d_a[offset],(cufftDoubleComplex *) &d_a\
[offset],CUFFT_FORWARD);
if (status != CUFFT_SUCCESS)
printf("Cufft FFT work error: %d\n",status);
checkCuda( cudaMemcpyAsync(&b[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]) );
checkCuda( cudaEventRecord(stopEvent,stream[i]));
}
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
printf(" max error: %lg\n", checkError(b, streamSize,batch));
// cleanup
checkCuda( cudaEventDestroy(startEvent) );
checkCuda( cudaEventDestroy(stopEvent) );
checkCuda( cudaEventDestroy(dummyEvent) );
for (int i = 0; i < nStreams; ++i)
checkCuda( cudaStreamDestroy(stream[i]) );
cudaFree(d_a);
cudaFreeHost(a);
return 0;
}