Implementing cuFFT with streams problem

Hi there,
I am trying to implement a simple FFT transform using cuFFT with streams. Looks like I am getting incorrect results with more than 1 stream, while results are correct with 1 stream. This tells me there is something wrong with synchronization. I am using events. Please let me know what I could be doing wrong. The code is below.
Thanks so much!

#include <stdio.h>
#include <cufft.h>
#include <stdlib.h>
#include <complex>

using namespace std;


// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}


double checkError(complex<double> *a,int n, int m) ;
double checkError(complex<double> *a,int n, int m)
{
        int i,j;
        complex<double> ans;
        double d, diff = 0.0;

        for(j=0;j<m;j++)
          for(i=0;i<n;i++) {
             if(i == j+1)
                ans = complex<double> (0.0,-0.5*n);
             else if(i == n-1-j)
                ans = complex<double> (0.0,0.5*n);
             else
                ans = 0.0;

             d = abs(ans-a[i+j*n]);
             if(d > diff) {
                diff = d;
                printf("diff=%lg, a=(%lg,%lg), ans=(%lg,%lg) at %d %d\n",diff,a[i+j*n].real(),a[i+j*n\
].imag(),ans.real(),ans.imag(),i,j);
}
          }
          return(diff);
}


float maxError(float *a, int n)
{
  float maxE = 0;
  for (int i = 0; i < n; i++) {
    float error = fabs(a[i]-1.0f);
    if (error > maxE) maxE = error;
  }
  return maxE;
}

int main(int argc, char **argv)
{
  const int blockSize = 256, nStreams = 2;
//  const int n = 4 * 1024 * blockSize * nStreams;
  const int n = 4 * 16 * blockSize * nStreams;
  const int streamSize = n / nStreams;
  const int streamBytes = streamSize * sizeof(complex<double>);
  const int bytes = n * sizeof(complex<double>);

  int devId = 0;
  if (argc > 1) devId = atoi(argv[1]);
  cudaDeviceProp prop;
  checkCuda( cudaGetDeviceProperties(&prop, devId));
  printf("Device : %s\n", prop.name);
  checkCuda( cudaSetDevice(devId) );

  // allocate pinned host memory and device memory
  complex<double> *a, *d_a,*b;
  checkCuda( cudaMallocHost((void**)&a, bytes) );      // host pinned
  checkCuda( cudaMallocHost((void**)&b, bytes) );      // host pinned
  checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device

  float ms; // elapsed time in milliseconds

  // create events and streams
  cudaEvent_t startEvent, stopEvent, dummyEvent;
  cudaStream_t stream[nStreams];
  checkCuda( cudaEventCreate(&startEvent) );
  checkCuda( cudaEventCreate(&stopEvent) );
  checkCuda( cudaEventCreate(&dummyEvent) );

  int *inembed=NULL;
  int *onembed=NULL;
  int idist = streamSize;
  int istride=1;
  int odist = streamSize;
  int ostride=1;
  int batch = nStreams;
  double twopi = atan(1.0)*8.0;
  int status;

  for (int i = 0; i < nStreams; ++i)
    checkCuda( cudaStreamCreate(&stream[i]) );

    for (int i = 0; i < nStreams; i++)
        {
                checkCuda(cudaStreamCreate(&stream[i]));
        }
        // create cufft         plans and set them in streams
   cufftHandle* fftPlans =       (cufftHandle*)malloc(sizeof(cufftHandle)*nStreams);
   for(int i = 0; i < nStreams; i++)
   {
      status = cufftPlanMany(&fftPlans[i],1,(int *) &n,inembed,istride,idist,onembed,ostride,odist,CU\
FFT_Z2Z,batch/nStreams);
if (status != CUFFT_SUCCESS)
           {
                        printf("Cufft FFT plan error: %d\n", status);
                             }
      cufftSetStream(fftPlans[i],stream[i]);
    }
  // asynchronous version 1: loop over {copy, kernel, copy}
  for(int j=0;j<batch;j++)
    for(int i=0;i<streamSize;i++)
       a[i+j*streamSize] = complex<double> (sin(twopi*i*(j+1)/streamSize),0.0);

  checkCuda( cudaEventRecord(startEvent,0) );
  for (int i = 0; i < nStreams; ++i) {
    int offset = i * streamSize;
    checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
                               streamBytes, cudaMemcpyHostToDevice,
                               stream[i]) );
    status = cufftExecZ2Z(fftPlans[i],(cufftDoubleComplex *) &d_a[offset],(cufftDoubleComplex *) &d_a\
[offset],CUFFT_FORWARD);
    if (status != CUFFT_SUCCESS)
       printf("Cufft FFT work error: %d\n",status);
    checkCuda( cudaMemcpyAsync(&b[offset], &d_a[offset],
                               streamBytes, cudaMemcpyDeviceToHost,
                               stream[i]) );
    checkCuda( cudaEventRecord(stopEvent,stream[i]));
  }
  checkCuda( cudaEventSynchronize(stopEvent) );
  checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
  printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
  printf("  max error: %lg\n", checkError(b, streamSize,batch));
  // cleanup
  checkCuda( cudaEventDestroy(startEvent) );
  checkCuda( cudaEventDestroy(stopEvent) );
  checkCuda( cudaEventDestroy(dummyEvent) );
  for (int i = 0; i < nStreams; ++i)
    checkCuda( cudaStreamDestroy(stream[i]) );
  cudaFree(d_a);
  cudaFreeHost(a);

  return 0;
}

I’m pretty sure you need to set n in cufftPlanMany to streamSize.

Device : NVIDIA TITAN RTX
Time for asynchronous V1 transfer and execute (ms): 0.346048
diff=1.84288e-15, a=(1.84288e-15,0), ans=(0,0) at 0 0
diff=9.50431e-13, a=(-9.50431e-13,-8192), ans=(0,-8192) at 1 0
diff=9.55542e-13, a=(-2.93052e-13,-9.09495e-13), ans=(0,0) at 8191 0
diff=2.01511e-12, a=(-2.01511e-12,-8192), ans=(0,-8192) at 2 1
  max error: 2.01511e-12

For examples: GitHub - mnicely/cufft_examples: cuFFT and cuFFTDx example

Thank you, this solves it.