cuFFT, MemcpyAsync = gain ? howto use streams

Hi,

I am running SDK 3.2 on Ubuntu 10.04 with a GTX 460. I perform some tests of FFT 1D R2C on 16384 samples.

At first I was doing simply

  1. Memcpy Host to Device

  2. R2C FFT (NX=16384 in batch of 50)

  3. Memcpy Debice to Host

Ok, no doubt that you imagine the result most of the time is spent in data transfert. So, first I have used cudaHostAlloc with

cudaHostAllocPortable flag to turn to pinned data. The gain is already nice from 60usec to 40usec (per FFT + 2 data transfers).

Then, I have looked at streams. Well, I have write a small code (see below). However I do not see any improvments switching from 1 stream to 2, 5 or10.

May be my code is not well written. Have you some comments on it?

Thanks

JE

PS: have a look at the second Edition (right after the first code sample)

/*

  JEC 27/11/10 v5

  Streams

*/

/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <sys/time.h>

// includes, project

#include <cuda.h>

#include <cuda_runtime.h>

#include <cufft.h>

#include <cutil_inline.h>

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

int runTest(int argc, char** argv);

unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {

  runTest(argc, argv);

//Destroy active GPU context

  cudaThreadExit();

}

///////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

int runTest(int argc, char** argv) {

//Create Streams

  size_t NSTREAMS = 5;  

if (argc==2) {

    //change number of workers in cmd line

    NSTREAMS=(size_t)(atoi(argv[1]));

  } else {

    printf("usage: myExamCUFFT1Dv5 <nomber of streams>\n");

    return 0;

  }

cudaSetDevice( cutGetMaxGflopsDeviceId() );

size_t NX     = 16384; //number of time samplings

  size_t WORKERS= 50/NSTREAMS;    //number of workers per stream

size_t NFFT = NSTREAMS*WORKERS; 

cufftHandle plan[NSTREAMS];

// creates 1D FFT plan 

  int DIM = 1;

  int rank[DIM];

  rank[0] = NX;

  for (int sid=0; sid<NSTREAMS; sid++) {

    cufftSafeCall(cufftPlanMany

		  (&plan[sid],DIM, rank,NULL,1,0,NULL,1,0,CUFFT_R2C, WORKERS)

		  );

  }

cudaStream_t streams[NSTREAMS];

  for (int sid=0; sid<NSTREAMS; sid++){

    cudaStreamCreate(&streams[sid]);

    cufftSetStream(plan[sid],streams[sid]);

    cudaStreamSynchronize(streams[sid]);

  }

// Host Array 

  size_t NDATA = NX*WORKERS;

  printf("Total Memory in DATA %d MBytes\n",sizeof(cufftComplex)*NDATA/1000000);

cufftReal* data[NSTREAMS];

  cufftComplex* hostPtrOut[NSTREAMS];

	

for (int sid=0; sid<NSTREAMS; sid++){

    cudaHostAlloc((void**)&data[sid],NDATA*sizeof(cufftReal),

		  cudaHostAllocPortable);

    cudaHostAlloc((void**)&hostPtrOut[sid],NDATA*sizeof(cufftComplex),

		  cudaHostAllocPortable);

  }

// source data creation 

  for (int sid=0; sid<NSTREAMS; sid++){

    for(size_t i= 0 ; i < NDATA ; i++){

      data[sid][i] = rand() / (float)RAND_MAX;

    }

  }

struct timeval time_start, time_stop; //for profiling

gettimeofday(&time_start,NULL);

// Device Array Ptr 

  cufftReal *devPtrIn[NSTREAMS];

  cufftComplex *devPtrOut[NSTREAMS];

  size_t memusedIn  = sizeof(cufftReal)*NX*WORKERS;

  size_t memusedOut = sizeof(cufftComplex)*NX*WORKERS;

for (int sid=0; sid<NSTREAMS; sid++){

    // GPU memory allocation 

    cutilSafeCall(cudaMalloc((void**)&devPtrIn[sid], memusedIn));

    cutilSafeCall(cudaMalloc((void**)&devPtrOut[sid],memusedOut));

// transfer to GPU memory 

    cutilSafeCall(cudaMemcpyAsync(devPtrIn[sid], data[sid], memusedIn, 

				  cudaMemcpyHostToDevice,streams[sid]));

  }

// executes FFT processes 

  for (int sid=0; sid<NSTREAMS; sid++){

    cufftSafeCall(cufftExecR2C(plan[sid], devPtrIn[sid], devPtrOut[sid]));

  }

// transfer results from GPU memory 

for (int sid=0; sid<NSTREAMS; sid++){

    cutilSafeCall(cudaMemcpyAsync(hostPtrOut[sid], devPtrOut[sid], memusedOut,

				  cudaMemcpyDeviceToHost,streams[sid]));

  }  

// Synchronize and destroy

  for (int sid=0; sid<NSTREAMS; sid++){

    cudaStreamSynchronize(streams[sid]);

    cudaStreamDestroy(streams[sid]);

  }

gettimeofday(&time_stop, NULL);

printf(" Total              = %10.2f\n",delay("Host -> Device",time_start,time_stop)/(float)NFFT);

// frees GPU memory 

  for (int sid=0; sid<NSTREAMS; sid++){

    cutilSafeCall(cudaFree(devPtrIn[sid]));

    cutilSafeCall(cudaFree(devPtrOut[sid]));

  }

// deletes CUFFT plan 

  for (int sid=0; sid<NSTREAMS; sid++){  

    cufftSafeCall(cufftDestroy(plan[sid]));

  }

	  // delete data

  for (int sid=0; sid<NSTREAMS; sid++){  

    cutilSafeCall(cudaFreeHost(data[sid]));

    cutilSafeCall(cudaFreeHost(hostPtrOut[sid]));

  }

return 0;

}//end runTest

unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop) {

if(time_stop.tv_usec < time_start.tv_usec){

    printf("adjust Stop\n");

    time_stop.tv_sec--;

    time_stop.tv_usec +=1000000;

  }

time_stop.tv_sec  -= time_start.tv_sec;

  time_stop.tv_usec -= time_start.tv_usec;

return time_stop.tv_sec*1000000+time_stop.tv_usec;

}

Compared to the previous code I have a 10% perf. improvment doing as the simpleMultiCopy example (see the code below).

Also, it seems that the time (per FFT done) decreases from 1 stream to 5 streams but after there is no gain to use more streams.

An idea to explain this behavior?

//////////////// START

  cudaEventRecord(start_event, 0);

  int current_stream = 0;

//Upload Data for first Stream

  cutilSafeCall(cudaMemcpyAsync(devPtrIn[0], data[0], memusedIn, 

				cudaMemcpyHostToDevice,streams[0]));

int next_stream;  

  for( current_stream = 0; current_stream<NSTREAMS-1; ++current_stream ) {

    next_stream = current_stream + 1;

// Ensure that processing and copying of the last cycle has finished

    cudaEventSynchronize(cycleDone[next_stream]);

// executes FFT current_stream 

    cufftSafeCall(cufftExecR2C(plan[current_stream], 

			       devPtrIn[current_stream], 

			       devPtrOut[current_stream]));

// Upload Host -> Device next_stream

    cutilSafeCall(cudaMemcpyAsync(devPtrIn[next_stream], 

				  data[next_stream],

				  memusedIn, 

				  cudaMemcpyHostToDevice,

				  streams[next_stream]));

// Download Device->Host current_stream 

    cutilSafeCall(cudaMemcpyAsync(hostPtrOut[current_stream], 

				  devPtrOut[current_stream], 

				  memusedOut,

				  cudaMemcpyDeviceToHost,

				  streams[current_stream]));

cutilSafeCall( cudaEventRecord(

				   cycleDone[current_stream], 

				   streams[current_stream]) );

}

  current_stream = next_stream; //last one

  // executes FFT current_stream 

  cufftSafeCall(cufftExecR2C(plan[current_stream], 

			     devPtrIn[current_stream], 

			     devPtrOut[current_stream]));

  // Download Device->Host current_stream 

  cutilSafeCall(cudaMemcpyAsync(hostPtrOut[current_stream], 

				devPtrOut[current_stream], 

				memusedOut,

				cudaMemcpyDeviceToHost,

				streams[current_stream]));

  //////////////// END  

// Synchronize and destroy

  for (int sid=0; sid<NSTREAMS; sid++){

    cudaStreamSynchronize(streams[sid]);

    cudaStreamDestroy(streams[sid]);

  }

Hi,

I have made similar experiences when using the cufft functions together with cuda streams. When measuring execution time it seems that the fft functions prevent concurrent asynchronous data transfers. Until now I assume that there is a bug in the cufft library.

Today I have recognized that an older cuda version was installed on my computer. After an update to cuda 3.2. all works fine now. External Image