cufft1D and MemcpyAsync = gain? optimization of Host/Device transfert

jean-eric · November 28, 2010, 4:41pm

Hi,

I am running SDK 3.2 on Ubuntu 10.04 with a GTX 460. I perform some tests of FFT 1D R2C on 16384 samples.

At first I was doing simply

Memcpy Host to Device
R2C FFT (NX=16384 in batch of 50)
Memcpy Debice to Host

Ok, no doubt that you imagine the result most of the time is spent in data transfert. So, first I have used cudaHostAlloc with

cudaHostAllocPortable flag to turn to pinned data. The gain is already nice from 60usec to 40usec (per FFT + 2 data transfers).

Then, I have looked at streams. Well, I have write a small code (see below). However I do not see any improvments switching from 1 stream to 2, 5 or10.

May be my code is not well written. Have you some comments on it?

Thanks

JE

/*

  JEC 27/11/10 v5

  Streams

*/

/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <sys/time.h>

// includes, project

#include <cuda.h>

#include <cuda_runtime.h>

#include <cufft.h>

#include <cutil_inline.h>

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

int runTest(int argc, char** argv);

unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {

  runTest(argc, argv);

//Destroy active GPU context

  cudaThreadExit();

}

///////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

int runTest(int argc, char** argv) {

//Create Streams

  size_t NSTREAMS = 5;  

if (argc==2) {

    //change number of workers in cmd line

    NSTREAMS=(size_t)(atoi(argv[1]));

  } else {

    printf("usage: myExamCUFFT1Dv5 <nomber of streams>\n");

    return 0;

  }

cudaSetDevice( cutGetMaxGflopsDeviceId() );

size_t NX     = 16384; //number of time samplings

  size_t WORKERS= 50/NSTREAMS;    //number of workers per stream

size_t NFFT = NSTREAMS*WORKERS; 

cufftHandle plan[NSTREAMS];

// creates 1D FFT plan 

  int DIM = 1;

  int rank[DIM];

  rank[0] = NX;

  for (int sid=0; sid<NSTREAMS; sid++) {

    cufftSafeCall(cufftPlanMany

		  (&plan[sid],DIM, rank,NULL,1,0,NULL,1,0,CUFFT_R2C, WORKERS)

		  );

  }

cudaStream_t streams[NSTREAMS];

  for (int sid=0; sid<NSTREAMS; sid++){

    cudaStreamCreate(&streams[sid]);

    cufftSetStream(plan[sid],streams[sid]);

    cudaStreamSynchronize(streams[sid]);

  }

// Host Array 

  size_t NDATA = NX*WORKERS;

  printf("Total Memory in DATA %d MBytes\n",sizeof(cufftComplex)*NDATA/1000000);

cufftReal* data[NSTREAMS];

  cufftComplex* hostPtrOut[NSTREAMS];

	

for (int sid=0; sid<NSTREAMS; sid++){

    cudaHostAlloc((void**)&data[sid],NDATA*sizeof(cufftReal),

		  cudaHostAllocPortable);

    cudaHostAlloc((void**)&hostPtrOut[sid],NDATA*sizeof(cufftComplex),

		  cudaHostAllocPortable);

  }

// source data creation 

  for (int sid=0; sid<NSTREAMS; sid++){

    for(size_t i= 0 ; i < NDATA ; i++){

      data[sid][i] = rand() / (float)RAND_MAX;

    }

  }

struct timeval time_start, time_stop; //for profiling

gettimeofday(&time_start,NULL);

// Device Array Ptr 

  cufftReal *devPtrIn[NSTREAMS];

  cufftComplex *devPtrOut[NSTREAMS];

  size_t memusedIn  = sizeof(cufftReal)*NX*WORKERS;

  size_t memusedOut = sizeof(cufftComplex)*NX*WORKERS;

for (int sid=0; sid<NSTREAMS; sid++){

    // GPU memory allocation 

    cutilSafeCall(cudaMalloc((void**)&devPtrIn[sid], memusedIn));

    cutilSafeCall(cudaMalloc((void**)&devPtrOut[sid],memusedOut));

// transfer to GPU memory 

    cutilSafeCall(cudaMemcpyAsync(devPtrIn[sid], data[sid], memusedIn, 

				  cudaMemcpyHostToDevice,streams[sid]));

  }

// executes FFT processes 

  for (int sid=0; sid<NSTREAMS; sid++){

    cufftSafeCall(cufftExecR2C(plan[sid], devPtrIn[sid], devPtrOut[sid]));

  }

// transfer results from GPU memory 

for (int sid=0; sid<NSTREAMS; sid++){

    cutilSafeCall(cudaMemcpyAsync(hostPtrOut[sid], devPtrOut[sid], memusedOut,

				  cudaMemcpyDeviceToHost,streams[sid]));

  }  

// Synchronize and destroy

  for (int sid=0; sid<NSTREAMS; sid++){

    cudaStreamSynchronize(streams[sid]);

    cudaStreamDestroy(streams[sid]);

  }

gettimeofday(&time_stop, NULL);

printf(" Total              = %10.2f\n",delay("Host -> Device",time_start,time_stop)/(float)NFFT);

// frees GPU memory 

  for (int sid=0; sid<NSTREAMS; sid++){

    cutilSafeCall(cudaFree(devPtrIn[sid]));

    cutilSafeCall(cudaFree(devPtrOut[sid]));

  }

// deletes CUFFT plan 

  for (int sid=0; sid<NSTREAMS; sid++){  

    cufftSafeCall(cufftDestroy(plan[sid]));

  }

	  // delete data

  for (int sid=0; sid<NSTREAMS; sid++){  

    cutilSafeCall(cudaFreeHost(data[sid]));

    cutilSafeCall(cudaFreeHost(hostPtrOut[sid]));

  }

return 0;

}//end runTest

unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop) {

if(time_stop.tv_usec < time_start.tv_usec){

    printf("adjust Stop\n");

    time_stop.tv_sec--;

    time_stop.tv_usec +=1000000;

  }

time_stop.tv_sec  -= time_start.tv_sec;

  time_stop.tv_usec -= time_start.tv_usec;

return time_stop.tv_sec*1000000+time_stop.tv_usec;

}

jean-eric · November 28, 2010, 4:41pm

Hi,