Hi,
I am running SDK 3.2 on Ubuntu 10.04 with a GTX 460. I perform some tests of FFT 1D R2C on 16384 samples.
At first I was doing simply
-
Memcpy Host to Device
-
R2C FFT (NX=16384 in batch of 50)
-
Memcpy Debice to Host
Ok, no doubt that you imagine the result most of the time is spent in data transfert. So, first I have used cudaHostAlloc with
cudaHostAllocPortable flag to turn to pinned data. The gain is already nice from 60usec to 40usec (per FFT + 2 data transfers).
Then, I have looked at streams. Well, I have write a small code (see below). However I do not see any improvments switching from 1 stream to 2, 5 or10.
May be my code is not well written. Have you some comments on it?
Thanks
JE
/*
JEC 27/11/10 v5
Streams
*/
/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
// includes, project
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cutil_inline.h>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(int argc, char** argv);
unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
runTest(argc, argv);
//Destroy active GPU context
cudaThreadExit();
}
///////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, char** argv) {
//Create Streams
size_t NSTREAMS = 5;
if (argc==2) {
//change number of workers in cmd line
NSTREAMS=(size_t)(atoi(argv[1]));
} else {
printf("usage: myExamCUFFT1Dv5 <nomber of streams>\n");
return 0;
}
cudaSetDevice( cutGetMaxGflopsDeviceId() );
size_t NX = 16384; //number of time samplings
size_t WORKERS= 50/NSTREAMS; //number of workers per stream
size_t NFFT = NSTREAMS*WORKERS;
cufftHandle plan[NSTREAMS];
// creates 1D FFT plan
int DIM = 1;
int rank[DIM];
rank[0] = NX;
for (int sid=0; sid<NSTREAMS; sid++) {
cufftSafeCall(cufftPlanMany
(&plan[sid],DIM, rank,NULL,1,0,NULL,1,0,CUFFT_R2C, WORKERS)
);
}
cudaStream_t streams[NSTREAMS];
for (int sid=0; sid<NSTREAMS; sid++){
cudaStreamCreate(&streams[sid]);
cufftSetStream(plan[sid],streams[sid]);
cudaStreamSynchronize(streams[sid]);
}
// Host Array
size_t NDATA = NX*WORKERS;
printf("Total Memory in DATA %d MBytes\n",sizeof(cufftComplex)*NDATA/1000000);
cufftReal* data[NSTREAMS];
cufftComplex* hostPtrOut[NSTREAMS];
for (int sid=0; sid<NSTREAMS; sid++){
cudaHostAlloc((void**)&data[sid],NDATA*sizeof(cufftReal),
cudaHostAllocPortable);
cudaHostAlloc((void**)&hostPtrOut[sid],NDATA*sizeof(cufftComplex),
cudaHostAllocPortable);
}
// source data creation
for (int sid=0; sid<NSTREAMS; sid++){
for(size_t i= 0 ; i < NDATA ; i++){
data[sid][i] = rand() / (float)RAND_MAX;
}
}
struct timeval time_start, time_stop; //for profiling
gettimeofday(&time_start,NULL);
// Device Array Ptr
cufftReal *devPtrIn[NSTREAMS];
cufftComplex *devPtrOut[NSTREAMS];
size_t memusedIn = sizeof(cufftReal)*NX*WORKERS;
size_t memusedOut = sizeof(cufftComplex)*NX*WORKERS;
for (int sid=0; sid<NSTREAMS; sid++){
// GPU memory allocation
cutilSafeCall(cudaMalloc((void**)&devPtrIn[sid], memusedIn));
cutilSafeCall(cudaMalloc((void**)&devPtrOut[sid],memusedOut));
// transfer to GPU memory
cutilSafeCall(cudaMemcpyAsync(devPtrIn[sid], data[sid], memusedIn,
cudaMemcpyHostToDevice,streams[sid]));
}
// executes FFT processes
for (int sid=0; sid<NSTREAMS; sid++){
cufftSafeCall(cufftExecR2C(plan[sid], devPtrIn[sid], devPtrOut[sid]));
}
// transfer results from GPU memory
for (int sid=0; sid<NSTREAMS; sid++){
cutilSafeCall(cudaMemcpyAsync(hostPtrOut[sid], devPtrOut[sid], memusedOut,
cudaMemcpyDeviceToHost,streams[sid]));
}
// Synchronize and destroy
for (int sid=0; sid<NSTREAMS; sid++){
cudaStreamSynchronize(streams[sid]);
cudaStreamDestroy(streams[sid]);
}
gettimeofday(&time_stop, NULL);
printf(" Total = %10.2f\n",delay("Host -> Device",time_start,time_stop)/(float)NFFT);
// frees GPU memory
for (int sid=0; sid<NSTREAMS; sid++){
cutilSafeCall(cudaFree(devPtrIn[sid]));
cutilSafeCall(cudaFree(devPtrOut[sid]));
}
// deletes CUFFT plan
for (int sid=0; sid<NSTREAMS; sid++){
cufftSafeCall(cufftDestroy(plan[sid]));
}
// delete data
for (int sid=0; sid<NSTREAMS; sid++){
cutilSafeCall(cudaFreeHost(data[sid]));
cutilSafeCall(cudaFreeHost(hostPtrOut[sid]));
}
return 0;
}//end runTest
unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop) {
if(time_stop.tv_usec < time_start.tv_usec){
printf("adjust Stop\n");
time_stop.tv_sec--;
time_stop.tv_usec +=1000000;
}
time_stop.tv_sec -= time_start.tv_sec;
time_stop.tv_usec -= time_start.tv_usec;
return time_stop.tv_sec*1000000+time_stop.tv_usec;
}