Hi,
I am running SDK 3.2 on Ubuntu 10.04 with a GTX 460. I perform some tests of FFT 1D R2C on 16384 samples.
At first I was doing simply
-
Memcpy Host to Device
-
R2C FFT (NX=16384 in batch of 50)
-
Memcpy Debice to Host
Ok, no doubt that you imagine the result most of the time is spent in data transfert. So, first I have used cudaHostAlloc with
cudaHostAllocPortable flag to turn to pinned data. The gain is already nice from 60usec to 40usec (per FFT + 2 data transfers).
Then, I have looked at streams. Well, I have write a small code (see below). However I do not see any improvments switching from 1 stream to 2, 5 or10.
May be my code is not well written. Have you some comments on it?
Thanks
JE
PS: have a look at the second Edition (right after the first code sample)
/*
JEC 27/11/10 v5
Streams
*/
/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
// includes, project
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cutil_inline.h>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(int argc, char** argv);
unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
runTest(argc, argv);
//Destroy active GPU context
cudaThreadExit();
}
///////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, char** argv) {
//Create Streams
size_t NSTREAMS = 5;
if (argc==2) {
//change number of workers in cmd line
NSTREAMS=(size_t)(atoi(argv[1]));
} else {
printf("usage: myExamCUFFT1Dv5 <nomber of streams>\n");
return 0;
}
cudaSetDevice( cutGetMaxGflopsDeviceId() );
size_t NX = 16384; //number of time samplings
size_t WORKERS= 50/NSTREAMS; //number of workers per stream
size_t NFFT = NSTREAMS*WORKERS;
cufftHandle plan[NSTREAMS];
// creates 1D FFT plan
int DIM = 1;
int rank[DIM];
rank[0] = NX;
for (int sid=0; sid<NSTREAMS; sid++) {
cufftSafeCall(cufftPlanMany
(&plan[sid],DIM, rank,NULL,1,0,NULL,1,0,CUFFT_R2C, WORKERS)
);
}
cudaStream_t streams[NSTREAMS];
for (int sid=0; sid<NSTREAMS; sid++){
cudaStreamCreate(&streams[sid]);
cufftSetStream(plan[sid],streams[sid]);
cudaStreamSynchronize(streams[sid]);
}
// Host Array
size_t NDATA = NX*WORKERS;
printf("Total Memory in DATA %d MBytes\n",sizeof(cufftComplex)*NDATA/1000000);
cufftReal* data[NSTREAMS];
cufftComplex* hostPtrOut[NSTREAMS];
for (int sid=0; sid<NSTREAMS; sid++){
cudaHostAlloc((void**)&data[sid],NDATA*sizeof(cufftReal),
cudaHostAllocPortable);
cudaHostAlloc((void**)&hostPtrOut[sid],NDATA*sizeof(cufftComplex),
cudaHostAllocPortable);
}
// source data creation
for (int sid=0; sid<NSTREAMS; sid++){
for(size_t i= 0 ; i < NDATA ; i++){
data[sid][i] = rand() / (float)RAND_MAX;
}
}
struct timeval time_start, time_stop; //for profiling
gettimeofday(&time_start,NULL);
// Device Array Ptr
cufftReal *devPtrIn[NSTREAMS];
cufftComplex *devPtrOut[NSTREAMS];
size_t memusedIn = sizeof(cufftReal)*NX*WORKERS;
size_t memusedOut = sizeof(cufftComplex)*NX*WORKERS;
for (int sid=0; sid<NSTREAMS; sid++){
// GPU memory allocation
cutilSafeCall(cudaMalloc((void**)&devPtrIn[sid], memusedIn));
cutilSafeCall(cudaMalloc((void**)&devPtrOut[sid],memusedOut));
// transfer to GPU memory
cutilSafeCall(cudaMemcpyAsync(devPtrIn[sid], data[sid], memusedIn,
cudaMemcpyHostToDevice,streams[sid]));
}
// executes FFT processes
for (int sid=0; sid<NSTREAMS; sid++){
cufftSafeCall(cufftExecR2C(plan[sid], devPtrIn[sid], devPtrOut[sid]));
}
// transfer results from GPU memory
for (int sid=0; sid<NSTREAMS; sid++){
cutilSafeCall(cudaMemcpyAsync(hostPtrOut[sid], devPtrOut[sid], memusedOut,
cudaMemcpyDeviceToHost,streams[sid]));
}
// Synchronize and destroy
for (int sid=0; sid<NSTREAMS; sid++){
cudaStreamSynchronize(streams[sid]);
cudaStreamDestroy(streams[sid]);
}
gettimeofday(&time_stop, NULL);
printf(" Total = %10.2f\n",delay("Host -> Device",time_start,time_stop)/(float)NFFT);
// frees GPU memory
for (int sid=0; sid<NSTREAMS; sid++){
cutilSafeCall(cudaFree(devPtrIn[sid]));
cutilSafeCall(cudaFree(devPtrOut[sid]));
}
// deletes CUFFT plan
for (int sid=0; sid<NSTREAMS; sid++){
cufftSafeCall(cufftDestroy(plan[sid]));
}
// delete data
for (int sid=0; sid<NSTREAMS; sid++){
cutilSafeCall(cudaFreeHost(data[sid]));
cutilSafeCall(cudaFreeHost(hostPtrOut[sid]));
}
return 0;
}//end runTest
unsigned int delay(const char* tag, struct timeval& time_start,struct timeval& time_stop) {
if(time_stop.tv_usec < time_start.tv_usec){
printf("adjust Stop\n");
time_stop.tv_sec--;
time_stop.tv_usec +=1000000;
}
time_stop.tv_sec -= time_start.tv_sec;
time_stop.tv_usec -= time_start.tv_usec;
return time_stop.tv_sec*1000000+time_stop.tv_usec;
}
Compared to the previous code I have a 10% perf. improvment doing as the simpleMultiCopy example (see the code below).
Also, it seems that the time (per FFT done) decreases from 1 stream to 5 streams but after there is no gain to use more streams.
An idea to explain this behavior?
//////////////// START
cudaEventRecord(start_event, 0);
int current_stream = 0;
//Upload Data for first Stream
cutilSafeCall(cudaMemcpyAsync(devPtrIn[0], data[0], memusedIn,
cudaMemcpyHostToDevice,streams[0]));
int next_stream;
for( current_stream = 0; current_stream<NSTREAMS-1; ++current_stream ) {
next_stream = current_stream + 1;
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// executes FFT current_stream
cufftSafeCall(cufftExecR2C(plan[current_stream],
devPtrIn[current_stream],
devPtrOut[current_stream]));
// Upload Host -> Device next_stream
cutilSafeCall(cudaMemcpyAsync(devPtrIn[next_stream],
data[next_stream],
memusedIn,
cudaMemcpyHostToDevice,
streams[next_stream]));
// Download Device->Host current_stream
cutilSafeCall(cudaMemcpyAsync(hostPtrOut[current_stream],
devPtrOut[current_stream],
memusedOut,
cudaMemcpyDeviceToHost,
streams[current_stream]));
cutilSafeCall( cudaEventRecord(
cycleDone[current_stream],
streams[current_stream]) );
}
current_stream = next_stream; //last one
// executes FFT current_stream
cufftSafeCall(cufftExecR2C(plan[current_stream],
devPtrIn[current_stream],
devPtrOut[current_stream]));
// Download Device->Host current_stream
cutilSafeCall(cudaMemcpyAsync(hostPtrOut[current_stream],
devPtrOut[current_stream],
memusedOut,
cudaMemcpyDeviceToHost,
streams[current_stream]));
//////////////// END
// Synchronize and destroy
for (int sid=0; sid<NSTREAMS; sid++){
cudaStreamSynchronize(streams[sid]);
cudaStreamDestroy(streams[sid]);
}