Hi Folks,
I want to write a code which performs a 3D FFT transformation on large (2,4,8,… GIGS) data sets. I use cuFFT of the 3.1 Toolkit and OpenMP on 4 TESLA C1060 GPUs in a Supermicro machine. I wrote a synchronous code with cudaMemcpy() and cufftExec…() statements, and it works fine even on 4 GPUs. In order to increase speed, I use page locked host memory (cudaHostAlloc and cudaFreeHost) for the splitted data set, and now I want to use streams to achieve concurrent copy and execute (cudaMemCpyAsync, cudaStreamCreate, cudaStreamDestory and cufftSetStream). But the asynchronous version doesn’t seem to work in multi-GPU mode. It works fine on 1 GPU, in the case of 2 GPUs sometimes it stops with miscellaneous errors, mainly with “invalid resource handle”, and the situation is worse in the case of 4 GPUs, where the program almost never runs but stops with this error. I think that the error is in the cufftSetStream() statement, but I don’t know why. It seems that the program looses the stream id attached to the fft plan. Has anyone any experience in this? Streams, OpenMP & cuFFT.
Thanks, Gyula
[codebox]#include<stdio.h>
#include<stdlib.h>
#include<cufft.h>
#include<omp.h>
#define ASYNC 1
#define STREAMMINPOW (ASYNC ? 1 : 0)
#define NGPUSPOW 1
#define NGPUS (1<<NGPUSPOW)
#define NXPOW 9
#define NYPOW 9
#define NZPOW 9
#define NX (1<<NXPOW)
#define NY (1<<NYPOW)
#define NZ (1<<NZPOW)
#define PNZ (NZ/2+1)
#define BLOCKMAXPOW 31
#define STREAMMAXPOW 30
#define SIZERPOW 2
#define SIZER (1<<SIZERPOW)
#define SIZEC (sizeof(cufftComplex))
#define NBLOCKSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-BLOCKMAXPOW)
#define NBLOCKSPOW (NBLOCKSPOWTMP >= 0 ? NBLOCKSPOWTMP : 0)
#define NBLOCKS (1<<NBLOCKSPOW)
#define BLOCKCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW))*PNZ)
#define NSTREAMSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-NBLOCKSPOW-STREAMMAXPOW)
#define NSTREAMSPOW (NSTREAMSPOWTMP >= STREAMMINPOW ? NBLOCKSPOWTMP : STREAMMINPOW)
#define NSTREAMS (1<<NSTREAMSPOW)
#define STREAMCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW-NSTREAMSPOW))*PNZ)
#define MEGA (1<<20)
#define STEPS 100
device cufftComplex *gpu_array[NSTREAMS];
main()
{
printf(“Number of GPUs : %d\n”,NGPUS);
printf(“Number of Blocks per GPU : %d\n”,NBLOCKS);
printf(“Data size per GPU = %lf Mb.\n”,(float)BLOCKCSIZE/MEGA*SIZEC);
printf(“Number of Streams per GPU Block : %d\n”,NSTREAMS);
printf(“Data size per GPU Stream = %lf Mb.\n”,(float)STREAMCSIZE/MEGA*SIZEC);
float cpu_array[NGPUSNBLOCKS*NSTREAMS];
cufftHandle fft2d[NSTREAMS],fft2di[NSTREAMS],fft1d[NSTREAMS],fft1di[NSTR
EAMS];
#if (ASYNC)
cudaStream_t streams[NSTREAMS];
int size1d[1];
size1d[0]=NX;
int batch1d=NY*PNZ/NGPUS/NBLOCKS/NSTREAMS;
int size2d[2];
size2d[0]=NY;
size2d[1]=NZ;
int batch2d=NX/NGPUS/NBLOCKS/NSTREAMS;
int gid;
// Allocate CPU & GPU memory, create cuFFT plans and streams
omp_set_num_threads(NGPUS);
#if (ASYNC)
#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di,streams) shared(cpu_array)
#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di) shared(cpu_array)
{
gid=omp_get_thread_num();
cudaSetDevice(gid);
for (int sid=0; sid<NSTREAMS; sid++)
{
cudaMalloc((void **)&gpu_array[sid],STREAMCSIZE*SIZEC);
for (int bid=0; bid<NBLOCKS; bid++)
cudaHostAlloc((void **)&cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],STREAMCSIZE*SI
ZEC,cudaHostAllocPortable);
cufftPlanMany(&fft1d[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d)
;
cufftPlanMany(&fft1di[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d
);
cufftPlanMany(&fft2d[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_R2C,batch2d)
;
cufftPlanMany(&fft2di[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_C2R,batch2d
);
}
for (int s=0; s<STEPS; s++)
{
// Test FFT
#if (ASYNC)
for (int sid=0; sid<NSTREAMS; sid++)
{
cudaStreamCreate(&streams[sid]);
cufftSetStream(fft2d[sid],streams[sid]);
cudaStreamSynchronize(streams[sid]);
}
#endif
// Perform 2D R2C batched FFT
for (int bid=0; bid<NBLOCKS; bid++)
{
#if (!ASYNC)
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA
MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);
for (int sid=0; sid<NSTREAMS; sid++)
cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr
ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);
#else
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N
STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream
s
[sid]);
for (int sid=0; sid<NSTREAMS; sid++)
cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp
u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream
s
[sid]);
#endif
}
#if (ASYNC)
for (int sid=0; sid<NSTREAMS; sid++)
{
cudaStreamSynchronize(streams[sid]);
cudaStreamDestroy(streams[sid]);
}
#endif
// Transpose array
// Perform 1D C2C batched FFT
#if (ASYNC)
for (int sid=0; sid<NSTREAMS; sid++)
{
cudaStreamCreate(&streams[sid]);
cufftSetStream(fft1d[sid],streams[sid]);
cudaStreamSynchronize(streams[sid]);
}
#endif
for (int bid=0; bid<NBLOCKS; bid++)
{
#if (!ASYNC)
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA
MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);
for (int sid=0; sid<NSTREAMS; sid++)
cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_
FORWARD);
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr
ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);
#else
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N
STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream
s
[sid]);
for (int sid=0; sid<NSTREAMS; sid++)
cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_
FORWARD);
for (int sid=0; sid<NSTREAMS; sid++)
cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp
u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream
s
[sid]);
#endif
}
#if (ASYNC)
for (int sid=0; sid<NSTREAMS; sid++)
{
cudaStreamSynchronize(streams[sid]);
cudaStreamDestroy(streams[sid]);
}
#endif
}
printf("Calculation done on GPU #%d.\n",gid);
// Free GPU memory and destroy FFT plans
for (int sid=0; sid<NSTREAMS; sid++)
{
cudaFree(gpu_array[sid]);
cufftDestroy(fft1d[sid]);
cufftDestroy(fft1di[sid]);
cufftDestroy(fft2d[sid]);
cufftDestroy(fft2di[sid]);
for (int bid=0; bid<NBLOCKS; bid++)
cudaFreeHost(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid]);
}
// Destroy active GPU context
cudaThreadExit();
}
}[/codebox]
Compile with “nvcc -lcufft -arch sm_13 -Xcompiler -fopenmp xxx.cu” on Linux (Ubuntu 9.04)