cuFFT & OpenMP Asynchronous multi-GPU 3D FFT transformation

fizimokus · September 14, 2010, 8:48am

Hi Folks,

I want to write a code which performs a 3D FFT transformation on large (2,4,8,… GIGS) data sets. I use cuFFT of the 3.1 Toolkit and OpenMP on 4 TESLA C1060 GPUs in a Supermicro machine. I wrote a synchronous code with cudaMemcpy() and cufftExec…() statements, and it works fine even on 4 GPUs. In order to increase speed, I use page locked host memory (cudaHostAlloc and cudaFreeHost) for the splitted data set, and now I want to use streams to achieve concurrent copy and execute (cudaMemCpyAsync, cudaStreamCreate, cudaStreamDestory and cufftSetStream). But the asynchronous version doesn’t seem to work in multi-GPU mode. It works fine on 1 GPU, in the case of 2 GPUs sometimes it stops with miscellaneous errors, mainly with “invalid resource handle”, and the situation is worse in the case of 4 GPUs, where the program almost never runs but stops with this error. I think that the error is in the cufftSetStream() statement, but I don’t know why. It seems that the program looses the stream id attached to the fft plan. Has anyone any experience in this? Streams, OpenMP & cuFFT.

Thanks, Gyula

[codebox]#include<stdio.h>

#include<stdlib.h>

#include<cufft.h>

#include<omp.h>

#define ASYNC 1

#define STREAMMINPOW (ASYNC ? 1 : 0)

#define NGPUSPOW 1

#define NGPUS (1<<NGPUSPOW)

#define NXPOW 9

#define NYPOW 9

#define NZPOW 9

#define NX (1<<NXPOW)

#define NY (1<<NYPOW)

#define NZ (1<<NZPOW)

#define PNZ (NZ/2+1)

#define BLOCKMAXPOW 31

#define STREAMMAXPOW 30

#define SIZERPOW 2

#define SIZER (1<<SIZERPOW)

#define SIZEC (sizeof(cufftComplex))

#define NBLOCKSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-BLOCKMAXPOW)

#define NBLOCKSPOW (NBLOCKSPOWTMP >= 0 ? NBLOCKSPOWTMP : 0)

#define NBLOCKS (1<<NBLOCKSPOW)

#define BLOCKCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW))*PNZ)

#define NSTREAMSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-NBLOCKSPOW-STREAMMAXPOW)

#define NSTREAMSPOW (NSTREAMSPOWTMP >= STREAMMINPOW ? NBLOCKSPOWTMP : STREAMMINPOW)

#define NSTREAMS (1<<NSTREAMSPOW)

#define STREAMCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW-NSTREAMSPOW))*PNZ)

#define MEGA (1<<20)

#define STEPS 100

device cufftComplex *gpu_array[NSTREAMS];

main()

{

printf(“Number of GPUs : %d\n”,NGPUS);

printf(“Number of Blocks per GPU : %d\n”,NBLOCKS);

printf(“Data size per GPU = %lf Mb.\n”,(float)BLOCKCSIZE/MEGA*SIZEC);

printf(“Number of Streams per GPU Block : %d\n”,NSTREAMS);

printf(“Data size per GPU Stream = %lf Mb.\n”,(float)STREAMCSIZE/MEGA*SIZEC);

float cpu_array[NGPUSNBLOCKS*NSTREAMS];

cufftHandle fft2d[NSTREAMS],fft2di[NSTREAMS],fft1d[NSTREAMS],fft1di[NSTR

EAMS];

#if (ASYNC)

cudaStream_t streams[NSTREAMS];

#endif

int size1d[1];

size1d[0]=NX;

int batch1d=NY*PNZ/NGPUS/NBLOCKS/NSTREAMS;

int size2d[2];

size2d[0]=NY;

size2d[1]=NZ;

int batch2d=NX/NGPUS/NBLOCKS/NSTREAMS;

int gid;

// Allocate CPU & GPU memory, create cuFFT plans and streams

omp_set_num_threads(NGPUS);

#if (ASYNC)

#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di,streams) shared(cpu_array)

#else

#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di) shared(cpu_array)

#endif

{

gid=omp_get_thread_num();

cudaSetDevice(gid);

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaMalloc((void **)&gpu_array[sid],STREAMCSIZE*SIZEC);

for (int bid=0; bid<NBLOCKS; bid++)

cudaHostAlloc((void **)&cpu_array[(gid*NBLOCKS+bid)NSTREAMS+sid],STREAMCSIZESI

ZEC,cudaHostAllocPortable);

cufftPlanMany(&fft1d[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d)

;

cufftPlanMany(&fft1di[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d

);

cufftPlanMany(&fft2d[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_R2C,batch2d)

;

cufftPlanMany(&fft2di[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_C2R,batch2d

);

}

for (int s=0; s<STEPS; s++)

{

// Test FFT

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamCreate(&streams[sid]);

cufftSetStream(fft2d[sid],streams[sid]);

cudaStreamSynchronize(streams[sid]);

}

#endif

// Perform 2D R2C batched FFT

for (int bid=0; bid<NBLOCKS; bid++)

{

#if (!ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA

MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr

ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);

#else

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N

STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream

s

[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp

u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream

s

[sid]);

#endif

}

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamSynchronize(streams[sid]);

cudaStreamDestroy(streams[sid]);

}

#endif

// Transpose array

// Perform 1D C2C batched FFT

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamCreate(&streams[sid]);

cufftSetStream(fft1d[sid],streams[sid]);

cudaStreamSynchronize(streams[sid]);

}

#endif

for (int bid=0; bid<NBLOCKS; bid++)

{

#if (!ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA

MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_

FORWARD);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr

ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);

#else

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N

STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream

s

[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_

FORWARD);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp

u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream

s

[sid]);

#endif

}

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamSynchronize(streams[sid]);

cudaStreamDestroy(streams[sid]);

}

#endif

}

printf(“Calculation done on GPU #%d.\n”,gid);

// Free GPU memory and destroy FFT plans

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaFree(gpu_array[sid]);

cufftDestroy(fft1d[sid]);

cufftDestroy(fft1di[sid]);

cufftDestroy(fft2d[sid]);

cufftDestroy(fft2di[sid]);

for (int bid=0; bid<NBLOCKS; bid++)

cudaFreeHost(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid]);

}

// Destroy active GPU context

cudaThreadExit();

}

[/codebox]

Compile with “nvcc -lcufft -Xcompiler -fopenmp xxx.cu” on Linux (Ubuntu 9.04)

jan.heckman · September 14, 2010, 12:50pm

Hi Folks,

I want to write a code which performs a 3D FFT transformation on large (2,4,8,… GIGS) data sets. I use cuFFT of the 3.1 Toolkit and OpenMP on 4 TESLA C1060 GPUs in a Supermicro machine. I wrote a synchronous code with cudaMemcpy() and cufftExec…() statements, and it works fine even on 4 GPUs. In order to increase speed, I use page locked host memory (cudaHostAlloc and cudaFreeHost) for the splitted data set, and now I want to use streams to achieve concurrent copy and execute (cudaMemCpyAsync, cudaStreamCreate, cudaStreamDestory and cufftSetStream). But the asynchronous version doesn’t seem to work in multi-GPU mode. It works fine on 1 GPU, in the case of 2 GPUs sometimes it stops with miscellaneous errors, mainly with “invalid resource handle”, and the situation is worse in the case of 4 GPUs, where the program almost never runs but stops with this error. I think that the error is in the cufftSetStream() statement, but I don’t know why. It seems that the program looses the stream id attached to the fft plan. Has anyone any experience in this? Streams, OpenMP & cuFFT.

Thanks, Gyula

[codebox]#include<stdio.h>

#include<stdlib.h>

#include<cufft.h>

#include<omp.h>

define ASYNC 1

define STREAMMINPOW (ASYNC ? 1 : 0)

define NGPUSPOW 1

define NGPUS (1<<NGPUSPOW)

define NXPOW 9

define NYPOW 9

define NZPOW 9

define NX (1<<NXPOW)

define NY (1<<NYPOW)

define NZ (1<<NZPOW)

define PNZ (NZ/2+1)

define BLOCKMAXPOW 31

define STREAMMAXPOW 30

define SIZERPOW 2

define SIZER (1<<SIZERPOW)

define SIZEC (sizeof(cufftComplex))

define NBLOCKSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-BLOCKMAXPOW)

define NBLOCKSPOW (NBLOCKSPOWTMP >= 0 ? NBLOCKSPOWTMP : 0)

define NBLOCKS (1<<NBLOCKSPOW)

define BLOCKCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW))*PNZ)

define NSTREAMSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-NBLOCKSPOW-STREAMMAXPOW)

define NSTREAMSPOW (NSTREAMSPOWTMP >= STREAMMINPOW ? NBLOCKSPOWTMP : STREAMMINPOW)

define NSTREAMS (1<<NSTREAMSPOW)

define STREAMCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW-NSTREAMSPOW))*PNZ)

define MEGA (1<<20)

define STEPS 100

device cufftComplex *gpu_array[NSTREAMS];

main()

{

printf(“Number of GPUs : %d\n”,NGPUS);

printf(“Number of Blocks per GPU : %d\n”,NBLOCKS);

printf(“Data size per GPU = %lf Mb.\n”,(float)BLOCKCSIZE/MEGA*SIZEC);

printf(“Number of Streams per GPU Block : %d\n”,NSTREAMS);

printf(“Data size per GPU Stream = %lf Mb.\n”,(float)STREAMCSIZE/MEGA*SIZEC);

float cpu_array[NGPUSNBLOCKS*NSTREAMS];

cufftHandle fft2d[NSTREAMS],fft2di[NSTREAMS],fft1d[NSTREAMS],fft1di[NSTR

EAMS];

#if (ASYNC)

cudaStream_t streams[NSTREAMS];

endif

int size1d[1];

size1d[0]=NX;

int batch1d=NY*PNZ/NGPUS/NBLOCKS/NSTREAMS;

int size2d[2];

size2d[0]=NY;

size2d[1]=NZ;

int batch2d=NX/NGPUS/NBLOCKS/NSTREAMS;

int gid;

// Allocate CPU & GPU memory, create cuFFT plans and streams

omp_set_num_threads(NGPUS);

#if (ASYNC)

#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di,streams) shared(cpu_array)

else

#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di) shared(cpu_array)

endif

{

gid=omp_get_thread_num();

cudaSetDevice(gid);

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaMalloc((void **)&gpu_array[sid],STREAMCSIZE*SIZEC);

for (int bid=0; bid<NBLOCKS; bid++)

cudaHostAlloc((void **)&cpu_array[(gid*NBLOCKS+bid)NSTREAMS+sid],STREAMCSIZESI

ZEC,cudaHostAllocPortable);

cufftPlanMany(&fft1d[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d)

;

cufftPlanMany(&fft1di[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d

);

cufftPlanMany(&fft2d[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_R2C,batch2d)

;

cufftPlanMany(&fft2di[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_C2R,batch2d

);

}

for (int s=0; s<STEPS; s++)

{

// Test FFT

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamCreate(&streams[sid]);

cufftSetStream(fft2d[sid],streams[sid]);

cudaStreamSynchronize(streams[sid]);

}

endif

// Perform 2D R2C batched FFT

for (int bid=0; bid<NBLOCKS; bid++)

{

#if (!ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA

MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr

ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);

else

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N

STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream

s

[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp

u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream

s

[sid]);

endif

}

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamSynchronize(streams[sid]);

cudaStreamDestroy(streams[sid]);

}

endif

// Transpose array

// Perform 1D C2C batched FFT

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamCreate(&streams[sid]);

cufftSetStream(fft1d[sid],streams[sid]);

cudaStreamSynchronize(streams[sid]);

}

endif

for (int bid=0; bid<NBLOCKS; bid++)

{

#if (!ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA

MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_

FORWARD);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr

ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);

else

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N

STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream

s

[sid]);

for (int sid=0; sid<NSTREAMS; sid++)

cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_

FORWARD);

for (int sid=0; sid<NSTREAMS; sid++)

cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp

u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream

s

[sid]);

endif

}

#if (ASYNC)

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaStreamSynchronize(streams[sid]);

cudaStreamDestroy(streams[sid]);

}

endif

}

printf(“Calculation done on GPU #%d.\n”,gid);

// Free GPU memory and destroy FFT plans

for (int sid=0; sid<NSTREAMS; sid++)

{

cudaFree(gpu_array[sid]);

cufftDestroy(fft1d[sid]);

cufftDestroy(fft1di[sid]);

cufftDestroy(fft2d[sid]);

cufftDestroy(fft2di[sid]);

for (int bid=0; bid<NBLOCKS; bid++)

cudaFreeHost(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid]);

}

// Destroy active GPU context

cudaThreadExit();

}

}

[/codebox]

Compile with “nvcc -lcufft -Xcompiler -fopenmp xxx.cu” on Linux (Ubuntu 9.04)

It’s quite a lot to work out, but I see one thing that might cause trouble, unless I am very wrong.

You declare the pointerarray gpu_array on a device. Which one? It is usual to keep the pointers to device arrays in host memory.

So I would begin by trying to remove the device in the declaration of gpu_array.

There is no need to sort of qualify pointers when they refer to device memory. It is just up to you to not confuse them…

EDIT: On my VS2008 installation, nvcc does not appear to pass settings for OpenMP to cl.exe. I have to name the file .cpp (and not .cu) to get the desired number of threads.

EDIT: Works fine (with mod above and as .cpp), buttt have just 2 gpu’s, one of which is really just good enough for screenoutput.

When executing with repetitions, I have seen the invalid resource handle error as well. Running in a command box, it executed 50+ times without error.

EDIT: In http://forums.nvidia.com/index.php?showtop…t=#entry1113859, message Sep 7 2010, 02:14 PM Avidday maintains that openmp is not very wel suited because the relation to actual threads is opaque. He suggests another method of threading. I would also check the SDK example simpleMultiGPU.

Final Edit: There is also an openmp example in the SDK (cudaOpenMp), so I give up speculating about the suitability of openmp for running several GPU’s.

jan.heckman · September 14, 2010, 12:50pm