cuFFT & OpenMP Asynchronous multi-GPU 3D FFT transformation

Hi Folks,

I want to write a code which performs a 3D FFT transformation on large (2,4,8,… GIGS) data sets. I use cuFFT of the 3.1 Toolkit and OpenMP on 4 TESLA C1060 GPUs in a Supermicro machine. I wrote a synchronous code with cudaMemcpy() and cufftExec…() statements, and it works fine even on 4 GPUs. In order to increase speed, I use page locked host memory (cudaHostAlloc and cudaFreeHost) for the splitted data set, and now I want to use streams to achieve concurrent copy and execute (cudaMemCpyAsync, cudaStreamCreate, cudaStreamDestory and cufftSetStream). But the asynchronous version doesn’t seem to work in multi-GPU mode. It works fine on 1 GPU, in the case of 2 GPUs sometimes it stops with miscellaneous errors, mainly with “invalid resource handle”, and the situation is worse in the case of 4 GPUs, where the program almost never runs but stops with this error. I think that the error is in the cufftSetStream() statement, but I don’t know why. It seems that the program looses the stream id attached to the fft plan. Has anyone any experience in this? Streams, OpenMP & cuFFT.

Thanks, Gyula

[codebox]#include<stdio.h>

#include<stdlib.h>

#include<cufft.h>

#include<omp.h>

#define ASYNC 1

#define STREAMMINPOW (ASYNC ? 1 : 0)

#define NGPUSPOW 1

#define NGPUS (1<<NGPUSPOW)

#define NXPOW 9

#define NYPOW 9

#define NZPOW 9

#define NX (1<<NXPOW)

#define NY (1<<NYPOW)

#define NZ (1<<NZPOW)

#define PNZ (NZ/2+1)

#define BLOCKMAXPOW 31

#define STREAMMAXPOW 30

#define SIZERPOW 2

#define SIZER (1<<SIZERPOW)

#define SIZEC (sizeof(cufftComplex))

#define NBLOCKSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-BLOCKMAXPOW)

#define NBLOCKSPOW (NBLOCKSPOWTMP >= 0 ? NBLOCKSPOWTMP : 0)

#define NBLOCKS (1<<NBLOCKSPOW)

#define BLOCKCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW))*PNZ)

#define NSTREAMSPOWTMP (NXPOW+NYPOW+NZPOW+SIZERPOW-NGPUSPOW-NBLOCKSPOW-STREAMMAXPOW)

#define NSTREAMSPOW (NSTREAMSPOWTMP >= STREAMMINPOW ? NBLOCKSPOWTMP : STREAMMINPOW)

#define NSTREAMS (1<<NSTREAMSPOW)

#define STREAMCSIZE ((1<<(NXPOW+NYPOW-NGPUSPOW-NBLOCKSPOW-NSTREAMSPOW))*PNZ)

#define MEGA (1<<20)

#define STEPS 100

device cufftComplex *gpu_array[NSTREAMS];

main()

{

printf(“Number of GPUs : %d\n”,NGPUS);

printf(“Number of Blocks per GPU : %d\n”,NBLOCKS);

printf(“Data size per GPU = %lf Mb.\n”,(float)BLOCKCSIZE/MEGA*SIZEC);

printf(“Number of Streams per GPU Block : %d\n”,NSTREAMS);

printf(“Data size per GPU Stream = %lf Mb.\n”,(float)STREAMCSIZE/MEGA*SIZEC);

float cpu_array[NGPUSNBLOCKS*NSTREAMS];

cufftHandle fft2d[NSTREAMS],fft2di[NSTREAMS],fft1d[NSTREAMS],fft1di[NSTR

EAMS];

#if (ASYNC)

cudaStream_t streams[NSTREAMS];

#endif

int size1d[1];

size1d[0]=NX;

int batch1d=NY*PNZ/NGPUS/NBLOCKS/NSTREAMS;

int size2d[2];

size2d[0]=NY;

size2d[1]=NZ;

int batch2d=NX/NGPUS/NBLOCKS/NSTREAMS;

int gid;

// Allocate CPU & GPU memory, create cuFFT plans and streams

omp_set_num_threads(NGPUS);

#if (ASYNC)

#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di,streams) shared(cpu_array)

#else

#pragma omp parallel private(gid,fft1d,fft1di,fft2d,fft2di) shared(cpu_array)

#endif

{

gid=omp_get_thread_num();

cudaSetDevice(gid);

for (int sid=0; sid<NSTREAMS; sid++)

{    

  cudaMalloc((void **)&gpu_array[sid],STREAMCSIZE*SIZEC);

  for (int bid=0; bid<NBLOCKS; bid++)

    cudaHostAlloc((void **)&cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],STREAMCSIZE*SI

ZEC,cudaHostAllocPortable);

  cufftPlanMany(&fft1d[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d)

;

  cufftPlanMany(&fft1di[sid],1,size1d,NULL,1,0,NULL,1,0,CUFFT_C2C,batch1d

);

  cufftPlanMany(&fft2d[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_R2C,batch2d)

;

  cufftPlanMany(&fft2di[sid],2,size2d,NULL,1,0,NULL,1,0,CUFFT_C2R,batch2d

);

}

for (int s=0; s<STEPS; s++)

{

  // Test FFT

  #if (ASYNC)

  for (int sid=0; sid<NSTREAMS; sid++)

  {

    cudaStreamCreate(&streams[sid]);

    cufftSetStream(fft2d[sid],streams[sid]);  

    cudaStreamSynchronize(streams[sid]);

  }

  #endif

  // Perform 2D R2C batched FFT

  for (int bid=0; bid<NBLOCKS; bid++)

  {

    #if (!ASYNC)

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA

MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);

    for (int sid=0; sid<NSTREAMS; sid++)

      cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);        

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr

ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);

    #else

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N

STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream

s

[sid]);

    for (int sid=0; sid<NSTREAMS; sid++)

      cufftExecR2C(fft2d[sid],(cufftReal *)gpu_array[sid],gpu_array[sid]);        

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp

u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream

s

[sid]);

    #endif

  }

  #if (ASYNC)

  for (int sid=0; sid<NSTREAMS; sid++)

  {   

    cudaStreamSynchronize(streams[sid]);

    cudaStreamDestroy(streams[sid]);

  }

  #endif

// Transpose array

// Perform 1D C2C batched FFT

  #if (ASYNC)

  for (int sid=0; sid<NSTREAMS; sid++)

  {

    cudaStreamCreate(&streams[sid]);

    cufftSetStream(fft1d[sid],streams[sid]);

    cudaStreamSynchronize(streams[sid]);  

  }

  #endif

  for (int bid=0; bid<NBLOCKS; bid++)

  {

    #if (!ASYNC)

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpy(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*NSTREA

MS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice);

    for (int sid=0; sid<NSTREAMS; sid++)

      cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_

FORWARD);

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpy(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gpu_arr

ay[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost);

    #else

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpyAsync(gpu_array[sid],cpu_array[(gid*NBLOCKS+bid)*N

STREAMS+sid],STREAMCSIZE*SIZEC,cudaMemcpyHostToDevice,stream

s

[sid]);

    for (int sid=0; sid<NSTREAMS; sid++)

      cufftExecC2C(fft1d[sid],gpu_array[sid],gpu_array[sid],CUFFT_

FORWARD);

    for (int sid=0; sid<NSTREAMS; sid++)

      cudaMemcpyAsync(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid],gp

u_array[sid],STREAMCSIZE*SIZEC,cudaMemcpyDeviceToHost,stream

s

[sid]);

    #endif

  }

  #if (ASYNC)

  for (int sid=0; sid<NSTREAMS; sid++)

  {   

    cudaStreamSynchronize(streams[sid]);

    cudaStreamDestroy(streams[sid]);

  }

  #endif

}

printf("Calculation done on GPU #%d.\n",gid);

// Free GPU memory and destroy FFT plans

for (int sid=0; sid<NSTREAMS; sid++)

{

  cudaFree(gpu_array[sid]);

  cufftDestroy(fft1d[sid]);

  cufftDestroy(fft1di[sid]);

  cufftDestroy(fft2d[sid]);

  cufftDestroy(fft2di[sid]);

  for (int bid=0; bid<NBLOCKS; bid++)

    cudaFreeHost(cpu_array[(gid*NBLOCKS+bid)*NSTREAMS+sid]);

}

// Destroy active GPU context

cudaThreadExit();

}

}[/codebox]

Compile with “nvcc -lcufft -arch sm_13 -Xcompiler -fopenmp xxx.cu” on Linux (Ubuntu 9.04)

Sorry, I missed the section. Check “Cuda programming & Development” section for this topic. Thx.

Sorry, I missed the section. Check “Cuda programming & Development” section for this topic. Thx.