CUDA FORTRAN shared memory warp-level sum reduction

Hi All,

Does anyone know where I can find a CUDA FORTRAN shared memory warp-level sum reduction example for compute capability 2.0 and higher? I’ve provided an attempt below which isn’t working. I’ve first shown an example which does what should be unnecessary thread synchronization for threads less than or equal to 32 (1 warp). This gives the correct answer. In the second example, I add the volatile attribute to the shared array and remove the synchronization, but this causes an incorrect answer.

kfernandes@kepler:/scratch/kfernandes/gpu
-254-$ cat SumRed.f90 
subroutine Run
use CUDAFOR
implicit none
real*8 :: Global(1)
real*8,device :: DevGlobal(1)
type(dim3) :: GridDim,BlockDim

  GridDim = Dim3(1,1,1)
  BlockDim = Dim3(512,1,1)

  call Kernel<<<GridDim,BlockDim>>>(DevGlobal)
  Global = DevGlobal

  print *,Global

end subroutine Run

attributes(global) subroutine Kernel(DevGlobal)
implicit none
real*8 :: DevGlobal(*)
real*8,shared :: Shared(512)

  Shared(ThreadIdx%X) = 1.0d0
  call SyncThreads()

  if (ThreadIdx%X.le.256) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+256)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.128) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+128)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.64) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+64)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+32)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+16)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+8)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+4)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+2)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+1)
  end if

  if (ThreadIdx%X.eq.1) then
    DevGlobal(1) = Shared(1)
  end if

end subroutine Kernel

program Prog
implicit none

  call Run

end program Prog

compiled with

kfernandes@kepler:/scratch/kfernandes/gpu
-255-$ pgf90 -fast -O3 -Mcuda=cuda6.0,cc35,keepgpu SumRed.f90

results in

kfernandes@kepler:/scratch/kfernandes/gpu
-256-$ ./a.out 
    512.0000000000000

which is correct and the corresponding .gpu file is

kfernandes@kepler:/scratch/kfernandes/gpu
-276-$ cat SumRed.n001.gpu 
#include "cuda_runtime.h"
#include "pgi_cuda_runtime.h"
#include "SumRed.n001.h"
extern "C" __global__ void kernel_(signed char* _pdevglobal)
{
__shared__ __align__(8) long long n_shared[512];
signed char* et7;
et7 = (((signed char*)(&n_shared))-8)+((long long)(((int)(threadIdx.x+1))*(8)));
*( double*)(et7) = (1.00000000000000000E+0);
__syncthreads();
if( (((int)(threadIdx.x+1))>(256)))  goto _BB_5;
*( double*)(et7) = (*( double*)((((signed char*)(&n_shared))+2040)+((long long)(((int)(threadIdx.x+1))*(8)))))+(*( double*)(et7));
_BB_5: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(128)))  goto _BB_7;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+1016)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_7: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(64)))  goto _BB_9;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+504)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_9: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_11;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+248)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_11: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_13;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+120)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_13: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_15;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+56)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_15: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_17;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+24)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_17: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_19;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+8)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_19: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_21;
*( double*)(et7) = (*( double*)(et7))+(*( double*)(((signed char*)(&n_shared))+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_21: ;
if( (((int)(threadIdx.x+1))!=(1)))  goto _BB_23;
*( double*)(_pdevglobal) = (( double*)&n_shared)[0];
_BB_23: ;
}

However,

kfernandes@kepler:/scratch/kfernandes/gpu
-279-$ cat WarpSumRed.f90 
subroutine Run
use CUDAFOR
implicit none
real*8 :: Global(1)
real*8,device :: DevGlobal(1)
type(dim3) :: GridDim,BlockDim

  GridDim = Dim3(1,1,1)
  BlockDim = Dim3(512,1,1)

  call Kernel<<<GridDim,BlockDim>>>(DevGlobal)
  Global = DevGlobal

  print *,Global

end subroutine Run

attributes(global) subroutine Kernel(DevGlobal)
implicit none
real*8 :: DevGlobal(*)
real*8,shared,volatile :: Shared(512)

  Shared(ThreadIdx%X) = 1.0d0
  call SyncThreads()

  if (ThreadIdx%X.le.256) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+256)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.128) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+128)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.64) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+64)
  end if
  call SyncThreads()
  if (ThreadIdx%X.le.32) then
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+32)
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+16)
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+8)
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+4)
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+2)
    Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+1)
  end if

  if (ThreadIdx%X.eq.1) then
    DevGlobal(1) = Shared(1)
  end if

end subroutine Kernel

program Prog
implicit none

  call Run

end program Prog

compiled with

kfernandes@kepler:/scratch/kfernandes/gpu
-255-$ pgf90 -fast -O3 -Mcuda=cuda6.0,cc35,keepgpu WarpSumRed.f90

results in

kfernandes@kepler:/scratch/kfernandes/gpu
-256-$ ./a.out 
    56.00000000000000

which is not correct and the corresponding .gpu file is

kfernandes@kepler:/scratch/kfernandes/gpu
-283-$ cat WarpSumRed.n001.gpu 
#include "cuda_runtime.h"
#include "pgi_cuda_runtime.h"
#include "WarpSumRed.n001.h"
extern "C" __global__ void kernel_(signed char* _pdevglobal)
{
__shared__ __align__(8) volatile long long n_shared[512];
int _i_1;
signed char* et7;
et7 = (((signed char*)(&n_shared))-8)+((long long)(((int)(threadIdx.x+1))*(8)));
*( double*)(et7) = (1.00000000000000000E+0);
__syncthreads();
if( (((int)(threadIdx.x+1))>(256)))  goto _BB_5;
*( double*)(et7) = (*( double*)((((signed char*)(&n_shared))+2040)+((long long)(((int)(threadIdx.x+1))*(8)))))+(*( double*)(et7));
_BB_5: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(128)))  goto _BB_7;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+1016)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_7: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(64)))  goto _BB_9;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+504)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_9: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32)))  goto _BB_11;
_i_1 = ((int)(threadIdx.x+1))*(8);
*( double*)(et7) = ((((((*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+248)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+120)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+56)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+24)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+8)+((long long)(_i_1)))))+(*( double*)(((signed char*)(&n_shared))+((long long)(_i_1))));
_BB_11: ;
if( (((int)(threadIdx.x+1))!=(1)))  goto _BB_13;
*( double*)(_pdevglobal) = (( double*)&n_shared)[0];
_BB_13: ;
}

I think the issues are that the volatile attribute is not being passed onto the et7 pointer and that the reduction is being over optimised into a single sum. Any ideas what I can do to get this to work? I’m probably going to try declaring a volatile pointer in the CUDA FORTRAN code which points to the shared array, but I’m not sure how the compiler is going to treat/optimize the pointer. Right now I’m just hoping it won’t create a non-volatile pointer to my pointer :).

Regards,
Kyle

Hi Kyle,

While “volatile” isn’t getting passed on to the local pointer, the problem persists even when no optimization is applied (i.e. compile with “-O0 -Mcuda=keep,O0”). In this case, only “shared” will be used and “volatile” is applied, but the back-end CUDA compiler will still put the loads and stores to shared memory out-of-order.

In looking at CUDA C SDK version, they work around this by adding “if” statements before the stores. The if statements are just checking the blocksize, but it’s enough to force store/loads in order. Applying the same to the CUDA Fortran version works as well.

However, if you are using a Kepler or later, you may wish to consider using the shuffle operators instead. Based on this article: https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/, I wrote the following example which may be helpful.

% cat warp_red.CUF
#ifndef KIND
#define KIND 8
#endif

module red

use cudafor

integer, parameter :: rkind=KIND
real(rkind),parameter :: initVal = 1.23456789012343456789_rkind

contains

subroutine Run
 use CUDAFOR
 implicit none
 real(rkind) :: Global(1)
 real(rkind),device :: DevGlobal(1)
 real(rkind),device :: DevGlobal2(512)
 real(rkind) :: redVal
 real(rkind), device :: redValD
 integer :: istat
 type(dim3) :: GridDim,BlockDim

   GridDim = Dim3(1,1,1)
   BlockDim = Dim3(512,1,1)

   call Kernel<<<GridDim,BlockDim>>>(DevGlobal)
   istat = cudaGetLastError()
   if (istat.ne.0) print *, cudaGetErrorString(istat)
   Global = DevGlobal
   write(*,'(F32.25)'), Global

   DevGlobal2=initVal
   call KernelReduce<<<GridDim,BlockDim>>>(DevGlobal2, 512, redValD)
   istat = cudaDeviceSynchronize()
   istat = cudaGetLastError()
   if (istat.ne.0) print *, cudaGetErrorString(istat)
   redVal = redValD
   write(*,'(F32.25)'), redVal

 end subroutine Run

 attributes(global) subroutine Kernel(DevGlobal)
 implicit none
 real(rkind) :: DevGlobal(*)
 real(rkind),shared,volatile :: Shared(512)

   Shared(ThreadIdx%X) = initVal
   call SyncThreads()

   if (ThreadIdx%X.le.256) then
     Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+256)
   end if
   call SyncThreads()
   if (ThreadIdx%X.le.128) then
     Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+128)
   end if
   call SyncThreads()
   if (ThreadIdx%X.le.64) then
     Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+64)
   end if
   call SyncThreads()
   if (ThreadIdx%X.le.32) then
     if (blockDim%x.gt.64) Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+32)
     if (blockDim%x.gt.32) Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+16)
     if (blockDim%x.gt.16) Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+8)
     if (blockDim%x.gt.8)  Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+4)
     if (blockDim%x.gt.4)  Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+2)
     if (blockDim%x.gt.2)  Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+1)
   end if
   call SyncThreads()
   if (ThreadIdx%X.eq.1) DevGlobal(1) = Shared(1)
 end subroutine Kernel

 attributes(device) subroutine WarpReduce(val)
   implicit none
   real(rkind) :: val
   integer :: offset
   offset = warpsize/2
   do
      if (offset.le.0) exit
      val = val + __shfl_down(val,offset)
      offset = offset / 2
   enddo
 end subroutine WarpReduce

 attributes(device) function BlockReduce(val)
   implicit none
   real(rkind) :: BlockReduce
   real(rkind) :: val
   real(rkind),shared :: svalues(32)
   integer :: tid, lane, wid

   tid = threadIdx%x
   lane = mod(tid-1,warpsize)
   wid = (tid-1)/warpsize
   call warpReduce(val)
   if (lane.eq.0) svalues(wid+1) = val
   call syncthreads()
   if (tid .le. ((blockdim%x) / warpsize)) then
        val = svalues(lane+1)
   else
        val = 0
   endif
   if (wid.eq.0) call warpReduce(val)
   BlockReduce = val

 end function  BlockReduce

 attributes(global) subroutine KernelReduce(DevGlobal, N, red)
   real(rkind), dimension(*) :: DevGlobal
   integer, value :: N
   real(rkind) :: red
   integer :: tid
   real(rkind) :: val

   tid = (blockidx%x-1)*blockdim%x + threadidx%x
   val = 1
   if (tid .le. N) then
     val = DevGlobal(tid)
   endif
   val = BlockReduce(val)
   if (tid .eq. 1) red = val
 end subroutine KernelReduce

end module red

 program Prog
  use red
  implicit none
  call Run
 end program Prog
% pgf90 warp_red.CUF -DKIND=8 -Mcuda=cc35 -fast ; a.out
   632.0987597431984568174812011
   632.0987597431984568174812011
  • Mat