Hi All,
Does anyone know where I can find a CUDA FORTRAN shared memory warp-level sum reduction example for compute capability 2.0 and higher? I’ve provided an attempt below which isn’t working. I’ve first shown an example which does what should be unnecessary thread synchronization for threads less than or equal to 32 (1 warp). This gives the correct answer. In the second example, I add the volatile attribute to the shared array and remove the synchronization, but this causes an incorrect answer.
kfernandes@kepler:/scratch/kfernandes/gpu
-254-$ cat SumRed.f90
subroutine Run
use CUDAFOR
implicit none
real*8 :: Global(1)
real*8,device :: DevGlobal(1)
type(dim3) :: GridDim,BlockDim
GridDim = Dim3(1,1,1)
BlockDim = Dim3(512,1,1)
call Kernel<<<GridDim,BlockDim>>>(DevGlobal)
Global = DevGlobal
print *,Global
end subroutine Run
attributes(global) subroutine Kernel(DevGlobal)
implicit none
real*8 :: DevGlobal(*)
real*8,shared :: Shared(512)
Shared(ThreadIdx%X) = 1.0d0
call SyncThreads()
if (ThreadIdx%X.le.256) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+256)
end if
call SyncThreads()
if (ThreadIdx%X.le.128) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+128)
end if
call SyncThreads()
if (ThreadIdx%X.le.64) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+64)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+32)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+16)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+8)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+4)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+2)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+1)
end if
if (ThreadIdx%X.eq.1) then
DevGlobal(1) = Shared(1)
end if
end subroutine Kernel
program Prog
implicit none
call Run
end program Prog
compiled with
kfernandes@kepler:/scratch/kfernandes/gpu
-255-$ pgf90 -fast -O3 -Mcuda=cuda6.0,cc35,keepgpu SumRed.f90
results in
kfernandes@kepler:/scratch/kfernandes/gpu
-256-$ ./a.out
512.0000000000000
which is correct and the corresponding .gpu file is
kfernandes@kepler:/scratch/kfernandes/gpu
-276-$ cat SumRed.n001.gpu
#include "cuda_runtime.h"
#include "pgi_cuda_runtime.h"
#include "SumRed.n001.h"
extern "C" __global__ void kernel_(signed char* _pdevglobal)
{
__shared__ __align__(8) long long n_shared[512];
signed char* et7;
et7 = (((signed char*)(&n_shared))-8)+((long long)(((int)(threadIdx.x+1))*(8)));
*( double*)(et7) = (1.00000000000000000E+0);
__syncthreads();
if( (((int)(threadIdx.x+1))>(256))) goto _BB_5;
*( double*)(et7) = (*( double*)((((signed char*)(&n_shared))+2040)+((long long)(((int)(threadIdx.x+1))*(8)))))+(*( double*)(et7));
_BB_5: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(128))) goto _BB_7;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+1016)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_7: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(64))) goto _BB_9;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+504)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_9: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_11;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+248)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_11: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_13;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+120)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_13: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_15;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+56)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_15: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_17;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+24)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_17: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_19;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+8)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_19: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_21;
*( double*)(et7) = (*( double*)(et7))+(*( double*)(((signed char*)(&n_shared))+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_21: ;
if( (((int)(threadIdx.x+1))!=(1))) goto _BB_23;
*( double*)(_pdevglobal) = (( double*)&n_shared)[0];
_BB_23: ;
}
However,
kfernandes@kepler:/scratch/kfernandes/gpu
-279-$ cat WarpSumRed.f90
subroutine Run
use CUDAFOR
implicit none
real*8 :: Global(1)
real*8,device :: DevGlobal(1)
type(dim3) :: GridDim,BlockDim
GridDim = Dim3(1,1,1)
BlockDim = Dim3(512,1,1)
call Kernel<<<GridDim,BlockDim>>>(DevGlobal)
Global = DevGlobal
print *,Global
end subroutine Run
attributes(global) subroutine Kernel(DevGlobal)
implicit none
real*8 :: DevGlobal(*)
real*8,shared,volatile :: Shared(512)
Shared(ThreadIdx%X) = 1.0d0
call SyncThreads()
if (ThreadIdx%X.le.256) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+256)
end if
call SyncThreads()
if (ThreadIdx%X.le.128) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+128)
end if
call SyncThreads()
if (ThreadIdx%X.le.64) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+64)
end if
call SyncThreads()
if (ThreadIdx%X.le.32) then
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+32)
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+16)
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+8)
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+4)
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+2)
Shared(ThreadIdx%X) = Shared(ThreadIdx%X)+Shared(ThreadIdx%X+1)
end if
if (ThreadIdx%X.eq.1) then
DevGlobal(1) = Shared(1)
end if
end subroutine Kernel
program Prog
implicit none
call Run
end program Prog
compiled with
kfernandes@kepler:/scratch/kfernandes/gpu
-255-$ pgf90 -fast -O3 -Mcuda=cuda6.0,cc35,keepgpu WarpSumRed.f90
results in
kfernandes@kepler:/scratch/kfernandes/gpu
-256-$ ./a.out
56.00000000000000
which is not correct and the corresponding .gpu file is
kfernandes@kepler:/scratch/kfernandes/gpu
-283-$ cat WarpSumRed.n001.gpu
#include "cuda_runtime.h"
#include "pgi_cuda_runtime.h"
#include "WarpSumRed.n001.h"
extern "C" __global__ void kernel_(signed char* _pdevglobal)
{
__shared__ __align__(8) volatile long long n_shared[512];
int _i_1;
signed char* et7;
et7 = (((signed char*)(&n_shared))-8)+((long long)(((int)(threadIdx.x+1))*(8)));
*( double*)(et7) = (1.00000000000000000E+0);
__syncthreads();
if( (((int)(threadIdx.x+1))>(256))) goto _BB_5;
*( double*)(et7) = (*( double*)((((signed char*)(&n_shared))+2040)+((long long)(((int)(threadIdx.x+1))*(8)))))+(*( double*)(et7));
_BB_5: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(128))) goto _BB_7;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+1016)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_7: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(64))) goto _BB_9;
*( double*)(et7) = (*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+504)+((long long)(((int)(threadIdx.x+1))*(8)))));
_BB_9: ;
__syncthreads();
if( (((int)(threadIdx.x+1))>(32))) goto _BB_11;
_i_1 = ((int)(threadIdx.x+1))*(8);
*( double*)(et7) = ((((((*( double*)(et7))+(*( double*)((((signed char*)(&n_shared))+248)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+120)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+56)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+24)+((long long)(_i_1)))))+(*( double*)((((signed char*)(&n_shared))+8)+((long long)(_i_1)))))+(*( double*)(((signed char*)(&n_shared))+((long long)(_i_1))));
_BB_11: ;
if( (((int)(threadIdx.x+1))!=(1))) goto _BB_13;
*( double*)(_pdevglobal) = (( double*)&n_shared)[0];
_BB_13: ;
}
I think the issues are that the volatile attribute is not being passed onto the et7 pointer and that the reduction is being over optimised into a single sum. Any ideas what I can do to get this to work? I’m probably going to try declaring a volatile pointer in the CUDA FORTRAN code which points to the shared array, but I’m not sure how the compiler is going to treat/optimize the pointer. Right now I’m just hoping it won’t create a non-volatile pointer to my pointer :).
Regards,
Kyle