Hello,
I am performing a summation reduction on the GPU using CUDA. The GPU gives a different value from the CPU and it also differ when I change the block size of the kernel launch. Here is the code in fortran,
module test
implicit none
integer, parameter :: dp = selected_real_kind(15,307)
integer :: n
real (dp), allocatable, dimension(:) :: Vec, tmp, Vec1
integer, device :: n_d
real (dp), device, allocatable, dimension(:) :: Vec_d, tmp_d
contains
attributes(global) subroutine reduction()
implicit none
integer :: i, tid, stride
!real (dp) :: tmp
real (dp), shared :: sdata(blockDim%x)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
tid= threadIdx%x
!sdata(tid) = 0.
sdata(tid) = Vec_d(i)
stride = blockDim%x/2
do while (stride>=1)
call syncthreads()
if (tid<=stride) sdata(tid) = sdata(tid) + sdata(tid + stride)
stride = stride/2
end do
call syncthreads()
if (tid==1) tmp_d(blockIdx%x) = sdata(1)
end subroutine reduction
end module test
program main
use test
implicit none
integer :: i, threads, blocks
real (dp):: total
n=438048
allocate(Vec(n))
allocate(Vec_d(n))
open(11, file='vector.dat')
do i=1,n
read(11,*) Vec(i)
end do
close(11)
Vec_d=Vec
threads=256
blocks= (n-1)/threads + 1
allocate(tmp(blocks))
allocate(tmp_d(blocks))
call reduction<<<blocks, threads, threads*8, 0 >>>()
tmp=tmp_d
total = 0.
do i=1,blocks
total = total + tmp(i)
end do
print*, 'sum gpu: ', total
total = 0.
do i=1,n
total = total + Vec(i)
end do
print*, 'sum cpu: ', total
print*, 'Blocks: ', blocks
print*, 'Threads: ', threads
end program main
The input vector is extracted from a different program, uploaded here,
(File on MEGA)
The sample output that I’ve got, (compiled by: pgfortran -Mr8 code.cuf)
sum gpu: 9.9229944015915367E+028
sum cpu: 9.9229944015916106E+028
Blocks: 3423
Threads: 128
sum gpu: 9.9229944015915402E+028
sum cpu: 9.9229944015916106E+028
Blocks: 6845
Threads: 64
sum gpu: 9.9229944015915438E+028
sum cpu: 9.9229944015916106E+028
Blocks: 13689
Threads: 32
The results are from Tesla V100 and Intel Xeon CPU. Why are the results different?
Any advice would be appreciated.
Thank you.