Hello
I have a source code.
module test_gpu
use cudafor
contains
attributes(global) subroutine force_gpu(q,UEV1_m, block,koef,m3)
real,shared :: bsum(32)
real :: q(96),UEV1_m(block)
integer, value :: block,m3
real,value :: koef
integer :: i,tindex,tneighbor
i = (blockidx%x-1)*blockdim%x + threadidx%x
if (i .LE. m3) then
Ud_i=0
DO 3 J=1,m3
if (i.ne.j) then
aq=koef/2
Udj=aq
Ud_i= Ud_i + Udj
endif
call syncthreads()
3 continue
UEV1 =UEV1 + Ud_i
end if
tindex = threadIdx%x
bsum(tindex) = UEV1
call syncthreads()
tneighbor=16
do while( tneighbor >= 1 )
if(tindex<=tneighbor) then
bsum(tindex)=bsum(tindex)+bsum(tindex+tneighbor)
endif
tneighbor = tneighbor / 2
call syncthreads()
end do
if( tindex == 1 ) then
UEV1_m(blockidx%x) = bsum(1)
endif
call syncthreads()
endsubroutine
endmodule
program test
use test_gpu
real,device :: q_d(96)
real :: koef
real :: q(96)
integer :: block
real,device,allocatable :: UEV1_m(:)
real,allocatable :: UEV1_cp(:)
koef = 1.30462647E-04
m3=96
do 10 z = 1, 5000
block=(m3/32)+1
UEV1=0
UEV2=0
allocate(UEV1_m(block),UEV1_cp(block))
call force_gpu <<<block,32>>>(q_d,UEV1_m,block,koef,m3)
cudasync = cudaThreadSynchronize()
UEV1_cp = UEV1_m
do i = 1,block
UEV1=UEV1+UEV1_cp(i)
enddo
deallocate(UEV1_m,UEV1_cp)
do 2 i = 1,m3
Ud_i=0
DO 3 J=1,m3
if (i.ne.j) then
aq=koef/2
Udj=aq
Ud_i= Ud_i + Udj
endif
3 continue
UEV2 =UEV2 + Ud_i
2 continue
10 continue
write (*,*), UEV1
write (*,*), UEV2
end program
Results:
UEV1 = 0.5949091
UEV2 = 0.5949095
I do not understand why the results are different?