Hi Mat,
Your example works “ok” for me as well.
After several hours I have detected where the error in my program comes from, but it seems very strange.
If I run the code below with -Mcuda=emu, I get the good results.
If I run the code without “emu”, the program gives wrong results.
But the really strange thing is that if I erase the first subroutine in the file (the one called “force_over_my_particule”), everything works without “emu”!
And this is even more strange, because as you can see this subroutine
is not called from the kernell! (it’s commented!)
Thanks a lot for your help!
Alberto.
module foo
use cudafor
type float4
real :: x !!! X-position
real :: y !!! Y-position
real :: z !!! Z-position
real :: d !!! diameter
end type float4
type float3
real :: x !!! X-position
real :: y !!! Y-position
real :: z !!! Z-position
end type float3
contains
attributes(device) subroutine force_over_my_particule(X_shared,grad_shared,Npart)
type(float4), shared, dimension(8), intent(in) :: X_shared
type(float3), shared, dimension(8), intent(out) :: grad_shared
integer, value, intent(in) :: Npart
real :: force
real :: dx, dy, dz, r, invr
real :: diam, invdiam, ptemp, fr
integer :: i
type(float3) :: temp_grad
temp_grad%x = 0. !! No olvidar el "punto" al final.
temp_grad%y = 0.
temp_grad%z = 0.
do i=1,Npart
dx = X_Shared(threadidx%x)%x - X_Shared(i)%x
dx = dx-nint(dx)
dy = X_Shared(threadidx%x)%y - X_Shared(i)%y
dy = dy - nint(dy)
dz = X_Shared(threadidx%x)%z - X_Shared(i)%z
dz = dz - nint(dz)
r = sqrt(dx*dx+dy*dy+dz*dz)
invr = 1.0D0/r
diam = (X_Shared(threadidx%x)%d + X_Shared(i)%d)/2.0D0
invdiam = 1.0D0/diam
if ((r > 0) .AND. (r<diam)) then
ptemp = (1.0D0- r * invdiam)
fr= invdiam*invr*ptemp
temp_grad%x = temp_grad%x - fr*dx
temp_grad%y = temp_grad%y - fr*dy
temp_grad%z = temp_grad%z - fr*dz
end if
enddo
grad_shared(threadidx%x)%x=temp_grad%x
grad_shared(threadidx%x)%y=temp_grad%y
grad_shared(threadidx%x)%z=temp_grad%z
end subroutine force_over_my_particule
attributes(global) subroutine steepest_kernell(X_global,Nmin,Npart,step_size)
integer, value :: Nmin, Npart
real, value :: step_size
type(float4), device, dimension(Npart,Nmin) :: X_global
type(float4), shared, dimension(8) :: X_shared
type(float3), shared, dimension(8) :: grad_shared
type(float3) :: temp
!!! Copy to shared memory
X_shared(threadidx%x)%x = X_global(threadidx%x,blockidx%x)%x
X_shared(threadidx%x)%y = X_global(threadidx%x,blockidx%x)%y
X_shared(threadidx%x)%z = X_global(threadidx%x,blockidx%x)%z
X_shared(threadidx%x)%d = X_global(threadidx%x,blockidx%x)%d
call syncthreads()
!!! Compute the gradient
grad_shared(threadidx%x)%x = step_size
grad_shared(threadidx%x)%y = step_size
grad_shared(threadidx%x)%z = step_size
! call force_over_my_particule(X_shared,grad_shared,Npart)
!!! end compute gradient
call syncthreads()
temp%x = X_shared(threadidx%x)%x - step_size*grad_shared(threadidx%x)%x
temp%y = X_shared(threadidx%x)%y - step_size*grad_shared(threadidx%x)%y
temp%z = X_shared(threadidx%x)%z - step_size*grad_shared(threadidx%x)%z
call syncthreads()
X_global(threadidx%x,blockidx%x)%x = temp%x
X_global(threadidx%x,blockidx%x)%y = temp%y
X_global(threadidx%x,blockidx%x)%z = temp%z
end subroutine steepest_kernell
subroutine minimize_steepest_gpu(Npart,Nmin,X0,Xf,step_size)
integer :: Npart
integer :: Nmin
real :: step_size
type(float4), dimension(Npart,Nmin) :: X0, Xf
type(float4), device, dimension(Npart,Nmin) :: X_global
X_global=X0
call steepest_kernell<<<Nmin,Npart>>>(X_global,Nmin,Npart,step_size)
Xf = X_global
end subroutine minimize_steepest_gpu
subroutine initialize_positions(X0)
type(float4), dimension(:,:), intent(inout) :: X0
integer :: i,j
real :: ran
do i=1, size(X0(1,:))
do j=1,size(X0(:,1))
call random_number(ran)
X0(j,i)%x = ran-0.5
call random_number(ran)
X0(j,i)%y = ran-0.5
X0(j,i)%z = 0
enddo
enddo
end subroutine initialize_positions
subroutine initialize_diameters(X0,r,PHI)
type(float4), dimension(:,:), intent(inout) :: X0
real, intent(in) :: r,PHI
real, parameter :: PI=3.14159265358979D0
real :: d1,d2
integer :: npart, nmin, i, j
nmin = size(X0(1,:))
npart = size(X0(:,1))
d1 = sqrt(8*PHI/(PI*npart*(1+r*r)))
d2 = d1*r
do i=1,npart
if (i<=npart/2) then
X0(i,:)%d = d1
else
X0(i,:)%d = d2
end if
end do
end subroutine initialize_diameters
end module foo
program test
use foo
integer, parameter :: Nmin = 1
integer, parameter :: Npart = 8
real,parameter :: step_size = 1.0
type(float4), dimension(Npart,Nmin) :: X0, Xf !!! Posiciones iniciales
real, parameter :: phi = 0.9 !!! Volume ration
real, parameter :: r = 1.4 !!! Diameter ratio
call initialize_positions(X0)
call initialize_diameters(X0,r,phi)
write(*,*) 'x0'
write(*,*) X0(:,1)
call minimize_steepest_gpu(Npart,Nmin,X0,Xf,step_size)
write(*,*) 'xf'
write(*,*), Xf(:,1)
end program test