Sorry for reviving an old thread, but I decided to create the most simplified form of the code to test acc routines that I could. Since this is completely self-contained there’s no possibility of data corruption.
program cuBLAS_example
use iso_c_binding, only: c_double, c_int
use cublas
use openacc
implicit none
integer, parameter :: nelems = 50000
integer, parameter :: Np = 16, Nvar = 4, Nflux = 16
integer, parameter :: chunkSize = 10
type mesh2d
real(c_double) :: u(Np, Nvar, nelems)
real(c_double) :: ux(Np, Nvar, nelems)
real(c_double) :: uflux(Nflux, Nvar, nelems)
real(c_double) :: ucommon(Nflux, Nvar, nelems)
end type mesh2d
real(c_double) :: Lgrad(Np, Np), phi(Np, Nflux)
real(c_double) :: uelem_tmp(Np, Nvar*chunkSize), ucorr_tmp(Nflux, Nvar*chunkSize)
real(c_double) :: ugrad_tmp(Np, Nvar*chunkSize)
type(mesh2d) :: mesh
integer(c_int) :: i, j, k, iter
do i = 1, nelems
do j = 1, Nvar
do k = 1, Np
mesh%u(k, j, i) = i+j+k
end do
do k = 1, Nflux
mesh%uflux(k, j, i) = i-j-k
mesh%ucommon(k, j, i) = i-j+k
end do
end do
end do
do j = 1, Np
do k = 1, Np
Lgrad(j, k) = k*j
end do
do k = 1, Nflux
phi(j, k) = k*j + k + j
end do
end do
!$acc enter data copyin(mesh)
!$acc enter data copyin(mesh%u, mesh%uflux, mesh%ucommon)
!$acc enter data copyin(mesh%ux)
!$acc enter data create(uelem_tmp, ucorr_tmp, ugrad_tmp, Lgrad, phi)
do iter = 1, nelems/chunkSize
!$acc parallel loop present(mesh, mesh%u, uelem_tmp)
do i = 1, chunkSize
do j = 1, Nvar
do k = 1, Np
uelem_tmp(k, Nvar*(i-1) + j) = mesh%u(k, j, (iter-1)*chunkSize + i)
end do
end do
end do
!$acc end parallel
!$acc parallel loop present(mesh, mesh%uflux, mesh%ucommon, ucorr_tmp)
do i = 1, chunkSize
do j = 1, Nvar
do k = 1, Nflux
ucorr_tmp(k, Nvar*(i-1) + j) = mesh%uflux(k, j, (iter-1)*chunkSize + i) &
- mesh%ucommon(k, j, (iter-1)*chunkSize + i)
end do
end do
end do
!$acc end parallel
!$acc host_data use_device(Lgrad, uelem_tmp, ugrad_tmp)
call cublasDGEMM('N', 'N', Np, Nvar*chunkSize, Np, &
1.0_c_double, Lgrad, Np, uelem_tmp, Np, &
0.0_c_double, ugrad_tmp, Np)
!$acc end host_data
!$acc host_data use_device(phi, uelem_tmp, ugrad_tmp)
call cublasDGEMM('N', 'N', Np, Nvar*chunkSize, Np, &
1.0_c_double, phi, Np, uelem_tmp, Np, &
1.0_c_double, ugrad_tmp, Np)
!$acc end host_data
!$acc parallel loop present(mesh, mesh%ux, ugrad_tmp)
do i = 1, chunkSize
do j = 1, Nvar
do k = 1, Nflux
mesh%ux(k, j, (iter-1)*chunkSize + i) = ugrad_tmp(k, Nvar*(i-1) + j)
end do
end do
end do
!$acc end parallel
end do
!$acc update self(mesh%ux)
!$acc exit data delete(mesh%ucommon, mesh%uflux, mesh%u, mesh%ux)
!$acc exit data delete(uelem_tmp, ugrad_tmp, ucorr_tmp)
!$acc exit data
end program cuBLAS_example
Here’s the compiler output
pgfortran -o speed -acc -Minfo=accel speedup.f90 -L/usr/local/cuda/lib64 -lcublas -Mcuda
cublas_example:
49, Generating enter data copyin(mesh)
50, Generating enter data copyin(mesh%ucommon(:,:,:),mesh%uflux(:,:,:),mesh%u(:,:,:))
51, Generating enter data copyin(mesh%ux(:,:,:))
52, Generating enter data create(phi(:,:),lgrad(:,:),ugrad_tmp(:,:),ucorr_tmp(:,:),uelem_tmp(:,:))
55, Generating present(mesh,mesh%u(:,:,:),uelem_tmp(:,:))
Accelerator kernel generated
Generating Tesla code
56, !$acc loop gang ! blockidx%x
58, !$acc loop vector(128) ! threadidx%x
57, Loop is parallelizable
58, Loop is parallelizable
65, Generating present(mesh,mesh%uflux(:,:,:),mesh%ucommon(:,:,:),ucorr_tmp(:,:))
Accelerator kernel generated
Generating Tesla code
66, !$acc loop gang ! blockidx%x
68, !$acc loop vector(128) ! threadidx%x
67, Loop is parallelizable
68, Loop is parallelizable
92, Generating present(mesh,mesh%ux(:,:,:),ugrad_tmp(:,:))
Accelerator kernel generated
Generating Tesla code
93, !$acc loop gang ! blockidx%x
Accelerator restriction: scalar variable live-out from loop: mesh
94, Accelerator restriction: scalar variable live-out from loop: mesh
95, Accelerator restriction: scalar variable live-out from loop: mesh
104, Generating update self(mesh%ux(:,:,:))
106, Generating exit data delete(mesh%ux(:,:,:),mesh%u(:,:,:),mesh%uflux(:,:,:),mesh%ucommon(:,:,:))
107, Generating exit data delete(ucorr_tmp(:,:),ugrad_tmp(:,:),uelem_tmp(:,:))
and here’s the output
(null) lives at 0x608500 size 25600000 present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 3.0
host:0x608500 device:0x700680000 size:102400000 presentcount:1 line:49 name:mesh
FATAL ERROR: variable in data clause was already present on device 1: name=(null)
file:/home/Experiments/Nvidia/OpenACC/acc_demo/speedup.f90 cublas_example line:50
I guess if all errors can be sorted out at this level, then I can build everything else out of this.[/code]