Hi qijiin21c,
You’ll want to make the CUDA Fortran device allocatable arrays OpenMP threadprivate.
For example, I modified our same sgemm code to include OpenMP.
% cat sgemm.cuf
!
! Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
!
! NVIDIA CORPORATION and its licensors retain all intellectual property
! and proprietary rights in and to this software, related documentation
! and any modifications thereto. Any use, reproduction, disclosure or
! distribution of this software and related documentation without an express
! license agreement from NVIDIA CORPORATION is strictly prohibited.
!
!
! An example of single precision matrix multiply
! Build for running without optimizations:
! pgfortran sgemm.cuf
! Build for running with optimizations:
! pgfortran -O2 sgemm.cuf
!
MODULE saxpy_sgemm
CONTAINS
attributes(device) subroutine saxpy16(a, b, c)
real, device :: a
real, dimension(16) :: b
real, device, dimension(16) :: c
c = c + a * b
end subroutine
attributes(global) subroutine sgemmNN_16x16(a, b, c, m, n, k, alpha, beta)
real, device :: a(m,*), b(k,*), c(m,*)
integer, value :: m, n, k
real, value :: alpha, beta
real, shared, dimension(17,16) :: bs
real, device, dimension(16) :: cloc
inx = threadidx%x
iny = threadidx%y
ibx = (blockidx%x-1) * 256
iby = (blockidx%y-1) * 16
ia = ibx + (iny-1)*16 + inx
ib = inx
ic = ia
jb = iby + iny
jc = iby + 1
cloc = 0.0
do ik = 1, k, 16
bs(iny,inx) = b(ib,jb)
call syncthreads()
do j = 1, 16
call saxpy16(a(ia,ik+j-1), bs(1,j), cloc)
end do
ib = ib + 16
call syncthreads()
end do
do ii = 1, 16
c(ic,jc+ii-1) = alpha*cloc(ii) + beta*c(ic,jc+ii-1)
end do
end subroutine
END MODULE
subroutine sgemm_cpu(a, b, c, m, n, k, alpha, beta)
real, dimension(m,k) :: a
real, dimension(k,n) :: b
real, dimension(m,n) :: c
real alpha, beta
do im = 1, m
do in = 1, n
temp = 0.0
do ik = 1, k
temp = temp + a(im,ik) * b(ik,in)
end do
c(im,in) = alpha*temp + beta*c(im,in)
end do
end do
end subroutine
program main
use cudafor
use saxpy_sgemm
use omp_lib
integer, parameter :: N = 256
integer, parameter :: NREPS = 1000
! matrix data
real, dimension(N,N) :: A, B, C, gold
real, allocatable, device, dimension(:,:) :: dA, dB, dC
!$omp threadprivate(dA, dB, dC, C)
!
real alpha, beta
type(cudaDeviceProp) :: prop
type(cudaEvent) :: start, stop
type(dim3) :: blocks
type(dim3) :: threads
integer idevice, nGPUs
idevice = 0
call random_number(A)
call random_number(B)
alpha = 1
beta = 0
m = N
k = N
call sgemm_cpu(A, B, gold, m, N, k, alpha, beta)
istat = cudaGetDeviceCount(nGPUs)
print *, "Num devices: ", nGPUs
call omp_set_num_threads(nGPUs)
!$omp parallel private(idevice,istat,blocks,threads,start,stop,time,nerrors)
idevice = omp_get_thread_num()
print *, "Using device: ", idevice
istat = cudaSetDevice(idevice)
istat = cudaGetDeviceProperties(prop,idevice)
ilen = verify(prop%name, ' ', .true.)
write (*,900) prop%name(1:ilen), &
real(prop%clockRate)/1000.0, &
real(prop%totalGlobalMem)/1024.0/1024.0
istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)
allocate(dA(N,N))
allocate(dB(N,N))
allocate(dC(N,N))
dA = A
dB = B
dC = 0.0
blocks = dim3(N/256, N/16, 1)
threads = dim3(16, 16, 1)
! timing experiment
time = 0.0
istat = cudaEventRecord(start, 0)
do j = 1, NREPS
call sgemmNN_16x16<<<blocks, threads>>>(dA, dB, dC, m, N, k, alpha, beta)
end do
istat = cudaEventRecord(stop, 0)
istat = cudaThreadSynchronize()
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NREPS*1.0e3)
C = dC
nerrors = 0
do j = 1, N
do i = 1, N
if (abs(gold(i,j) - C(i,j)) .gt. 1.0e-4) then
nerrors = nerrors + 1
end if
end do
end do
if (nerrors .eq. 0) then
print *,"Test PASSED"
else
print *,"Test FAILED"
print *,nerrors," errors were encountered"
endif
gflops = 2.0 * N * N * N/time/1e9
write (*,901) m,k,k,N,time*1.0e3,gflops
900 format('\nDevice:',a,', ',f6.1,' MHz clock, ',f6.1,' MB memory.\n')
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
!$omp end parallel
end program
% pgfortran sgemm.cuf -Mcuda=cc60 -mp -fast; a.out
Num devices: 2
Using device: 0
Using device: 1
Device:Tesla P100-PCIE-16GB, 1328.5 MHz clock, ****** MB memory.
Device:Tesla P100-PCIE-16GB, 1328.5 MHz clock, ****** MB memory.
Test PASSED
256x256 * 256x256: 0.060 ms 562.581 GFlops/s
Test PASSED
256x256 * 256x256: 0.060 ms 558.238 GFlops/s