Hello
I want to test the dynamic parallelism in PVF 13.9. The file dgemmdynamic.cuf contains the source code
MODULE dynamic_dgemm
CONTAINS
attributes(global) subroutine add16(a, lda, b, ldb, c, ldc, n)
double precision, device :: a(lda,*), b(ldb,*), c(ldc,*)
integer, value :: lda, ldb, ldc
integer, value :: n
double precision, dimension(4) :: as, bs
inx = threadidx%x
iny = threadidx%y
ibx = (blockidx%x-1) * 256
ia = ibx + (iny-1)*16 + inx
do ij = 1, n, 4
as(1) = a(ia,ij)
as(2) = a(ia,ij+1)
as(3) = a(ia,ij+2)
as(4) = a(ia,ij+3)
bs(1) = b(ia,ij)
bs(2) = b(ia,ij+1)
bs(3) = b(ia,ij+2)
bs(4) = b(ia,ij+3)
as(1) = as(1) + bs(1)
as(2) = as(2) + bs(2)
as(3) = as(3) + bs(3)
as(4) = as(4) + bs(4)
c(ia,ij) = as(1)
c(ia,ij+1) = as(2)
c(ia,ij+2) = as(3)
c(ia,ij+3) = as(4)
end do
end subroutine
attributes(global) subroutine dgemm16(a, lda, b, ldb, c, ldc, m, n, k)
double precision, device :: a(lda,*), b(ldb,*), c(ldc,*)
integer, value :: lda, ldb, ldc
integer, value :: m, n, k
double precision, shared, dimension(16,16) :: bs
double precision, device :: cloc(16), ax
inx = threadidx%x
iny = threadidx%y
ibx = (blockidx%x-1) * 256
iby = (blockidx%y-1) * 16
ia = ibx + (iny-1)*16 + inx
ib = inx
ic = ia
jb = iby + iny
jc = iby + 1
do i = 1, 16
cloc(i) = 0.0d0
end do
do ik = 1, k, 16
bs(iny,inx) = b(ib,jb)
call syncthreads()
do j = 1, 16
ax = a(ia,ik+j-1)
do i = 1, 16
cloc(i) = cloc(i) + ax * bs(i,j)
end do
end do
ib = ib + 16
call syncthreads()
end do
do i = 1, 16
c(ic,jc+i-1) = cloc(i)
end do
call syncthreads()
end subroutine
attributes(global) subroutine dgemmdriver(a, b, c, m, n, k)
integer, value :: m, n, k
double precision, device :: a(m,*), b(k,*), c(m,*)
double precision, device, allocatable :: m1(:,:), m2(:,:), m3(:,:), m4(:,:)
double precision, device, allocatable :: m5(:,:), m6(:,:), m7(:,:), m8(:,:)
type(dim3), device :: devthreads, devblocks
i = threadIdx%x
if (i.eq.1) then
newn = n / 2
allocate(m1(1:m/2,1:k/2))
allocate(m2(1:k/2,1:n/2))
allocate(m3(1:m/2,1:k/2))
allocate(m4(1:k/2,1:n/2))
allocate(m5(1:m/2,1:k/2))
allocate(m6(1:k/2,1:n/2))
allocate(m7(1:m/2,1:k/2))
allocate(m8(1:k/2,1:n/2))
devblocks = dim3(newn/256, newn/16, 1)
devthreads = dim3(16, 16, 1)
call dgemm16<<<devblocks,devthreads>>>(a(1,1), m, b(1,1), k, &
m1(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1,1+k/2), m, b(1+k/2,1), k, &
m2(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1,1), m, b(1,1+n/2), k, &
m3(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1,1+k/2), m, b(1+k/2,1+n/2), k, &
m4(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1), m, b(1,1), k, &
m5(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1+k/2), m, b(1+k/2,1), k, &
m6(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1), m, b(1,1+n/2), k, &
m7(1,1), newn, newn, newn, newn)
call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1+k/2), m, b(1+k/2,1+n/2), k, &
m8(1,1), newn, newn, newn, newn)
istat = cudaDeviceSynchronize()
call add16<<<1,devthreads>>>(m1, newn, m2, newn, c(1,1), m, newn)
call add16<<<1,devthreads>>>(m3, newn, m4, newn, c(1,1+n/2), m, newn)
call add16<<<1,devthreads>>>(m5, newn, m6, newn, c(1+m/2,1), m, newn)
call add16<<<1,devthreads>>>(m7, newn, m8, newn, c(1+m/2,1+n/2), m, newn)
istat = cudaDeviceSynchronize()
!deallocate(m1,m2,m3,m4,m5,m6,m7,m8)
deallocate(m1)
end if
return
end subroutine
END MODULE
program main
use dynamic_dgemm
use cudafor
integer, parameter :: N = 512
integer, parameter :: NREPS = 100
! matrix data
real(8), dimension(N,N) :: A, B, C
real(8), allocatable, device, dimension(:,:) :: dA, dB, dC
real(8) gold, RR(N), RQ(N)
type(cudaEvent) :: start, stop
type(dim3) :: blocks
type(dim3) :: threads
istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)
j = 1
bv = -127.0d0
do i = 1, N/2
B(i,j) = 2.0d0 ** bv
bv = bv + 1.0d0
B(N-i+1,j) = -B(i,j)
end do
call random_number(rr)
A(:,1) = rr
do j = 2, N
RQ = B(:,1)
call random_number(rr)
nn = N - 1
do i = 1, N
ival = int(rr(j) * nn + 1.0d0)
B(i,j) = rq(ival)
do k = ival, nn
rq(k) = rq(k+1)
end do
nn = nn - 1
A(i,j) = A(i,1)
end do
end do
allocate(dA(N,N))
allocate(dB(N,N))
allocate(dC(N,N))
dA = A
dB = B
dC = 4.0d0
m = N
k = N
! timing experiment
call dgemmdriver<<<1, 1>>>(dA, dB, dC, m, N, k)
time = 0.d0
istat = cudaEventRecord(start, 0)
do j = 1, NREPS
call dgemmdriver<<<1, 1>>>(dA, dB, dC, m, N, k)
end do
istat = cudaEventRecord(stop, 0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NREPS*1.0d3)
C = dC
nerrors = 0
rmaxerr = 0.0d0
rsumerr = 0.0d0
do j = 1, N
do i = 1, N
if (C(i,j) .ne. 0.0d0) then
if (abs(C(i,j)) .gt. rmaxerr) rmaxerr = abs(C(i,j))
nerrors = nerrors + 1
rsumerr = rsumerr + abs(C(i,j))
end if
end do
end do
if (nerrors .eq. 0) then
print *,"Test passed!"
else
print *,nerrors," errors were encountered"
print *,"Max error was ",rmaxerr
print *,"Ave error was ",rsumerr / (N * N)
endif
gflops = 2.0 * N * N * N/time/1d9
write (*,901) m,k,k,N,time*1.0d3,gflops
print *,"### C(1,1)=",C(1,1)
!
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
end program
I use -Mcuda=cc35,rdc, Liner-Input-Additional Denpendencies: cudadevrt.lib
But it can not compile, the error messages is:
Deleting intermediate and output files for project ‘CudaDynamicParallel’, configuration ‘Release’
Compiling Project …
dgemmdynamic.cuf
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1935): error: linkage specification is incompatible with previous “cudaLaunchDevice”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(117): here
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1948): error: linkage specification is incompatible with previous “cudaGetParameterBuffer”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(116): here
2 errors detected in the compilation of “C:\Users\KANGUA~1\AppData\Local\Temp\pgnvd2a6a1bUyLA3d-B.nv0”.
D:\PGI Visual Fortran 13.9\CudaDynamicParallel\CudaDynamicParallel\dgemmdynamic.cuf(1) : error F0155 : Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code
PGF90/x86 Windows 13.9-0: compilation aborted
dgemmdynamic.cuf
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1935): error: linkage specification is incompatible with previous “cudaLaunchDevice”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(117): here
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1948): error: linkage specification is incompatible with previous “cudaGetParameterBuffer”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(116): here
2 errors detected in the compilation of “C:\Users\KANGUA~1\AppData\Local\Temp\pgnvd2a6KdcUyPe3duy.nv0”.
D:\PGI Visual Fortran 13.9\CudaDynamicParallel\CudaDynamicParallel\dgemmdynamic.cuf(1) : error F0155 : Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code
PGF90/x86 Windows 13.9-0: compilation aborted
CudaDynamicParallel build failed.
How to solve this problem?
Thank you!
Nightwish