Dynamic parallelism in PVF cannot compile

Hello

I want to test the dynamic parallelism in PVF 13.9. The file dgemmdynamic.cuf contains the source code

MODULE dynamic_dgemm

  CONTAINS

  attributes(global) subroutine add16(a, lda, b, ldb, c, ldc, n)
    double precision, device :: a(lda,*), b(ldb,*), c(ldc,*)
    integer, value :: lda, ldb, ldc
    integer, value :: n
    double precision, dimension(4) :: as, bs

    inx = threadidx%x
    iny = threadidx%y
    ibx = (blockidx%x-1) * 256
    ia  = ibx + (iny-1)*16 + inx

    do ij = 1, n, 4
        as(1) = a(ia,ij)
        as(2) = a(ia,ij+1)
        as(3) = a(ia,ij+2)
        as(4) = a(ia,ij+3)

        bs(1) = b(ia,ij)
        bs(2) = b(ia,ij+1)
        bs(3) = b(ia,ij+2)
        bs(4) = b(ia,ij+3)

        as(1) = as(1) + bs(1)
        as(2) = as(2) + bs(2)
        as(3) = as(3) + bs(3)
        as(4) = as(4) + bs(4)

        c(ia,ij)   = as(1)
        c(ia,ij+1) = as(2)
        c(ia,ij+2) = as(3)
        c(ia,ij+3) = as(4)
    end do
    end subroutine

  attributes(global) subroutine dgemm16(a, lda, b, ldb, c, ldc, m, n, k)
    double precision, device :: a(lda,*), b(ldb,*), c(ldc,*)
    integer, value :: lda, ldb, ldc
    integer, value :: m, n, k

    double precision, shared, dimension(16,16) :: bs
    double precision, device  :: cloc(16), ax

    inx = threadidx%x
    iny = threadidx%y
    ibx = (blockidx%x-1) * 256
    iby = (blockidx%y-1) * 16

    ia = ibx + (iny-1)*16 + inx
    ib = inx
    ic = ia

    jb = iby + iny
    jc = iby + 1

    do i = 1, 16
      cloc(i) = 0.0d0
    end do

    do ik = 1, k, 16
      bs(iny,inx) = b(ib,jb)
      call syncthreads()

      do j = 1, 16
        ax = a(ia,ik+j-1)
        do i = 1, 16
          cloc(i) = cloc(i) + ax * bs(i,j)
        end do
      end do

      ib = ib + 16
      call syncthreads()
    end do

    do i = 1, 16
      c(ic,jc+i-1) = cloc(i)
    end do
    call syncthreads()
  end subroutine

  attributes(global) subroutine dgemmdriver(a, b, c, m, n, k)
    integer, value :: m, n, k
    double precision, device :: a(m,*), b(k,*), c(m,*)
    double precision, device, allocatable :: m1(:,:), m2(:,:), m3(:,:), m4(:,:)
    double precision, device, allocatable :: m5(:,:), m6(:,:), m7(:,:), m8(:,:)
    type(dim3), device :: devthreads, devblocks
    i = threadIdx%x
    if (i.eq.1) then
        newn = n / 2
        allocate(m1(1:m/2,1:k/2))
        allocate(m2(1:k/2,1:n/2))
        allocate(m3(1:m/2,1:k/2))
        allocate(m4(1:k/2,1:n/2))
        allocate(m5(1:m/2,1:k/2))
        allocate(m6(1:k/2,1:n/2))
        allocate(m7(1:m/2,1:k/2))
        allocate(m8(1:k/2,1:n/2))
        devblocks = dim3(newn/256, newn/16, 1)
        devthreads = dim3(16, 16, 1)

        call dgemm16<<<devblocks,devthreads>>>(a(1,1), m, b(1,1), k, &
                                m1(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1,1+k/2), m, b(1+k/2,1), k, &
                                m2(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1,1), m, b(1,1+n/2), k, &
                                m3(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1,1+k/2), m, b(1+k/2,1+n/2), k, &
                                m4(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1), m, b(1,1), k, &
                                m5(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1+k/2), m, b(1+k/2,1), k, &
                                m6(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1), m, b(1,1+n/2), k, &
                                m7(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1+k/2), m, b(1+k/2,1+n/2), k, &
                                m8(1,1), newn, newn, newn, newn)
        istat = cudaDeviceSynchronize()
        call add16<<<1,devthreads>>>(m1, newn, m2, newn, c(1,1), m, newn)
        call add16<<<1,devthreads>>>(m3, newn, m4, newn, c(1,1+n/2), m, newn)
        call add16<<<1,devthreads>>>(m5, newn, m6, newn, c(1+m/2,1), m, newn)
        call add16<<<1,devthreads>>>(m7, newn, m8, newn, c(1+m/2,1+n/2), m, newn)
        istat = cudaDeviceSynchronize()
        !deallocate(m1,m2,m3,m4,m5,m6,m7,m8)
        deallocate(m1)

    end if
    return
    end subroutine

END MODULE

program main
  use dynamic_dgemm
  use cudafor
  integer, parameter :: N = 512
  integer, parameter :: NREPS = 100
  ! matrix data
  real(8), dimension(N,N) :: A, B, C
  real(8), allocatable, device, dimension(:,:) :: dA, dB, dC
  real(8) gold, RR(N), RQ(N)
  type(cudaEvent) :: start, stop
  type(dim3) :: blocks
  type(dim3) :: threads

  istat = cudaEventCreate(start)
  istat = cudaEventCreate(stop)

  j = 1
  bv = -127.0d0
  do i = 1, N/2
    B(i,j) = 2.0d0 ** bv
    bv = bv + 1.0d0
    B(N-i+1,j) = -B(i,j)
  end do

  call random_number(rr)
  A(:,1) = rr

  do j = 2, N
    RQ = B(:,1)
    call random_number(rr)
    nn = N - 1
    do i = 1, N
      ival = int(rr(j) * nn + 1.0d0)
      B(i,j) = rq(ival)
      do k = ival, nn
        rq(k) = rq(k+1)
      end do
      nn = nn - 1
      A(i,j) = A(i,1)
    end do
  end do

  allocate(dA(N,N))
  allocate(dB(N,N))
  allocate(dC(N,N))

  dA = A
  dB = B

  dC = 4.0d0

  m = N
  k = N

  ! timing experiment
  call dgemmdriver<<<1, 1>>>(dA, dB, dC, m, N, k)
  time = 0.d0
  istat = cudaEventRecord(start, 0)
  do j = 1, NREPS
     call dgemmdriver<<<1, 1>>>(dA, dB, dC, m, N, k)
  end do
  istat = cudaEventRecord(stop, 0)
  istat = cudaDeviceSynchronize()
  istat = cudaEventElapsedTime(time, start, stop)
  time = time / (NREPS*1.0d3)

  C = dC

  nerrors = 0
  rmaxerr = 0.0d0
  rsumerr = 0.0d0
  do j = 1, N
    do i = 1, N
      if (C(i,j) .ne. 0.0d0) then
        if (abs(C(i,j)) .gt. rmaxerr) rmaxerr = abs(C(i,j))
        nerrors = nerrors + 1
        rsumerr = rsumerr + abs(C(i,j))
      end if
    end do
  end do

  if (nerrors .eq. 0) then
    print *,"Test passed!"
  else
    print *,nerrors," errors were encountered"
    print *,"Max error was ",rmaxerr
    print *,"Ave error was ",rsumerr / (N * N)
  endif

  gflops = 2.0 * N * N * N/time/1d9
  write (*,901) m,k,k,N,time*1.0d3,gflops
  print *,"### C(1,1)=",C(1,1)
!
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
end program

I use -Mcuda=cc35,rdc, Liner-Input-Additional Denpendencies: cudadevrt.lib

But it can not compile, the error messages is:

Deleting intermediate and output files for project ‘CudaDynamicParallel’, configuration ‘Release’
Compiling Project …
dgemmdynamic.cuf
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1935): error: linkage specification is incompatible with previous “cudaLaunchDevice”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(117): here

c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1948): error: linkage specification is incompatible with previous “cudaGetParameterBuffer”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(116): here

2 errors detected in the compilation of “C:\Users\KANGUA~1\AppData\Local\Temp\pgnvd2a6a1bUyLA3d-B.nv0”.
D:\PGI Visual Fortran 13.9\CudaDynamicParallel\CudaDynamicParallel\dgemmdynamic.cuf(1) : error F0155 : Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code
PGF90/x86 Windows 13.9-0: compilation aborted
dgemmdynamic.cuf
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1935): error: linkage specification is incompatible with previous “cudaLaunchDevice”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(117): here

c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1948): error: linkage specification is incompatible with previous “cudaGetParameterBuffer”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(116): here

2 errors detected in the compilation of “C:\Users\KANGUA~1\AppData\Local\Temp\pgnvd2a6KdcUyPe3duy.nv0”.
D:\PGI Visual Fortran 13.9\CudaDynamicParallel\CudaDynamicParallel\dgemmdynamic.cuf(1) : error F0155 : Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code
PGF90/x86 Windows 13.9-0: compilation aborted
CudaDynamicParallel build failed.

How to solve this problem?

Thank you!

Nightwish

Hi Nightwish,

The example works on Linux but fails on Windows. With 14.4 and earlier, I get the same syntax error. With 14.6, it compiles but gets wrong answers. I’ve filed a problem report, TPR#20722, and sent it on to engineering for further investigation.

Thanks!
Mat

TPR 20722 - CUDA Fortran: dgemmdynamic example fails on Windows
is fixed in the current 15.3 release.

thanks,
dave