Dynamic parallelism in PVF cannot compile

yuanrui124 · July 27, 2014, 6:19pm

Hello

I want to test the dynamic parallelism in PVF 13.9. The file dgemmdynamic.cuf contains the source code

MODULE dynamic_dgemm

  CONTAINS

  attributes(global) subroutine add16(a, lda, b, ldb, c, ldc, n)
    double precision, device :: a(lda,*), b(ldb,*), c(ldc,*)
    integer, value :: lda, ldb, ldc
    integer, value :: n
    double precision, dimension(4) :: as, bs

    inx = threadidx%x
    iny = threadidx%y
    ibx = (blockidx%x-1) * 256
    ia  = ibx + (iny-1)*16 + inx

    do ij = 1, n, 4
        as(1) = a(ia,ij)
        as(2) = a(ia,ij+1)
        as(3) = a(ia,ij+2)
        as(4) = a(ia,ij+3)

        bs(1) = b(ia,ij)
        bs(2) = b(ia,ij+1)
        bs(3) = b(ia,ij+2)
        bs(4) = b(ia,ij+3)

        as(1) = as(1) + bs(1)
        as(2) = as(2) + bs(2)
        as(3) = as(3) + bs(3)
        as(4) = as(4) + bs(4)

        c(ia,ij)   = as(1)
        c(ia,ij+1) = as(2)
        c(ia,ij+2) = as(3)
        c(ia,ij+3) = as(4)
    end do
    end subroutine

  attributes(global) subroutine dgemm16(a, lda, b, ldb, c, ldc, m, n, k)
    double precision, device :: a(lda,*), b(ldb,*), c(ldc,*)
    integer, value :: lda, ldb, ldc
    integer, value :: m, n, k

    double precision, shared, dimension(16,16) :: bs
    double precision, device  :: cloc(16), ax

    inx = threadidx%x
    iny = threadidx%y
    ibx = (blockidx%x-1) * 256
    iby = (blockidx%y-1) * 16

    ia = ibx + (iny-1)*16 + inx
    ib = inx
    ic = ia

    jb = iby + iny
    jc = iby + 1

    do i = 1, 16
      cloc(i) = 0.0d0
    end do

    do ik = 1, k, 16
      bs(iny,inx) = b(ib,jb)
      call syncthreads()

      do j = 1, 16
        ax = a(ia,ik+j-1)
        do i = 1, 16
          cloc(i) = cloc(i) + ax * bs(i,j)
        end do
      end do

      ib = ib + 16
      call syncthreads()
    end do

    do i = 1, 16
      c(ic,jc+i-1) = cloc(i)
    end do
    call syncthreads()
  end subroutine

  attributes(global) subroutine dgemmdriver(a, b, c, m, n, k)
    integer, value :: m, n, k
    double precision, device :: a(m,*), b(k,*), c(m,*)
    double precision, device, allocatable :: m1(:,:), m2(:,:), m3(:,:), m4(:,:)
    double precision, device, allocatable :: m5(:,:), m6(:,:), m7(:,:), m8(:,:)
    type(dim3), device :: devthreads, devblocks
    i = threadIdx%x
    if (i.eq.1) then
        newn = n / 2
        allocate(m1(1:m/2,1:k/2))
        allocate(m2(1:k/2,1:n/2))
        allocate(m3(1:m/2,1:k/2))
        allocate(m4(1:k/2,1:n/2))
        allocate(m5(1:m/2,1:k/2))
        allocate(m6(1:k/2,1:n/2))
        allocate(m7(1:m/2,1:k/2))
        allocate(m8(1:k/2,1:n/2))
        devblocks = dim3(newn/256, newn/16, 1)
        devthreads = dim3(16, 16, 1)

        call dgemm16<<<devblocks,devthreads>>>(a(1,1), m, b(1,1), k, &
                                m1(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1,1+k/2), m, b(1+k/2,1), k, &
                                m2(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1,1), m, b(1,1+n/2), k, &
                                m3(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1,1+k/2), m, b(1+k/2,1+n/2), k, &
                                m4(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1), m, b(1,1), k, &
                                m5(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1+k/2), m, b(1+k/2,1), k, &
                                m6(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1), m, b(1,1+n/2), k, &
                                m7(1,1), newn, newn, newn, newn)
        call dgemm16<<<devblocks,devthreads>>>(a(1+m/2,1+k/2), m, b(1+k/2,1+n/2), k, &
                                m8(1,1), newn, newn, newn, newn)
        istat = cudaDeviceSynchronize()
        call add16<<<1,devthreads>>>(m1, newn, m2, newn, c(1,1), m, newn)
        call add16<<<1,devthreads>>>(m3, newn, m4, newn, c(1,1+n/2), m, newn)
        call add16<<<1,devthreads>>>(m5, newn, m6, newn, c(1+m/2,1), m, newn)
        call add16<<<1,devthreads>>>(m7, newn, m8, newn, c(1+m/2,1+n/2), m, newn)
        istat = cudaDeviceSynchronize()
        !deallocate(m1,m2,m3,m4,m5,m6,m7,m8)
        deallocate(m1)

    end if
    return
    end subroutine

END MODULE

program main
  use dynamic_dgemm
  use cudafor
  integer, parameter :: N = 512
  integer, parameter :: NREPS = 100
  ! matrix data
  real(8), dimension(N,N) :: A, B, C
  real(8), allocatable, device, dimension(:,:) :: dA, dB, dC
  real(8) gold, RR(N), RQ(N)
  type(cudaEvent) :: start, stop
  type(dim3) :: blocks
  type(dim3) :: threads

  istat = cudaEventCreate(start)
  istat = cudaEventCreate(stop)

  j = 1
  bv = -127.0d0
  do i = 1, N/2
    B(i,j) = 2.0d0 ** bv
    bv = bv + 1.0d0
    B(N-i+1,j) = -B(i,j)
  end do

  call random_number(rr)
  A(:,1) = rr

  do j = 2, N
    RQ = B(:,1)
    call random_number(rr)
    nn = N - 1
    do i = 1, N
      ival = int(rr(j) * nn + 1.0d0)
      B(i,j) = rq(ival)
      do k = ival, nn
        rq(k) = rq(k+1)
      end do
      nn = nn - 1
      A(i,j) = A(i,1)
    end do
  end do

  allocate(dA(N,N))
  allocate(dB(N,N))
  allocate(dC(N,N))

  dA = A
  dB = B

  dC = 4.0d0

  m = N
  k = N

  ! timing experiment
  call dgemmdriver<<<1, 1>>>(dA, dB, dC, m, N, k)
  time = 0.d0
  istat = cudaEventRecord(start, 0)
  do j = 1, NREPS
     call dgemmdriver<<<1, 1>>>(dA, dB, dC, m, N, k)
  end do
  istat = cudaEventRecord(stop, 0)
  istat = cudaDeviceSynchronize()
  istat = cudaEventElapsedTime(time, start, stop)
  time = time / (NREPS*1.0d3)

  C = dC

  nerrors = 0
  rmaxerr = 0.0d0
  rsumerr = 0.0d0
  do j = 1, N
    do i = 1, N
      if (C(i,j) .ne. 0.0d0) then
        if (abs(C(i,j)) .gt. rmaxerr) rmaxerr = abs(C(i,j))
        nerrors = nerrors + 1
        rsumerr = rsumerr + abs(C(i,j))
      end if
    end do
  end do

  if (nerrors .eq. 0) then
    print *,"Test passed!"
  else
    print *,nerrors," errors were encountered"
    print *,"Max error was ",rmaxerr
    print *,"Ave error was ",rsumerr / (N * N)
  endif

  gflops = 2.0 * N * N * N/time/1d9
  write (*,901) m,k,k,N,time*1.0d3,gflops
  print *,"### C(1,1)=",C(1,1)
!
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f8.3,' GFlops/s')
end program

I use -Mcuda=cc35,rdc, Liner-Input-Additional Denpendencies: cudadevrt.lib

But it can not compile, the error messages is:

Deleting intermediate and output files for project ‘CudaDynamicParallel’, configuration ‘Release’
Compiling Project …
dgemmdynamic.cuf
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1935): error: linkage specification is incompatible with previous “cudaLaunchDevice”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(117): here

c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1948): error: linkage specification is incompatible with previous “cudaGetParameterBuffer”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(116): here

2 errors detected in the compilation of “C:\Users\KANGUA~1\AppData\Local\Temp\pgnvd2a6a1bUyLA3d-B.nv0”.
D:\PGI Visual Fortran 13.9\CudaDynamicParallel\CudaDynamicParallel\dgemmdynamic.cuf(1) : error F0155 : Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code
PGF90/x86 Windows 13.9-0: compilation aborted
dgemmdynamic.cuf
c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1935): error: linkage specification is incompatible with previous “cudaLaunchDevice”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(117): here

c:\program files (x86)\pgi\win32\13.9/include_acc\pgi_cuda_runtime.h(1948): error: linkage specification is incompatible with previous “cudaGetParameterBuffer”
c:\program files (x86)\pgi\win32\2013\cuda\5.0\include\cuda_device_runtime_api.h(116): here

2 errors detected in the compilation of “C:\Users\KANGUA~1\AppData\Local\Temp\pgnvd2a6KdcUyPe3duy.nv0”.
D:\PGI Visual Fortran 13.9\CudaDynamicParallel\CudaDynamicParallel\dgemmdynamic.cuf(1) : error F0155 : Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error status code
PGF90/x86 Windows 13.9-0: compilation aborted
CudaDynamicParallel build failed.

How to solve this problem?

Thank you!

Nightwish

MatColgrove · July 31, 2014, 11:24pm

Hi Nightwish,

The example works on Linux but fails on Windows. With 14.4 and earlier, I get the same syntax error. With 14.6, it compiles but gets wrong answers. I’ve filed a problem report, TPR#20722, and sent it on to engineering for further investigation.

Thanks!
Mat

tull · March 12, 2015, 10:59pm

TPR 20722 - CUDA Fortran: dgemmdynamic example fails on Windows
is fixed in the current 15.3 release.

thanks,
dave

Topic		Replies	Views
About dynamic parallelism of CUDA Fortran Legacy PGI Compilers	7	9202	December 2, 2016
CUDA Dynamic Parallelism undefined reference to __fatbinwrap Legacy PGI Compilers	5	11967	April 28, 2015
How compile the kernel subroutine containing dgetrf Legacy PGI Compilers	4	5718	December 4, 2013
Dynamic Parallelism : code: 30, reason: unknown error from cudaMalloc and cudaMemcpy CUDA Programming and Performance	1	1750	July 14, 2015
MPICH linking failing Legacy PGI Compilers	12	12593	October 25, 2013
Passing dynamically allocated memory in kernel to sub kernel via dynamic parallelism CUDA Programming and Performance	6	686	May 3, 2019
First try compile errors Legacy PGI Compilers	15	14344	August 29, 2013
runtime error with cuda 9.1 CUDA Programming and Performance	18	4131	April 24, 2018
Parallel Reduction CUDA Programming and Performance	10	3653	June 26, 2011
Performance drops with dynamic parallelism CUDA Programming and Performance cuda , dynamic-control	12	489	June 3, 2024

Dynamic parallelism in PVF cannot compile

Related topics