No CUDA device code available

I am trying to compile and run a simple code that a colleague wrote for me as an example. However, the code compiles, but doesn’t run:

stefano@payne:~/home/testgpu/d3b$ make
nvfortran -O3 -mp=gpu -gpu=cc89 -Minfo=all -cuda -o d3btest d3btest.o ran.o
stefano@payne:~/home/testgpu/d3b$ d3btest
No CUDA device code available
stefano@payne:~/home/testgpu/d3b$

What is the problem?
Thanks,
Stefano

Hi Stefano,

Can you show the output from the compilation of “d3btest.o” and “ran.o” using the same compile flags you’re using to link?

The compiler feedback messages will show us if there’s any GPU kernels getting generated. In not, please share the source so I can see how it’s coded.

-Mat

Hi Mat,

sure! Here it is:

stefano@payne:~/home/testgpu/d3b$ nvfortran -O3 -mp=gpu -gpu=cc89 -Minfo=all -cuda -c ran.f90
randn:
71, Loop not vectorized/parallelized: contains call
gaussian:
88, Generated vector simd code for the loop
stefano@payne:~/home/testgpu/d3b$ nvfortran -O3 -mp=gpu -gpu=cc89 -Minfo=all -cuda -c d3btest.f90
addtod3b:
72, Loop not vectorized/parallelized: too deeply nested
73, Loop not vectorized/parallelized: too deeply nested
76, Loop not vectorized/parallelized: loop count too small
77, Outer loop unrolled 4 times (completely unrolled)
78, Loop not fused: dependence chain to sibling loop
Generated vector simd code for the loop
FMA (fused multiply-add) instruction(s) generated
d3btest:
131, Loop unrolled 4 times (completely unrolled)
Loop not fused: function call before adjacent loop
Loop not vectorized: unsupported scatter
FMA (fused multiply-add) instruction(s) generated
134, FMA (fused multiply-add) instruction(s) generated
168, Memory zero idiom, array assignment replaced by call to pgf90_mzeroz16
174, maxval reduction inlined
Loop not fused: function call before adjacent loop
Loop not vectorized/parallelized: too deeply nested
Loop not vectorized: may not be beneficial for target
stefano@payne:~/home/testgpu/d3b$ nvfortran -O3 -mp=gpu -gpu=cc89 -Minfo=all -cuda -o d3btest ran.o d3btest.o
stefano@payne:~/home/testgpu/d3b$

Stefano

From the output, there’s no GPU or OpenMP code at all. Can you share the source? Does it use OpenMP?

Here the whole code (main one, let me know if you also need ran.f90):

module d3bmod
   use, intrinsic :: iso_fortran_env
   implicit none
   integer, private, parameter :: i4=int32
   integer, private, parameter :: i8=int64
   integer, private, parameter :: r8=real64
   complex, private, parameter :: czero=(0.0_r8,0.0_r8)
!
! Put npart in constant memory since it never changes
!
   integer(kind=i4), constant, private :: npart

contains

!
! I prefer to keep variables private and use setter subroutines.
!
   subroutine setnpart(npartin)
   integer(kind=i4), intent(in) :: npartin
   npart=npartin
   end subroutine setnpart

!
! zero d3b on gpu
!
   attributes (global) subroutine zerod3b_gpu(d3b)
   complex(kind=r8), intent(inout) :: d3b(4,4,4,*)
   integer(kind=i4) :: i,j,k,is,js,ks,ijk
   i=blockIdx%x
   j=blockIdx%y
   k=blockIdx%z
   is=threadIdx%x
   js=threadIdx%y
   ks=threadIdx%z
   if ((i.lt.j).and.(j.lt.k)) then
      ijk=i+((j-2)*(j-1)+(k-3)*(k-2)*(k-1)/3)/2
      d3b(is,js,ks,ijk)=czero
   endif
   end subroutine zerod3b_gpu

!
! add to d3b on gpu
!
   attributes(global) subroutine addtod3b_gpu(sxz,fij,d3b)
   complex(kind=r8), intent(in) :: sxz(4,npart,*),fij(*)
   complex(kind=r8), intent(inout) :: d3b(4,4,4,*)
   integer(kind=i4) :: i,j,k,is,js,ks,ijk
   i=blockIdx%x
   j=blockIdx%y
   k=blockIdx%z
   is=threadIdx%x
   js=threadIdx%y
   ks=threadIdx%z
   if ((i.lt.j).and.(j.lt.k)) then
      ijk=i+((j-2)*(j-1)+(k-3)*(k-2)*(k-1)/3)/2
      d3b(is,js,ks,ijk)=d3b(is,js,ks,ijk)+fij(1)*( &
         sxz(is,i,i)*(sxz(js,j,j)*sxz(ks,k,k)-sxz(js,j,k)*sxz(ks,k,j)) &
        +sxz(is,i,j)*(sxz(js,j,k)*sxz(ks,k,i)-sxz(js,j,i)*sxz(ks,k,k)) &
        +sxz(is,i,k)*(sxz(js,j,i)*sxz(ks,k,j)-sxz(js,j,j)*sxz(ks,k,i)))
   endif
   end subroutine addtod3b_gpu

!
! add to d3b on cpu
!
   subroutine addtod3b(sxz,fij,npart,d3b)
   integer(kind=i4), intent(in) :: npart
   complex(kind=r8), intent(in) :: sxz(:,:,:),fij
   complex(kind=r8), intent(inout) :: d3b(:,:,:,:)
   integer(kind=i4) :: i,j,k,js,ks,ijk
   ijk=0
   do k=3,npart
      do j=2,k-1
         do i=1,j-1
            ijk=ijk+1
            do ks=1,4
               do js=1,4
                  d3b(:,js,ks,ijk)=d3b(:,js,ks,ijk)+fij*( &
                    sxz(:,i,i)*(sxz(js,j,j)*sxz(ks,k,k) &
                   -sxz(js,j,k)*sxz(ks,k,j))+sxz(:,i,j) &
                   *(sxz(js,j,k)*sxz(ks,k,i)-sxz(js,j,i)*sxz(ks,k,k)) &
                      +sxz(:,i,k)*(sxz(js,j,i)*sxz(ks,k,j) &
                      -sxz(js,j,j)*sxz(ks,k,i)))
               enddo
            enddo
         enddo
      enddo
   enddo
   end subroutine addtod3b

end module d3bmod

program d3btest
   use cudafor
   use, intrinsic :: iso_fortran_env
   use random
   use random2
   use d3bmod
   implicit none
   integer, parameter :: i4=int32
   integer, parameter :: i8=int64
   integer, parameter :: r8=real64
   complex(kind=r8), parameter :: czero=(0.0_r8,0.0_r8)
   complex(kind=r8), parameter :: cone=(1.0_r8,0.0_r8)
   complex(kind=r8), parameter :: ci=(0.0_r8,1.0_r8)
! pin these for better transfer speed
   complex(kind=r8), pinned, allocatable :: sxz(:,:,:),d3b(:,:,:,:),fij(:)
   complex(kind=r8), allocatable :: d3b2(:,:,:,:)
   complex(kind=r8), device, allocatable :: sxz_d(:,:,:)
   complex(kind=r8), device, allocatable :: d3b_d(:,:,:,:)
   complex(kind=r8), device, allocatable :: fij_d(:)
   real(kind=r8) :: rnc(2)
   real :: time0,time1,time2
   integer(kind=i8) :: iseed
   integer(kind=i4) :: npart,npair,ntrip,i,j,k,ijk,js,ks
   integer :: istat
   type(dim3) :: blocks, threads
   iseed=1717171717
   npart=40
   npair=(npart*(npart-1))/2
   ntrip=(npart*(npart-1)*(npart-2))/6
   allocate(sxz(4,npart,npart))
   allocate(fij(1))
   allocate(d3b(4,4,4,ntrip))
   allocate(d3b2(4,4,4,ntrip))
   allocate(sxz_d(4,npart,npart))
   allocate(fij_d(1))
   allocate(d3b_d(4,4,4,ntrip))
   call setrn(iseed)
   call cpu_time(time0)
   sxz=reshape(randn(4*npart**2),[4,npart,npart]) &
      +ci*reshape(randn(4*npart**2),[4,npart,npart])
   rnc=randn(2)
   fij(1)=rnc(1)+ci*rnc(2)
   call cpu_time(time1)
   write(*,*) 'fij',fij,'cpu time for random junk',time1-time0
!
! set up with 64 threads, I think 2 warps of 32 threads will then have the
! same i<j<k if test result in the gpu code, limiting thread divergence.
!
   threads=dim3(4,4,4)
   blocks=dim3(npart,npart,npart)
!
! Transfer data to gpu
!
   call setnpart(npart)
   call cpu_time(time0)
   sxz_d=sxz
   fij_d=fij
   call cpu_time(time1)
   write (*,*) 'to device time',time1-time0
   istat = cudaDeviceSynchronize()
!
! zero d3b then calculate addition from sxz and fij
!
   call cpu_time(time0)
   call zerod3b_gpu<<<blocks,threads>>>(d3b_d)
   call addtod3b_gpu<<<blocks,threads>>>(sxz_d,fij_d,d3b_d)
   istat = cudaDeviceSynchronize()
   call cpu_time(time1)
   write (*,*) 'kernel time',time1-time0
!
! Transfer result back
!
   d3b=d3b_d
   call cpu_time(time2)
   write (*,*) 'from device time',time2-time1
   d3b2=czero
   call cpu_time(time0)
   call addtod3b(sxz,fij(1),npart,d3b2)
   call cpu_time(time1)
   write (*,*) 'cpu d3b time',time1-time0
   write (*,*) d3b(1,2,3,37),d3b2(1,2,3,37)
   write (*,*) 'max difference',maxval(abs(d3b-d3b2))

end program d3btest

Stefano

Thanks, I didn’t realize it’s a CUDA Fortran code which explains the lack of feedback messages. I presumed that since you were using “-mp=gpu”, you were using OpenMP.

I was able to compile (after removing the missing ran.f90 stuff) and it runs fine on my system.

What GPU do you have, what’s it’s compute capability and what CUDA Driver version? (if you don’t know, you can run “nvaccelinfo” to get the details).

My best guess is that it’s not a cc89 device hence when the binary gets loaded, there’s not device binary for this target.

Good morning,
I was able to solve the problem, I just needed to change the flags that I used to compile the code to “-O3 -gpu=ccnative,ptxinfo -cuda”.
Thanks again!