call to cuLaunchKernel returned error 400: Invalid handle

I am trying to use multiple gpu on single. Here is the simple code. I encountered call to cuLaunchKernel returned error 400: Invalid handle

program test

      use mpi
      use openacc
      implicit none


      !usual mpi variables
      integer                        :: mype, npes, ierr
      integer                        :: status(MPI_STATUS_SIZE)

      integer::ngpus,mydevice,a(10000),i


      call MPI_Init(ierr)
      call MPI_Comm_size(MPI_COMM_WORLD, npes, ierr)
      call MPI_Comm_rank(MPI_COMM_WORLD, mype, ierr)
      ngpus=acc_get_num_devices(acc_device_nvidia)
      mydevice=mod(mype,ngpus)
      call acc_set_device_num(mydevice,acc_device_nvidia)
      print*,mydevice

      !$acc kernels deviceid(mydevice)
      do i=1,10000
        a(i)=a(i)*2
      end do
      !$acc end kernels
      call MPI_finalize(ierr)
end program test

Please help me, i have contest submission tomorrow night(less than 24 hrs)

Hi Masrul,

Looks like a problem with “deviceid” which is an old PGI extension that isn’t used very much. I added a problem report (TPR#22703) and sent it off to engineering.

It’s not needed here since your setting the device number via “acc_set_device_num” so the solution is to simply remove “deviceid(mydevice)”.

% cat test.f90
program test

       use mpi
       use openacc
       implicit none


       !usual mpi variables
       integer                        :: mype, npes, ierr
       integer                        :: status(MPI_STATUS_SIZE)

       integer::ngpus,mydevice,a(10000),i


       call MPI_Init(ierr)
       call MPI_Comm_size(MPI_COMM_WORLD, npes, ierr)
       call MPI_Comm_rank(MPI_COMM_WORLD, mype, ierr)
       ngpus=acc_get_num_devices(acc_device_nvidia)
       mydevice=mod(mype,ngpus)
       call acc_set_device_num(mydevice,acc_device_nvidia)
       print*,mydevice

       !$acc kernels
       do i=1,10000
         a(i)=a(i)*2
       end do
       !$acc end kernels
       call MPI_finalize(ierr)
 end program test
% mpif90 -acc -Minfo=accel test.f90
test:
     23, Generating implicit copy(a(:))
     24, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         24, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
% mpirun -np 4 a.out
            0
            1
            3
  • Mat

TPR #22703 should be fixed with 19.4 and greater.