The CUDA function "cudaEventRecord()" put a error

Could you tell me why the CUDA function “cudaEventRecord()” a error message “invalid resource handle” ?

In order to get timing information on echo GPU, I use the CUDA function “cudaEventRecord()”.
However I got a error message "FILE: test.cuf LINE: 72 Error: invalid resource handle ".

When I ran on one GPU, there was no problem.
However when I ran more than two GPUs, the CUDA function “cudaEventRecord()” put the error message.

I suppose it is not possible for me to pass variables with the attribute dimension to the CUDA function “cudaEventRecord()”.

Here is the test program.


#define CHECK(err) _CHECK(err,FILE,LINE)
module EventTest
use cudafor
implicit none

contains
subroutine _CHECK(err, filename, linenum)
integer(kind=4),intent(in) :: err
character(kind=1,len=*), optional :: filename
integer(kind=4), optional :: linenum

if( err /= cudaSuccess ) then
if(present(filename) .and. present(linenum)) then
write(,‘(“FILE:”,1x,a,1x,“LINE:”,1x,i10,1x,“Error:”,1x,a)’),filename,linenum,cudaGetErrorString(err)
else
write(
,‘(“Error:”,1x,a)’)cudaGetErrorString(err)
end if
stop
end if

end subroutine _CHECK

attributes(global) subroutine gpu_kernel(n,d_a)
integer,intent(in),value :: n
integer,dimension(n),intent(out),device :: d_a
integer :: idx

idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x
if( ( idx < 1 ) .or. ( idx > n ) ) return
d_a(idx) = idx

end subroutine gpu_kernel
end module EventTest

program EventTestMain
use EventTest
integer,parameter :: n = 10000000
integer,parameter :: tbx = 128
integer :: devnum
integer :: devId
integer,dimension(:,:),pointer,device :: d_a
type(dim3) :: grid,tblock
type(cudaEvent),dimension(:),pointer :: startEvent, stopEvent


tblock = dim3(tbx,1,1)
grid = dim3( (n + tblock%x - 1) / tblock%x ,1,1)

! Number of GPUs
call CHECK ( cudaGetDeviceCount(devnum) )

allocate( d_a(n,0:devnum -1) )
allocate( startEvent(0:devnum -1) )
allocate( stopEvent(0:devnum -1) )

! Create Events
do devId = 0,devnum - 1
call CHECK( cudaEventCreate(startEvent(devId)) )
call CHECK( cudaEventCreate(stopEvent(devId)) )
end do


do devId = 0,devnum - 1
! Start Event
call CHECK( cudaDeviceSynchronize() )
call CHECK( cudaEventRecord(startEvent(devId),0) )

call CHECK( cudaSetDevice(devId) )
call gpu_kernel<<<grid,tblock>>>(n,d_a(:,devId))

! Stop Event
call CHECK( cudaEventRecord(stopEvent(devId),0) )
call CHECK( cudaDeviceSynchronize() )
end do

! Timing information
do devId = 0,devnum - 1
call CHECK( cudaEventElapsedTime(time,startEvent(devId),stopEvent(devId)) )
write(*,‘(“Device Id:”,3x,i2,3x,“Elapsed Time:”,3x,f10.8)’)devId,time
end do

! Destroy Events
do devId = 0,devnum - 1
call CHECK( cudaEventDestroy(startEvent(devId)) )
call CHECK( cudaEventDestroy(stopEvent(devId)) )

end do

deallocate(startEvent)
deallocate(stopEvent)
deallocate(d_a)

end program EventTestMain

Could you tell me why the CUDA function “cudaEventRecord()” a error message “invalid resource handle” ?

The problem here is that you’re not setting the device number before creating the events, hence both are being created on the default device. Hence when you access the event on the wrong device you get the error.

You’re also going to have issues with the “d_a” array. When you allocate the array, all the data will be created on the default device. The second dimension is not going to implicitly create part of the array on one device and part on the other. Instead, you need to create an array of a type on the host where the data member is a device array.

#define CHECK(err) _CHECK(err,__FILE__,__LINE__)
 module EventTest
 use cudafor
 implicit none

 contains
 subroutine _CHECK(err, filename, linenum)
 integer(kind=4),intent(in) :: err
 character(kind=1,len=*), optional :: filename
 integer(kind=4), optional :: linenum

 if( err /= cudaSuccess ) then
 if(present(filename) .and. present(linenum)) then
 write(*,'("FILE:",1x,a,1x,"LINE:",1x,i10,1x,"Error:",1x,a)'),filename,linenum,cudaGetErrorString(err)
 else
 write(*,'("Error:",1x,a)')cudaGetErrorString(err)
 end if
 stop
 end if

 end subroutine _CHECK

 attributes(global) subroutine gpu_kernel(n,d_a)
 integer,intent(in),value :: n
 integer,dimension(n),intent(out),device :: d_a
 integer :: idx

 idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x
 if( ( idx < 1 ) .or. ( idx > n ) ) return
 d_a(idx) = idx

 end subroutine gpu_kernel
 end module EventTest

 program EventTestMain
 use EventTest
 integer,parameter :: n = 10000000
 integer,parameter :: tbx = 128
 integer :: devnum
 integer :: devId
 type devarray
   integer,dimension(:),allocatable :: h_a
   integer,dimension(:),allocatable,device :: d_a
 end type devarray
 type(devarray), allocatable, dimension(:) :: a
 type(dim3) :: grid,tblock
 type(cudaEvent),dimension(:),pointer :: startEvent, stopEvent


 tblock = dim3(tbx,1,1)
 grid = dim3( (n + tblock%x - 1) / tblock%x ,1,1)

 ! Number of GPUs
 call CHECK ( cudaGetDeviceCount(devnum) )

 allocate( a(0:devnum -1) )
 allocate( startEvent(0:devnum -1) )
 allocate( stopEvent(0:devnum -1) )

 ! Create Events
 do devId = 0,devnum - 1
 call CHECK( cudaSetDevice(devId) )
 allocate(a(devId)%d_a(n))
 allocate(a(devId)%h_a(n))
 call CHECK( cudaEventCreate(startEvent(devId)) )
 call CHECK( cudaEventCreate(stopEvent(devId)) )
 end do


 do devId = 0,devnum - 1
 ! Start Event
 call CHECK( cudaSetDevice(devId) )
 call CHECK( cudaDeviceSynchronize() )
 call CHECK( cudaEventRecord(startEvent(devId),0) )
 call gpu_kernel<<<grid,tblock>>>(n,a(devId)%d_a(:))
 call CHECK( cudaDeviceSynchronize() )
 a(devId)%h_a =  a(devId)%d_a
 print *, a(devId)%h_a(1:10)

 ! Stop Event
 call CHECK( cudaEventRecord(stopEvent(devId),0) )
 end do

 ! Timing information
 do devId = 0,devnum - 1
 call CHECK( cudaSetDevice(devId) )
 call CHECK( cudaEventElapsedTime(time,startEvent(devId),stopEvent(devId)) )
 write(*,'("Device Id:",3x,i2,3x,"Elapsed Time:",3x,f15.10)')devId,time
 end do

 ! Destroy Events
 do devId = 0,devnum - 1
 call CHECK( cudaSetDevice(devId) )
 call CHECK( cudaEventDestroy(startEvent(devId)) )
 call CHECK( cudaEventDestroy(stopEvent(devId)) )

 end do

 deallocate(startEvent)
 deallocate(stopEvent)
 do devId = 0,devnum - 1
   call CHECK( cudaSetDevice(devId) )
   deallocate(a(devId)%d_a)
   deallocate(a(devId)%h_a)
 enddo
 deallocate(a)

        end program EventTestMain



% pgfortran -Mcuda=cc35 -fast test.CUF
% a.out
            1            2            3            4            5            6
            7            8            9           10
            1            2            3            4            5            6
            7            8            9           10
            1            2            3            4            5            6
            7            8            9           10
            1            2            3            4            5            6
            7            8            9           10
Device Id:    0   Elapsed Time:     17.8769283295
Device Id:    1   Elapsed Time:     16.3900794983
Device Id:    2   Elapsed Time:     17.1248321533
Device Id:    3   Elapsed Time:     17.7438392639

Hope this helps,
Mat

Dear Mat-san,

Thank you for your advice. It make sense.

Thing about this a bit more, you might want to do something like the following so the kernels are running on each device at the same time. Currently, they are running sequentially.


 do devId = 0,devnum - 1 
  ! Start Event 
  call CHECK( cudaSetDevice(devId) ) 
  call CHECK( cudaEventRecord(startEvent(devId),0) ) 
  call gpu_kernel<<<grid,tblock>>>(n,a(devId)%d_a(:)) 
 end do 

 do devId = 0,devnum - 1 
  ! Start Event 
  call CHECK( cudaSetDevice(devId) ) 
  call CHECK( cudaDeviceSynchronize() ) 
  a(devId)%h_a =  a(devId)%d_a 
  print *, a(devId)%h_a(1:10) 

  ! Stop Event 
  call CHECK( cudaEventRecord(stopEvent(devId),0) ) 
  end do