Hi. I stumbled upon a problem where my program would run fine, but on hitting ‘stop’ it will segfault. Is CUDA_ERROR_INVALID_CONTEXT indicative of something? I haven’t yet reduced the problem further, could it be in the OpenACC-CUDA Fortran interop?
PGI version
same problem persists in 15.7 and 15.10
compiler command (for all files)
pgf90 -g -Mcuda=cc3x -ta=nvidia,cc3x,keepgpu,keepbin,time -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -r8 -DGPU -c example.f90 -o example.o
output cuda-memcheck
========= CUDA-MEMCHECK
========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxAttach.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so (cuCtxAttach + 0x15c) [0x13bd9c]
========= Host Frame:/opt/pgi/linux86-64/15.7/lib/libaccnc.so (__pgi_uacc_cuda_initdev + 0x73e) [0x5fee]
========= Host Frame:/opt/pgi/linux86-64/15.7/lib/libaccg.so (__pgi_uacc_enumerate + 0x152) [0x10002]
========= Host Frame:/opt/pgi/linux86-64/15.7/lib/libaccg.so (__pgi_uacc_initialize + 0x45) [0x10455]
========= Host Frame:/opt/pgi/linux86-64/15.7/lib/libaccg.so (__pgi_uacc_dataenterstart + 0x4e) [0x9ebe]
========= Host Frame:./example_gpu [0x2888]
========= Host Frame:./example_gpu [0x6355]
========= Host Frame:./example_gpu [0x27d4]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
========= Host Frame:./example_gpu [0x26d1]
=========
PGI: CUDA Performance Tools Interface (CUPTI) could not be initialized.
Please disable all profiling tools (including NVPROF) before using PGI_ACC_TIME.
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
calculation complete
test ok
Warning: ieee_inexact is signaling
FORTRAN STOP
Accelerator Kernel Timing data
/home/michel/hybrid/examples/strides/build/gpu/source/example.f90
initialize NVIDIA devicenum=0
time(us): 0
62: data region reached 1 time
/home/michel/hybrid/examples/strides/build/gpu/source/example.f90
cleanup NVIDIA devicenum=0
time(us): 0
68: data region reached 1 time
/home/michel/hybrid/examples/strides/build/gpu/source/example.f90
wrapper NVIDIA devicenum=0
time(us): 839
98: data region reached 1 time
98: data copyin transfers: 4
device time(us): total=472 max=129 min=113 avg=118
124: data region reached 1 time
124: data copyout transfers: 2
device time(us): total=367 max=194 min=173 avg=183
/home/michel/hybrid/examples/strides/build/gpu/source/example.f90
add NVIDIA devicenum=0
time(us): 0
153: compute region reached 4 times
159: kernel launched 4 times
grid: [2x2] block: [16x16]
elapsed time(us): total=828 max=282 min=142 avg=207
153: data region reached 4 times
164: data region reached 4 times
/home/michel/hybrid/examples/strides/build/gpu/source/example.f90
mult NVIDIA devicenum=0
time(us): 0
178: compute region reached 4 times
184: kernel launched 4 times
grid: [2x2] block: [16x16]
elapsed time(us): total=817 max=282 min=140 avg=204
178: data region reached 4 times
189: data region reached 4 times
========= ERROR SUMMARY: 1 error
output valgrind
==626== Memcheck, a memory error detector
==626== Copyright (C) 2002-2012, and GNU GPL'd, by Julian Seward et al.
==626== Using Valgrind-3.8.1 and LibVEX; rerun with -h for copyright info
==626== Command: ./example_gpu
==626==
==626== Warning: noted but unhandled ioctl 0x30000001 with no size/direction hints
==626== This could cause spurious value errors to appear.
==626== See README_MISSING_SYSCALL_OR_IOCTL for guidance on writing a proper wrapper.
==626== Warning: set address range perms: large range [0x800000000, 0x2900000000) (noaccess)
==626== Warning: set address range perms: large range [0x2900000000, 0x3600000000) (noaccess)
==626== Warning: set address range perms: large range [0x4000000000, 0x4d00000000) (noaccess)
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine getSlice3D
calling kernel getSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
entering subroutine storeSlice3D
calling kernel storeSlice3D_kernel with grid size 2 2
calculation complete
test ok
FORTRAN STOP
==626== Thread 3:
==626== Invalid read of size 8
==626== at 0x184D7E51: activityCallback (prof_cuda_cupti.c:167)
==626== by 0x18685F6A: ??? (in /opt/pgi/linux86-64/2015/cuda/6.5/lib64/libcupti.so.6.5.14)
==626== by 0x1888EF98: ??? (in /opt/pgi/linux86-64/2015/cuda/6.5/lib64/libcupti.so.6.5.14)
==626== by 0x3FD56079D0: start_thread (in /lib64/libpthread-2.12.so)
==626== by 0x3FD4EE89DC: clone (in /lib64/libc-2.12.so)
==626== Address 0x28 is not stack'd, malloc'd or (recently) free'd
==626==
==626==
==626== Process terminating with default action of signal 11 (SIGSEGV)
==626== Access not within mapped region at address 0x28
==626== at 0x184D7E51: activityCallback (prof_cuda_cupti.c:167)
==626== by 0x18685F6A: ??? (in /opt/pgi/linux86-64/2015/cuda/6.5/lib64/libcupti.so.6.5.14)
==626== by 0x1888EF98: ??? (in /opt/pgi/linux86-64/2015/cuda/6.5/lib64/libcupti.so.6.5.14)
==626== by 0x3FD56079D0: start_thread (in /lib64/libpthread-2.12.so)
==626== by 0x3FD4EE89DC: clone (in /lib64/libc-2.12.so)
==626== If you believe this happened as a result of a stack
==626== overflow in your program's main thread (unlikely but
==626== possible), you can try to increase the size of the
==626== main thread stack using the --main-stacksize= flag.
==626== The main thread stack size used in this run was 10485760.
==626==
==626== HEAP SUMMARY:
==626== in use at exit: 90,499,843 bytes in 329,646 blocks
==626== total heap usage: 366,966 allocs, 37,320 frees, 93,410,653 bytes allocated
==626==
==626== LEAK SUMMARY:
==626== definitely lost: 20 bytes in 1 blocks
==626== indirectly lost: 0 bytes in 0 blocks
==626== possibly lost: 5,162,594 bytes in 4,383 blocks
==626== still reachable: 85,337,229 bytes in 325,262 blocks
==626== suppressed: 0 bytes in 0 blocks
==626== Rerun with --leak-check=full to see details of leaked memory
==626==
==626== For counts of detected and suppressed errors, rerun with: -v
==626== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 12 from 9)
Killed
example.f90
module example
real(8), dimension(32, 32, 10) :: a_stride, b_stride, c_stride, d_stride
contains
subroutine initialize()
use openacc
use cudafor
!$acc enter data create(a_stride, b_stride, c_stride, d_stride)
end subroutine
subroutine cleanup()
use openacc
use cudafor
!$acc exit data delete(a_stride, b_stride, c_stride, d_stride)
end subroutine
subroutine run(nx, ny, a, b, c, d)
use openacc
use cudafor
implicit none
integer, intent(in) :: nx, ny
real(8), intent(in) :: a(nx, ny, 10), b(nx, ny, 10)
real(8), intent(out) :: c(nx, ny, 10), d(nx, ny, 10)
! ****** additional symbols inserted by framework to emulate device support of language features
integer(4) :: hf_symbols_are_device_present
! ****** end additional symbols
call wrapper(nx, ny, a, b, c, d)
end subroutine
subroutine wrapper(nx, ny, a, b, c, d)
use openacc
use cudafor
use helper_functions_gpu
implicit none
integer, intent(in) :: nx, ny
real(8), intent(in) :: a(nx, ny, 10), b(nx, ny, 10)
real(8), intent(out) :: c(nx, ny, 10), d(nx, ny, 10)
integer(4) :: num_strides_x, num_strides_y, stride_x, stride_y
integer(4) :: stride_x_offset, stride_y_offset
integer(4) :: stride_x_size, stride_y_size
! ****** additional symbols inserted by framework to emulate device support of language features
integer(4) :: hf_symbols_are_device_present
!$acc enter data copyin(a), copyin(c), copyin(b), copyin(d)
hf_symbols_are_device_present = acc_is_present(a)
! ****** end additional symbols
stride_x_size = min(nx, 32)
stride_y_size = min(ny, 32)
num_strides_x = nx / stride_x_size
num_strides_y = ny / stride_y_size
do stride_x = 0, num_strides_x - 1
do stride_y = 0, num_strides_y - 1
stride_x_offset = stride_x * stride_x_size
stride_y_offset = stride_y * stride_y_size
!$acc host_data use_device(a_stride, b_stride, c_stride, d_stride, a, b, c, d)
call getSlice3D(a_stride, a, stride_x_size, stride_y_size, 10, nx, ny, 10, stride_x_offset, stride_y_offset, 0)
call getSlice3D(b_stride, b, stride_x_size, stride_y_size, 10, nx, ny, 10, stride_x_offset, stride_y_offset, 0)
call getSlice3D(c_stride, c, stride_x_size, stride_y_size, 10, nx, ny, 10, stride_x_offset, stride_y_offset, 0)
call getSlice3D(d_stride, d, stride_x_size, stride_y_size, 10, nx, ny, 10, stride_x_offset, stride_y_offset, 0)
!$acc end host_data
call kernel_wrapper(stride_x_size, stride_y_size, a_stride, b_stride, c_stride, d_stride)
!$acc host_data use_device(c_stride, d_stride, c, d)
call storeSlice3D(c, c_stride, nx, ny, 10, stride_x_size, stride_y_size, 10, stride_x_offset, stride_y_offset, 0)
call storeSlice3D(d, d_stride, nx, ny, 10, stride_x_size, stride_y_size, 10, stride_x_offset, stride_y_offset, 0)
!$acc end host_data
end do
end do
!$acc exit data delete(a), copyout(c), delete(b), copyout(d)
end subroutine
subroutine kernel_wrapper(nx, ny, a, b, c, d)
use openacc
use cudafor
implicit none
integer, intent(in) :: nx, ny
real(8), intent(in) :: a(nx, ny, 10), b(nx, ny, 10)
real(8), intent(out) :: c(nx, ny, 10), d(nx, ny, 10)
! ****** additional symbols inserted by framework to emulate device support of language features
integer(4) :: hf_symbols_are_device_present
hf_symbols_are_device_present = acc_is_present(a)
! ****** end additional symbols
call add (nx, ny, a, b, c)
call mult (nx, ny, a, b, d)
end subroutine
subroutine add(nx, ny, a, b, c)
use openacc
use cudafor
implicit none
integer, intent(in) :: nx, ny
real(8), intent(in) :: a(nx, ny, 10), b(nx, ny, 10)
real(8), intent(out) :: c(nx, ny, 10)
integer :: z
integer(4) :: y, x
integer(4) :: hf_symbols_are_device_present
hf_symbols_are_device_present = acc_is_present(c)
!$acc kernels present(c) present(b) present(a)
!$acc loop independent vector(16)
do y=1,ny
!$acc loop independent vector(16)
do x=1,nx
!$acc loop seq
do z=1,10
c(x, y, z)= a(x, y, z)+ b(x, y, z)
end do
end do
end do
!$acc end kernels
end subroutine
subroutine mult(nx, ny, a, b, d)
use openacc
use cudafor
implicit none
integer, intent(in) :: nx, ny
real(8), intent(in) :: a(nx, ny, 10), b(nx, ny, 10)
real(8), intent(out) :: d(nx, ny, 10)
integer :: z
integer(4) :: y, x
integer(4) :: hf_symbols_are_device_present
hf_symbols_are_device_present = acc_is_present(b)
!$acc kernels present(b) present(a) present(d)
!$acc loop independent vector(16)
do y=1,ny
!$acc loop independent vector(16)
do x=1,nx
!$acc loop seq
do z=1,10
d(x, y, z)= a(x, y, z)* b(x, y, z)
end do
end do
end do
!$acc end kernels
end subroutine
end module example
program main
use example
implicit none
integer, parameter :: nx = 64
integer, parameter :: ny = 64
real(8), dimension(nx, ny, 10) :: a, b, c, d
integer :: x, y, z, firstErrorCx, firstErrorCy, firstErrorCz, firstErrorDx, firstErrorDy, firstErrorDz
real :: errorValC, errorValD
logical test
do y=1,ny
do x=1,ny
do z=1,10
a(x, y, z) = (real(x)-1) + 100.d0*(real(y)-1) + 10000.d0*(real(z)-1)
b(x, y, z) = (real(x)-1) + 100.d0*(real(y)-1) + 10000.d0*(real(z)-1)
end do
end do
end do
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
call initialize
call run(nx, ny, a, b, c, d)
call cleanup
write(6,*) "calculation complete"
test = .TRUE.
firstErrorCx = -1
firstErrorCy = -1
firstErrorCz = -1
firstErrorDx = -1
firstErrorDy = -1
firstErrorDz = -1
errorValC = -1.0d0
errorValD = -1.0d0
do y=1,ny
do x=1,ny
do z=1,10
if (c(x, y, z) .NE. 2.d0*((real(x)-1) + 100.d0*(real(y)-1) + 10000.d0*(real(z)-1))) then
test = .FALSE.
if (firstErrorCx .EQ. -1.0d0) then
firstErrorCx = x
firstErrorCy = y
firstErrorCz = z
errorValC = c(x, y, z)
end if
end if
if (d(x, y, z) .NE. ((real(x)-1) + 100.d0*(real(y)-1) + 10000.d0*(real(z)-1))**2) then
test = .FALSE.
if (firstErrorDx .EQ. -1.0d0) then
firstErrorDx = x
firstErrorDy = y
firstErrorDz = z
errorValD = d(x, y, z)
end if
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "test ok"
else
write(6,*) "test failed"
write(6,*) "First Error in C: ", errorValC, "@x=", firstErrorCx, ",y=", firstErrorCy, ",z=", firstErrorCz
write(6,*) "First Error in D: ", errorValD, "@x=", firstErrorDx, ",y=", firstErrorDy, ",z=", firstErrorDz
stop 2
end if
stop
end program main
helper_functions_gpu
module helper_functions_gpu
implicit none
private
public :: getSlice3D
public :: storeSlice3D
contains
subroutine getSlice3D(sliced, original, nx, ny, nz, nx_original, ny_original, nz_original, offset_x, offset_y, offset_z)
use cudafor
implicit none
integer(4), intent(in) :: nx, ny, nz, nx_original, ny_original, nz_original, offset_x, offset_y, offset_z
real(8), intent(in) ,device :: original(nx_original, ny_original, nz_original)
real(8), intent(out) ,device :: sliced(nx, ny, nz)
! ****** additional symbols inserted by framework to emulate device support of language features
type(dim3) :: cugrid, cublock
integer(4) :: cugridSizeX, cugridSizeY, cugridSizeZ, cuerror, cuErrorMemcopy
write(0,*) 'entering subroutine getSlice3D'
! ****** end additional symbols
cuerror = cudaFuncSetCacheConfig(getSlice3D_kernel, cudaFuncCachePreferL1)
cuerror = cudaGetLastError()
if(cuerror .NE. cudaSuccess) then
write(0, *) 'CUDA error when setting cache configuration for kernel getSlice3D_kernel:', cudaGetErrorString(cuerror)
stop 1
end if
cugridSizeX = ceiling(real(nx) / real(16))
cugridSizeY = ceiling(real(ny) / real(16))
cugridSizeZ = 1
cugrid = dim3(cugridSizeX, cugridSizeY, cugridSizeZ)
cublock = dim3(16, 16, 1)
write(0,*) 'calling kernel getSlice3D_kernel with grid size', cugridSizeX, cugridSizeY
call getSlice3D_kernel <<< cugrid, cublock >>>(sliced, original, nx, ny, nz, nx_original, ny_original, nz_original, offset_x, &
& offset_y, offset_z)
cuerror = cudaThreadSynchronize()
cuerror = cudaGetLastError()
if(cuerror .NE. cudaSuccess) then
write(0, *) 'CUDA error in kernel getSlice3D_kernel:', cudaGetErrorString(cuerror)
stop 1
end if
if(cuerror .NE. cudaSuccess) then
stop 1
end if
end subroutine
attributes(global) subroutine getSlice3D_kernel(sliced, original, nx, ny, nz, nx_original, ny_original, nz_original, offset_x, &
& offset_y, offset_z)
use cudafor
implicit none
integer(4), intent(in) ,value :: nx, ny, nz, nx_original, ny_original, nz_original, offset_x, offset_y, offset_z
real(8), intent(in) ,device :: original(nx_original, ny_original, nz_original)
real(8), intent(out) ,device :: sliced(nx, ny, nz)
integer(4) :: z
integer(4) :: y, x
x = (blockidx%x - 1) * blockDim%x + threadidx%x
y = (blockidx%y - 1) * blockDim%y + threadidx%y
if (x .GT. nx .OR. x .LT. 1 .OR. y .GT. ny .OR. y .LT. 1) then
return
end if
do z = 1, nz
sliced(x, y, z)= original(x+offset_x, y+offset_y, z+offset_z)
end do
end subroutine
subroutine storeSlice3D(original, sliced, nx_original, ny_original, nz_original, nx, ny, nz, offset_x, offset_y, offset_z)
use cudafor
implicit none
integer(4), intent(in) :: nx_original, ny_original, nz_original, nx, ny, nz, offset_x, offset_y, offset_z
real(8), intent(out) ,device :: original(nx_original, ny_original, nz_original)
real(8), intent(in) ,device :: sliced(nx, ny, nz)
! ****** additional symbols inserted by framework to emulate device support of language features
type(dim3) :: cugrid, cublock
integer(4) :: cugridSizeX, cugridSizeY, cugridSizeZ, cuerror, cuErrorMemcopy
write(0,*) 'entering subroutine storeSlice3D'
! ****** end additional symbols
cuerror = cudaFuncSetCacheConfig(storeSlice3D_kernel, cudaFuncCachePreferL1)
cuerror = cudaGetLastError()
if(cuerror .NE. cudaSuccess) then
write(0, *) 'CUDA error when setting cache configuration for kernel storeSlice3D_kernel:', cudaGetErrorString(cuerror)
stop 1
end if
cugridSizeX = ceiling(real(nx) / real(16))
cugridSizeY = ceiling(real(ny) / real(16))
cugridSizeZ = 1
cugrid = dim3(cugridSizeX, cugridSizeY, cugridSizeZ)
cublock = dim3(16, 16, 1)
write(0,*) 'calling kernel storeSlice3D_kernel with grid size', cugridSizeX, cugridSizeY
call storeSlice3D_kernel <<< cugrid, cublock >>>(original, sliced, nx_original, ny_original, nz_original, nx, ny, nz, offset_x, &
& offset_y, offset_z)
cuerror = cudaThreadSynchronize()
cuerror = cudaGetLastError()
if(cuerror .NE. cudaSuccess) then
write(0, *) 'CUDA error in kernel storeSlice3D_kernel:', cudaGetErrorString(cuerror)
stop 1
end if
if(cuerror .NE. cudaSuccess) then
stop 1
end if
end subroutine
attributes(global) subroutine storeSlice3D_kernel(original, sliced, nx_original, ny_original, nz_original, nx, ny, nz, offset_x, &
& offset_y, offset_z)
use cudafor
implicit none
integer(4), intent(in) ,value :: nx_original, ny_original, nz_original, nx, ny, nz, offset_x, offset_y, offset_z
real(8), intent(out) ,device :: original(nx_original, ny_original, nz_original)
real(8), intent(in) ,device :: sliced(nx, ny, nz)
integer(4) :: z
integer(4) :: y, x
x = (blockidx%x - 1) * blockDim%x + threadidx%x
y = (blockidx%y - 1) * blockDim%y + threadidx%y
if (x .GT. nx .OR. x .LT. 1 .OR. y .GT. ny .OR. y .LT. 1) then
return
end if
do z = 1, nz
original(x+offset_x, y+offset_y, z+offset_z)= sliced(x, y, z)
end do
end subroutine
end module