The following example uses dynamic parallelism and passes some device arrays from parent to child.
! Compile with pgfortran -ta=tesla,cc50 test.cuf
MODULE Test_Module
CONTAINS
ATTRIBUTES(global) SUBROUTINE Kernel_Child(a,b)
DOUBLE PRECISION,DIMENSION(:) :: a
DOUBLE PRECISION,DIMENSION(:) :: b
PRINT *,'C',SIZE(a)
END SUBROUTINE Kernel_Child
ATTRIBUTES(global) SUBROUTINE Kernel_Parent(a,b)
DOUBLE PRECISION,DIMENSION(:) :: a
DOUBLE PRECISION,DIMENSION(:) :: b
TYPE(dim3) :: grid, block
PRINT *,'P',SIZE(a)
grid = dim3(1,1,1)
block = dim3(4,1,1)
CALL Kernel_Child<<<grid,block>>>(a,b)
END SUBROUTINE Kernel_Parent
END MODULE Test_Module
!
PROGRAM Test
USE CUDAFOR
USE Test_Module
IMPLICIT NONE
!
INTEGER,PARAMETER :: n = 8
DOUBLE PRECISION,DIMENSION(n) :: x
DOUBLE PRECISION,DIMENSION(n),DEVICE :: a_d
DOUBLE PRECISION,DIMENSION(n),DEVICE :: b_d
TYPE(dim3) :: grid, block
INTEGER :: i
!
CALL Set_Device_Array(a_d,b_d)
grid = dim3(1,1,1)
block = dim3(1,1,1)
CALL Kernel_Parent<<<grid,block>>>(a_d,b_d)
x = a_d
DO i=1, n
PRINT *,x(i)
END DO
CONTAINS
SUBROUTINE Set_Device_Array(a_d,b_d)
DOUBLE PRECISION,DIMENSION(:),DEVICE,INTENT(OUT) :: a_d
DOUBLE PRECISION,DIMENSION(:),DEVICE,INTENT(OUT) :: b_d
a_d = 1.0
b_d = 2.0
END SUBROUTINE Set_Device_Array
END PROGRAM Test
It compiles but crashes with an unspecified launch failure:
PGI$ pgfortran --version
pgfortran 17.4-0 64-bit target on x86-64 Windows -tp haswell
PGI Compilers and Tools
Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
PGI$ pgfortran -ta=tesla,cc50 test.cuf
PGI$ ./test
P 8
0: copyout Memcpy (host=0x0000000140175800, dev=0x0000000704340000, size=64) FAILED: 4(unspecified launch failure)
However the code runs if the arrays of the child are not accessed. E. g. replacing *PRINT ,‘C’,SIZE(a) by *PRINT ,‘C’ produces the expected output:
PGI$ ./test
P 8
C
C
C
C
1.000000000000000
1.000000000000000
1.000000000000000
1.000000000000000
1.000000000000000
1.000000000000000
1.000000000000000
1.000000000000000
Am I missing something or is there a workaround?
Thanks