Hi,
I’m still testing the allocate statements in device subroutines and I come across a new problem.
This is my test code:
MODULE mod_kernel
USE cudafor
CONTAINS
ATTRIBUTES(GLOBAL) SUBROUTINE kernel(x,n)
IMPLICIT NONE
INTEGER,VALUE,INTENT(IN) :: n
INTEGER,DEVICE,INTENT(OUT) :: x(n)
INTEGER,DEVICE,ALLOCATABLE :: A(:)
INTEGER :: tid, STATUS
tid = threadidx%x
ALLOCATE(A(13317),STAT=STATUS)
WRITE(*,*) STATUS
IF (tid<=n) THEN
A = tid
x(tid) = SUM(A)
END IF
CALL SYNCTHREADS()
DEALLOCATE(A)
RETURN
END SUBROUTINE kernel
END MODULE mod_kernel
PROGRAM test
USE cudafor
USE mod_kernel
IMPLICIT NONE
INTEGER,PARAMETER :: n=100
INTEGER :: x(n)
INTEGER,ALLOCATABLE,DEVICE :: dev_x(:)
x = 1
ALLOCATE(dev_x(n))
CALL kernel<<<1,n>>>(dev_x,n)
x = dev_x
DEALLOCATE(dev_x)
WRITE(*,*) x
END PROGRAM test
I find the maximum size of the allocatable array A in the device subroutine kernel of the module mod_kernel is 13316. When a larger number is adopted, errors occur.
Error Message:
NVRM: Xid (0000:04:00): 31 Ch00000002, engmask 00000101, intr 10000000
0: copyout Memcpy (host=0x6b8600, dev=0xb01600000, size=400) FAILED: 4(unspecified launch failure)
From the value of the variable STATUS, we can figure out that some of the threads allocate memory successfully while others not.
Why does this occur?
Can I change the maximum size of the allocatable array to be larger with altering the compiler opition?
Hi OceanCloud,
Looks like you’re going beyond the max heap size. To raise this, call “cudaDeviceSetLimit”.
% cat test.f90
MODULE mod_kernel
USE cudafor
CONTAINS
ATTRIBUTES(GLOBAL) SUBROUTINE kernel(x,n)
IMPLICIT NONE
INTEGER,VALUE,INTENT(IN) :: n
INTEGER,DEVICE,INTENT(OUT) :: x(n)
INTEGER,DEVICE,ALLOCATABLE :: A(:)
INTEGER :: tid, STATUS
tid = threadidx%x
ALLOCATE(A(13317),STAT=STATUS)
! WRITE(*,*) STATUS
IF (tid<=n) THEN
A = tid
x(tid) = SUM(A)
END IF
CALL SYNCTHREADS()
DEALLOCATE(A)
RETURN
END SUBROUTINE kernel
END MODULE mod_kernel
PROGRAM test
USE cudafor
USE mod_kernel
IMPLICIT NONE
INTEGER,PARAMETER :: n=100
INTEGER :: x(n)
INTEGER,ALLOCATABLE,DEVICE :: dev_x(:)
integer(kind=cuda_count_kind) :: val
integer :: rc
rc= cudaDeviceGetLimit( val, cudaLimitMallocHeapSize )
print *, "Max Heap Size: ", val
val = val*2
rc= cudaDeviceSetLimit( cudaLimitMallocHeapSize, val )
rc= cudaDeviceGetLimit( val, cudaLimitMallocHeapSize )
print *, "New Heap Size: ", val
x = 1
ALLOCATE(dev_x(n))
CALL kernel<<<1,n>>>(dev_x,n)
x = dev_x
DEALLOCATE(dev_x)
WRITE(*,*) x
END PROGRAM test
% pgf90 -Mcuda=rdc,cuda5.5,cc35 test.f90 ; a.out
Max Heap Size: 8388608
New Heap Size: 16777216
13317 26634 39951 53268 66585 79902
93219 106536 119853 133170 146487 159804
173121 186438 199755 213072 226389 239706
253023 266340 279657 292974 306291 319608
332925 346242 359559 372876 386193 399510
412827 426144 439461 452778 466095 479412
492729 506046 519363 532680 545997 559314
572631 585948 599265 612582 625899 639216
652533 665850 679167 692484 705801 719118
732435 745752 759069 772386 785703 799020
812337 825654 838971 852288 865605 878922
892239 905556 918873 932190 945507 958824
972141 985458 998775 1012092 1025409 1038726
1052043 1065360 1078677 1091994 1105311 1118628
1131945 1145262 1158579 1171896 1185213 1198530
1211847 1225164 1238481 1251798 1265115 1278432
1291749 1305066 1318383 1331700