I have this subroutine:
SUBROUTINE test2(x, y)
INTEGER :: x(:,:), y(:,:)
INTEGER :: i, npti
INTEGER, ALLOCATABLE :: nptidx(:)
LOGICAL, ALLOCATABLE :: mask(:)
INTEGER, ALLOCATABLE :: z(:)
INTEGER :: k, m, count_rate, count_start, count_end
call system_clock(count_rate = count_rate)
call system_clock(count_start)
do k = 1, ncycles
mask = reshape(x > threshold, [jpi*jpj])
npti = count(mask)
nptidx = pack([(i, i=1,jpi*jpj)], mask)
if (.not. allocated(z)) allocate(z(npti))
DO m = 1, 10
z = pack(reshape(y, [jpi*jpj]), mask)
END DO
end do
call system_clock(count_end)
write (*,*) "npti: ", npti
write (*,*) "SUM nptidx: ", SUM(nptidx)
write (*,*) "SUM z: ", SUM(z)
write (*,*) "time: ", INT(real(count_end - count_start) / real(count_rate) / ncycles * 1e6)
deallocate(z)
END SUBROUTINE
Inspired by section 3.9.4 of current NVIDIA CUDA Fortran programming guide, I’d like to have a version of it running on GPUs using the CUDA versions of RESHAPE, COUNT, PACK from CUTENSOREX library. I’m trying to do this by making this version of the code:
! the GPU version of test2 (?) -- expected to call pack() and reshape() from the cutensorex
SUBROUTINE test3(x, y)
USE cutensorex, only: pack, reshape, count
INTEGER :: x(:,:), y(:,:)
INTEGER :: i, npti
INTEGER, ALLOCATABLE :: nptidx(:)
LOGICAL, ALLOCATABLE :: mask(:)
INTEGER, ALLOCATABLE :: z(:)
INTEGER, ALLOCATABLE :: y1d(:)
INTEGER :: k, m, count_rate, count_start, count_end
!$acc data copyin(x,y)
call system_clock(count_rate = count_rate)
call system_clock(count_start)
do k = 1, ncycles
!$acc host_data use_device(x)
mask = reshape(x > threshold, [jpi*jpj])
!$acc end host_data
!$acc host_data use_device(mask)
npti = count(mask)
!$acc end host_data
!$acc host_data use_device(mask)
nptidx = pack([(i, i=1,jpi*jpj)], mask)
!$acc end host_data
if (.not. allocated(z)) allocate(z(npti))
DO m = 1, 10
!$acc host_data use_device(y)
y1d = reshape(y, [jpi*jpj])
!$acc end host_data
!$acc host_data use_device(y1d,mask)
z = pack(y1d, mask)
!$acc end host_data
END DO
end do
call cudaDeviceSynchronize()
call system_clock(count_end)
!$acc end data
write (*,*) "npti: ", npti
write (*,*) "SUM nptidx: ", SUM(nptidx)
write (*,*) "SUM z: ", SUM(z)
write (*,*) "time: ", INT(real(count_end - count_start) / real(count_rate) / ncycles * 1e6)
deallocate(z)
END SUBROUTINE
But I have the compiler diagnostics looking like this:
$ nvfortran -cuda -acc=gpu -O3 -o sc_test sc_test.f90 -cudalib=cutensor
NVFORTRAN-S-0074-Illegal number or type of arguments to reshape - keyword argument source (sc_test.f90: 129)
NVFORTRAN-S-0099-Illegal use of derived type (sc_test.f90: 143)
0 inform, 0 warnings, 2 severes, 0 fatal for test3
where line 129 is mask = reshape(x > threshold, [jpi*jpj]) and line 143 is: z = pack(y1d, mask).
I suspect I’m hitting some limitations here, but I can’t figure out what they are. Is there any way to make this code work?
Best regards,
Alexey