Hi all,
The following code can be compiled with PVF 10.9 and 12.x (last year) but fails with 13.7.
The error messages are:
GPU_RainfallInt.cuf(42) : error S0155 : Illegal call from host code to device subprogram __pgi_get_last_error
GPU_RainfallInt.cuf(44) : error S0155 : Could not resolve generic procedure cudageterrorstring
GPU_RainfallInt.cuf(52) : error S0155 : Illegal call from host code to device subprogram __pgi_get_last_error
GPU_RainfallInt.cuf(54) : error S0155 : Could not resolve generic procedure cudageterrorstring
GPU_RainfallInt.cuf(61) : error S0155 : Illegal call from host code to device subprogram __pgi_get_last_error
GPU_RainfallInt.cuf(62) : error S0155 : Could not resolve generic procedure cudageterrorstring
Should I modify my codes?
Thanks in advance.
Feng
module GPU_OK_Scheme
use cudafor
use cudadevice
use IO_Parameter
implicit none
real, device, allocatable :: dgx(:), dgy(:), sorain(:,:)
real, constant :: sox(1000), soy(1000)
real*8, device, allocatable :: d_inv_cov_matrix(:,:)
real*8, device, allocatable :: d_grid_rain(:)
contains
!*********************************************************************************************************************
subroutine OKgrid_rain2(prop, nobs, ntime, ngrid, var, integral, nugget, vmin, vmax, cov_vec)
implicit none
integer :: nobs, ntime, ngrid
real*8 :: var, integral, nugget, vmin, vmax
real*8 :: cov_vec(nobs+1, ngrid)
integer::MaxBatchNdem, NBatchLoop, NBatch, NB
type(dim3) :: dimGrid, dimBlock
type(cudadeviceprop) :: prop
real*8, device, allocatable :: d_lambda(:,:),d_Cdev(:,:)
integer :: i, j, ss
MaxBatchNdem=9600
if( ngrid <= MaxBatchNdem )then
NBatchLoop=1
else
NBatchLoop= ngrid/MaxBatchNdem + 1
end if
allocate( d_lambda(MaxBatchNdem,Nobs+1) )
allocate( d_Cdev(MaxBatchNdem,Nobs+1) )
do j=1, NBatchLoop
ss=(j-1)*MaxBatchNdem
NBatch= min( Ngrid, Ngrid-(j-1)*MaxBatchNdem, MaxBatchNdem)
dimGrid = dim3( (NBatch-1)/16+1, (Nobs+1-1)/16+1, 1 )
dimBlock = dim3( 16, 16, 1 )
call gpu_assign_B2_matrix<<<dimGrid, dimBlock,0,0>>>(d_lambda, nobs, NBatch, ss, ngrid, dgx, dgy, var, integral, nugget)
i = cudaGetLastError()
if (i .ne. 0)then
print *, cudaGetErrorString(i)
pause
end if
dimGrid = dim3( (NBatch-1)/16+1, (Nobs+1-1)/16+1, 1 )
dimBlock = dim3( 16, 16, 1 )
call gpu_cal_coef<<< dimGrid, dimBlock,0,0>>>( d_lambda, d_inv_cov_matrix, d_Cdev, NBatch, Nobs+1, Nobs+1)
i = cudaGetLastError()
if (i .ne. 0)then
print *, cudaGetErrorString(i)
pause
end if
dimGrid = dim3( NBatch/(prop%maxThreadsPerBlock/1)+1, 1, 1 )
dimBlock = dim3( prop%maxThreadsPerBlock/1, 1, 1 )
call gpu_cal_rain<<<dimGrid,dimBlock,0,0>>>(d_Cdev, sorain, Nobs, NBatch, ss, NHr, Ntime, vmin, vmax)
i = cudaGetLastError()
if (i .ne. 0) print *, cudaGetErrorString(i)
end do
deallocate( d_lambda)
deallocate( d_Cdev )
end subroutine OKgrid_rain2
!*********************************************************************************************************************
attributes(global) subroutine gpu_assign_B2_matrix(d_lambda, nobs, NBatch , ss, ngrid, dgx, dgy, var, integral, nugget )
implicit none
integer, value :: nobs, ngrid, NBatch, ss
real*8, value :: var, integral, nugget
real*8, device :: d_lambda(NBatch,nobs+1), d
real, device :: dgx(ngrid), dgy(ngrid)
real*8, shared :: ssox(16), ssoy(16)
integer, shared :: iby, ibx, ibid
integer,device :: tx, ty, i, j
tx= threadidx%x
ty= threadidx%y
if( tx==1 .and. ty==1 )then
if( NBatch==nobs+1 )then
iby=blockidx%x-1
ibx=blockidx%x + blockidx%y-2
if( ibx .ge. griddim%x) ibx=ibx-griddim%x
else
ibid=griddim%x*(blockidx%y-1)+blockidx%x -1
iby=mod(ibid, griddim%y)
ibx=mod(ibid/griddim%y+iby, griddim%x)
end if
end if
call syncthreads()
i=tx+ ibx*blockdim%x
j=ty+ iby*blockdim%x
if ( tx==1 .and. j<=nobs)then
ssox(ty)=DBLE(sox(j))
ssoy(ty)=DBLE(soy(j))
end if
call syncthreads()
if ( i <= nBatch .and. j<=nobs )then
d = DSQRT( (DBLE(dgx(ss+i)) - ssox(ty))*(DBLE(dgx(ss+i)) -ssox(ty)) + &
(DBLE(dgy(ss+i)) - ssoy(ty))*(DBLE(dgy(ss+i)) - ssoy(ty)) )
d_lambda(i,j) = var - ( nugget + (var-nugget)*(1.d0- exp(-1.d0*d/integral)) )
else if( i <= nBatch .and. j==nobs+1)then
d_lambda(i,nobs+1) = 1.d0
end if
end subroutine gpu_assign_B2_matrix
...
end module GPU_OK_Scheme
the module IO_Parameter is
MODULE IO_Parameter
IMPLICIT NONE
CHARACTER(LEN=200) :: DEM_FILE, VAL_FILE, OUT_FILE, DEM_FILE_TWD97
CHARACTER(LEN=3) :: RAIN_FORM, VAL_TYPE
CHARACTER(LEN=2) :: INTERP_METHOD
LOGICAL::FILE_Existed=.FALSE.
INTEGER :: Nval, Ndem, Nhr, dt, ncols, nrows, dem_cols, dem_rows
END MODULE IO_Parameter