Hi,
I try to do a cuda function, such as cusolverDnXgesvd. Before excuting this function, a handle should be created, which need to be tied a cuda stream. For example:
istat = cusolverDnCreate(handle)
istat = cusolverDnSetStream(handle,acc_get_cuda_stream(22))
When finishing the cusolverDnXgesvd function, the vector or matrix is usually copied to the host from device. However, this program must be excuted on the device, i.e. between a kernel with a specified thread. For example:
!$acc kernels async(22) copy(S_c)
!$acc loop
do I=1,cols
S_c(i) = S(i)
enddo
!$acc end kernels
I am not sure that both of the stream tied the handle and the label in the async all indicate the same stream. Two part wiil be run following the order or concurrent? I can give you some codes I test the function I use.
SUBROUTINE test_cusolverDnXgesvd
IMPLICIT NONE
INTEGER(4)::i
!============= CUSPARSE APIs =============
type(cusolverDnHandle):: handle ! A handle that handle to the cuSolver library context
type(cusolverDnParams) :: params ! A handle that handle to the cuSolver library context
INTEGER(KIND=8), parameter ::rows=2, cols=2 ! The number of rows of the dense matrix
INTEGER(KIND=4):: istat ! This data type represents the status returned by the library functions and it can have the values from 0 to 31
INTEGER(KIND=8), parameter :: lda=rows, ldu=rows, ldvt=cols
CHARACTER(1) :: jobu, jobvt
integer(8) :: workspaceInBytesOnDevice, workspaceInBytesOnHost
REAL(KIND=8), device:: A(rows,cols)
REAL(KIND=8), device:: S(cols)
REAL(KIND=8), device:: U(ldu,rows)
REAL(KIND=8), device:: VT(ldvt,cols)
INTEGER(1), DIMENSION(:), device, ALLOCATABLE :: bufferOnDevice
INTEGER(1), DIMENSION(:), ALLOCATABLE :: bufferOnHost
integer(4), device :: devinfo
REAL(KIND=8):: S_c(cols)
istat = cusolverDnCreate(handle)
istat = cusolverDnSetStream(handle,acc_get_cuda_stream(22))
istat = cusolverDnCreateParams(params)
istat = cusolverDnSetAdvOptions(params, CUSOLVERDN_GETRF, CUSOLVER_ALG_0)
jobu = "A"
jobvt = "A"
S_c = 0
A(1,:) =(/1,-2/)
A(2,:) =(/1,2/)
! A(3,:) =(/2,1/)
istat = cusolverDnXgesvd_bufferSize(handle,params,jobu,jobvt,rows,cols,cudaDataType(CUDA_R_64F),&
A,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
workspaceInBytesOnDevice, workspaceInBytesOnHost)
allocate(bufferOnDevice(workspaceInBytesOnDevice) , bufferOnHost(workspaceInBytesOnHost) )
istat = cusolverDnXgesvd(handle,params,jobu,jobvt,rows,cols,cudaDataType(CUDA_R_64F),&
A,lda,cudaDataType(CUDA_R_64F),S,cudaDataType(CUDA_R_64F),U,ldu,&
cudaDataType(CUDA_R_64F),VT,ldvt,cudaDataType(CUDA_R_64F),&
bufferOnDevice,workspaceInBytesOnDevice,&
bufferOnHost,workspaceInBytesOnHost,&
devinfo)
!$acc kernels copy(S_c)
!$acc loop
do I=1,cols
S_c(i) = S(i)
enddo
!$acc end kernels
write(*,*)S_C
istat = cusolverDnDestroy(handle)
istat = cusolverDnDestroyParams(params)
istat = cudaStreamDestroy(acc_get_cuda_stream(22))
deallocate(bufferOnDevice,bufferOnHost)
stop
END SUBROUTINE test_cusolverDnXgesvd
You can call this subroutine directly and the correct results can be obtained (2.828427124, 1.4142135623). However, the results are incorrect if you add “async(22)” as flag of kernels.
Best wishes,
Amor