Hi,
I am using the function “cusolverDnXgetrs” to solve the linear system of equations. The subroutine need to be called multiple times. I found when it is called first, the correct result can be obtain. However, using it again, I obtained the incorrect result, and the result (is the “gm” in my codes) after using the function is same as the result before using the function. In other words, it look the “gm” is not be changed form CPU to GPU and then to CPU. It is tricky why the result can be obtained on the first time. I am sure that the “A” and “b” of the “Ax=b” are updated correctly before they are copied on the deviced. Here are my codes:
SUBROUTINE Calculate_Updating_model_Direct_GPU(conductivity)
IMPLICIT NONE
REAL(r8k), intent(inout):: conductivity( Model_Num )
INTEGER(i4k):: i,j,k, moment, point
INTEGER(i4k):: Hessian_row,Hessian_col,WtW_row, idx_tol
INTEGER(i8k)::t1,t2,count_rate !t1 denotes original cpu time , t2 denotes the end cpu time and t=t2-t1
REAL(r8k) ::t
!============= Calculate normal equation APIs =============
TYPE(cusolverDnHandle):: handle_LSE ! A handle that handle to the cuSolver library context
TYPE(cusolverDnParams):: params_LSE ! A handle that handle to the cuSolver library context
INTEGER(i8k)::rows ! The number of rows of the dense matrix
INTEGER(i8k)::cols ! The number of columns of the dense matrix
INTEGER(i8k):: ld, ldb
REAL(r8k), device :: dX(Model_Num) ! Values of the dense vector, i.e. unknown term. Array of size cols
integer(8) :: workspaceInBytesOnDevice, workspaceInBytesOnHost
integer(4), device :: devinfo
INTEGER(i1k), DIMENSION(:), device, ALLOCATABLE :: bufferOnDevice
INTEGER(i1k), DIMENSION(:), ALLOCATABLE :: bufferOnHost
INTEGER(i8k), device:: ipiv(Model_Num) ! Array of size at least min(rows,cols), containing pivot indices
!============= Host problem definition =============
rows = Model_Num
cols = Model_Num
ld = Model_Num
ldb = Model_Num
!============= Device memory management =============
istat = cusolverDnCreate(handle_LSE)
istat = cusolverDnSetStream(handle_LSE,acc_get_cuda_stream(1))
istat = cusolverDnCreateParams(params_LSE)
istat = cusolverDnSetAdvOptions(params_LSE, CUSOLVERDN_GETRF, CUSOLVER_ALG_0)
!============= Record the computation time =============
call system_clock(count_rate=count_rate)
call system_clock (t1)
! d_dense = 0
!$OMP PARALLEL num_threads(Threads_Num)
!$OMP DO PRIVATE(WtW_row,Hessian_col)
do Hessian_row=1, Model_Num
do WtW_row=Mat_Wm2%Pos_Row_Firstdata(Hessian_row), Mat_Wm2%Pos_Row_Firstdata(Hessian_row+1)-1
Hessian_col = Mat_Wm2%Col_List(WtW_row)
d_dense( (Hessian_row-1)*Model_Num + Hessian_col) = d_dense( (Hessian_row-1)*Model_Num + Hessian_col) + laida*Mat_Wm2%Val_List(WtW_row)
enddo
enddo
!$OMP END DO
!$OMP DO PRIVATE(idx_tol,Hessian_row,moment,point)
do Hessian_col=1, Model_Num
do Hessian_row=1, Model_Num
DO moment=1,Timegate_Num
DO point=1,Point_Num
idx_tol=Point_Num*(moment-1)+point
d_dense( (Hessian_row-1)*Model_Num + Hessian_col) = d_dense( (Hessian_row-1)*Model_Num + Hessian_col) + Jacobian_Tran(Hessian_row,idx_tol) /Dataerr_Variance(idx_tol)**2 * Jacobian_Tran(Hessian_col,idx_tol)
ENDDO
enddo
ENDDO
enddo
!$OMP END DO
!$OMP END PARALLEL
!$acc data copyin (d_dense,Gm)
!$acc kernels
!$acc loop
do i=1, Model_Num
dX(i) = Gm(i)
enddo
!$acc end kernels
!============= Calculate the sizes needed for pre-allocated buffer =============
istat = cusolverDnXgetrf_bufferSize( handle_LSE, params_LSE, rows, cols, cudaDataType(CUDA_R_64F), d_dense, ld,cudaDataType(CUDA_R_64F), workspaceInBytesOnDevice, workspaceInBytesOnHost )
write(*,*)istat
allocate(bufferOnDevice(workspaceInBytesOnDevice) , bufferOnHost(workspaceInBytesOnHost) )
!============= Slove the linear system of equations : AX=B =============
istat = cusolverDnXgetrf( handle_LSE, params_LSE, rows, cols, cudaDataType(CUDA_R_64F), d_dense, ld, ipiv, cudaDataType(CUDA_R_64F), bufferOnDevice,workspaceInBytesOnDevice,bufferOnHost, workspaceInBytesOnHost,devinfo )
write(*,*)istat
istat = cusolverDnXgetrs( handle_LSE, params_LSE,CUBLAS_OP_T, rows, 1, cudaDataType(CUDA_R_64F), d_dense, ld, ipiv, cudaDataType(CUDA_R_64F), dX, ldb, devinfo )
write(*,*)istat
!============= Get the Results =============
call system_clock (t2)
t = REAL(t2 - t1) / REAL(count_rate)
print*,'The computing time is:', t
! write(*,*)
! write(*,*)"========Assign the result to the host==========="
! write(*,*)
!$acc kernels
!$acc loop
do i=1, Model_Num
Gm(i) = dX(i)
enddo
!$acc end kernels
!$acc update host(Gm)
!$acc end data
!============= Calculate the sizes needed for pre-allocated buffer =============
deallocate( bufferOnDevice,bufferOnHost )
istat = cusolverDnDestroyParams(params_LSE)
istat = cusolverDnDestroy(handle_LSE)
!============= Update the Conductivity =============
conductivity=conductivity-Gm
RETURN
ENDSUBROUTINE Calculate_Updating_model_Direct_GPU
Regards,
Amore.