I notice the function “cusolverDnSetStream”. Can I use the multiple cuda streams to deal with the linear systems in parallel? Here is my codes:
do n=1, (i-1)*(NYB-2)+j-1
istat = cusolverDnCreate(handle(n))
istat = cusolverDnSetStream(handle(n),acc_get_cuda_stream(n))
istat = cusolverDnCreateParams(param(n))
istat = cusolverDnSetAdvOptions(param(n), CUSOLVERDN_GETRF, CUSOLVER_ALG_0)
end do
istat = cusolverDnXgetrf_bufferSize( handle( 1 ), param( 1 ), NZ-1, NZ-1, cudaDataType(CUDA_R_64F), MatVal_EX_1,NZ-1,cudaDataType(CUDA_R_64F), workspaceInBytesOnDevice_EX_1, workspaceInBytesOnHost_EX_1 )
ALLOCATE(bufferOnDevice_EX_1(workspaceInBytesOnDevice_EX_1) , bufferOnHost_EX_1(workspaceInBytesOnHost_EX_1) )
do I=1,NX
do J=2,NYB-1
!$acc kernels async( (i-1)*(NYB-2)+j-1 )
AZ_1=0
!$acc loop collapse(1) independent private(Collabel)
do k=2,NZB-1
AZ_1(k-1, (k-1) ) = parameter1
!$acc loop seq
do Collabel=1, NZB-2
MatVal_EX_1( (NZ-1)*(k-2) + Collabel ) = AZ_1( k-1,Collabel )
enddo
RHSVal_EX_1(k-1)=paremeter2
enddo
!$acc end kernels
istat = cusolverDnXgetrf( handle((i-1)*(NYB-2)+j-1), param((i-1)*(NYB-2)+j-1), ROW_EX_1, COL_EX_1, cudaDataType(CUDA_R_64F), MatVal_EX_1, LDA_EX_1, ipiv_EX_1, cudaDataType(CUDA_R_64F), bufferOnDevice_EX_1,workspaceInBytesOnDevice_EX_1,bufferOnHost_EX_1, workspaceInBytesOnHost_EX_1,devinfoX )
istat = cusolverDnXgetrs( handle((i-1)*(NYB-2)+j-1), param((i-1)*(NYB-2)+j-1), CUBLAS_OP_T, ROW_EX_1, 1, cudaDataType(CUDA_R_64F), MatVal_EX_1, LDA_EX_1, ipiv_EX_1, cudaDataType(CUDA_R_64F), RHSVal_EX_1, LDB_EX_1, devinfoX )
!$acc kernels async( (i-1)*(NYB-2)+j-1 )
!$acc loop collapse(1)
do k=2,NZB-1
EX_1(I,J,k) = RHSVal_EX_1(k-1)
enddo
!$acc end kernels
enddo
enddo
Unfortunately, these calculations produce incorrect results and require a large amount of memory.