Hi Mat
I am running a test with OpenMP and OpenACC on multi GPU. Here are the codes:
SUBROUTINE Calculate_Jacobian_Transpose
USE CUSPARSE
USE openacc
use omp_lib
use cudafor
IMPLICIT NONE
CHARACTER*4::str_1,str_2,str_3,str_4,str_5
CHARACTER*10::str_6
INTEGER(KIND=8)::t1,t2,count_rate !t1 denotes original cpu time , t2 denotes the end cpu time and t=t2-t1
REAL(KIND=4)::t
REAL(KIND=8):: CA0,CA1,DELX0,DELX1,DELX2,DELY0,DELY1,DELY2,DELZ0,DELZ1,DELZ2 !ca, cb, delx1, dely1, delz1 are all middle variables used in the computation of EM field
REAL(KIND=8):: TEMP_SIG,residual_error,val_temp,hz_observer(8) !Temp_sig and temp_cacb are middle variables used in the computation of EM field
REAL(KIND=8):: time_down,value_down,time_up,value_up,time_intelval
INTEGER(KIND=4):: i,j,k,ii,jj,kk,loop_false_rec,x_pos,y_pos,z_pos,x_pos_observer(8),y_pos_observer(8),z_pos_observer(8),idx_tol,idx_up,idx_down,idx_num,idx1_num
type(cusparseHandle) :: handle1,handle2,handle3
INTEGER(KIND=4) :: point,monment,branch
REAL(KIND=8) :: temp(Model_Num)
INTEGER(KIND=4) :: Max_index_of_loop
INTEGER(KIND=4) :: tid,num_nvidia,index_nvidia
Jacobian_Tran = 0.0
write(str_1,"(I4)")Num_pseudo_forward+1
write(*,*)"========"//TRIM(str_1)//"th Num_pseudo_forward calculation==========="
istat = cusparseCreate(handle1)
istat = cusparseCreate(handle2)
istat = cusparseCreate(handle3)
istat = cusparseSetStream(handle1,acc_get_cuda_stream(11))
istat = cusparseSetStream(handle2,acc_get_cuda_stream(12))
istat = cusparseSetStream(handle3,acc_get_cuda_stream(13))
call system_clock(count_rate=count_rate)
num_nvidia = acc_get_num_devices(acc_device_nvidia)
DO point=1,Point_Num
!$omp parallel do num_threads(2) private(tid,monment,t1,t2,t,loop_false_rec,Is_EX_In_Source_False,Is_EY_In_Source_False,Max_index_of_loop,loop,str_3,str_4,str_5,str_6)
DO monment=1,Timegate_Num
tid = omp_get_thread_num()
istat = cudaSetDevice( tid )
!$acc data copyin(loop_false_rec,Is_EX_In_Source_False,Is_EY_In_Source_False,EX_V,EY_V,EZ_V,HX_V,HY_V,HZ_V)&
!$acc present(Jacobian_Tran,Forward_fields)&
!$acc copyin(val_temp)&
!$acc present(CDELX,CDELY,CDELZ,Data_Calcuated,Data_Measured,Points_Observer,Timegate_Para,Dataerr_Variance)&
!$acc present(Comput_False_Logic,Delt,CA2,CB,CCSIG,Ratio_Rec,Field_Rec,Field_False_Rec,Field_Idx_Rec)&
!$acc present(Field_Idx1_rec,Sigma_Jac,field_orig_pos,field_coord,timegate_rec)&
!$acc present(Ctime_Cutoff)&
!$acc present(EX_V_1,EY_V_1,EZ_V_1)&
!$acc present(A_EX,B_EX,C_EX,D_EX,A_EY,B_EY,C_EY,D_EY,A_EZ,B_EZ,C_EZ,D_EZ)&
!$acc present(A_EX_1,B_EX_1,C_EX_1,D_EX_1,A_EY_1,B_EY_1,C_EY_1,D_EY_1,A_EZ_1,B_EZ_1,C_EZ_1,D_EZ_1)&
!$acc present(C_EX_1C,C_EY_1C,C_EZ_1C,C_EXC,C_EYC,C_EZC)
call system_clock (t1)
loop_false_rec=2*monment !The first index is dependent on the the loop index of the observation monment
Is_EX_In_Source_False=0.D0; Is_EY_In_Source_False=0.D0
!$acc update device(Is_EX_In_Source_False,Is_EY_In_Source_False,loop_false_rec) async(1)
!$acc wait
Max_index_of_loop = Timegate_Rec(loop_false_rec)
DO loop=Max_index_of_loop,Field_Rec_Idxstart+1,-1
!Solve EX_1
!$acc kernels async(11)
!$acc loop collapse(3)
DO I=1,NX
DO J=2,NYB-1
DO k=2,NZB-1
!Genernate the coefficient matrix A and RHS B
ENDDO
ENDDO
ENDDO
!$acc end kernels
!==================================Start solving the equations==================================!
istat = cusparseDgtsvInterleavedBatch(handle1,algo,m_EX_1,A_EX_1,B_EX_1,C_EX_1C,D_EX_1,batchcount_EX_1,buffer_EX_1)
!=================================End Solving the equations=====================================!
!************************************************************************************************
!==========================================Update EX_1===========================================!
!$acc kernels async(11)
!$acc loop collapse(3) independent private(Coeff_EX_1_Num)
do I=1,NX
do J=2,NYB-1
do k=2,NZB-1
Coeff_EX_1_Num=NX *(NYB-2)*(K-2)+(NYB-2)*(I-1)+J-1
EX_V_1(I,J,K) = D_EX_1(Coeff_EX_1_Num)
enddo
enddo
enddo
!$acc end kernels
!*************************************************************************************************
! ======================================end of updating EX_1======================================!
!*************************************************************************************************
!*************************************************************************************************
! -------------------------------------update the value of EY_1-----------------------------------!
!*************************************************************************************************
!Solve EY_1
!$acc kernels async(12)
!$acc loop collapse(3)
do J=1,NY
do K=2,NZB-1
do I=2,NXB-1
!Genernate the coefficient matrix A and RHS B
ENDDO
ENDDO
ENDDO
!$acc end kernels
!==================================Start solving the equations==================================!
istat = cusparseDgtsvInterleavedBatch(handle2,algo,m_EY_1,A_EY_1,B_EY_1,C_EY_1C,D_EY_1,batchcount_EY_1,buffer_EY_1)
!=================================End Solving the equations=====================================!
!*************************************************************************************************
!=======================================Update EY_1=============================================!
!$acc kernels async(12)
!$acc loop collapse(3) independent private(Coeff_EY_1_Num)
do J=1,NY
do K=2,NZB-1
do I=2,NXB-1
Coeff_EY_1_Num = (NZB-2)*NY*(I-2)+(NZB-2)*(J-1)+K-1
EY_V_1(I,J,K) = D_EY_1(Coeff_EY_1_Num)
enddo
enddo
enddo
!$acc end kernels
!*************************************************************************************************
! ======================================end of updating EY_1=======================================!
!*************************************************************************************************
!*************************************************************************************************
! -------------------------------------update the value of EZ_1-----------------------------------!
!*************************************************************************************************
!Solve EZ_1
!$acc kernels async(13)
!$acc loop collapse(3)
DO K=1,NZ
DO I=2,NXB-1
DO J=2,NYB-1
!Genernate the coefficient matrix A and RHS B
ENDDO
ENDDO
ENDDO
!$acc end kernels
!==================================Start solving the equations==================================!
istat = cusparseDgtsvInterleavedBatch(handle3,algo,m_EZ_1,A_EZ_1,B_EZ_1,C_EZ_1C,D_EZ_1,batchcount_EZ_1,buffer_EZ_1)
!=================================End Solving the equations=====================================!
!*************************************************************************************************
!=======================================Update EZ_1==============================================!
!$acc kernels async(13)
!$acc loop collapse(3) independent private(Coeff_EZ_1_Num)
do K=1,NZ
do I=2,NXB-1
do J=2,NYB-1
Coeff_EZ_1_Num = (NXB-2)*NZ*(J-2)+(NXB-2)*(K-1)+I-1
EZ_V_1(I,J,K) = D_EZ_1(Coeff_EZ_1_Num)
enddo
enddo
enddo
!$acc end kernels
!$acc wait
!Write field value
!$acc kernels async(11)
!$acc loop private(x_pos,y_pos,z_pos)
DO i=1,field_num
x_pos=Field_Coord(i)%Coordmesh_X
y_pos=Field_Coord(i)%Coordmesh_Y
z_pos=Field_Coord(i)%Coordmesh_Z
IF(Field_Orig_Pos(i)<=num_1)THEN
Field_False_Rec(i)=EX_V_1(x_pos,y_pos,z_pos) !Since time is backwards, the time of the field values is in the same order as the delt
ELSEIF((Field_Orig_Pos(i)>num_1).and.(Field_Orig_Pos(i)<=num_2))THEN
Field_False_Rec(i)=EY_V_1(x_pos,y_pos,z_pos)
ELSEIF((Field_Orig_Pos(i)>num_2).and.(Field_Orig_Pos(i)<=num_3))THEN
Field_False_Rec(i)=EZ_V_1(x_pos,y_pos,z_pos)
ELSEIF((Field_Orig_Pos(i)>num_3).and.(Field_Orig_Pos(i)<=num_4))THEN
Field_False_Rec(i)=EX_V(x_pos,y_pos,z_pos)
ELSEIF((Field_Orig_Pos(i)>num_4).and.(Field_Orig_Pos(i)<=num_5))THEN
Field_False_Rec(i)=EY_V(x_pos,y_pos,z_pos)
ELSEIF(Field_Orig_Pos(i)>num_5)THEN
Field_False_Rec(i)=EZ_V(x_pos,y_pos,z_pos)
ENDIF
ENDDO
!$acc end kernels
!$acc wait
!Recover the forward fields.
!$acc kernels async(11)
!$acc loop
DO I=1,Model_Num
val_temp=0.0D0
!$acc loop seq
DO jj=1,12
!! Referencing the first equaiton of governing equation---the coefficient of auxiliary electric field is +1
idx_num=Field_Idx_Rec(I,jj)
val_temp=val_temp+Ratio_Rec(I,jj)*Forward_fields(idx_num,Loop-1-Field_Rec_Idxstart+1)*Field_False_Rec(idx_num)
ENDDO
!$acc loop seq
DO jj=13,24
!! Referencing the first equaiton of governing equation---the coefficient of auxiliary electric field is -1 and the one of electric field is +1
idx_num=Field_Idx_Rec(I,jj)
idx1_num=Field_Idx1_Rec(I,jj)
val_temp=val_temp+Ratio_Rec(I,jj)*(Forward_fields(idx_num,Loop-1-Field_Rec_Idxstart+1)-Forward_fields(idx1_num,Loop-1-Field_Rec_Idxstart+1))*Field_False_Rec(idx_num)
ENDDO
Jacobian_Tran( i, Point_Num*(monment-1)+point ) = Jacobian_Tran( i, Point_Num*(monment-1)+point ) - val_temp*(Up_Sigma-Low_SigmA)*DEXP(SIGMA_Jac(I))/(1+DEXP(SIGMA_Jac(I)))**2
enddo
!$acc end kernels
IF(Loop==Timegate_Rec(Loop_False_Rec)) then !It's the same as in interation
IF(Loop_False_Rec>1) THEN
Loop_False_Rec=Loop_False_Rec-1
ELSE
Loop_False_Rec=1
ENDIF
ENDIF
!$acc update device(loop_false_rec)
!$acc wait
ENDDO
!$acc update host( Jacobian_Tran(:,Point_Num*(monment-1)+point ) )
!$acc end data
call system_clock(t2)
t=REAL(t2 - t1) / REAL(count_rate)
write(str_3,"(I4)")tid
write(str_4,"(I4)")monment
write(str_5,"(I4)")point
write(str_6,"(f10.6)")t
write(*,*) "The device"//TRIM(str_3)//"calculating"//TRIM(str_4)//"of Point"//TRIM(str_5)//"The computation time is:"//TRIM(str_6)
ENDDO
!$omp end parallel do
ENDDO
istat=cusparseDestroy(handle1)
istat=cusparseDestroy(handle2)
istat=cusparseDestroy(handle3)
Num_pseudo_forward = Num_pseudo_forward + 1
! write(*,*), "false is ok Jac:", Num_pseudo_forward
RETURN
ENDSUBROUTINE Calculate_Jacobian_Transpose
There are two Tesla V100s on the station. If the thread is set to 1, i.e, “!$omp parallel do num_threads(1) private()”, correct results can be obtained. However, If the thread is set to 2, the results are wrong.
I am not clear how the program works on the multi GPU.