Simulation using Multi GPU with OpenMP+OpenACC

Hi Mat

I am running a test with OpenMP and OpenACC on multi GPU. Here are the codes:

SUBROUTINE Calculate_Jacobian_Transpose
	USE CUSPARSE
	USE openacc
	use omp_lib
	use cudafor
	IMPLICIT NONE	

CHARACTER*4::str_1,str_2,str_3,str_4,str_5
CHARACTER*10::str_6
INTEGER(KIND=8)::t1,t2,count_rate           !t1 denotes original cpu time , t2 denotes the end cpu time and t=t2-t1
REAL(KIND=4)::t

REAL(KIND=8):: CA0,CA1,DELX0,DELX1,DELX2,DELY0,DELY1,DELY2,DELZ0,DELZ1,DELZ2  !ca, cb, delx1, dely1, delz1 are all middle variables used in the computation of EM field
REAL(KIND=8):: TEMP_SIG,residual_error,val_temp,hz_observer(8) !Temp_sig and temp_cacb are middle variables used in the computation of EM field
REAL(KIND=8):: time_down,value_down,time_up,value_up,time_intelval
INTEGER(KIND=4):: i,j,k,ii,jj,kk,loop_false_rec,x_pos,y_pos,z_pos,x_pos_observer(8),y_pos_observer(8),z_pos_observer(8),idx_tol,idx_up,idx_down,idx_num,idx1_num
type(cusparseHandle) :: handle1,handle2,handle3

INTEGER(KIND=4) :: point,monment,branch
REAL(KIND=8) :: temp(Model_Num)

INTEGER(KIND=4) :: Max_index_of_loop
INTEGER(KIND=4) :: tid,num_nvidia,index_nvidia

Jacobian_Tran = 0.0

write(str_1,"(I4)")Num_pseudo_forward+1
write(*,*)"========"//TRIM(str_1)//"th Num_pseudo_forward calculation==========="

istat = cusparseCreate(handle1)
istat = cusparseCreate(handle2)
istat = cusparseCreate(handle3)
istat = cusparseSetStream(handle1,acc_get_cuda_stream(11)) 
istat = cusparseSetStream(handle2,acc_get_cuda_stream(12)) 
istat = cusparseSetStream(handle3,acc_get_cuda_stream(13)) 

call system_clock(count_rate=count_rate)
num_nvidia = acc_get_num_devices(acc_device_nvidia)

DO point=1,Point_Num
	
	!$omp parallel do num_threads(2) private(tid,monment,t1,t2,t,loop_false_rec,Is_EX_In_Source_False,Is_EY_In_Source_False,Max_index_of_loop,loop,str_3,str_4,str_5,str_6)
	DO monment=1,Timegate_Num  

		tid = omp_get_thread_num()
		istat = cudaSetDevice( tid )

		!$acc data copyin(loop_false_rec,Is_EX_In_Source_False,Is_EY_In_Source_False,EX_V,EY_V,EZ_V,HX_V,HY_V,HZ_V)&	
		!$acc      present(Jacobian_Tran,Forward_fields)&
		!$acc	   copyin(val_temp)&
		!$acc      present(CDELX,CDELY,CDELZ,Data_Calcuated,Data_Measured,Points_Observer,Timegate_Para,Dataerr_Variance)&
		!$acc      present(Comput_False_Logic,Delt,CA2,CB,CCSIG,Ratio_Rec,Field_Rec,Field_False_Rec,Field_Idx_Rec)&
		!$acc      present(Field_Idx1_rec,Sigma_Jac,field_orig_pos,field_coord,timegate_rec)&
		!$acc      present(Ctime_Cutoff)&
		!$acc      present(EX_V_1,EY_V_1,EZ_V_1)&
		!$acc      present(A_EX,B_EX,C_EX,D_EX,A_EY,B_EY,C_EY,D_EY,A_EZ,B_EZ,C_EZ,D_EZ)&
		!$acc      present(A_EX_1,B_EX_1,C_EX_1,D_EX_1,A_EY_1,B_EY_1,C_EY_1,D_EY_1,A_EZ_1,B_EZ_1,C_EZ_1,D_EZ_1)&	
		!$acc      present(C_EX_1C,C_EY_1C,C_EZ_1C,C_EXC,C_EYC,C_EZC)
	   
		call system_clock (t1)      							

		loop_false_rec=2*monment !The first index is dependent on the the loop index of the observation monment
		Is_EX_In_Source_False=0.D0; Is_EY_In_Source_False=0.D0
		
		!$acc update device(Is_EX_In_Source_False,Is_EY_In_Source_False,loop_false_rec) async(1)
		!$acc wait

		Max_index_of_loop = Timegate_Rec(loop_false_rec)

		DO loop=Max_index_of_loop,Field_Rec_Idxstart+1,-1					
			!Solve EX_1
			!$acc kernels async(11)
			!$acc loop collapse(3)
			DO I=1,NX
				DO J=2,NYB-1
					DO k=2,NZB-1

						!Genernate the coefficient matrix A and RHS B
							
					ENDDO
				ENDDO
			ENDDO	
			!$acc end kernels
			!==================================Start solving the equations==================================!
			istat = cusparseDgtsvInterleavedBatch(handle1,algo,m_EX_1,A_EX_1,B_EX_1,C_EX_1C,D_EX_1,batchcount_EX_1,buffer_EX_1)
			!=================================End Solving the equations=====================================!		
			!************************************************************************************************
			!==========================================Update EX_1===========================================!
			!$acc kernels async(11)
			!$acc loop collapse(3) independent private(Coeff_EX_1_Num)
			do I=1,NX
				do J=2,NYB-1
					do k=2,NZB-1
						Coeff_EX_1_Num=NX *(NYB-2)*(K-2)+(NYB-2)*(I-1)+J-1
						EX_V_1(I,J,K) = D_EX_1(Coeff_EX_1_Num)
					enddo
				enddo
			enddo
			!$acc end kernels
			!*************************************************************************************************
			! ======================================end of updating EX_1======================================!
			!*************************************************************************************************
			
			!************************************************************************************************* 
			! -------------------------------------update the value of EY_1-----------------------------------!
			!*************************************************************************************************		 
			!Solve EY_1
			!$acc kernels async(12)
			!$acc loop collapse(3) 
			do J=1,NY
			     do K=2,NZB-1
					do I=2,NXB-1	
			
						!Genernate the coefficient matrix A and RHS B

					ENDDO
				ENDDO
			ENDDO
			!$acc end kernels
			!==================================Start solving the equations==================================!		
			istat = cusparseDgtsvInterleavedBatch(handle2,algo,m_EY_1,A_EY_1,B_EY_1,C_EY_1C,D_EY_1,batchcount_EY_1,buffer_EY_1)
			!=================================End Solving the equations=====================================!
			!************************************************************************************************* 
			!=======================================Update EY_1=============================================!
			!$acc kernels async(12)
			!$acc loop collapse(3) independent private(Coeff_EY_1_Num)		
			do J=1,NY
			do K=2,NZB-1
					do I=2,NXB-1
						Coeff_EY_1_Num = (NZB-2)*NY*(I-2)+(NZB-2)*(J-1)+K-1
						EY_V_1(I,J,K) = D_EY_1(Coeff_EY_1_Num)
					enddo
				enddo
			enddo
			!$acc end kernels
			!*************************************************************************************************
			! ======================================end of updating EY_1=======================================!
			!*************************************************************************************************
			
			!************************************************************************************************* 
			! -------------------------------------update the value of EZ_1-----------------------------------!
			!*************************************************************************************************
			!Solve EZ_1
			!$acc kernels async(13)
			!$acc loop collapse(3) 
			DO K=1,NZ
				DO I=2,NXB-1
					DO J=2,NYB-1		
						
                                             !Genernate the coefficient matrix A and RHS B

					ENDDO
				ENDDO
			ENDDO
			!$acc end kernels
			!==================================Start solving the equations==================================!
			istat = cusparseDgtsvInterleavedBatch(handle3,algo,m_EZ_1,A_EZ_1,B_EZ_1,C_EZ_1C,D_EZ_1,batchcount_EZ_1,buffer_EZ_1)
			!=================================End Solving the equations=====================================!				
			!************************************************************************************************* 
			!=======================================Update EZ_1==============================================!		
			!$acc kernels async(13)
			!$acc loop collapse(3) independent private(Coeff_EZ_1_Num)
			do K=1,NZ
				do I=2,NXB-1
					do J=2,NYB-1
						Coeff_EZ_1_Num = (NXB-2)*NZ*(J-2)+(NXB-2)*(K-1)+I-1
						EZ_V_1(I,J,K) = D_EZ_1(Coeff_EZ_1_Num)
					enddo
				enddo
			enddo
			!$acc end kernels
			!$acc wait 		


			!Write field value
			!$acc kernels async(11)
			!$acc loop private(x_pos,y_pos,z_pos)
			DO i=1,field_num
			
				x_pos=Field_Coord(i)%Coordmesh_X
				y_pos=Field_Coord(i)%Coordmesh_Y
				z_pos=Field_Coord(i)%Coordmesh_Z
				IF(Field_Orig_Pos(i)<=num_1)THEN	
					Field_False_Rec(i)=EX_V_1(x_pos,y_pos,z_pos)                   		!Since time is backwards, the time of the field values is in the same order as the delt
				ELSEIF((Field_Orig_Pos(i)>num_1).and.(Field_Orig_Pos(i)<=num_2))THEN
					Field_False_Rec(i)=EY_V_1(x_pos,y_pos,z_pos)
				ELSEIF((Field_Orig_Pos(i)>num_2).and.(Field_Orig_Pos(i)<=num_3))THEN
					Field_False_Rec(i)=EZ_V_1(x_pos,y_pos,z_pos)
				ELSEIF((Field_Orig_Pos(i)>num_3).and.(Field_Orig_Pos(i)<=num_4))THEN
					Field_False_Rec(i)=EX_V(x_pos,y_pos,z_pos)
				ELSEIF((Field_Orig_Pos(i)>num_4).and.(Field_Orig_Pos(i)<=num_5))THEN
					Field_False_Rec(i)=EY_V(x_pos,y_pos,z_pos)
				ELSEIF(Field_Orig_Pos(i)>num_5)THEN
					Field_False_Rec(i)=EZ_V(x_pos,y_pos,z_pos)
				ENDIF
			ENDDO
			!$acc end kernels
			!$acc wait
			!Recover the forward fields.

			!$acc kernels async(11)
			!$acc loop
			DO I=1,Model_Num	
			
				val_temp=0.0D0
				!$acc loop seq
				DO jj=1,12
					!! Referencing the first equaiton of governing equation---the coefficient of auxiliary electric field is +1 
					idx_num=Field_Idx_Rec(I,jj)
					val_temp=val_temp+Ratio_Rec(I,jj)*Forward_fields(idx_num,Loop-1-Field_Rec_Idxstart+1)*Field_False_Rec(idx_num)
				ENDDO
				!$acc loop seq
				DO jj=13,24
					!! Referencing the first equaiton of governing equation---the coefficient of auxiliary electric field is -1 and the one of electric field is +1 
					idx_num=Field_Idx_Rec(I,jj)
					idx1_num=Field_Idx1_Rec(I,jj)
					val_temp=val_temp+Ratio_Rec(I,jj)*(Forward_fields(idx_num,Loop-1-Field_Rec_Idxstart+1)-Forward_fields(idx1_num,Loop-1-Field_Rec_Idxstart+1))*Field_False_Rec(idx_num)
				ENDDO

				Jacobian_Tran( i, Point_Num*(monment-1)+point ) = Jacobian_Tran( i, Point_Num*(monment-1)+point ) - val_temp*(Up_Sigma-Low_SigmA)*DEXP(SIGMA_Jac(I))/(1+DEXP(SIGMA_Jac(I)))**2
			enddo
			!$acc end kernels

			IF(Loop==Timegate_Rec(Loop_False_Rec)) then  								!It's the same as in interation
				IF(Loop_False_Rec>1) THEN
					Loop_False_Rec=Loop_False_Rec-1
				ELSE
					Loop_False_Rec=1
				ENDIF
			ENDIF
			!$acc update device(loop_false_rec)
			!$acc wait

		ENDDO
		!$acc update host( Jacobian_Tran(:,Point_Num*(monment-1)+point ) )

		!$acc end data

		call system_clock(t2)
		t=REAL(t2 - t1) / REAL(count_rate)

		write(str_3,"(I4)")tid
		write(str_4,"(I4)")monment
		write(str_5,"(I4)")point
		write(str_6,"(f10.6)")t
		
		write(*,*) "The device"//TRIM(str_3)//"calculating"//TRIM(str_4)//"of Point"//TRIM(str_5)//"The computation time is:"//TRIM(str_6)


	ENDDO
	!$omp end parallel do
ENDDO

istat=cusparseDestroy(handle1)
istat=cusparseDestroy(handle2)
istat=cusparseDestroy(handle3)

Num_pseudo_forward = Num_pseudo_forward + 1

! write(*,*), "false is ok Jac:", Num_pseudo_forward
RETURN	

ENDSUBROUTINE Calculate_Jacobian_Transpose

There are two Tesla V100s on the station. If the thread is set to 1, i.e, “!$omp parallel do num_threads(1) private()”, correct results can be obtained. However, If the thread is set to 2, the results are wrong.
I am not clear how the program works on the multi GPU.

This forum is for DeepStream SDK support, seems this topic is not relevant to DeepStream, you can move it to the proper forum, thank you.

Hi SkyCool,

Since you don’t provide a reproducing example, it’s difficult for me to tell why you’re getting wrong answers. Some guesses would be that you’re not coping the data to both devices, you’re not merging the multiple device copies of the data into the single host array, or there’s a race condition.

Another possible issue that I see is that you’re using “cudaSetDevice” instead of “acc_set_device”. “cudaSetDevice” has no notion of OpenACC so doesn’t set the correct device, while “acc_set_device” sets the device for both OpenACC and CUDA. In other words, the OpenACC regions and the cuSparse calls may be using different devices.

If you can provide a full reproducing example, I’ll try to take a look, but again, my recommendation is to use MPI+OpenACC. It far simpler and less error prone.

-Mat

Thanks for your responses. I have addressed that problem.
Now, I have a new question. In my codes, the cusparseDgtsvInterleavedBatch in the cuSPARSE library will be used to solve the linear systems: AX=B. May I call this function in different cuda streams, such as the following codes:

		do monment=1,Timegate_Num 
			istat = cusparseCreate(handle1(monment))
			istat = cusparseCreate(handle2(monment))
			istat = cusparseCreate(handle3(monment))

			istat = cusparseSetStream(handle1(monment),acc_get_cuda_stream(monment))
			istat = cusparseSetStream(handle2(monment),acc_get_cuda_stream(Timegate_Num + monment))
			istat = cusparseSetStream(handle3(monment),acc_get_cuda_stream(Timegate_Num*2 +  monment)) 
		end do
		do monment=1,Timegate_Num
               !$acc kernels async(monment) wait(0)
				!$acc loop
				do ii=1, (NZB-2)*(NYB-2)*NX
					A_EX_1_AUX(ii) = A_EX_1(ii)
					B_EX_1_AUX(ii) = B_EX_1(ii)
					C_EX_1_AUX(ii) = C_EX_1C(ii)
					D_EX_1_AUX(ii) = D_EX_1_G(ii,monment)
				enddo
				!$acc end kernels
				istat = cusparseDgtsvInterleavedBatch(handle1(monment),algo,m_EX_1,A_EX_1_AUX,B_EX_1_AUX,C_EX_1_AUX,D_EX_1_AUX,batchcount_EX_1,buffer_EX_1)
				!$acc kernels async(monment) 
				!$acc loop collapse(3) independent private(Coeff_EX_1_Num,D_EX_1_AUX)	
				do I=1,NX
					do J=2,NYB-1
						do k=2,NZB-1
							Coeff_EX_1_Num=NX *(NYB-2)*(K-2)+(NYB-2)*(I-1)+J-1
							EX_G_1(I,J,K,monment) = D_EX_1_AUX(Coeff_EX_1_Num)
						enddo
					enddo
				enddo
				!$acc end kernels
				
			enddo

I haven’t tried this myself so not 100% sure, but it looks correct.

I’m sorry to tell you, by experiment, that multi-CUDA streams cannot be run in parallel. Now, I need to solve a dense linear system. I notice that the function cusolverDnXgetrs from the cuSOLVER library is a nice tool to solve it. However, when I try to solve a linear system with 4000 unknowns using this function, I cannot obtain the results until the code runs for 2 hours. Maybe, for such a huge linear system, the direct solver is powerless. So, I want to find an iterative solver. Could you provide some help with the iterative solver from the HPC SDK, please?

I’m not sure what you mean. Multiple streams definitely can run concurrently provided there’s enough device resources. If one kernel is fully utilizing the device, then the second does need to wait until resources become available. Perhaps that’s what you’re seeing?

Could you provide some help with the iterative solver from the HPC SDK, please?

That’s out of my area of expertise. Let me see if @mfatica can help.

You are right that multiple streams can run asynchronously on the device with enough resources. I mean that the following code is wrong:

                do monment=1,Timegate_Num
                    istat = cusparseDgtsvInterleavedBatch(handle1(monment),algo,m_EX_1,A_EX_1,B_EX_1,C_EX_1_G(:,monment),D_EX_1_G(:,monment),batchcount_EX_1,buffer_EX_1)
                enddo
                !$acc kernels async(1) 
                !$acc loop collapse(4) independent private(Coeff_EX_1_Num)	
                do monment=1,Timegate_Num
                    do I=1,NX
                        do J=2,NYB-1
                            do k=2,NZB-1
                                Coeff_EX_1_Num=NX *(NYB-2)*(K-2)+(NYB-2)*(I-1)+J-1
                                EX_G_1(I,J,K,monment) = D_EX_1_G(Coeff_EX_1_Num,monment)
                            enddo
                        enddo
                    enddo
                enddo
                !$acc end kernels

where handle1() is the vector with Timegate_Num, which attachs streams with different label. Maybe, some variables are competed on a GPU if multi-handles are run concurrently.

And then I use the same handle to call the function and get the correct results as follow

                do monment=1,Timegate_Num
                    istat = cusparseDgtsvInterleavedBatch(handle1(1),algo,m_EX_1,A_EX_1,B_EX_1,C_EX_1_G(:,monment),D_EX_1_G(:,monment),batchcount_EX_1,buffer_EX_1)
                enddo

However, this serial calculating is not my expectation.