Error while calling module subroutine : call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

I am trying to call a subroutine (named MP5_INTERPOLATE here) belonging to a module from a OpenACC compute region. while the code is running fine sequentially under “!$acc loop seq” directives, the program has returned the following error during runtime when parallel directives were used. I have privitized the variables properly and took care of using “!$acc routine seq” directive in module subroutine too, what can be the possible reason for this error?

Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

This is the subroutine from which I am calling the module subroutine named “MP5_INTERPOLATE”

SUBROUTINE WENO5_FACE_INTERPOLATION_WITH_PRIMS_CHARS()

	use declare_variables
	use MP5_subroutines
	implicit none
	
	integer :: II
	double precision, dimension(nprims) :: W_iph, W_imh
	double precision, dimension(-2:3,nprims) :: Wp
    double precision, dimension(nprims,nprims) :: L_Eig, R_Eig
    double precision :: p, c, sqrt_rho, divisor, rho


	!$acc parallel loop gang collapse(3) default(present)
	DO nbl = 1,nblocks
	DO k = 1,NKmax
	DO j = 1,NJmax
	!$acc loop vector private(L_Eig,R_Eig,Wp,W_iph,W_imh,n_prim)
	DO i = 0,NImax
	if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then	
	
		! Roe-averaged 'Rho' and 'c' on i+1/2 face
		sqrt_rho = sqrt(Qp(i+1,j,k,nbl,1)/Qp(i,j,k,nbl,1))
		rho      = sqrt_rho*Qp(i,j,k,nbl,1)
		divisor  = 1.d0/(sqrt_rho+1.d0)
		
		p   = (Qp(i,j,k,nbl,5) + (Qp(i+1,j,k,nbl,5)*sqrt_rho))*divisor
		c   = sqrt(gamma*p/rho)
		
		! Left and right Eigen vectors on i+1/2 face
		!--------------------------------------------
		
		L_Eig(1,1) = 0.d0							;	R_Eig(1,1) = 0.5d0/c**2.d0
		L_Eig(2,1) = Ix_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(2,1) = -0.5d0*Ix_c(i,j,k,nbl)/rho/c
		L_Eig(3,1) = Iz_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(3,1) = -0.5d0*Iy_c(i,j,k,nbl)/rho/c
		L_Eig(4,1) = -Iy_c(i,j,k,nbl)*c**2.d0       ;	R_Eig(4,1) = -0.5d0*Iz_c(i,j,k,nbl)/rho/c
		L_Eig(5,1) = 0.d0                           ;	R_Eig(5,1) = 0.5d0
													;	
		L_Eig(1,2) = -Ix_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,2) = Ix_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,2) = 0.d0                           ;	R_Eig(2,2) = 0.d0
		L_Eig(3,2) = -Iy_c(i,j,k,nbl)               ;	R_Eig(3,2) = -Iz_c(i,j,k,nbl)
		L_Eig(4,2) = -Iz_c(i,j,k,nbl)               ;	R_Eig(4,2) = Iy_c(i,j,k,nbl)
		L_Eig(5,2) = Ix_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,2) = 0.d0
													;	
		L_Eig(1,3) = -Iy_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,3) = Iz_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,3) = -Iz_c(i,j,k,nbl)               ;	R_Eig(2,3) = -Iy_c(i,j,k,nbl)
		L_Eig(3,3) = Ix_c(i,j,k,nbl)                ;	R_Eig(3,3) = Ix_c(i,j,k,nbl)
		L_Eig(4,3) = 0.d0                           ;	R_Eig(4,3) = 0.d0
		L_Eig(5,3) = Iy_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,3) = 0.d0
													;	
		L_Eig(1,4) = -Iz_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,4) = -Iy_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,4) = Iy_c(i,j,k,nbl)                ;	R_Eig(2,4) = -Iz_c(i,j,k,nbl)
		L_Eig(3,4) = 0.d0                           ;	R_Eig(3,4) = 0.d0
		L_Eig(4,4) = Ix_c(i,j,k,nbl)                ;	R_Eig(4,4) = Ix_c(i,j,k,nbl)
		L_Eig(5,4) = Iz_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,4) = 0.d0
													;	
		L_Eig(1,5) = 1.d0                           ;	R_Eig(1,5) = 0.5d0/c**2.d0
		L_Eig(2,5) = -Ix_c(i,j,k,nbl)               ;	R_Eig(2,5) = 0.5d0*Ix_c(i,j,k,nbl)/rho/c
		L_Eig(3,5) = -Iz_c(i,j,k,nbl)               ;	R_Eig(3,5) = 0.5d0*Iy_c(i,j,k,nbl)/rho/c
		L_Eig(4,5) = Iy_c(i,j,k,nbl)                ;	R_Eig(4,5) = 0.5d0*Iz_c(i,j,k,nbl)/rho/c
		L_Eig(5,5) = 1.d0                           ;	R_Eig(5,5) = 0.5d0
		
		!--------------------------------------------------------------------------------------------
		
		! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
		!$acc loop seq
		DO II=-2,3
			
			!$acc loop seq
			DO n_prim = 1,nprims
			Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i+II,j,k,nbl,1) +&
							L_Eig(n_prim,2)*Qp(i+II,j,k,nbl,2) +&
							L_Eig(n_prim,3)*Qp(i+II,j,k,nbl,3) +&
							L_Eig(n_prim,4)*Qp(i+II,j,k,nbl,4) +&
							L_Eig(n_prim,5)*Qp(i+II,j,k,nbl,5) 
			ENDDO
			
		ENDDO
		
		!========================================================
		
		Call MP5_INTERPOLATE(Wp, W_iph, W_imh, nprims)
		
		!========================================================
		
		
		
		!$acc loop seq
		DO n_prim = 1,nprims
		
			Qp_iphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
										R_Eig(n_prim,2)*W_iph(2) + &
										R_Eig(n_prim,3)*W_iph(3) + &
										R_Eig(n_prim,4)*W_iph(4) + &
										R_Eig(n_prim,5)*W_iph(5)
										
			Qp_iphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
										R_Eig(n_prim,2)*W_imh(2) + &
										R_Eig(n_prim,3)*W_imh(3) + &
										R_Eig(n_prim,4)*W_imh(4) + &
										R_Eig(n_prim,5)*W_imh(5)
		ENDDO
		
	endif
	ENDDO
	ENDDO
	ENDDO
	ENDDO
	
	
	!$acc parallel loop gang collapse(3) default(present)
	DO nbl = 1,nblocks
	DO k = 1,NKmax
	DO j = 0,NJmax
	!$acc loop vector private(L_Eig,R_Eig,Wp,W_iph,W_imh,n_prim)
	DO i = 1,NImax
	if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
	
		! Roe-averaged 'Rho' and 'c' on j+1/2 face
		sqrt_rho =sqrt(Qp(i,j+1,k,nbl,1)/Qp(i,j,k,nbl,1))
		rho      =sqrt_rho*Qp(i,j,k,nbl,1)
		divisor  =1.0d0/(sqrt_rho+1.0d0)
		
		p   = (Qp(i,j,k,nbl,5) + (Qp(i,j+1,k,nbl,5)*sqrt_rho))*divisor
		c   = sqrt(gamma*p/rho)
		
		! Left and right Eigen vectors on j+1/2 face
		!--------------------------------
		
		L_Eig(1,1) = 0.d0							;	R_Eig(1,1) = 0.5d0/c**2.d0
		L_Eig(2,1) = Jx_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(2,1) = -0.5d0*Jx_c(i,j,k,nbl)/rho/c
		L_Eig(3,1) = Jz_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(3,1) = -0.5d0*Jy_c(i,j,k,nbl)/rho/c
		L_Eig(4,1) = -Jy_c(i,j,k,nbl)*c**2.d0       ;	R_Eig(4,1) = -0.5d0*Jz_c(i,j,k,nbl)/rho/c
		L_Eig(5,1) = 0.d0                           ;	R_Eig(5,1) = 0.5d0
													;	
		L_Eig(1,2) = -Jx_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,2) = Jx_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,2) = 0.d0                           ;	R_Eig(2,2) = 0.d0
		L_Eig(3,2) = -Jy_c(i,j,k,nbl)               ;	R_Eig(3,2) = -Jz_c(i,j,k,nbl)
		L_Eig(4,2) = -Jz_c(i,j,k,nbl)               ;	R_Eig(4,2) = Jy_c(i,j,k,nbl)
		L_Eig(5,2) = Jx_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,2) = 0.d0
													;	
		L_Eig(1,3) = -Jy_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,3) = Jz_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,3) = -Jz_c(i,j,k,nbl)               ;	R_Eig(2,3) = -Jy_c(i,j,k,nbl)
		L_Eig(3,3) = Jx_c(i,j,k,nbl)                ;	R_Eig(3,3) = Jx_c(i,j,k,nbl)
		L_Eig(4,3) = 0.d0                           ;	R_Eig(4,3) = 0.d0
		L_Eig(5,3) = Jy_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,3) = 0.d0
													;	
		L_Eig(1,4) = -Jz_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,4) = -Jy_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,4) = Jy_c(i,j,k,nbl)                ;	R_Eig(2,4) = -Jz_c(i,j,k,nbl)
		L_Eig(3,4) = 0.d0                           ;	R_Eig(3,4) = 0.d0
		L_Eig(4,4) = Jx_c(i,j,k,nbl)                ;	R_Eig(4,4) = Jx_c(i,j,k,nbl)
		L_Eig(5,4) = Jz_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,4) = 0.d0
													;	
		L_Eig(1,5) = 1.d0                           ;	R_Eig(1,5) = 0.5d0/c**2.d0
		L_Eig(2,5) = -Jx_c(i,j,k,nbl)               ;	R_Eig(2,5) = 0.5d0*Jx_c(i,j,k,nbl)/rho/c
		L_Eig(3,5) = -Jz_c(i,j,k,nbl)               ;	R_Eig(3,5) = 0.5d0*Jy_c(i,j,k,nbl)/rho/c
		L_Eig(4,5) = Jy_c(i,j,k,nbl)                ;	R_Eig(4,5) = 0.5d0*Jz_c(i,j,k,nbl)/rho/c
		L_Eig(5,5) = 1.d0                           ;	R_Eig(5,5) = 0.5d0
		
		!------------------------------------------------------------------------------------------
			
		! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
		!$acc loop seq
		DO II=-2,3
		!$acc loop seq
		DO n_prim = 1,nprims
			Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i,j+II,k,nbl,1) +&
							L_Eig(n_prim,2)*Qp(i,j+II,k,nbl,2) +&
							L_Eig(n_prim,3)*Qp(i,j+II,k,nbl,3) +&
							L_Eig(n_prim,4)*Qp(i,j+II,k,nbl,4) +&
							L_Eig(n_prim,5)*Qp(i,j+II,k,nbl,5) 
		ENDDO
		ENDDO
		
		!========================================================
		
		Call MP5_INTERPOLATE(Wp, W_iph, W_imh, nprims) 
		
		!========================================================
		
		
		!$acc loop seq
		DO n_prim = 1,nprims
		
			Qp_jphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
										R_Eig(n_prim,2)*W_iph(2) + &
										R_Eig(n_prim,3)*W_iph(3) + &
										R_Eig(n_prim,4)*W_iph(4) + &
										R_Eig(n_prim,5)*W_iph(5)
										
			Qp_jphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
										R_Eig(n_prim,2)*W_imh(2) + &
										R_Eig(n_prim,3)*W_imh(3) + &
										R_Eig(n_prim,4)*W_imh(4) + &
										R_Eig(n_prim,5)*W_imh(5)
		ENDDO
		
	endif
	ENDDO
	ENDDO
	ENDDO
	ENDDO


END

This is the Module in which MP5_INTERPOLATE subroutine is present:

MODULE MP5_subroutines

	contains
	
	SUBROUTINE MP5_INTERPOLATE(Wp, W_iph, W_imh, nprims)
		!$acc routine seq

		! MP5 by Suresh and Hyunh Ref- https://doi.org/10.1006/JCPH.1997.5745
		! Task: Give a small stencil of Characteristic variables Wp(i-2,i+3),
		! Subroutine interpolates data to right face of 'i' cell (PHI_iph(i)) and left face of 'i+1' cell (Phi_imh(i))
		use some_functions
		implicit none
		
		Integer :: n_prim, nprims, II
		double precision, dimension(nprims)  :: W_iph, W_imh
		double precision, dimension(-2:3,nprims) :: Wp
		
		double precision :: PHI_mp, D_iph, D_imh, PHI_UL, PHI_AV, PHI_MD, PHI_LC, PHI_min, PHI_max
		double precision :: DI_minus, DI_zero, DI_plus
		double precision, parameter :: eps = 1d-40, alp = 4.d0, B2 = 4.d0/3.d0
		
		
		
		DO n_prim = 1,nprims
		
			II = 0
			
			! This is the interpolation polynomial
			! Taylor series based interpolation, Legendre polynomial interpolation, Newton polynomial interpolation
			! any thing can be used
			W_iph(n_prim) = (2.d0*Wp(II-2,n_prim) - 13.d0*Wp(II-1,n_prim) &
				+ 47.d0*Wp(II,n_prim) + 27.d0*Wp(II+1,n_prim) - 3.d0*Wp(II+2,n_prim))/60.d0
					
			PHI_mp = Wp(II,n_prim) + MINMOD2(Wp(II+1,n_prim)-Wp(II,n_prim), alp*(Wp(II,n_prim)-Wp(II-1,n_prim)))
			
			! Sometimes the interpolation can create new maxima/minima, so check and correct
			! Maintain TVD
										
			IF ((W_iph(n_prim) - Wp(II,n_prim))*(W_iph(n_prim) - PHI_mp) >= eps) THEN
			
				Di_minus = Wp(II-2,n_prim) - 2.d0*Wp(II-1,n_prim) + Wp(II,n_prim)
				Di_zero	 = Wp(II-1,n_prim) - 2.d0*Wp(II,n_prim) + Wp(II+1,n_prim)
				DI_plus  = Wp(II,n_prim) - 2.d0*Wp(II+1,n_prim) + Wp(II+2,n_prim)
				
				D_iph = MINMOD4(4.d0*DI_zero - DI_plus , 4.d0*Di_plus - DI_zero, DI_zero, DI_plus)
				D_imh = MINMOD4(4.d0*Di_zero - DI_minus, 4.d0*Di_minus - DI_zero, DI_zero, DI_minus)
				
				PHI_UL = Wp(II,n_prim) + alp*(Wp(II,n_prim) - Wp(II-1,n_prim))
				PHI_AV = 0.5d0*(Wp(II,n_prim) + Wp(II+1,n_prim))
				PHI_MD = PHI_AV - 0.5d0*D_iph
				PHI_LC = Wp(II,n_prim) + 0.5d0*(Wp(II,n_prim)-Wp(II-1,n_prim)) + B2*D_imh
				
				PHI_min = MAX(MIN(Wp(II,n_prim), Wp(II+1,n_prim), PHI_MD), MIN(Wp(II,n_prim), PHI_UL, PHI_LC)) 
				PHI_max = MIN(MAX(Wp(II,n_prim), Wp(II+1,n_prim), PHI_MD), MAX(Wp(II,n_prim), PHI_UL, PHI_LC))
				
				W_iph(n_prim) = W_iph(n_prim) + MINMOD2(PHI_min-W_iph(n_prim), PHI_max-W_iph(n_prim))
	
			ENDIF
			
		ENDDO
		
		!======================================================================================================
		
		DO n_prim = 1,nprims
		
			II = 1
		
			W_imh(n_prim) = (2.d0*Wp(II+2,n_prim) - 13.d0*Wp(II+1,n_prim) &
				+ 47.d0*Wp(II,n_prim) + 27.d0*Wp(II-1,n_prim) - 3.d0*Wp(II-2,n_prim))/60.d0
			
			PHI_mp = Wp(II,n_prim) + MINMOD2(Wp(II-1,n_prim) - Wp(II,n_prim), alp*(Wp(II,n_prim) - Wp(II+1,n_prim)))
					
			IF ((W_imh(n_prim) - Wp(II,n_prim))*(W_imh(n_prim) - PHI_mp) >= eps) THEN
			
				DI_plus = Wp(II+2,n_prim) - 2.d0*Wp(II+1,n_prim) + Wp(II,n_prim)
				DI_zero	 = Wp(II+1,n_prim) - 2.d0*Wp(II,n_prim) + Wp(II-1,n_prim)
				DI_minus  = Wp(II,n_prim) - 2.d0*Wp(II-1,n_prim) + Wp(II-2,n_prim)
				
				D_iph = MINMOD4(4.d0*DI_zero - DI_plus, 4.d0*Di_plus - DI_zero, DI_zero, DI_plus)
				D_imh = MINMOD4(4.d0*DI_zero - DI_minus, 4.d0*Di_minus - DI_zero, DI_zero, DI_minus)
				
				PHI_UL = Wp(II,n_prim) + alp*(Wp(II,n_prim) - Wp(II+1,n_prim))
				PHI_AV = 0.5d0*(Wp(II,n_prim) + Wp(II-1,n_prim))
				PHI_MD = PHI_AV - 0.5d0*D_imh
				PHI_LC = Wp(II,n_prim) + 0.5d0*(Wp(II,n_prim)-Wp(II+1,n_prim)) + B2*D_iph
				
				PHI_min = MAX(MIN(Wp(II,n_prim), Wp(II-1,n_prim), PHI_MD), MIN(Wp(II,n_prim), PHI_UL, PHI_LC)) 
				PHI_max = MIN(MAX(Wp(II,n_prim), Wp(II-1,n_prim), PHI_MD), MAX(Wp(II,n_prim), PHI_UL, PHI_LC))
				
				W_imh(n_prim) = W_imh(n_prim) + MINMOD2(PHI_min-W_imh(n_prim), PHI_max-W_imh(n_prim))
			
			ENDIF
		
		ENDDO
		
	END SUBROUTINE


END

For good or bad, the code runs fine for me on my V100 using NVHPC SDK 21.7. Do I need to modify the source or input file to see the error?

call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

This is a generic device error, similar to a seg fault on the host, where a bad address is be used.

Some possible causes:

  1. A host address is being used on the device.
  2. A derived type with aggregate data members isn’t properly being created on the device
  3. A single object is over 2GB with the -Mlarge_array or -mcmodel=medium flag
  4. Out-of-Bounds error
  5. Heap or Stack overflow

Since you’re using CUDA Unified Memory (‘managed’), highly doubt it’s #1. No derived types, so unlike to be that either (plus UM usually fixes these issues as well). You’re using -mcmodel=medium, so it’s not going to be that either.

I ran your program through valgrind (host) and cuda-memcheck (device) and didn’t see any errors, so it’s probably not an out-of-bounds access issue.

There’s no device side allocation that I can see (typically due to using automatics), so it’s not going to be a heap overflow.

Possibly there’s a stack overflow? You only have the one routine call in there, but it does contain a few small fixed size arrays which would be stored on the stack:

                double precision, dimension(3) :: PHI_hat, w, Gma, B
                double precision, dimension(-2:2) :: PHI_L

Try setting the environment variable “NV_ACC_CUDA_STACKSIZE=MB” for various sizes of N. Like 32MB, 64MB.

Otherwise I’m not sure since, again, it runs fine for me.

What GPU and CUDA driver version are you using?

-Mat

% pgfortran -V

pgfortran (aka nvfortran) 21.7-0 64-bit target on x86-64 Linux -tp skylake
PGI Compilers and Tools
Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
dev-sky5:/local/home/mcolgrove/tmp/Fortran_testing% make
make: Warning: File 'Module.f90' has modification time 29866 s in the future
pgfortran -acc -fast -mcmodel=medium -ta=tesla:managed  -c Module.f90
pgfortran -acc -fast -mcmodel=medium -ta=tesla:managed  -c Main.f90
pgfortran -acc -fast -mcmodel=medium -ta=tesla:managed  -c Module.f90
pgfortran -acc -fast -mcmodel=medium -ta=tesla:managed  -c Pre_processing.f90
pgfortran -acc -fast -mcmodel=medium -ta=tesla:managed  Main.o Module.o Pre_processing.o -o a.out
make: warning:  Clock skew detected.  Your build may be incomplete.
% ./a.out
 Allocating variables done...
 Completed reading Disc coefficents...
 (Min, Max) of I_x =     399.9999999901696         400.0000000065968
 (Min, Max) of I_y =   -2.4368470204668486E-011   2.4368470204668289E-011
 (Min, Max) of I_z =     0.000000000000000         0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of J_x =   -2.4368470204668479E-011   2.4368470204668176E-011
 (Min, Max) of J_y =     399.9999999901696         400.0000000065931
 (Min, Max) of J_z =    -0.000000000000000        -0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of K_x =     0.000000000000000         0.000000000000000
 (Min, Max) of K_y =     0.000000000000000         0.000000000000000
 (Min, Max) of K_z =     1.000000000000000         1.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of Jac =     319999.9999921206         320000.0000052624

 ___________________________________________________________

 (Min, Max) of I_x_iph =     399.9999999907029         400.0000000189481
 (Min, Max) of I_y_iph =   -5.0330110449676281E-011   2.5031211671313193E-011
 (Min, Max) of I_z_iph =     0.000000000000000         0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of J_x_iph =   -5.2815621789398101E-011   3.5769110102399421E-011
 (Min, Max) of J_y_iph =     399.9999999906447         400.0000000188774
 (Min, Max) of J_z_iph =     0.000000000000000         0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of K_x_iph =     0.000000000000000         0.000000000000000
 (Min, Max) of K_y_iph =     0.000000000000000         0.000000000000000
 (Min, Max) of K_z_iph =    0.9999999999999998         1.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of Jac_iph =     319999.9999925573         320000.0000151434

 ___________________________________________________________

 (Min, Max) of I_x_jph =     399.9999999905837         400.0000000188818
 (Min, Max) of I_y_jph =   -5.2815621789397132E-011   3.5769110102398885E-011
 (Min, Max) of I_z_jph =     0.000000000000000         0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of J_x_jph =   -5.0330110449676275E-011   2.5031211671313404E-011
 (Min, Max) of J_y_jph =     399.9999999906418         400.0000000189524
 (Min, Max) of J_z_jph =     0.000000000000000         0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of K_x_jph =     0.000000000000000         0.000000000000000
 (Min, Max) of K_y_jph =     0.000000000000000         0.000000000000000
 (Min, Max) of K_z_jph =    0.9999999999999998         1.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of Jac_jph =     319999.9999925086         320000.0000151469

 ___________________________________________________________

 (Min, Max) of I_x_kph =     399.9999999901696         400.0000000065968
 (Min, Max) of I_y_kph =   -2.4368470204668486E-011   2.4368470204668289E-011
 (Min, Max) of I_z_kph =     0.000000000000000         0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of J_x_kph =   -2.4368470204668479E-011   2.4368470204668176E-011
 (Min, Max) of J_y_kph =     399.9999999901696         400.0000000065931
 (Min, Max) of J_z_kph =    -0.000000000000000        -0.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of K_x_kph =     0.000000000000000         0.000000000000000
 (Min, Max) of K_y_kph =     0.000000000000000         0.000000000000000
 (Min, Max) of K_z_kph =     1.000000000000000         1.000000000000000
 -------------------------------------------------------------------------
 (Min, Max) of Jac_kph =     319999.9999921206         320000.0000052624

 ___________________________________________________________

 grid.xyz file ready
 flow.f file ready!

 ___________________________________________________________

       1  0.0000  0.00023874
       2  0.0002  0.00019773
       3  0.0004  0.00008617
       4  0.0005  0.00004181
       5  0.0006  0.00002056
       6  0.0006  0.00000837
       7  0.0006  0.00000493
       8  0.0006  0.00000212
       9  0.0006  0.00000164
      10  0.0006  0.00000164
 Total wall clock time =    0.2645158767700195      secs

 ___________________________________________________________

 grid.xyz file ready
 flow.f file ready!

 ___________________________________________________________

 SOLUTION - (at domain center)
   0.1380057350288226
    1.205523252704241
    1.206630903436920
    0.000000000000000
   2.9025306460949390E-002
1 Like

No you don’t need to change anything.

GPU info: Tesla V100-PCIE-32GB
CUDA Version: Cuda compilation tools, release 10.1, V10.1.243

I will try this and get back. Thanks!

Ok, so the only really difference beside you having more memory (my V100 is 16GB) is that you’re using an older CUDA driver version (I’m using CUDA 11.3) Though I just jumped on another system that has a V100 using CUDA 10.1 Update 2 (418.165.2) and it still ran fine for me.

Wish I could reproduce this and be of more help.

1 Like