Unknow error when calling device subroutine

Hi Mat,

here is another strange error I have encountered when calling device subroutine.

C:\Users\Dolf\Desktop\test3>cuda-memcheck.exe --log-file err.dat test3.exe
grid 26 26 1
thread 16 16 0
gethxp1 kernel error
unspecified launch failure

4

0: DEALLOCATE: unspecified launch failure


! test3.f90 
program test3

use kernels
use reyneq_Dev
use Q4_globals_Dev
use flows_Dev
use average_Dev
use filmtk_Dev
use sl_fthk_Dev
use bearing_Dev

nxmax = 402
nymax = 402
nx = nxmax
ny = nymax
threads%x = 16 
threads%y = 16

allocate(hxlnewDev(nxmax,nymax),hxlDev(nxmax,nymax),hxrnewDev(nxmax,nymax),hxrDev(nxmax,nymax), &
		 hydnewDev(nxmax,nymax),hyunewDev(nxmax,nymax),hydDev(nxmax,nymax),hyuDev(nxmax,nymax))
allocate(xrefDev(nxmax),yrefDev(nxmax),hDev(nxmax,nymax),hnewDev(nxmax,nymax),hxyDev(nxmax,nymax))
allocate(zdatDev(1001),qndatDev(1001))
allocate(ztaDev(nxmax,nymax),etaDev(nxmax,nymax),bearxDev(nxmax,nymax),&
		 bearyDev(nxmax,nymax),pDev(nxmax,nymax),poldDev(nxmax,nymax))
allocate(awDev(nxmax,nymax),apDev(nxmax,nymax),aeDev(nxmax,nymax), &
         arDev(nxmax,nymax),asDev(nxmax,nymax),anDev(nxmax,nymax))
allocate(ajDev(nxmax),bjDev(nxmax),cjDev(nxmax),djDev(nxmax),betaDev(nxmax),gammaDev(nxmax))

grid = dim3(ceiling(real(nx-1)/threads%x), & !nx-1
                ceiling(real(ny-1)/threads%y),1) !ny-1
write(*,*)'grid',grid
write(*,*)'thread',threads
call gethxp1_kernel<<<grid,threads>>>(nx,ny,nxmax,nymax,hminDev,hx0Dev,hyDev,ylDev,hxyDev,hnewDev, &
	hxlnewDev,hxrnewDev,hxlDev,hxrDev,hydnewDev,hyunewDev,hydDev,hyuDev,xrefDev,yrefDev,zdatDev,qndatDev)
istat = cudaThreadSynchronize()
if (istat .ne. 0) write(*,*) ' gethxp1 kernel error ',cudaGetErrorString(istat)

deallocate(hxlnewDev,hxlDev,hxrnewDev,hxrDev,hydnewDev,hyunewDev,hydDev,hyuDev)
deallocate(xrefDev,yrefDev,hDev,hnewDev,hxyDev,ztaDev,etaDev,bearxDev,bearyDev,pDev,poldDev)
deallocate(awDev,apDev,aeDev,arDev,asDev,anDev,ajDev,bjDev,cjDev,djDev)
istat = cudaDeviceReset()
end program test3



! kernels.f90

module kernels
contains
!======================================================    
    attributes (global) subroutine gethxp1_kernel(nx,ny,nxmax,nymax,hmin,hx0,hy,yl,hxy,hnew, &
		hxlnew,hxrnew,hxl,hxr,hydnew,hyunew,hyd,hyu,xref,yref,zdat,qndat)
!======================================================
    implicit none
    
    integer, value :: nx,ny,nxmax,nymax
    integer :: i,j
    real(8) :: hnew(nxmax,nymax),hxy(nxmax,nymax),xref(nxmax),yref(nymax)
	real(8) :: hyd(nxmax,nymax),hyu(nxmax,nymax),hydnew(nxmax,nymax),hyunew(nxmax,nymax)
	real(8) :: hxl(nxmax,nymax),hxr(nxmax,nymax),hxlnew(nxmax,nymax),hxrnew(nxmax,nymax)
	real(8) :: zdat(:),qndat(:)
	real(8) :: hmin,hx0,hy,yl
    real(8) :: YV1,YV2,XV1,XV2,ZX1,ZY2,ZX2,ZX,ZY,ZY1,DELY,DELX,XV,YV    

    i = (blockidx%x - 1) * blockDim%x + threadidx%x
    j = (blockidx%y - 1) * blockDim%y + threadidx%y  
    
    !if(i <= nx) then
	!    if(j <= ny) then
    !do i = 2,nx-1
	
	if(i >= 2 .and. i <= nx) then
        delx = (xref(i+1)-xref(i-1)) / 2.0
        !do j = 2,ny-1
		if(j >= 2 .and. j <= ny) then 
            dely=(yref(j+1)-yref(j-1)) / 2.0

            xv = (xref(i-1)+xref(i))/2.0 + 0.5*delx
            yv = (yref(j-1)+yref(j))/2.0
            zx = hmin - hx0 * (1.0 - xv)    !height due to nominal FH and pitch angle
            zy = hy * (yv - 0.5*yl)         !height due to roll angle
            hxlnew(i,j) = hxl(i,j)+zx+zy    !what are hxl and hxr???
            hxrnew(i,j) = hxr(i,j)+zx+zy

            if(j.eq.(ny-1)) then
              !grab edge cases as well
              xv=(xref(i-1)+xref(i))/2.0+0.50*delx
              yv=(yref(j)+yref(j+1))/2.0
              zx=hmin-hx0*(1.0-xv)
              zy=hy*(yv-0.50*yl)
              hxlnew(i,j+1)=hxl(i,j+1)+zx+zy
              hxrnew(i,j+1)=hxr(i,j+1)+zx+zy
            endif

            xv=(xref(i-1)+xref(i))/2.d0
            yv=(yref(j-1)+yref(j))/2.d0+0.5d0*dely
            zx=hmin-hx0*(1.d0-xv)
            zy=hy*(yv-0.5d0*yl)
            hydnew(i,j)=hyd(i,j)+zx+zy
            hyunew(i,j)=hyu(i,j)+zx+zy

            if(i.eq.(nx-1)) then
              !grab edge cases as well
              xv=(xref(i)+xref(i+1))/2.d0
              yv=(yref(j-1)+yref(j))/2.d0+0.5d0*dely
              zx=hmin-hx0*(1.d0-xv)
              zy=hy*(yv-0.5d0*yl)
              hydnew(i+1,j)=hyd(i+1,j)+zx+zy
              hyunew(i+1,j)=hyu(i+1,j)+zx+zy
            endif

            xv=xref(i)
            yv=yref(j)
            zx = hmin - hx0*(1.d0-xv)
            zy = hy*(yv-0.5d0*yl)
            hnew(i,j)=hxy(i,j)+zx+zy
        !enddo
    !enddo        
        endif
    endif
    
    return
    end subroutine gethxp1_kernel
end module kernels



!common.f90
!****************************************
module Q4_globals_Dev
!**************************************** 
    use cudafor
    !use Q4_sizes
    !use Q4_globals
    implicit none
    !cuda device variables
    integer :: istat
    type(dim3) :: grid, threads
	type(cudaDeviceProp) :: prop
    
    !slider info
    !xl = x length, yl = y length, zl=**, xg = **,  xt=taper Length, ht = taper angle, rebase = base recess
    real(8), device :: xlDev, ylDev, zlDev, xgDev, xtDev, htDev, rebaseDev, crownDev, camberDev, twistDev
    !Grid
    !==========
    !integer, device :: nxDev,nx1Dev,nx2Dev,nx3Dev,nx4Dev
    real(8), device, allocatable, dimension(:)   :: xrefDev,xref1Dev,xref2Dev,xref3Dev,xref4Dev, &
        yrefDev,yref1Dev,yref2Dev,yref3Dev,yref4Dev
    !real(8), device, allocatable, dimension(:,:) :: bearxDev, bearx1Dev, bearx2Dev,bearx3Dev,&
    !    bearx4Dev, bearyDev, beary1Dev, beary2Dev, beary3Dev, beary4Dev
    
    !Run Setup and Initial attitude
    real(8), device :: skeDev, raDev, rpmDev, u0Dev, hmDev, hx0Dev, h0Dev, hsDev, hyDev, p0xlDev
    
    !height data (say what hnew and h are)
    real(8), device, allocatable, dimension(:,:) :: hDev, h1Dev, h2Dev, h3Dev, h4Dev, hnewDev, hnew1Dev,&
        hnew2Dev, hnew3Dev, hnew4Dev
    
    !pressure
    !========
    real(8), device, allocatable :: pDev(:,:), p1Dev(:,:), p2Dev(:,:), p3Dev(:,:), p4Dev(:,:)
    
    !constants
    real(8), device :: corCoefDev !d0, gama, pir, pit, t1, t2, t3, t4, slip_beta, slip_gamma, accom, 
    
    !reynolds equation
    !=================
    real(8), device :: akmaxDev, akDev, ak0Dev, ak1Dev, ak2Dev, ak3Dev, ak4Dev
    
    !reynolds eqn residuals
    !=====================
    real(8), device, allocatable, dimension(:,:) :: resDev, res1Dev, res2Dev, res3Dev, res4Dev !, &
       ! su01Dev, su02Dev, su03Dev, su04Dev
    !various parameters needed by the reynolds equation
    !many of these are set in ave_height()
    real(8), device, allocatable, dimension(:,:) :: cohimxDev,cohimx1Dev,cohimx2Dev, &
     		    cohimx3Dev,cohimx4Dev,cohjmxDev,cohjmx1Dev,cohjmx2Dev,cohjmx3Dev,cohjmx4Dev, &
     		    himaxDev,himax1Dev,himax2Dev,himax3Dev,himax4Dev, &
     		    himinDev,himin1Dev,himin2Dev,himin3Dev,himin4Dev, &
     		    hjmaxDev,hjmax1Dev,hjmax2Dev,hjmax3Dev,hjmax4Dev, &
     		    hjminDev,hjmin1Dev,hjmin2Dev,hjmin3Dev,hjmin4Dev, &
     		    recssiDev,recssi1Dev,recssi2Dev,recssi3Dev,recssi4Dev, &
     		    recssjDev,recssj1Dev,recssj2Dev,recssj3Dev,recssj4Dev
    
    !FORCES
    !================
    real(8), device :: emaxDev, errDev, fDev, fnegDev, fposDev, fspDev, fsrDev, xfDev, yfDev, &
               !xint(4), yint(4), hint(4), &
               hminDev, MinFHDev, MinFHLocXDev, MinFHLocYDev, &
               jacDev(3,3), rintDev(4), hintgeDev(4),hgapDev(4), &
               XmomDev, YmomDev, xPosLocDev, yPosLocDev, xNegLocDev, yNegLocDev, &
               fvdw_outputDev !,ZmomDev
    real(8), device, allocatable, dimension (:,:) :: vdwMolecularForceMapDev
    real(8), device, allocatable, dimension (:) :: xintNewDev, yintNewDev, hintNewDev
    
    !contact and surface roughness
    !=============================
    real(8), device :: cpDev  !(nxx,nyx)
    real(8), device :: rsikDev,ctaDev,raspDev,fcrDev,txrDev,tyrDev, &
    		   aratioDev,eyoungDev,ydstDev,ydcoeDev,pratioDev,frcoeDev,ahcDev,bhcDev,elecpotDev
    
    !forces on the slider
    real(8), device :: f0Dev, xf0Dev, yf0Dev, xfsDev, yfsDev, Pitch_StiffnessDev, Roll_StiffnessDev,&
    PSADev, RSADev
    
    logical, device :: crashDev
    
end module Q4_globals_Dev
!***************************************************************
module filmtk_Dev
!***************************************************************
	!use Dyn_sizes_module
	implicit none

	real(8), device, allocatable :: hxlnewDev(:,:), hxrnewDev(:,:), hydnewDev(:,:), hyunewDev(:,:) 
	real(8), device :: zx1Dev,zx2Dev,zy1Dev,zy2Dev,zxDev

end module filmtk_Dev

!***************************************************************
module sl_fthk_Dev
!***************************************************************
	
	implicit none

	real(8), device, allocatable :: hxlDev(:,:), hxrDev(:,:), hydDev(:,:), hyuDev(:,:), hxyDev(:,:), &
	           hsadDev(:,:)

end module sl_fthk_Dev


!***************************************************************
module flows_Dev
!***************************************************************
	!use Dyn_sizes_module
	implicit none

	real(8), device :: t1Dev, t2Dev, t3Dev, t4Dev, gamaDev, pirDev, pitDev, d0Dev
	real(8), device, allocatable :: zdatDev(:), qndatDev(:)
	!integer, device :: nterDev, icoeDev, iqpoDev

end module flows_Dev

!***************************************************************
module average_Dev
!***************************************************************
	!use Dyn_sizes_module
	implicit none

	real(8), device, allocatable :: ztaDev(:,:), etaDev(:,:)

end module average_Dev


!***************************************************************
module bearing_Dev
!***************************************************************
	!use Dyn_sizes_module
	implicit none

	real(8), device, allocatable :: bearxDev(:,:), bearyDev(:, :)

end module bearing_Dev

! added by Dolf 06/04/15
!****************************************
module reyneq_Dev
!****************************************
	!use Dyn_sizes_module
	implicit none

	real(8), device, allocatable :: awDev(:,:),apDev(:,:),aeDev(:,:),arDev(:,:),asDev(:,:),anDev(:,:),&
		aw1Dev(:,:),ap1Dev(:,:),ae1Dev(:,:),as1Dev(:,:),an1Dev(:,:),ar1Dev(:,:),dp1Dev(:,:), &
        aw2Dev(:,:),ap2Dev(:,:),ae2Dev(:,:),as2Dev(:,:),an2Dev(:,:),ar2Dev(:,:),dp2Dev(:,:), &
        aw3Dev(:,:),ap3Dev(:,:),ae3Dev(:,:),as3Dev(:,:),an3Dev(:,:),ar3Dev(:,:),dp3Dev(:,:), &
        aw4Dev(:,:),ap4Dev(:,:),ae4Dev(:,:),as4Dev(:,:),an4Dev(:,:),ar4Dev(:,:),dp4Dev(:,:), &
		poldDev(:,:),ajDev(:),bjDev(:),cjDev(:),djDev(:),betaDev(:),gammaDev(:)
	real(8), device :: qnDev(1001),pnDev(1001)
	real(8), device :: siDev,dtDev,omega0Dev,akdDev,aknDev

end module reyneq_Dev

Hi Dolf,

I see two out-of-bounds accesses in the “gethxp1_kernel”: “xref(i+1)” when “i.eq.nx” and “yref(j+1)” when “j.eq.ny”. When I change the guards from “i<=nx” to “i<nx” and “j<=ny” to “j<nx”, the error goes away. Another possible solution would be to extend “xref” and “yref” by one element.

!!!   if(i >= 2 .and. i <= nx) then
   if(i >= 2 .and. i < nx) then
        delx = (xref(i+1)-xref(i-1)) / 2.0
        !do j = 2,ny-1
!!!      if(j >= 2 .and. j <= ny) then
      if(j >= 2 .and. j < ny) then
            dely=(yref(j+1)-yref(j-1)) / 2.0
  • Mat

Thanks Mat, that comment saved me a bunch.

any other comments about how I should write things in my kernels or main program?

how can I check out of bounds using cuda-memcheck? how did you know that it’s xref and yref which gives this problem?

again, thanks allot!

Dolf

any other comments about how I should write things in my kernels or main program?

It looks fine to me. Memory is coalesced for both the I and J indices, which is good.

One suggestion would be to declare your read-only arrays as “INTENT(IN)”. Depending upon the target device, we’ll use this to move these read-only arrays into texture memory.

Avoiding if statements (beyond the outer guards) is usually a good thing. Though here, only a few threads will ever take the edge case branch so it doesn’t matter as much. The problem occurs when you have different threads in a warp take different branches. Since threads in a warp are executed in SIMT mode (single instruction multi threads), they all execute the same instruction at the same time. If one thread takes a branch and another takes a different branch, your execution time for this section of code is at least double since the threads need to wait for each other.

how can I check out of bounds using cuda-memcheck?

It should say “out-of-bounds” in the output. Though it won’t tell you which memory reference is out of bounds.

An unspecified launch failure means that the kernel abnormally aborted mid-execution. I have found this almost always means an illegal memory access occurred with the most common cause being an out-of-bound access. On the CPU, off by one errors typically wont cause the execution to abort. Instead the program just clobbers memory potentially leading to verification issues. The GPU is more sensitive to memory access issues.

Because of this, I checked your code for potential OOB access. In particular I looked at all the “i+1” and “j+1” access and check them against i’s range and the arrays bounds. No magic or tools, just read the code and followed the logic.

  • Mat

Hi Mat,
So, I have fixed the problems I have in my test code. Implemented the fixes into my original code (Dyn5.exe). here is what happened if I run the code:

err.dat (cuda-memcheck error log).

========= CUDA-MEMCHECK
========= Program hit cudaErrorLaunchTimeout (error 6) due to "the launch timed out and was terminated" on CUDA API call to cudaThreadSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xe2362) [0x105d62]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\cudart64_65.dll (cudaThreadSynchronize + 0xf5) [0x19585]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (cudathreadsynchronize_ + 0x12) [0x86872]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (reyneq_ + 0x1555) [0x5d7b5]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (MAIN_ + 0x62ad) [0x7a5d]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (main + 0x70) [0x10e0]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (__tmainCRTStartup + 0x144) [0x128e30]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x15454]
=========
========= Program hit cudaErrorLaunchTimeout (error 6) due to "the launch timed out and was terminated" on CUDA API call to cudaLaunch. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xe2362) [0x105d62]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\cudart64_65.dll (cudaLaunch + 0x13a) [0x1c59a]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (kernels_sub_reyneq_p6_kernel_ + 0x19f) [0x37cff]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (reyneq_ + 0x1739) [0x5d999]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (MAIN_ + 0x62ad) [0x7a5d]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (main + 0x70) [0x10e0]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (__tmainCRTStartup + 0x144) [0x128e30]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x15454]
=========
========= Program hit cudaErrorLaunchTimeout (error 6) due to "the launch timed out and was terminated" on CUDA API call to cudaThreadSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xe2362) [0x105d62]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\cudart64_65.dll (cudaThreadSynchronize + 0xf5) [0x19585]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (cudathreadsynchronize_ + 0x12) [0x86872]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (reyneq_ + 0x173e) [0x5d99e]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (MAIN_ + 0x62ad) [0x7a5d]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (main + 0x70) [0x10e0]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (__tmainCRTStartup + 0x144) [0x128e30]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x15454]
=========
========= Program hit cudaErrorLaunchTimeout (error 6) due to "the launch timed out and was terminated" on CUDA API call to cudaMemcpy2D. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xe2362) [0x105d62]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\cudart64_65.dll (cudaMemcpy2D + 0x178) [0x1f4e8]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (pgf90_dev_copyin_2d + 0x11f) [0x84ebf]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (reyneq_ + 0x18df) [0x5db3f]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (MAIN_ + 0x62ad) [0x7a5d]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (main + 0x70) [0x10e0]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (__tmainCRTStartup + 0x144) [0x128e30]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x15454]
=========
========= ERROR SUMMARY: 4 errors

here is what I get in the command prompt:

C:\Users\Dolf\Desktop\test3>cuda-memcheck --log-file err.dat Dyn5.exe
 ******************************************
 * CML Dynamic GPU Simulator Version 5.0  *
 ******************************************

 Simulation started at:  14:45:52 -- 08/21/2015


 GPU installed: GeForce GTX 960

 *****************************************
 Dynamics.def file version:
 4.039

 Running with non-zero altitude
 Air parameters will be set according to specified altitude
 *****************************************
 CML Version 7.0  RAIL.DAT
 7.0
 *****************************************
Actual Grid Size: nx= 402  ny= 402
Suspension Load(grams) =  2.500
Initial Disk Velocity(m/s) =    8.21991
Initial IDEMA Skew Angle(degre -14.76900
Initial Radial Position(mm) =   14.53600
Intermolecular forces are: OFF

 SIMULATOR SOLVES SLIDER RESPONSE
 TO THE FOLLOWING DYNAMIC INPUTS:

  => Initial Impulse

   => Three DOF Stiffness and Damping Coef. Suspension Model

  => Asperity Contact
     ( GW model )

 PRE-PROCESSING...

  dense meshing ...

  adaptive mesh discretization ...
  reyneq_p5 kernel error
 the launch timed out and was terminated

  reyneq_p6 kernel error
 the launch timed out and was terminated

0: copyin 2D (dst=0xf83800, dpitch=3216, src=0x53fa3d0, spitch=3216, width=8, he
ight=1) FAILED: 6(the launch timed out and was terminated)

C:\Users\Dolf\Desktop\test3>

Now I am using same PVF compiler (15.7) with Visual Studio 2010 and windows 8.1 64 bit.

I bought (I might already told you) GeForce GTX 960 (GigaByte).

one time I ran the code, the computer froze, gave me this error message “DPC_WATCHDOG_VIOLATION”, computer need to restart, it is collecting information before restarting, then it restarted.

any clue why? by the way I sent the original code + two input files that need to be in the same folder as the executable (Dyn5.exe) to run via e-mail. I told the team to forward the code to you just like last time.

Please advice! I am sinking in an enormous amount of GPU problems here.

cheers,
Dolf

Hi Dolf,

This is the problem I wrote about earlier. The Windows Display Driver Model (WDDM) safe guards against your monitor freezing by deploying a watchdog timer. If any process that takes too long running on your GTX card, the watch dog timer will kill it.

There are ways to disable the watchdog timer or at least set the time limited to a higher value. However, both require you to hack your Windows registry which is not something I can recommend. You certain can try this but changing registry values may lead to instability in your system.

Though, I just reviewed an old post on Stack-overflow that details the problem and noticed that there was an update (look for the April 2015 edit). According to the poster you can now set the timeout factor or even disable it from within the NVIDIA Nsight tool. I have not tried this myself so don’t know if it works, but it seems promising.

  • Mat

Hi Mat,

I checked the post, did change registry via setting TdrLevel to 0 in order to disable WatchDog timer but it disables the whole GPU card.

Dolf

How about instead just setting the timeout factor to a larger value?

Hi Mat,

so I followed the stackoverflow post to disable WDDM from nsight viewer. I still get the watchdog violation error and computer restarts.

the error log looks like this:

========= CUDA-MEMCHECK
========= Program hit cudaErrorLaunchTimeout (error 6) due to "the launch timed out and was terminated" on CUDA API call to cudaThreadSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xe2362) [0x105d62]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\cudart64_65.dll (cudaThreadSynchronize + 0xf5) [0x19585]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (cudathreadsynchronize_ + 0x12) [0x86872]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (reyneq_ + 0x1555) [0x5d7b5]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (MAIN_ + 0x62ad) [0x7a5d]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (main + 0x70) [0x10e0]
=========     Host Frame:C:\Users\Dolf\Desktop\test3\Dyn5.exe (__tmainCRTStartup + 0x144) [0x128e30]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x15454]
=========

any next steps to fix reyeq_p3_kernel subroutine?

Dolf

Hi Dolf,

There’s not must more we can do with regards to the watchdog timer other than increase the timeout length or get a Telsa device which uses the TCC compute driver instead of WDDM.

On my Tesla K40 running on Linux, the reyeq_p3_kernel runs in ~0.2 seconds while on a Maxwell card, the kernel to ~0.3 seconds. Both well under what a watchdog timer should kill the kernel over. I was also able to run you program without encountering a timeout on my Windows laptop which has a Quadro K2000M. Not a wimpy card but far less powerful than your GTX960. I even tried compiling with “-g” and it still didn’t timeout.

Are you using a bigger data set than I?

If not, then my best guess is you have a configuration issue on your system that’s slowing your card down. Maybe it’s the wrong PCIe slot? Driver issue? Hardware configuration isn’t really my area so I’m not sure how much I can provide here. Sorry.

  • Mat

Hi Mat,
No, I am using same input files as I sent you. nx = 402, ny = 402.
Does the code run to the end? it is strange.
I have only one PCIe in my dell XPS 8500. I have changed the power supply to a 700 watts total power. My windows is 8.1 Pro.

Dolf

Here’s my run on my laptop. It doesn’t look like it’s getting correct answers and seg faults after the 100th iteration, but no timeout. The same output occurs at -O0.

PGI$ ./dolf.exe
 ******************************************
 * CML Dynamic GPU Simulator Version 5.0  *
 ******************************************

 Simulation started at:  15:41:29 -- 08/25/2015


 GPU installed: Quadro K2000M

 *****************************************
 Dynamics.def file version:
 4.039

 Running with non-zero altitude
 Air parameters will be set according to specified altitude
 *****************************************
 CML Version 7.0  RAIL.DAT
 7.0
 *****************************************
Actual Grid Size: nx= 402  ny= 402
Suspension Load(grams) =  2.500
Initial Disk Velocity(m/s) =    8.21991
Initial IDEMA Skew Angle(degre -14.76900
Initial Radial Position(mm) =   14.53600
Intermolecular forces are: OFF

 SIMULATOR SOLVES SLIDER RESPONSE
 TO THE FOLLOWING DYNAMIC INPUTS:

  => Initial Impulse

   => Three DOF Stiffness and Damping Coef. Suspension Model

  => Asperity Contact
     ( GW model )

 PRE-PROCESSING...

  dense meshing ...

  adaptive mesh discretization ...
  T(s)= 0.00000    ,Iteration=  5,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 10,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 15,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 20,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 25,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 30,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 35,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 40,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 45,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 50,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 55,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 60,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 65,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 70,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 75,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 80,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 85,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 90,Residual=         NaN
  T(s)= 0.00000    ,Iteration= 95,Residual=         NaN
  T(s)= 0.00000    ,Iteration=100,Residual=         NaN
Segmentation fault
PGI$

I see now. this should be the output the residual should not be NaN.
the reason for not having the error I am encountering is because of the following lines (lines 160-163 in Reynolds.f90 file) were commented out:

!call reyneq_p3_kernel<<<grid,threads>>>(nx,ny,nxmax,nymax,pDev,xrefDev,yrefDev, &
		!awDev,apDev,aeDev,arDev,asDev,anDev,ajDev,bjDev,cjDev,djDev,betaDev,gammaDev)
		!istat = cudaThreadSynchronize()
		!if (istat .ne. 0) write(*,*) ' reyneq_p3 kernel error ',cudaGetErrorString(istat)

can you please uncomment the above lines in Reynolds.f90 and see what you get?
now I am getting this (above lines uncommented):

C:\Users\Dolf\Desktop\test3>cuda-memcheck --log-file err8.dat Dyn5.exe
 ******************************************
 * CML Dynamic GPU Simulator Version 5.0  *
 ******************************************

 Simulation started at:  20:28:53 -- 08/25/2015


 GPU installed: GeForce GTX 960

 *****************************************
 Dynamics.def file version:
 4.039

 Running with non-zero altitude
 Air parameters will be set according to specified altitude
 *****************************************
 CML Version 7.0  RAIL.DAT
 7.0
 *****************************************
Actual Grid Size: nx= 402  ny= 402
Suspension Load(grams) =  2.500
Initial Disk Velocity(m/s) =    8.21991
Initial IDEMA Skew Angle(degre -14.76900
Initial Radial Position(mm) =   14.53600
Intermolecular forces are: OFF

 SIMULATOR SOLVES SLIDER RESPONSE
 TO THE FOLLOWING DYNAMIC INPUTS:

  => Initial Impulse

   => Three DOF Stiffness and Damping Coef. Suspension Model

  => Asperity Contact
     ( GW model )

 PRE-PROCESSING...

  dense meshing ...

  adaptive mesh discretization ...
  reyneq_p3 kernel error
 unspecified launch failure

  reyneq_p4 kernel error
 unspecified launch failure

0: copyin 2D (dst=0xe83800, dpitch=3216, src=0x113b7780, spitch=3216, width=8, h
eight=1) FAILED: 4(unspecified launch failure)

C:\Users\Dolf\Desktop\test3>



I was also able to run you program without encountering a timeout on my Windows laptop which has a Quadro K2000M.

what OS you running on your laptop? is it _x64? or _x32? what GPU driver version you have on the laptop?

what OS you running on your laptop? is it _x64? or _x32? what GPU driver version you have on the laptop?

64-bit Win7, CUDA 6.5 driver. But I’m not sure this is relevant.

Are there any other changes you made? I get the same behavior as before once I uncomment this section.

In looking at the “reynew_p3_kernel”, it does have some issues. Namely, you have several variables, aj, bj, cj, dj, gamma, and beta, that are shared global variables but really need to be private to the kernel. This reminded me that originally you had these variables declared as automatics but this caused an illegal memory access (I reported this to engineering as TPR#21851 and is still under investigation). I changed these to be fixed size arrays. The runtime of the kernel did increase from 20,000 to 100,000 ms (presumably due to the data access pattern) but still didn’t cause a timeout on my system. Granted, I set my timeout to 2 seconds so wouldn’t expect it.

What I’d suggest is that you add a second “ny” dimension to aj, bj, cj, dj, gamma, and beta so that each thread will have it’s own copy. Alternatively, only parallelize the “nx” dimension. You’ll need to make this change to all of your kernels where these are used.

Also, I’m concerned about the correctness of the bottom section of the “reynew_p3_kernel”. For example:

            do 475 k=1,n !nxm2
              p(k+1,j)=dj1(k)
475         continue

Here all the “i” threads will redundantly execute this loop and “p” is not being accessed in the stride-1 dimension.

Given that you’re seeing different errors versus what I’m seeing and that the errors seem to change, I’m wondering if the root cause could due to race conditions and algorithmic issues causing undefined behavior?

  • Mat

Hi Mat,

64-bit Win7, CUDA 6.5 driver. But I’m not sure this is relevant.

It’s some how relevant. Since I am investigating both hardware and software (windows drivers/updates and my code) issues.

In looking at the “reynew_p3_kernel”, it does have some issues. Namely, you have several variables, aj, bj, cj, dj, gamma, and beta, that are shared global variables but really need to be private to the kernel.

the reason why those variables are global because their size depends on an input variable (nx and ny in dynamics.def input file). since you suggested to make them private, I will use the following code:
dimension(aj(sizeof(p)), will that work? or I can set the size to the maximum nx value allowed in the code which is 1000. then I will not pass them to kernel as what I am doing now.
the last part of the code:

n = nxm2
			beta(1)=bj(1)
			gamma(1)=dj(1)/beta(1)
			do 10 k=2,n
				km1=k-1
				beta(k)=bj(k)-aj(k)*cj(km1)/beta(km1)
				gamma(k)=(dj(k)-aj(k)*gamma(km1))/beta(k)
10  continue
			dj(n)=gamma(n)
			do 20 k=n-1,1,-1
				dj(k)=gamma(k)-cj(k)*dj(k+1)/beta(k)
20  continue
			! tridag ends
            do 475 k=1,n !nxm2
              p(k+1,j)=dj(k)
475         continue

I will remove it from reyneq_p3 and have it in another kernel. I hope that’s ok and will not ruin the results.

Though the persisting question is, how come you can run the code on your laptop with all those problems but I cannot? it does not make any sense at all. my last resort is to install the GTX 680 card on a windows 7 pro machine and see it that works.

regards,
Dolf

[quotee]I will use the following code: dimension(aj(sizeof(p)), will that work?[/quote]No. By manually privatizing, I mean add an extra dimension, “aj(nym,nxm)” so that each thread will have it’s own copy.

Right now you use an “i” and “j” index for each thread but access “aj(j)”. Hence all threads will access the same “aj(j)” for each value of “j” and cause a race condition. Instead if you use “aj(j,i)” or “aj(i,j)”, each thread has its own value.

However given the second part of the code, it may actual be better to not use a 2-D thread block and instead just parallelize over “nym” (i.e. the j index). In this case, you don’t need to add the second dimension to “aj”.

Though the persisting question is, how come you can run the code on your laptop with all those problems but I cannot?

Sorry, I have no idea. My current guess is that since the code has race conditions and other issues, the behavior is undefined and manifests itself differently on different systems. But yes, it could be an issue with your system as well.

However given the second part of the code, it may actual be better to not use a 2-D thread block and instead just paralyze over “nym” (i.e. the j index). In this case, you don’t need to add the second dimension to “aj”.

I did not quite understand that. can you give an example of the correct version of reyneq_p3_kernel so I better understand the problem? Then I can update all other subroutines that is similar to this one.

thanks,
Dolf

Let me use pseudo-code and code snipits to illustrate the issues I see.

        i = (blockidx%x - 1) * blockDim%x + threadidx%x
        j = (blockidx%y - 1) * blockDim%y + threadidx%y

Here you’ve set-up the kernel to use a 2 dimensional thread block using the indices i and j. This is perfectly acceptable and each thread will have a unique i j pair. However, multiple threads may share the same i or j value.

   if ( j >= 2 .and. j <= nym1) then
       ... some code ...
       if(i >= 2 .and. i <= nxm1) then
             ... some code ...
              aj(i)=aw(i,j)
              ... more code
        endif
         ... CODE SEE BELOW ...
    endif

Here you have two globally shared arrays, aj and aw. Each thread will load a unique value of aw but store into a element of aj that’s shared by multiple threads. The actual value that gets stored into aj(i) will depend on the order in which the threads are executed. To solve this issue, you have two options.

First you could make aj private to the thread. In other words, each thread will get their own unique copy of the array. The problem here is that you don’t know the size of aj until the routine is called therefor need to dynamically allocate it every time the routine is executed. While newer NVIDIA devices do support dynamic allocation by individual threads, there is a high performance cost. Also since the memory is not contiguous among threads in the same warp, the program will suffer from memory divergence and suffer a severe performance degradation. Finally as we found with you’re original error, PGI does have a bug when creating automatic arrays. While the bug will be fixed in a later release, the performance issues will still remain.

Second, you could add an extra “j” dimension to aj. This will ensure that every thread writes to a unique element of the array. Extra memory is required but no more than what’s needed if the arrays are private. Plus, the memory can be accessed contiguously in memory and not suffer performance penalties. When I say to “manually privatize” an array, this is what I mean.

  if ( j >= 2 .and. j <= nym1) then
       ... some code ...
       if(i >= 2 .and. i <= nxm1) then
           ... 
       endif  !! end i
      
      do 20 k=n-1,1,-1
             dj(k)= ... expression ...
20  continue
                     
       do 475 k=1,n 
              p(k+1,j)=dj(k)
475  continue
 
  endif  !!! end j

Now this section of code is even more problematic. Here dj does need to be privatized otherwise all threads will be updating the same memory in random order. However, to manually privatize it you will need to add both an i and j dimension so that each thread will get it’s own “k” array.

p also has a race condition since threads with the same i index will populate the same “p(k+1,j)” element of the array. The final value depending upon the order in which the threads were executed (a non-deterministic value)

I’m thinking that what might work best here, is to separate this kernel into two separate kernels. The first kernel would compute the first first block of code above. You would parallelize as you do now using a 2-D thread block and manually privatize the aj array (as well as the others). A second kernel would handle the second section but only use a 1-D thread block across j. You would need to manually privatize dj (and beta and gamma) as well, but only need to add a single extra dimension instead of 2.

Similar ideas would be required for your other kernels.

Note that these are just suggestions. I don’t know your code as well as you nor know your algorithm. Please use your own judgement and feel free to disagree with any suggestions I’ve made.

  • Mat

Hi Mat,

I do agree with you that this part below is problematic.

if ( j >= 2 .and. j <= nym1) then 
       ... some code ... 
       if(i >= 2 .and. i <= nxm1) then 
           ... 
       endif  !! end i 
      
      do 20 k=n-1,1,-1 
             dj(k)= ... expression ... 
20  continue 
                      
       do 475 k=1,n 
              p(k+1,j)=dj(k) 
475  continue 
  
  endif  !!! end j

but as you can see that p gets updated while the i loop doing it’s thing.

!=============================================================
attributes (global) subroutine reyneq_p5_kernel(nx,ny,nxmax,nymax,p,xref,yref, &
	aw,ap,ae,ar,as,an,aj,bj,cj,dj,beta,gamma)
!=============================================================

	integer, value :: nx,ny,nxmax,nymax
	integer :: i,j,k,km1,nxm1,nym1,jm1,jp1,im1,ip1,nxm2,nym2,n
	real(8) :: p(nxmax,nymax),xref(nxmax),yref(nxmax),aw(nxmax,nymax),ap(nxmax,nymax), &
		ae(nxmax,nymax),ar(nxmax,nymax),as(nxmax,nymax),an(nxmax,nymax), &
		aj(nxmax,nymax),bj(nxmax,nymax),cj(nxmax,nymax),dj(nxmax,nymax), &
		beta(nxmax,nymax),gamma(nxmax,nymax)
	real(8) :: ymyp,xmxp
   
	i = (blockidx%x - 1) * blockDim%x + threadidx%x
	j = (blockidx%y - 1) * blockDim%y + threadidx%y
	
	nym1 = ny-1
	nxm1 = nx-1
	nxm2 = nx-2 
	nym2 = ny-2
	 
	!do 700 i=2,nxm1
	if (i >= 2 .and. i <= nxm1) then
            im1=i-1
            ip1=i+1
            xmxp=(xref(ip1)-xref(im1))/2.d0
!
            !do 650 j=2,nym1
			if (j >=2 .and. j <= nym1) then
              jm1=j-1
              jp1=j+1
              ymyp=(yref(jp1)-yref(jm1))/2.d0
!
              aj(jm1,i)=as(im1,jm1)
              bj(jm1,i)=ap(im1,jm1)
              cj(jm1,i)=an(im1,jm1)
              dj(jm1,i)=ar(im1,jm1)-aw(im1,jm1)*p(im1,j) - ae(im1,jm1)*p(ip1,j)
!
              if(j.eq.2) then
                dj(jm1,i)=dj(jm1,i)-as(im1,jm1)*p(i,jm1)
                aj(jm1,i)=0.d0
              else if(j.eq.nym1) then
                dj(jm1,i)=dj(jm1,i)-an(im1,jm1)*p(i,jp1)
                cj(jm1,i)=0.d0
              end if
!
!650         continue
			endif ! j
            
			!call tridag(nym2,aj,bj,cj,dj)
			! tridag starts
			n = nym2
			beta(1,i)=bj(1,i)
			gamma(1,i)=dj(1,i)/beta(1,i)
			do 10 k=2,n
				km1=k-1
				beta(k,i)=bj(k,i)-aj(k,i)*cj(km1,i)/beta(km1,i)
				gamma(k,i)=(dj(k,i)-aj(k,i)*gamma(km1,i))/beta(k,i)
10  continue
			dj(n,i)=gamma(n,i)
			do 20 k=n-1,1,-1
				dj(k,i)=gamma(k,i)-cj(k,i)*dj(k+1,i)/beta(k,i)
20  continue
			! tridag ends
            do 675 k=1,n
              p(i,k+1)=dj(k,i)
675         continue
!700       continue
		  endif ! i
	
	end subroutine reyneq_p5_kernel