Integration calls fail

Cattaneo · November 16, 2020, 9:30pm

I’m trying to port some solver code onto GPUs, but unfortunately I’m running into an issue that I’m not really sure how to handle. Specifically I’m running into a problem on the integrate part in that it keeps failing on what seems to be the second integration call. Specifically the error I’m getting is

FORTRAN STOP: 0: Block (2,1,1), Thread (1,1,1)
stop08.cu:28: pgf90_stop08: block: [1,0,0], thread: [0,0,0] Assertion `FORTRAN_STOP_STATEMENT` failed.

This error occurs in this call, it always seems to fail on the a block that has entered but not left this section (N=50).

 fi = (U-V)
 call grad(fi,X,DU)
 do i=1,N
    DU(i)=max(DU(i),1.d-100)
 enddo
 do j=1,N
    do i=1,N
       xvec1(i)=X(i)
       yvec1(i)=DU(i)*Kxold(i,j)
    enddo

    call integrate(2,X(1),X(N),tmp(j),1)
    Upold(j)=U(1)-V(1)+tmp(j)
 enddo

and the subroutines and functions that are causing it are below (all of them are in the module file)

attributes(device) function Diff(mom)
  implicit none
  real*8 :: Diff,mom
  Diff=(mom*mom/dsqrt(1.d0+mom*mom))
  return
end function Diff

attributes(device) function funcexp(z)
  implicit none
  real*8 :: funcexp,z,divdif
  funcexp=dexp(divdif(yvec,xvec,N,z,3))*dexp(z) 
  return
end function funcexp

attributes(device) function funcp1(z)
  implicit none
  real*8 :: funcp1,divdif,z
  funcp1=divdif(yvec1,xvec1,N,z,3)
  return
end function funcp1

attributes(device) function funcp0(z)
  implicit none
  real*8 :: funcp0,divdif,z
  funcp0=dexp(divdif(yvec0,xvec0,N,z,3))
  return
end function funcp0

attributes(device)  function funcxi(z) 
  implicit none
  real*8 :: funcxi,z,divdif
  funcxi=dexp(divdif(yvec,xvec,N,z,3))*dexp(4.d0*z)*dexp(z)/dsqrt(1.d0+dexp(2.d0*z))
  return
end function funcxi

attributes(device) subroutine grad(fi,xi,dfi)
  implicit none
  integer i
  real*8 :: fi(N),xi(N),dfi(N)

  do i=2,N-1
     dfi(i)=(fi(i+1)-fi(i-1))/(xi(i+1)-xi(i-1))
  enddo
  dfi(1)=(fi(2)-fi(1))/(xi(2)-xi(1))
  dfi(N)=(fi(N)-fi(N-1))/(xi(N)-xi(N-1))
  return
end subroutine grad


!!!!	Interpolating subroutine
attributes(device) function DIVDIF(F,A,NN,X,MM)
  implicit none
  integer N,NN,MM,M,MPLUS,IX,IY,MID,NPTS,IP,L,ISUB,J,I,MMAX, errorNumber
  real*8 :: SUM,DIVDIF
  real*8 :: A(NN),F(NN),T(20),D(20),X
  LOGICAL EXTRA

  DATA MMAX/10/
  !     
  !     TABULAR INTERPOLATION USING SYMMETRICALLY PLACED ARGUMENT POINTS.
  !     
  !     START.  FIND SUBSCRIPT IX OF X IN ARRAY A.
  IF( (NN.LT.2) .OR. (MM.LT.1) ) GO TO 20
  N=NN
  M=MM
  MPLUS=M+1
  IX=0
  IY=N+1
  !     (SEARCH INCREASING ARGUMENTS.)
1 MID=(IX+IY)/2
  IF(X.GE.A(MID)) GO TO 2
  IY=MID
  GO TO 3
  !     (IF TRUE.)
2 IX=MID
3 IF(IY-IX.GT.1) GO TO 1
  GO TO 7
  !     COPY REORDERED INTERPOLATION POINTS INTO (T(I),D(I)), SETTING
  !     *EXTRA* TO TRUE IF M+2 POINTS TO BE USED.
7 NPTS=M+2-MOD(M,2)
  IP=0
  L=0
  GO TO 9
8 L=-L
  IF(L.GE.0) L=L+1
9 ISUB=IX+L
  IF((1.LE.ISUB).AND.(ISUB.LE.N)) GO TO 10
  !     (SKIP POINT.)
  NPTS=MPLUS
  GO TO 11
  !     (INSERT POINT.)
10 IP=IP+1
  T(IP)=A(ISUB)
  D(IP)=F(ISUB)
11 IF(IP.LT.NPTS) GO TO 8
  EXTRA=NPTS.NE.MPLUS
  !     
  !     REPLACE D BY THE LEADING DIAGONAL OF A DIVIDED-DIFFERENCE TABLE, SUP-
  !     PLEMENTED BY AN EXTRA LINE IF *EXTRA* IS TRUE.
  DO 14 L=1,M
     IF(.NOT.EXTRA) GO TO 12
     ISUB=MPLUS-L
     D(M+2)=(D(M+2)-D(M))/(T(M+2)-T(ISUB))
12   I=MPLUS
     DO 13 J=L,M
        ISUB=I-L
        D(I)=(D(I)-D(I-1))/(T(I)-T(ISUB))
        I=I-1
13      CONTINUE
14      CONTINUE
        !     
        !     EVALUATE THE NEWTON INTERPOLATION FORMULA AT X, AVERAGING TWO VALUES
        !     OF LAST DIFFERENCE IF *EXTRA* IS TRUE.
        SUM=D(MPLUS)
        IF(EXTRA) SUM=0.5*(SUM+D(M+2))
        J=M
        DO 15 L=1,M
           SUM=D(J)+(X-T(J))*SUM
           J=J-1
15         CONTINUE

           DIVDIF=SUM

           RETURN

20         errorNumber = 1
           errorNumber = 2

           DIVDIF=999999999999999999.

101        errorNumber = 3
102        errorNumber = 4

        END

        subroutine get_time(time_sec) 
          implicit integer(i-n), double precision(a-h, o-z)
          character (len=12) tdum1, tdum2, tdum3
          dimension i_date(8)
          call date_and_time(tdum1, tdum2, tdum3, i_date)
          time_sec = i_date(3)*24.0*3600.0 + i_date(5)*3600.0
          time_sec = time_sec + i_date(6)*60.0 + i_date(7)+ i_date(8)/1.d3
          return
        END subroutine get_time

!!!!------------------------------------------------------------------------------
!!!!		Subroutine for integrals 
        attributes(device) SUBROUTINE integrate(func,a,b,ss,v) 
          INTEGER KMAX,KMAXP,J,JM
          real*8 :: a,b,ss,tol
          integer :: func,v
          PARAMETER (KMAX=14, KMAXP=KMAX+1, J=5, JM=J-1, tol=1.d-1) !#CONTROL Should work even for EPS=1e-1
          INTEGER k
          real*8 :: dss,h(KMAXP),s(KMAXP)

          h(1)=1.d0
          do k=1,KMAX 
             call halfway(func,a,b,s(k),k,v)
             if (k.ge.J) then
                call pol_interpol(h(k-JM),s(k-JM),J,0.d0,ss,dss)
                if (dabs(dss).le.tol*dabs(ss)) return
             endif
             s(k+1)=s(k)
             h(k+1)=h(k)/9.d0
          end do
          !write(*,*)  'ERROR in subroutine Integrate'
          print *, "ERROR in subroutine integrate"
          print *, "Error on thread", threadIdx%x
          print *, "Error on block", blockIdx%x
          !Keep running on other ones
          stop
        END SUBROUTINE integrate

        attributes(device) SUBROUTINE pol_interpol(xa,ya,n,x,y,dy)
          INTEGER n,NMAX
          real*8 :: dy,x,y,xa(n),ya(n)
          PARAMETER (NMAX=10)
          INTEGER i,m,ns
          real*8 :: den,dif,dift,ho,hp,w,c(NMAX),d(NMAX)
          ns=1
          dif=dabs(x-xa(1))
          do i=1,n
             dift=abs(x-xa(i))
             if (dift.lt.dif) then
                ns=i
                dif=dift
             endif
             c(i)=ya(i)
             d(i)=ya(i)
          end do
          y=ya(ns)
          ns=ns-1
          do m=1,n-1
             do i=1,n-m
                ho=xa(i)-x
                hp=xa(i+m)-x
                w=c(i+1)-d(i)
                den=ho-hp
                if(den.eq.0.d0)then
                   !write(*,*)  'ERROR in subroutine interpol'
                   print *, "ERROR in subroutine interpol"
                   stop
                end if
                den=w/den
                d(i)=hp*den
                c(i)=ho*den
             end do
             if (2*ns.lt.n-m)then
                dy=c(ns+1)
             else
                dy=d(ns)
                ns=ns-1
             endif
             y=y+dy
          end do
          return
        END SUBROUTINE pol_interpol

      attributes(device) SUBROUTINE halfway(func,a,b,s,n,v)
      INTEGER n
      real*8 :: a,b,s
      integer :: func,v
      INTEGER :: it,j
      real*8 :: ddel,del,sum,tnm,x
      
      if (v == 1) then
        ! print *, "block", blockIdx%x, "has entered halfway"
        ! print *, "a:", a
        ! print *, "b:", b
        v = 1
      endif
      
      Select Case (func)
      
        case (1)
          if (n.eq.1) then
            s=(b-a)*funcexp(0.5d0*(a+b))
          else
            it=3**(n-2)
            tnm=it
            del=(b-a)/(3.d0*tnm)
            ddel=del+del
            x=a+0.5d0*del
            sum=0.d0
            do j=1,it
              sum=sum+funcexp(x)
              x=x+ddel
              sum=sum+funcexp(x)
              x=x+del
           end do
            s=(s+(b-a)*sum/tnm)/3.d0
          endif
        case (2)
          if (n.eq.1) then
            s=(b-a)*funcp1(0.5d0*(a+b))
          else
            it=3**(n-2)
            tnm=it
            del=(b-a)/(3.d0*tnm)
            ddel=del+del
            x=a+0.5d0*del
            sum=0.d0
            do j=1,it
              sum=sum+funcp1(x)
              x=x+ddel
              sum=sum+funcp1(x)
              x=x+del
           end do
            s=(s+(b-a)*sum/tnm)/3.d0
          endif
        case (3)
          if (n.eq.1) then
            s=(b-a)*funcp0(0.5d0*(a+b))
          else
            it=3**(n-2)
            tnm=it
            del=(b-a)/(3.d0*tnm)
            ddel=del+del
            x=a+0.5d0*del
            sum=0.d0
            do j=1,it
              sum=sum+funcp0(x)
              x=x+ddel
              sum=sum+funcp0(x)
              x=x+del
           end do
            s=(s+(b-a)*sum/tnm)/3.d0
          endif
        case (4)
          if (n.eq.1) then
            s=(b-a)*funcxi(0.5d0*(a+b))
          else
            it=3**(n-2)
            tnm=it
            del=(b-a)/(3.d0*tnm)
            ddel=del+del
            x=a+0.5d0*del
            sum=0.d0
            do j=1,it
              sum=sum+funcxi(x)
              x=x+ddel
              sum=sum+funcxi(x)
              x=x+del
           end do
            s=(s+(b-a)*sum/tnm)/3.d0
          endif
        case default
            return
      end select
      return
      END SUBROUTINE halfway

Unfortunately I don’t know how to better pinpoint the cause of this error. I know it’s not an issue with the init and evolution file I’m using since they work fine on the non-GPU version. I’ve included all the files needed to compile and run the code should this be insufficient.
I’m compiling it with this command

pgfortran -Mcuda=cuda8.0 -o CRAFTest CRAFT_CLEAN.f90 -#

Apologies for the massive amount of text.

CRAFT.zip (12.2 KB)

Robert_Crovella · November 17, 2020, 3:10am

I’m running into an issue that I’m not really sure how to handle

It’s not really clear what you need help with or how familiar you are with the code. When I run your code, I get output like this:

# ./CRAFTest
           12
 All arrays started
 Timesteps detected:           12
    147000000000000.0         294000000000000.0         441000000000000.0
    588000000000000.0         734000000000000.0         881000000000000.0
    1030000000000000.         1180000000000000.         1320000000000000.
    1470000000000000.         1620000000000000.
 starting DSA on thread            3
 starting DSA on thread            8
 starting DSA on thread           12
 starting DSA on thread           11
 starting DSA on thread            6
 starting DSA on thread            7
 starting DSA on thread            1
 starting DSA on thread           10
 starting DSA on thread            2
 starting DSA on thread            5
 starting DSA on thread            9
 starting DSA on thread            4
 running on thread            1 block            8
 running on thread            1 block            3
 running on thread            1 block           11
 running on thread            1 block            6
 running on thread            1 block            1
 running on thread            1 block           12
 running on thread            1 block           10
 running on thread            1 block            7
 running on thread            1 block            2
 running on thread            1 block            9
 running on thread            1 block            5
 running on thread            1 block            4
 block            8 entering loop
 block           11 entering loop
 block            3 entering loop
 block            6 entering loop
 block            1 entering loop
 block           10 entering loop
 block            7 entering loop
 block           12 entering loop
 block            2 entering loop
 block            9 entering loop
 block            5 entering loop
 block            4 entering loop
            3 has completed first loop
            8 has completed first loop
           11 has completed first loop
            6 has completed first loop
            1 has completed first loop
           10 has completed first loop
            7 has completed first loop
            9 has completed first loop
            5 has completed first loop
            2 has completed first loop
            4 has completed first loop
 block            3 about to make first call to integrate
 block            8 about to make first call to integrate
 block           10 about to make first call to integrate
 block            5 about to make first call to integrate
 block           11 about to make first call to integrate
 block            1 about to make first call to integrate
 block            6 about to make first call to integrate
 block            9 about to make first call to integrate
 block            7 about to make first call to integrate
 block            2 about to make first call to integrate
 block            4 about to make first call to integrate
 ERROR in subroutine integrate
 ERROR in subroutine integrate
 Error on thread            1
 Error on thread            1
 Error on block           10
 Error on block            5
FORTRAN STOP: FORTRAN STOP: 0: 0: Block (10,1,1), Thread (1,1,1)
Block (5,1,1), Thread (1,1,1)
stop08.cu:28: pgf90_stop08a: block: [9,0,0], thread: [0,0,0] Assertion `FORTRAN_STOP_STATEMENT` failed.
stop08.cu:28: pgf90_stop08a: block: [4,0,0], thread: [0,0,0] Assertion `FORTRAN_STOP_STATEMENT` failed.
 Made it to the end
#

The first error indication I see looks like this:

  ERROR in subroutine integrate

That seems to correspond to this section of code:

!!!!            Subroutine for integrals
        attributes(device) SUBROUTINE integrate(func,a,b,ss,v)
          INTEGER KMAX,KMAXP,J,JM
          real*8 :: a,b,ss,tol
          integer :: func,v
          PARAMETER (KMAX=14, KMAXP=KMAX+1, J=5, JM=J-1, tol=1.d-1) !#CONTROL Should work even for EPS=1e-1
          INTEGER k
          real*8 :: dss,h(KMAXP),s(KMAXP)

          h(1)=1.d0
          do k=1,KMAX
             call halfway(func,a,b,s(k),k,v)
             if (k.ge.J) then
                call pol_interpol(h(k-JM),s(k-JM),J,0.d0,ss,dss)
                if (dabs(dss).le.tol*dabs(ss)) return
             endif
             s(k+1)=s(k)
             h(k+1)=h(k)/9.d0
          end do
          !write(*,*)  'ERROR in subroutine Integrate'
          print *, "ERROR in subroutine integrate"
          print *, "Error on thread", threadIdx%x
          print *, "Error on block", blockIdx%x
          !Keep running on other ones
          stop
        END SUBROUTINE integrate

It seems evident that when you call that integrate subroutine, the only way to return (successfully) is if this condition is satisfied:

                if (dabs(dss).le.tol*dabs(ss)) return

otherwise you will eventually hit the stop statement.

So it seems like you would want to track down from a calculation perspective why your condition is not being met (appears to be some sort of convergence criteria?)

Cattaneo · November 17, 2020, 3:26am

I apologise for not being clear, I am trying to figure out why it’s not reaching convergence. You are correct in that the error is caused by it not converging, but what I don’t understand is why. This Evolution.dat and Init.dat should converge and do converge on the CPU version (which I’ve attached should you or anyone want to look) so something must have broken on my moving it to CUDA.

Although what is odd is that when you run it the thing doesn’t even get past the first call to integrate, and it manages to put out 2 error messages, whereas on my machine it usually manages to reach at least the 3rd integration but only outputs one error.

I would greatly appreciate your assistance, but I understand if this is outside the purview of this forum.

CRAFT_CLEAN_ORIGINAL.zip (9.2 KB)

njuffa · November 17, 2020, 4:05am

If something seems odd, that is precisely the point where you would start to investigate, e.g. by extracting relevant data items to increase observability.

It usually helps to reproduce failure with as little data as possible and with as little code as possible. E.g. a failure that can be reproduced with a 3x3 matrix is easier to find than one using a 3Kx3K matrix. You can also temporarily disable (or remove) parts of the code not relevant to reproducing the failure, narrowing down the code that causes the failure.

Robert_Crovella · November 17, 2020, 4:21am

debugging is often, in my experience, an incremental, iterative process. You grab the tiger by the tail, and work backwards from there, sometimes taking what seems like baby steps. The tiger will eventually take you to the source of the problem, if you don’t let go. Judiciously use occam’s razor, and arthur conan doyle along the way. If its friday afternoon before the beer party, see if you can round up some colleagues for an exciting round of stadium debugging. “I have this weird problem, when I divide by zero, I don’t get NaN …” "try setting a breakpoint on the else clause, see if it ever gets hit… " “… did you see how she used strace there? that was sweet…”

You already know that you have the ability to print out values from device code.

Have you tried using any print statements to look at, for example, the values of ss and dss ? Perhaps do the same thing with the CPU version? If you restrict your print-out to a single thread (easily done with an if statement) then you should be able to do an apples-to-apples comparison on the convergence process between CPU and GPU. The error output is already indicating which block/thread to start with. The analysis of dss and ss may quickly lead you back to the pol_interpol routine, at which point you could use print statements to track the calculation process in that routine, which appears to be calculating both dss and ss. The process may lead backward from there, which may cause you to inspect the input data to that routine, and compare it to the CPU version. Again, possibly using print statements. And so on.

You could also dust off a debugger. Perhaps, by running 2 debug sessions, one on the CPU code, one on the GPU code, you could track the progress of the code all the way to this subroutine, inspecting data along the way.

I’m mostly still responding to this statement of yours:

I’m running into an issue that I’m not really sure how to handle.

If you are aware of all these ideas then please disregard my comments.

For me, I’ve found that stadium debugging is a lot less fun over the internet, except maybe if you have a video conference going. There is something about a sporting event that makes it more engaging when you are actually there, and can see the action live, as it happens. Reading an article about it just isn’t the same.

Cattaneo · November 17, 2020, 4:27am

I agree and I am, but I’ve always liked to proactively throw my net for solutions. I certainly didn’t mean for my post here to be a replacement for debugging, but if while I work someone else happens to catch something I’ve missed then I see no reason to not get other eyes on it, especially in a forum setting. If that is incorrect or I shouldn’t do that here please let me know and I’ll mark this as solved and go my merry way.

Robert_Crovella · November 17, 2020, 4:31am

within reasonable decorum, you can post whatever you want here…
the only potential penalty is being ignored into obscurity…

Cattaneo · November 17, 2020, 4:32am

In that case I hope this counts as reasonable decorum, I mean no disrespect nor rudeness

Robert_Crovella · November 17, 2020, 4:33am

it’s not rude. It’s fine. I just don’t happen to feel like debugging your code right now. but someone may come along with more stamina or else a magic wand…

njuffa · November 17, 2020, 4:46am

[ That quote from Arthur Conan Doyle made me try the Thumbs-Up button for the first time :-) From “The Sign of the Four”, Doyle’s second Sherlock Holmes novel. Worth (re-)reading]

I don’t know how others feel about it, but one thing I like less than debugging my own code is debugging someone else’s.

I have found that my debugging skills were honed a lot by working through the entire debugging process by myself, baby-step-by-step until the root cause had been identified. Admittedly, part of that was from necessity. Books on debugging had not yet been written, the internet wouldn’t come into existence until years later, and there often was nobody else around who I could have asked for help. I seem to recall the existence of some early interactive debuggers, but mostly debugging was about data reduction, code bisection, strategically adding printfs for logs and data dumps, to be extracted over a serial port or some such.

Much debugging boils down to establishing were exactly one’s mental model deviates from reality, or vice versa. In that context, any unexplained observation is worth following up on, even if it ultimately leads to a dead end.

Robert_Crovella · November 17, 2020, 5:23am

speaking of magic wands:

# cuda-memcheck ./CRAFTest
========= CUDA-MEMCHECK
           12
 All arrays started
 Timesteps detected:           12
    147000000000000.0         294000000000000.0         441000000000000.0
    588000000000000.0         734000000000000.0         881000000000000.0
    1030000000000000.         1180000000000000.         1320000000000000.
    1470000000000000.         1620000000000000.
========= Invalid __global__ read of size 8
=========     at 0x000005a0 in gpufunctions_launchtogpu_
=========     by thread (0,0,0) in block (11,0,0)
=========     Address 0x7f190fc00258 is out of bounds
=========     Device Frame:gpufunctions_launchtogpu_ (gpufunctions_launchtogpu_ : 0x5a0)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2d725e]
=========     Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/lib64/libcudart.so.11.0 [0xf62b]
=========     Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/lib64/libcudart.so.11.0 (cudaLaunchKernel + 0x1c1) [0x4f5b1]
=========     Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/compilers/lib/libcudafor.so (__pgiLaunchKernel + 0x1eb) [0x80276]
=========     Host Frame:./CRAFTest [0x4c39]
=========     Host Frame:./CRAFTest [0x1913]
=========

I don’t know if that helps or not, but it’s another place to grab the tiger’s tail, if you’re so inclined.

Topic		Replies	Views
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4769	April 8, 2008
Passing dynamically allocated memory in kernel to sub kernel via dynamic parallelism CUDA Programming and Performance	6	688	May 3, 2019
What can't you do in CUDA that you'd like? Requests for the future CUDA Programming and Performance	407	134562	May 26, 2010
CUDA C++ Segmentation Fault CUDA Programming and Performance	7	14638	October 1, 2017
Unexplained stalls in CUDA API calls - reproducer attached Jetson TK1	27	2928	October 18, 2021
Effective Parallelisation of CUDA C code CUDA Programming and Performance	38	1918	December 27, 2021
Running into an endless loop, but why? The strange behaviour of __syncthreads() CUDA Programming and Performance	1	1619	January 27, 2010
Cuda compiler loop unroll bug? CUDA Programming and Performance	14	2393	October 25, 2017
Synchronization hangs sporadically after kernel launch CUDA Programming and Performance	23	7800	August 20, 2015
Why cuda kernel use unexpected stack frame? CUDA Programming and Performance cuda , kernel	8	315	April 3, 2024

Integration calls fail

Related topics