error 700: Illegal address during kernel execution

How to debug this error?
when I use openacc for speed up and I only added some simple directed sentence,it occured this mistack. I set the PGI_ACC_DEBUG=1, it shows the following information.

pgi_uacc_dataenterstart( file=/home/0401/cfl3dv6/build/cfl/libs/diagj.F, function=diagj, line=1:1, line=78, devid=0 )
pgi_uacc_dataon(hostptr=0x7f6dad10a9b0,stride=1,121,2178,size=121x1x1,extent=121x18x30,eltsize=8,lineno=78,name=dtj,flags=0x2700=present+create+copyin+inexact,async=-1,threadid=1)
pgi_uacc_alloc(size=968,devid=1,threadid=1)
allocate device memory 0x7f6d8ae00000(1024B)
pgi_uacc_alloc(size=968,devid=1,threadid=1) returns 0x7f6d8ae00000
map    dev:0x7f6d8ae00000 host:0x7f6dad10a9b0 dindex:1 size:968 offset:0  (line:78 name:dtj) thread:1
alloc done with devptr at 0x7f6d8ae00000
pgi_uacc_dataupx(devptr=0x7f6d8ae00000,hostptr=0x7f6dad10a9b0,stride=1,size=121,extent=65340,eltsize=8,lineno=78,name=dtj,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7f6d8ae00000,hostsrc=0x7f6dad10a9b0,offset=0,stride=1,size=121,eltsize=8,lineno=78,name=dtj,thread=1)
pgi_uacc_dataon(hostptr=0x7f6dac570230,stride=1,121,2178,67518,size=121x1x1x5,extent=121x18x31x5,eltsize=8,lineno=78,name=q,flags=0x2700=present+create+copyin+inexact,async=-1,threadid=1)
pgi_uacc_alloc(size=2161544,devid=1,threadid=1)
allocate device memory 0x7f6d8b000000(2161664B)
pgi_uacc_alloc(size=2161544,devid=1,threadid=1) returns 0x7f6d8b000000
map    dev:0x7f6d8b000000 host:0x7f6dac570230 dindex:1 size:2161544 offset:0  (line:78 name:q) thread:1
alloc done with devptr at 0x7f6d8b000000
pgi_uacc_dataupx(devptr=0x7f6d8b000000,hostptr=0x7f6dac570230,stride=1,67518,size=121x5,extent=67518x5,eltsize=8,lineno=78,name=q,async=-1,threadid=1)
pgi_uacc_cuda_dataup2(devdst=0x7f6d8b000000,hostsrc=0x7f6dac570230,offset=0,0,stride=1,67518,size=121,5,eltsize=8,lineno=78,name=q)
pgi_uacc_dataon(hostptr=0x7f6db2f09e78,offset=0,5,stride=1,63162,size=63162x25,extent=63162x35,eltsize=8,lineno=78,name=t,flags=0x2f00=present+create+copyin+copyout+inexact,async=-1,threadid=1)
pgi_uacc_alloc(size=12632400,devid=1,threadid=1)
allocate device memory 0x7f6d8e400000(12632576B)
pgi_uacc_alloc(size=12632400,devid=1,threadid=1) returns 0x7f6d8e400000
map    dev:0x7f6d8e400000 host:0x7f6db3172b88 dindex:1 size:12632400 offset:0  (line:78 name:t) thread:1
alloc done with devptr at 0x7f6d8e1972f0
pgi_uacc_dataupx(devptr=0x7f6d8e1972f0,hostptr=0x7f6db2f09e78,offset=315810,stride=1,size=1579050,extent=2210670,eltsize=8,lineno=78,name=t,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7f6d8e1972f0,hostsrc=0x7f6db2f09e78,offset=315810,stride=1,size=1579050,eltsize=8,lineno=78,name=t,thread=1)
pgi_uacc_dataon(hostptr=0x7f6db2b5d0e8,stride=1,121,2178,65340,size=121x1x1x5,extent=121x18x30x5,eltsize=8,lineno=78,name=res,flags=0x2700=present+create+copyin+inexact,async=-1,threadid=1)
pgi_uacc_alloc(size=2091848,devid=1,threadid=1)
allocate device memory 0x7f6d8b400000(2092032B)
pgi_uacc_alloc(size=2091848,devid=1,threadid=1) returns 0x7f6d8b400000
map    dev:0x7f6d8b400000 host:0x7f6db2b5d0e8 dindex:1 size:2091848 offset:0  (line:78 name:res) thread:1
alloc done with devptr at 0x7f6d8b400000
pgi_uacc_dataupx(devptr=0x7f6d8b400000,hostptr=0x7f6db2b5d0e8,stride=1,65340,size=121x5,extent=65340x5,eltsize=8,lineno=78,name=res,async=-1,threadid=1)
pgi_uacc_cuda_dataup2(devdst=0x7f6d8b400000,hostsrc=0x7f6db2b5d0e8,offset=0,0,stride=1,65340,size=121,5,eltsize=8,lineno=78,name=res)
pgi_uacc_dataon(hostptr=0x7f6dad08afd0,stride=1,121,2178,size=121x1x1,extent=121x18x30,eltsize=8,lineno=78,name=vol,flags=0x2700=present+create+copyin+inexact,async=-1,threadid=1)
pgi_uacc_alloc(size=968,devid=1,threadid=1)
allocate device memory 0x7f6d8ae00400(1024B)
pgi_uacc_alloc(size=968,devid=1,threadid=1) returns 0x7f6d8ae00400
map    dev:0x7f6d8ae00400 host:0x7f6dad08afd0 dindex:1 size:968 offset:0  (line:78 name:vol) thread:1
alloc done with devptr at 0x7f6d8ae00400
pgi_uacc_dataupx(devptr=0x7f6d8ae00400,hostptr=0x7f6dad08afd0,stride=1,size=121,extent=65340,eltsize=8,lineno=78,name=vol,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7f6d8ae00400,hostsrc=0x7f6dad08afd0,offset=0,stride=1,size=121,eltsize=8,lineno=78,name=vol,thread=1)
pgi_uacc_dataenterdone( devid=1 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1)
pgi_uacc_cuda_wait(sync on stream=0x29b4290)
pgi_uacc_cuda_wait done
pgi_uacc_computestart( file=/home/xll/take_test/cfl3d_xll_0401/cfl3dv6/build/cfl/libs/diagj.F, function=diagj, line=1:1, line=78, devid=0, computeconstruct=9999 )
pgi_uacc_launch funcnum=0 argptr=0x7ffe8733df20 sizeargs=(nil) async=-1 devid=1
Arguments to function 0 diagj_79_gpu dindex=1 threadid=1 device=0: 
               522        522        522        522        120       -121       -121       -121
              -121        121        121        121        121      32621 -1958739968      32621
        0x0000020a 0x0000020a 0x0000020a 0x0000020a 0x00000078 0xffffff87 0xffffff87 0xffffff87
        0xffffff87 0x00000079 0x00000079 0x00000079 0x00000079 0x00007f6d 0x8b400000 0x00007f6d
Launch configuration for function=0=diagj_79_gpu line=79 dindex=1 threadid=1 device=0 <<<(1,1,1),(32,1,1),0>>> async=-1
pgi_uacc_computedone( devid=0, computeconstruct=9999 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1)
pgi_uacc_cuda_wait(sync on stream=0x29b4290)
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

call to cuMemFreeHost returned error 700: Illegal address during kernel execution

the code I changes is as follows:

      kv = npl*kdim
!$acc kernels loop      
      do 1009 j=1,jdim1
      kj = (j-1)*kv+1
      do 1004 l=1,5
c
      jj = 1-jdim
      do 8466 ii=1,kv
      jj = jj+jdim
 8466 t(kj+ii-1,25+l) = -res(j+jj-1,1,i,l)
c      call q8vgathp(kv,res(j,1,i,l),jdim,kv,kv,t(kj,25+l))
c
      jj = 1-jdim
      do 8467 ii=1,kv
      jj = jj+jdim
 8467 t(kj+ii-1,l+5) = q(j+jj-1,1,i,l)
c      call q8vgathp(kv,q(j,1,i,l),jdim,kv,kv,t(kj,l+15))
 1004 continue
c
      jj = 1-jdim
      do 8458 ii=1,kv
      jj = jj+jdim
 8458 t(kj+ii-1,21) = tfacp1*dtj(j+jj-1,1,i)
c      call q8vgathp(kv,dtj(j,1,i),jdim,kv,kv,t(kj,21))
      if(ivisc(2) .gt. 0) then
        jj=1-jdim
        do 9458 ii=1,kv
        jj=jj+jdim
 9458   t(kj+ii-1,12)=vol(j+jj-1,1,i)
      end if
 1009 continue
!$acc end kernels      
c
!$acc kernels loop      
      do 1119 j=1,jdim
      kj = (j-1)*kv+1
      do 1119 l=1,5
c
      jj = 1-jdim
      do 8459 ii=1,kv
      jj = jj+jdim
 8459 t(kj+ii-1,15+l) = sj(j+jj-1,1,i,l)
c      call q8vgathp(kv,sj(j,1,i,l),jdim,kv,kv,t(kj,5+l))
 1119 continue
!$acc end kernels      
      if(ivisc(2) .gt. 1) then
        ic=0
!$acc kernels loop      
        do 8558 ipl=1,npl
          ii=i+ipl-1
          do 8558 k=1,kdim
            ic=ic+1
            if(k .ne. kdim) then
              do 1118 j=1,jdim1
                kj=(j-1)*kv
                t(kj+ic,31)=vist3d(j,k,ii)
 1118         continue
            else
              do 1120 j=1,jdim1
                kj=(j-1)*kv
                t(kj+ic,31)=vist3d(j,kdim1,ii)
 1120         continue
            end if
 8558   continue
!$acc end kernels      
      end if

It is very confusing

Hi xll_bit,

An “illegal address error” is similar to a seg fault on the host where the code is accessing a bad device address. It can be caused by a number of things such as accessing a host pointer on the device, out-of-bounds errors, bad pointers (C/C++), etc.

From the debug output, the error is occurring in the loop at line 79 of the “diagj” routine (i.e. diagj_79_gpu) which I presume is the first kernel.

What I’d like to see the output from the compiler feedback messages (-Minfo=accel) during compilation.

You don’t use data directives (at least you don’t show them) which means the compiler is having to implicitly copy the data for you. Since you’re using computed indices, it’s possible that the compiler is getting confused as to the size of the arrays. The feedback messages will tell us more.

Secondly, I’d like you to add a data region around the compute regions. This will help you with performance, but also ensure the full arrays are being copied.

It will look something like the following, but please add all the needed arrays if I missed any.

      kv = npl*kdim 
!$acc data copy(t) copyin(res,q,dtj,vol,sj,visit3d)
!$acc kernels loop      
      do 1009 j=1,jdim1 
      kj = (j-1)*kv+1 
... cut ...
 1120         continue 
            end if 
 8558   continue 
!$acc end kernels      
!$acc end data
      end if

Hope this helps,
Mat

Hi Mat,
Thank you for your reply. I am trying to add some data directives in other parts of the program. But it often occur the mistake:“variable in data clause is partially present on device” or “Illegal address during kernel execution” I have encountered a lot of such errors these days, and I am trying to find how to solve it.
here is the compiler feedback messages.

diagj:
     78, Generating implicit copyin(dtj(:,:1,i),res(:,:1,i,:),q(:,:1,i,:),vol(:,:1,i))
         Generating implicit copy(t(:,6:30))
     79, Parallelization would require privatization of array t(:,i2+26)
         Accelerator kernel generated
         Generating Tesla code
         79, !$acc loop seq
         81, !$acc loop vector(32) ! threadidx%x
         84, !$acc loop seq
         90, !$acc loop seq
         97, !$acc loop seq
        103, !$acc loop seq
     81, Loop is parallelizable
     84, Loop carried reuse of t prevents parallelization
         Inner sequential loop scheduled on accelerator
     90, Loop carried reuse of t prevents parallelization
         Inner sequential loop scheduled on accelerator
     97, Parallelization would require privatization of array t(:,21),t(:,26:30)
    103, Parallelization would require privatization of array t(:,12),t(:,26:30)
    110, Generating implicit copyout(t(:,16:20))
         Generating implicit copyin(sj(:,:1,i,:))
    111, Parallelization would require privatization of array t(:,i2+16)
         Accelerator kernel generated
         Generating Tesla code
        111, !$acc loop seq
        113, !$acc loop vector(32) ! threadidx%x
        116, !$acc loop seq
    113, Loop is parallelizable
    116, Loop carried reuse of t prevents parallelization
         Inner sequential loop scheduled on accelerator
    124, Generating implicit copyin(vist3d(:jdim-1,:kdim,i:npl+i-1))
         Generating implicit copyout(t(:,31))
    125, Parallelization would require privatization of array t(:,31)
    127, Parallelization would require privatization of array t(:,31)
         Accelerator kernel generated
         Generating Tesla code
        125, !$acc loop seq
        127, !$acc loop seq
        130, !$acc loop seq
        135, !$acc loop seq
    130, Parallelization would require privatization of array t(:,31)
    135, Parallelization would require privatization of array t(:,31)

But it often occur the mistake:“variable in data clause is partially present on device” or “Illegal address during kernel execution” I have encountered a lot of such errors these days, and I am trying to find how to solve it.

I just posted a reply to your other post and hopefully adding a “present” clause will fix the issue.

The feedback messages show a number of issues which are preventing your code from parallelizing. For example:

Parallelization would require privatization of array t(:,i2+26)

The problem being that you’re using computed indices so the compiler can’t prove that the code does not have any dependencies. Since computed indices could compute the same index, the compiler must assume that they do and hence the loop cannot be parallelized.

If you can guarantee that the loop is parallelizable, then you can override the compiler’s analysis by adding the “independent” clause to the “loop” directive (you’ll need to do this to all loops you want to parallelize) or use the “parallel” construct instead of “kernel” since loops in a parallel region are considered independent by default.

Note that several of the loops do contain dependencies so do be careful when adding “independent”. For example:

      jj = 1-jdim 
      do 8466 ii=1,kv 
      jj = jj+jdim 
 8466 t(kj+ii-1,25+l) = -res(j+jj-1,1,i,l)

“jj” in the inner loop depends upon the previous loop iteration for the current value.

Also, for the following loop:

        ic=0 
!$acc kernels loop      
        do 8558 ipl=1,npl 
          ii=i+ipl-1 
          do 8558 k=1,kdim 
            ic=ic+1 
            if(k .ne. kdim) then

The incrementing of “ic” will prevent the entire loop from being parallelized. To fix, I’d recommend computing “ic” based on the loop index values.

Hope this helps,
Mat

Thanks for your reply. I modified my code. It works well with CPU. However, it occurred another mistake when was running. The following is the code and mistake.

      kv = npl*kdim
!$acc data copy(t,res,q,dtj,vol,sj,vist3d)
      do j=1,jdim1
      kj = (j-1)*kv+1
      jx = 1-jdim
!$acc kernels loop independent present(t,res,q,dtj,vol)
      do ii=1,kv
        jj = jx+jdim*ii
        t(kj+ii-1,26) = -res(j+jj-1,1,i,1)
        t(kj+ii-1,27) = -res(j+jj-1,1,i,2)
        t(kj+ii-1,28) = -res(j+jj-1,1,i,3)
        t(kj+ii-1,29) = -res(j+jj-1,1,i,4)
        t(kj+ii-1,30) = -res(j+jj-1,1,i,5)
        t(kj+ii-1,6) = q(j+jj-1,1,i,1)
        t(kj+ii-1,7) = q(j+jj-1,1,i,2)
        t(kj+ii-1,8) = q(j+jj-1,1,i,3)
        t(kj+ii-1,9) = q(j+jj-1,1,i,4)
        t(kj+ii-1,10) = q(j+jj-1,1,i,5)
        t(kj+ii-1,21) = tfacp1*dtj(j+jj-1,1,i)
        if(ivisc(2) .gt. 0) then
          t(kj+ii-1,12)=vol(j+jj-1,1,i)
        end if
      end do    
!$acc end kernels      
      end do
!$acc end data

In the code,‘ivisc’ is a common variable.

node   4 is terminating the program due to a cfl3d error check
 see file cfl3d.error
--------------------------------------------------------------------------
MPI_ABORT was invoked on rank 4 in communicator MPI_COMM_WORLD
with errorcode 4.

NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.
You may or may not see output from other processes, depending on
exactly when Open MPI kills them.

node 4 is terminating the program due to a cfl3d error check

This appears to be a program error where it’s most likely doing some type of self consistency check and seeing that the code is getting bad results. Hence, we should be looking for things that would produce incorrect answers.

I’m not seeing any race conditions offhand.

Though while scalars are private by default, one exception would be if they scalar has global storage (i.e. a module variable or in a common block). Is “jj” a module variable? If so, add it to a private clause on the loop.

Are you managing “ivisc” in another data region? If not, is the compiler implicitly copying it in? (As seen in the compiler feed back).

Could the code be writing beyond the bounds of the arrays?

-Mat