How to pass the device data into inner subroutines?

Hi,
I have a program called cuda kernel function in the inner subroutine, the call relationship is as follows: A->B->C->D

However, it is very time-consuming to malloc device data. I want to create device data in the outer subroutine, and then pass the device data to the inner subroutine to avoid frequent malloc of device data.

Subroutines A, B, C, D are as follows:

      subroutine A(work,mwork,arg1,arg2,...)
        dimension work(mwork)
        real, device :: work_d(mwork)
!       othre statements...

        call B(work_d,work_d(nstart+1+maxbl),...)
      end subroutine 

      subroutine B(w_d,wk_d,arg1,arg2,...)
        real, intent(inout), device :: w_d(:),wk_d(:)
!       othre statements... 

        call C(w_d(lq),w_d(lsj),w_d(lsk),w_d(lsi),w_d(lvol),
     .         w_d(lsnk0),wk_d(lres),wk_d(ltot),w_d(lqj0),
     .         w_d(lqk0),w_d(lvolj0),w_d(lvolk0),...)
      end subroutine 

      subroutine C(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,res_d,wk_d,qj0_d,
     .  qk0_d,volj0_d,volk0_d,arg1,arg2,...)
        real, intent(inout), device :: 
     .    q_d(:,:,:,:),
     .    si_d(:,:),
     .    sj_d(:,:),
     .    sk_d(:,:),
     .    vol_d(:),
     .    snk0_d(:,:,:),
     .    res_d(:,:),
     .    wk_d(:),
     .    qj0_d(:,:,:),
     .    qk0_d(:,:,:),
     .    volj0_d(:,:,:),
     .    volk0_d(:,:,:)
!       othre statements...

        call D(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,wk_d(iwk1),res_d(1,2),
     +         res_d(1,3),wk_d(iwk4),wk_d(iwk32),wk_d(iwk33),qj0_d,
     +         qk0_d,volj0_d,volk0_d,...)
      end subroutine 

      subroutine D(q_d,sj_d,sk_d,si_d,vol_d,smin_d,turre_d,damp1_d,
     +  blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,volj0_d,volk0_d,arg1,
     +  arg2,...)
        real, intent(inout), device :: 
     +  sk_d(:,:,:,:),
     +  vol_d(:,:,:), 
     +  turre_d(:,:,:,:),
     +  damp1_d(:,:,:),
     +  sj_d(:,:,:,:),
     +  si_d(:,:,:,:),
     +  smin_d(:,:,:),
     +  q_d(:,:,:,:),
     +  fnu_d(:,:,:),
     +  blend_d(:,:,:),
     +  v3dtmp_d(:,:,:),
     +  rhside_d(:,:,:,:),
     +  qk0_d(:,:,:,:),
     +  volk0_d(:,:,:),
     +  qj0_d(:,:,:,:),
     +  volj0_d(:,:,:)

!       call cuda kernel in this subroutine
        call kernel<<<grid,block>>>(q_d,sj_d,sk_d,si_d,vol_d,smin_d,
     +      turre_d,damp1_d,blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,
     +      volj0_d,volk0_d,...)
        call checkCudaError()
      end subroutine

But running the above code will get a runtime error, signal SIGSEGV, Segmentation fault. How to properly pass the device data reference to the inner subroutine?

Thanks!
Zhou Heng

First off, your interfaces between A and B, B and C, etc should be explicit (in the Fortran sense) so the caller knows how the callee declares its dummy arguments. Use modules or interface blocks to do that.

Second, you will have better performance if you do not use assumed-shape dummy arguments (declared with : for bounds). Pass the bounds for each array in as an input argument.

If you do those two things, you should be able to pass device arrays between host subroutines without any problems.

Thank you for your reply.
I try to add interface block in caller subroutine. However, a Segmentation fault (core dumped) error occurred during runtime.
The code is as follow:

      subroutine A(work,mwork,arg1,arg2,...)
        interface
        subroutine B(w_d,wk_d,arg1,arg2,...)
          real, device :: w_d(mgwk),wk_d(nwork)
!         define othre arguments...           
        end subroutine 
        end interface

        dimension work(mwork)
        real, device :: work_d(mwork)
!       othre statements...

        call B(work_d,work_d(nstart+1+maxbl),...)
      end subroutine 

      subroutine B(w_d,wk_d,arg1,arg2,...)
        real, intent(inout), device :: w_d(:),wk_d(:)
        interface 
        subroutine C(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,res_d,wk_d,qj0_d,
     .  qk0_d,volj0_d,volk0_d,arg1,arg2,...)
          real, device :: 
     .      q_d(jdim,kdim,idim,5),
     .      si_d(jdim*kdim*idim,5),
     .      sj_d(jdim*kdim*(idim-1),5),
     .      sk_d(jdim*kdim*(idim-1),5),
     .      vol_d(jdim*kdim*(idim-1)),
     .      snk0_d(jdim-1,kdim-1,idim-1),
     .      res_d(jdim*kdim*(idim-1),5),
     .      wk_d(nwork),
     .      qj0_d(kdim*(idim-1),5,4),
     .      qk0_d(jdim*(idim-1),5,4),
     .      volj0_d(kdim,idim-1,4),
     .      volk0_d(jdim,idim-1,4)
!           define othre arguments...
        end subroutine
        end interface
!       othre statements... 

        call C(w_d(lq),w_d(lsj),w_d(lsk),w_d(lsi),w_d(lvol),
     .         w_d(lsnk0),wk_d(lres),wk_d(ltot),w_d(lqj0),
     .         w_d(lqk0),w_d(lvolj0),w_d(lvolk0),...)
      end subroutine 

      subroutine C(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,res_d,wk_d,qj0_d,
     .  qk0_d,volj0_d,volk0_d,arg1,arg2,...)
        interface 
        subroutine D(q_d,sj_d,sk_d,si_d,vol_d,smin_d,turre_d,damp1_d,
     +  blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,volj0_d,volk0_d,arg1,
     +  arg2,...)
          real, device :: 
     +      sk_d(jdim,kdim,idim-1,5),
     +      vol_d(jdim,kdim,idim-1),
     +      turre_d(0-iex:jdim+iex,0-iex:kdim+iex,0-iex2:idim+iex2,2),
     +      damp1_d(jdim-1,kdim-1,idim-1),
     +      sj_d(jdim,kdim,idim-1,5),
     +      si_d(jdim,kdim,idim,5),
     +      smin_d(jdim-1,kdim-1,idim-1),
     +      q_d(jdim,kdim,idim,5),
     +      fnu_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +      blend_d(jdim-1,kdim-1,idim-1),
     +      v3dtmp_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +      rhside_d(jdim-1,kdim-1,idim-1,2),
     +      qk0_d(jdim,idim-1,5,4),
     +      volk0_d(jdim,idim-1,4),
     +      qj0_d(kdim,idim-1,5,4),
     +      volj0_d(kdim,idim-1,4)
!           define othre arguments...     
        end subroutine
        end interface

        real, device :: 
     +    sk_d(jdim,kdim,idim-1,5),
     +    vol_d(jdim,kdim,idim-1),
     +    turre_d(0-iex:jdim+iex,0-iex:kdim+iex,0-iex2:idim+iex2,2),
     +    damp1_d(jdim-1,kdim-1,idim-1),
     +    sj_d(jdim,kdim,idim-1,5),
     +    si_d(jdim,kdim,idim,5),
     +    smin_d(jdim-1,kdim-1,idim-1),
     +    q_d(jdim,kdim,idim,5),
     +    fnu_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +    blend_d(jdim-1,kdim-1,idim-1),
     +    v3dtmp_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +    rhside_d(jdim-1,kdim-1,idim-1,2),
     +    qk0_d(jdim,idim-1,5,4),
     +    volk0_d(jdim,idim-1,4),
     +    qj0_d(kdim,idim-1,5,4),
     +    volj0_d(kdim,idim-1,4)
!       othre statements...

        call D(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,wk_d(iwk1),res_d(1,2),
     +         res_d(1,3),wk_d(iwk4),wk_d(iwk32),wk_d(iwk33),qj0_d,
     +         qk0_d,volj0_d,volk0_d,...)
      end subroutine 

      subroutine D(q_d,sj_d,sk_d,si_d,vol_d,smin_d,turre_d,damp1_d,
     +  blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,volj0_d,volk0_d,arg1,
     +  arg2,...)
        real, device :: 
     +    sk_d(jdim,kdim,idim-1,5),
     +    vol_d(jdim,kdim,idim-1),
     +    turre_d(0-iex:jdim+iex,0-iex:kdim+iex,0-iex2:idim+iex2,2),
     +    damp1_d(jdim-1,kdim-1,idim-1),
     +    sj_d(jdim,kdim,idim-1,5),
     +    si_d(jdim,kdim,idim,5),
     +    smin_d(jdim-1,kdim-1,idim-1),
     +    q_d(jdim,kdim,idim,5),
     +    fnu_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +    blend_d(jdim-1,kdim-1,idim-1),
     +    v3dtmp_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +    rhside_d(jdim-1,kdim-1,idim-1,2),
     +    qk0_d(jdim,idim-1,5,4),
     +    volk0_d(jdim,idim-1,4),
     +    qj0_d(kdim,idim-1,5,4),
     +    volj0_d(kdim,idim-1,4)

!       call cuda kernel in this subroutine
        call kernel<<<grid,block>>>(q_d,sj_d,sk_d,si_d,vol_d,smin_d,
     +      turre_d,damp1_d,blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,
     +      volj0_d,volk0_d,...)
        call checkCudaError()
      end subroutine

I comment the interface block and the program run normally. It still spent 128 seconds, compared to the serial program 26 seconds.

The analysis using the PGI Profiler is as follows: cuMalloc is called in subroutine A and cuMalloc is called in D.

  8.76%  32.9331s  cuMemAlloc_v2
  8.76%  32.9331s  | ???
  8.76%  32.9331s  |   ???
  8.76%  32.9331s  |     cudaMalloc
  8.76%  32.9331s  |       pgf90_dev_mkdesc
  8.76%  32.9331s  |         twoeqn_
  8.76%  32.9331s  |           resid_
  8.76%  32.9331s  |             mgblk_
  8.76%  32.9331s  |               mgbl_
  8.76%  32.9331s  |                 cfl3d_
  8.76%  32.9331s  |                   MAIN_
  8.76%  32.9331s  |                     main
  8.76%  32.9331s  |                       ???
  3.03%  11.3818s  cuMemFree_v2
  3.03%  11.3818s  | ???
  3.03%  11.3818s  |   cudaFree
  3.03%  11.3818s  |     pgf90_dev_auto_dealloc
  3.03%  11.3818s  |       twoeqn_
  3.03%  11.3818s  |         resid_
  3.03%  11.3818s  |           mgblk_
  3.03%  11.3818s  |             mgbl_
  3.03%  11.3818s  |               cfl3d_
  3.03%  11.3818s  |                 MAIN_
  3.03%  11.3818s  |                   main
  3.03%  11.3818s  |                     ???

Should I pass a device pointer as a parameter to a subroutine?

Thanks!
Zhou Heng

[/code]

You shouldn’t declare an interface block this way:

interface
subroutine B(w_d,wk_d,arg1,arg2,…)
real, device :: w_d(mgwk),wk_d(nwork)
! define othre arguments…
end subroutine
end interface

And then write your actual subroutine declaring the arrays this way:
real, device :: w_d(:)

These are two different methods for passing arguments. That’s why you get the seg fault.

Hi, thank you for your reply.

I defined the device arguments in the subroutine as dummy arguments. The code looks like this:

      subroutine A(work,mwork,arg1,arg2,...)
        interface
        subroutine B(w_d,wk_d,arg1,arg2,...)
          real, device :: w_d(mgwk),wk_d(nwork)
!         define othre arguments...           
        end subroutine 
        end interface

        dimension work(mwork)
        real, device :: work_d(mwork)
!       othre statements...

        call B(work_d,work_d(nstart+1+maxbl),...)
      end subroutine 

      subroutine B(w_d,wk_d,arg1,arg2,...)
        interface 
        subroutine C(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,res_d,wk_d,qj0_d,
     .  qk0_d,volj0_d,volk0_d,arg1,arg2,...)
          real, device :: 
     .      q_d(jdim,kdim,idim,5),
     .      si_d(jdim*kdim*idim,5),
     .      sj_d(jdim*kdim*(idim-1),5),
     .      sk_d(jdim*kdim*(idim-1),5),
     .      vol_d(jdim*kdim*(idim-1)),
     .      snk0_d(jdim-1,kdim-1,idim-1),
     .      res_d(jdim*kdim*(idim-1),5),
     .      wk_d(nwork),
     .      qj0_d(kdim*(idim-1),5,4),
     .      qk0_d(jdim*(idim-1),5,4),
     .      volj0_d(kdim,idim-1,4),
     .      volk0_d(jdim,idim-1,4)
!           define othre arguments...
        end subroutine
        end interface
        
        real, device :: w_d(:),wk_d(:)

!       othre statements... 

        call C(w_d(lq),w_d(lsj),w_d(lsk),w_d(lsi),w_d(lvol),
     .         w_d(lsnk0),wk_d(lres),wk_d(ltot),w_d(lqj0),
     .         w_d(lqk0),w_d(lvolj0),w_d(lvolk0),...)
      end subroutine 

      subroutine C(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,res_d,wk_d,qj0_d,
     .  qk0_d,volj0_d,volk0_d,arg1,arg2,...)
        interface 
        subroutine D(q_d,sj_d,sk_d,si_d,vol_d,smin_d,turre_d,damp1_d,
     +  blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,volj0_d,volk0_d,arg1,
     +  arg2,...)
          real, device :: 
     +      sk_d(jdim,kdim,idim-1,5),
     +      vol_d(jdim,kdim,idim-1),
     +      turre_d(0-iex:jdim+iex,0-iex:kdim+iex,0-iex2:idim+iex2,2),
     +      damp1_d(jdim-1,kdim-1,idim-1),
     +      sj_d(jdim,kdim,idim-1,5),
     +      si_d(jdim,kdim,idim,5),
     +      smin_d(jdim-1,kdim-1,idim-1),
     +      q_d(jdim,kdim,idim,5),
     +      fnu_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +      blend_d(jdim-1,kdim-1,idim-1),
     +      v3dtmp_d(0:jdim,0:kdim,0-iex3:idim+iex3),
     +      rhside_d(jdim-1,kdim-1,idim-1,2),
     +      qk0_d(jdim,idim-1,5,4),
     +      volk0_d(jdim,idim-1,4),
     +      qj0_d(kdim,idim-1,5,4),
     +      volj0_d(kdim,idim-1,4)
!           define othre arguments...     
        end subroutine
        end interface

        real, device :: 
     .    q_d(:,:,:,:),
     .    si_d(:,:),
     .    sj_d(:,:),
     .    sk_d(:,:),
     .    vol_d(:),
     .    snk0_d(:,:,:),
     .    res_d(:,:),
     .    wk_d(:),
     .    qj0_d(:,:,:),
     .    qk0_d(:,:,:),
     .    volj0_d(:,:,:),
     .    volk0_d(:,:,:)
!       othre statements...

        call D(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,wk_d(iwk1),res_d(1,2),
     +         res_d(1,3),wk_d(iwk4),wk_d(iwk32),wk_d(iwk33),qj0_d,
     +         qk0_d,volj0_d,volk0_d,...)
      end subroutine 

      subroutine D(q_d,sj_d,sk_d,si_d,vol_d,smin_d,turre_d,damp1_d,
     +  blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,volj0_d,volk0_d,arg1,
     +  arg2,...)
        use cudaModule
        real, device :: 
     +    sk_d(:,:,:,:),
     +    vol_d(:,:,:),
     +    turre_d(:,:,:,:),
     +    damp1_d(:,:,:),
     +    sj_d(:,:,:,:),
     +    si_d(:,:,:,:),
     +    smin_d(:,:,:),
     +    q_d(:,:,:,:),
     +    fnu_d(:,:,:),
     +    blend_d(:,:,:),
     +    v3dtmp_d(:,:,:),
     +    rhside_d(:,:,:,:),
     +    qk0_d(:,:,:,:),
     +    volk0_d(:,:,:),
     +    qj0_d(:,:,:,:),
     +    volj0_d(:,:,:)
     
!       the definition of kernel subroutine is in cudaModule
!       call cuda kernel in this subroutine
        call kernel<<<grid,block>>>(q_d,sj_d,sk_d,si_d,vol_d,smin_d,
     +      turre_d,damp1_d,blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,
     +      volj0_d,volk0_d,...)
        call checkCudaError()
      end subroutine

However, seg fault still occurs at runtime.
Does it support passing device array slices as parameters?

Many thanks!


Maybe I’m making it too complicated. My first boss told me one working program was worth a thousand pages of manuals, so here is one working program. I think this is what you are trying to do:

program a
real, device :: awrk(5558)
real :: ah(5
558)
integer idx(8)
do i = 1, 8
idx(i) = 1 + (i-1)555
end do
call b(awrk(idx(1)), &
awrk(idx(2)), &
awrk(idx(3)), &
awrk(idx(4)), &
awrk(idx(5)), &
awrk(idx(6)), &
awrk(idx(7)), &
awrk(idx(8)),5,5,5)
ah = awrk
do i = 1, 8
ilo = idx(i); ihi = idx(i)+5
55-1
print ,i,sum(ah(ilo:ihi))/(55
5)
end do
end program
subroutine b(aa,ab,ac,ad,ae,af,ag,ah,k,n,m)
real, device, dimension(k,n,m) :: aa,ab,ac,ad,ae,af,ag,ah
print ,"in B, k,n,m are ",k,n,m
call c(aa,ab,ac,ad,ae,af,ag,ah,k,n,m)
end subroutine
subroutine c(aa,ab,ac,ad,ae,af,ag,ah,k,n,m)
real, device, dimension(k,n,m) :: aa,ab,ac,ad,ae,af,ag,ah
print ,"in C, k,n,m are ",k,n,m
!$cuf kernel do(3) <<<
,
>>>
do im = 1, m
do in = 1, n
do ik = 1, k
aa(ik,in,im) = 1.0
ab(ik,in,im) = 2.0
ac(ik,in,im) = 3.0
ad(ik,in,im) = 4.0
ae(ik,in,im) = 5.0
af(ik,in,im) = 6.0
ag(ik,in,im) = 7.0
ah(ik,in,im) = 8.0
end do
end do
end do
end subroutine

Thank you very much!
The problem was the description of the arguments. I changed the arguments description to the following:

subroutine A(arg1,arg2,...)
  real, device, dimension(jdim,kdim,idim)::arg1
  real, device, dimension(jdim,kdim,idim)::arg2
end subroutine

Successfully passed the device data of the outer subroutine to the inner subroutine. It just call cuMalloc one time.

==26435== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 14.85%  613.28ms     97503  6.2890us     960ns  1.1752ms  [CUDA memcpy HtoD]
 14.57%  602.06ms      7500  80.274us  75.617us  90.818us  cudamodule_twoeqn_rhside_8_
 13.11%  541.60ms      7500  72.213us  57.921us  126.75us  cudamodule_twoeqn_rhside_1_
 11.24%  464.36ms      7500  61.914us  58.593us  69.377us  cudamodule_twoeqn_rhside_6_
 11.22%  463.32ms      7500  61.775us  58.465us  69.441us  cudamodule_twoeqn_rhside_2_
 11.21%  462.96ms      7500  61.727us  58.240us  68.513us  cudamodule_twoeqn_rhside_3_
  5.30%  219.00ms      7500  29.199us  27.169us  32.160us  cudamodule_twoeqn_rhside_4_
  4.75%  196.39ms      7500  26.184us  24.736us  31.040us  cudamodule_twoeqn_rhside_9_
  4.25%  175.38ms      7500  23.383us  22.017us  27.745us  cudamodule_twoeqn_damp1_2_
  4.07%  168.26ms      7500  22.434us  21.120us  26.785us  cudamodule_twoeqn_damp1_1_
  2.87%  118.62ms      7500  15.816us  14.688us  19.424us  cudamodule_twoeqn_rhside_7_
  2.56%  105.75ms     22503  4.6990us  2.3360us  1.3000ms  [CUDA memcpy DtoH]

==26435== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 81.99%  5.65289s    120006  47.105us  5.2480us  30.456ms  cudaMemcpy
  9.44%  650.97ms     75000  8.6790us  5.1510us  568.32us  cudaLaunchKernel
  8.23%  567.37ms         1  567.37ms  567.37ms  567.37ms  cudaMalloc
  0.32%  22.023ms     75000     293ns     187ns  250.05us  cudaGetLastError
  0.01%  813.75us         1  813.75us  813.75us  813.75us  cuDeviceTotalMem
  0.01%  451.52us        91  4.9610us     361ns  136.40us  cuDeviceGetAttribute
  0.00%  196.77us         1  196.77us  196.77us  196.77us  cudaFree
  0.00%  38.439us         1  38.439us  38.439us  38.439us  cuDeviceGetName
  0.00%  7.9060us         3  2.6350us     669ns  5.6000us  cuDeviceGetCount
  0.00%  2.3540us         3     784ns     615ns  1.0830us  cuDeviceGet