Hi,
I have a program called cuda kernel function in the inner subroutine, the call relationship is as follows: A->B->C->D
However, it is very time-consuming to malloc device data. I want to create device data in the outer subroutine, and then pass the device data to the inner subroutine to avoid frequent malloc of device data.
Subroutines A, B, C, D are as follows:
subroutine A(work,mwork,arg1,arg2,...)
dimension work(mwork)
real, device :: work_d(mwork)
! othre statements...
call B(work_d,work_d(nstart+1+maxbl),...)
end subroutine
subroutine B(w_d,wk_d,arg1,arg2,...)
real, intent(inout), device :: w_d(:),wk_d(:)
! othre statements...
call C(w_d(lq),w_d(lsj),w_d(lsk),w_d(lsi),w_d(lvol),
. w_d(lsnk0),wk_d(lres),wk_d(ltot),w_d(lqj0),
. w_d(lqk0),w_d(lvolj0),w_d(lvolk0),...)
end subroutine
subroutine C(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,res_d,wk_d,qj0_d,
. qk0_d,volj0_d,volk0_d,arg1,arg2,...)
real, intent(inout), device ::
. q_d(:,:,:,:),
. si_d(:,:),
. sj_d(:,:),
. sk_d(:,:),
. vol_d(:),
. snk0_d(:,:,:),
. res_d(:,:),
. wk_d(:),
. qj0_d(:,:,:),
. qk0_d(:,:,:),
. volj0_d(:,:,:),
. volk0_d(:,:,:)
! othre statements...
call D(q_d,sj_d,sk_d,si_d,vol_d,snk0_d,wk_d(iwk1),res_d(1,2),
+ res_d(1,3),wk_d(iwk4),wk_d(iwk32),wk_d(iwk33),qj0_d,
+ qk0_d,volj0_d,volk0_d,...)
end subroutine
subroutine D(q_d,sj_d,sk_d,si_d,vol_d,smin_d,turre_d,damp1_d,
+ blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,volj0_d,volk0_d,arg1,
+ arg2,...)
real, intent(inout), device ::
+ sk_d(:,:,:,:),
+ vol_d(:,:,:),
+ turre_d(:,:,:,:),
+ damp1_d(:,:,:),
+ sj_d(:,:,:,:),
+ si_d(:,:,:,:),
+ smin_d(:,:,:),
+ q_d(:,:,:,:),
+ fnu_d(:,:,:),
+ blend_d(:,:,:),
+ v3dtmp_d(:,:,:),
+ rhside_d(:,:,:,:),
+ qk0_d(:,:,:,:),
+ volk0_d(:,:,:),
+ qj0_d(:,:,:,:),
+ volj0_d(:,:,:)
! call cuda kernel in this subroutine
call kernel<<<grid,block>>>(q_d,sj_d,sk_d,si_d,vol_d,smin_d,
+ turre_d,damp1_d,blend_d,fnu_d,rhside_d,v3dtmp_d,qj0_d,qk0_d,
+ volj0_d,volk0_d,...)
call checkCudaError()
end subroutine
But running the above code will get a runtime error, signal SIGSEGV, Segmentation fault. How to properly pass the device data reference to the inner subroutine?
Thanks!
Zhou Heng