Hi Mat,

Thank you very much for this detailed answer. I understand better the issue now.

I still have some questions though concerning the memory allocation and heap space: does subroutine variables with “intent(in or out)” are stored in the global memory (variables A,B,C from my previous examples) and the local variable in the heap space (variable buff from my examples) ? Does the heap space is shared by all the threads, and therefore the heap space needed increases with the number of threads called ?

Currently I have a subroutine which works correctly until a certain number of threads. It works for 512 threads but for 1024 my output variable remains untouched (identical as before calling the subroutine) without any error message. And increasing the heap space doesn’t solve the problem:

```
integer(kind=cuda_count_kind) :: val
istat = cudaDeviceGetLimit(val,cudaLimitMallocHeapSize)
istat = cudaDeviceSetLimit(cudaLimitMallocHeapSize,10*val)
```

So I don’t really know where the issue might come from. Subroutine too big ?

Here is my function if you want to take a look at it :

```
attributes(global) subroutine Section2_km(g_num,g_coord,el_sd,ip_w,dee, &
storkm, nels,nn,nod,nip,ndim,nst,ndof,nodof)
implicit none
integer,value,intent(in) :: nels,nn,nod,nip,ndim,nst,ndof,nodof
integer,device,intent(in) :: g_num(nod,nels)
real(iwp),device,intent(in) :: g_coord(ndim,nn),ip_w(nip), &
el_sd(ndim,nod,nip), dee(nst,nst)
real(iwp),device,intent(out) :: storkm(ndof,ndof,nels)
real(iwp),device :: zero=0.0_iwp,jac(3,3),jacinv(3,3), &
bee(nst,ndof), det, deriv(ndim,nod)
! ndof<60, nod<20, nst=6, ndim=3
integer,value :: iel,ipt,i,j,k,n,m,l,istat
iel = blockDim%x * (blockIdx%x - 1) + threadIdx%x
IF (iel <= nels) THEN; storkm(:,:,iel)=zero;
DO ipt=1,nip
! coord=g_coord(:,g_num(:,iel))
! der=el_sd(:,:,ipt)
! jac=MATMUL(der,coord)
jac=zero
DO i=1,ndim; DO j=1,ndim; DO k=1,nod
jac(i,j)=jac(i,j)+el_sd(i,k,ipt)*g_coord(j,g_num(k,iel))
END DO; END DO; END DO
det=jac(1,1)*(jac(2,2)*jac(3,3)-jac(3,2)*jac(2,3)) &
-jac(1,2)*(jac(2,1)*jac(3,3)-jac(3,1)*jac(2,3)) &
+jac(1,3)*(jac(2,1)*jac(3,2)-jac(3,1)*jac(2,2))
jacinv(1,1)=( jac(2,2)*jac(3,3)-jac(3,2)*jac(2,3))/det
jacinv(2,1)=(-jac(2,1)*jac(3,3)+jac(3,1)*jac(2,3))/det
jacinv(3,1)=( jac(2,1)*jac(3,2)-jac(3,1)*jac(2,2))/det
jacinv(1,2)=(-jac(1,2)*jac(3,3)+jac(3,2)*jac(1,3))/det
jacinv(2,2)=( jac(1,1)*jac(3,3)-jac(3,1)*jac(1,3))/det
jacinv(3,2)=(-jac(1,1)*jac(3,2)+jac(3,1)*jac(1,2))/det
jacinv(1,3)=( jac(1,2)*jac(2,3)-jac(2,2)*jac(1,3))/det
jacinv(2,3)=(-jac(1,1)*jac(2,3)+jac(2,1)*jac(1,3))/det
jacinv(3,3)=( jac(1,1)*jac(2,2)-jac(2,1)*jac(1,2))/det
! deriv=MATMUL(jacinv,der)
deriv=zero
DO i=1,ndim; DO j=1,nod; DO k=1,ndim
deriv(i,j)=deriv(i,j)+jacinv(i,k)*el_sd(k,j,ipt)
END DO; END DO; END DO
bee=zero
DO m=1,nod; n=3*m; k=n-1; l=k-1
bee(1,l)=deriv(1,m); bee(4,k)=deriv(1,m); bee(6,n)=deriv(1,m)
bee(2,k)=deriv(2,m); bee(4,l)=deriv(2,m); bee(5,n)=deriv(2,m)
bee(3,n)=deriv(3,m); bee(5,k)=deriv(3,m); bee(6,l)=deriv(3,m)
END DO
! km=km+MATMUL(MATMUL(TRANSPOSE(bee),dee),bee)*det*ip_w(ipt)
! storkm(:,:,iel) = km
DO i=1,ndof; DO j=1,ndof
DO k=1,nst; DO l=1,nst
storkm(i,j,iel)=storkm(i,j,iel)+bee(l,i)*dee(l,k)*bee(k,j)*det*ip_w(ipt)
! istat = atomicadd( storkm(i,j,iel) , bee(l,i)*dee(l,k)*bee(k,j)*det*ip_w(ipt) )
END DO; END DO;
END DO; END DO
END DO
END IF
end subroutine Section2_km
```

Concerning OpenACC vs CUDA this is not really my call. I’m working on an extension of an already existing code in cuda-fortran (Fraser Kirk might ring a bell to you ? he told me you met and helped him).

Thank you again for your great support,

Sincerely,

Remy