Hi guys,

I got a code up and running today on CUDA Fortran, but I have one problem that I’m not able to figure out and need some suggestions for how to debug. Here is the problem section of the code:

```
DO j = 1,n_steps
CALL build_SC<<<grid_sz,block_sz>>>(Super_set_1_d, Super_set_2_d, Super_set_3_d, Super_set_4_d, Phi2_d, PHI_d, Phi1_d, termsdim4_d, SC_d, NCRYS, SUP_IND)
CALL mmul<<<dim3(NCRYS/16,1,1),dim3(16,9,1)>>>( SC_d, FoMat_d(:,1:9), StressMat_d(:,1:9), NCRYS, 2*SUP_IND, 9 )
.
.
.
StressMat = StressMat_d
.
.
.
w_12,w_13,w_23 updated
.
.
.
CALL QMAT<<<blocks_texture,threads_texture>>>(Phi2_d, PHI_d, Phi1_d)
CALL RMAT<<<blocks_texture,threads_texture>>>(w_12_d, w_13_d, w_23_d)
iostat = cudaDeviceSynchronize()
CALL G_flag<<<blocks_texture,threads_texture>>>(Q_p_sam_d, Phi2_d, PHI_d, Phi1_d, j, n_steps)
ENDDO
```

The main gist of the kernel Module is:

```
MODULE CUDA_Kernels
USE Variables
IMPLICIT NONE
REAL(RP), DIMENSION(3,3,NCRYS), DEVICE :: QMATT, RMATT
CONTAINS
!=================================================================================
!=================================================================================
attributes(global) SUBROUTINE build_SC(SS1, SS2, SS3, SS4, P1, P2, P3, P4, SC, n, m)
END SUBROUTINE build_SC
!=================================================================================
!=================================================================================
attributes(global) SUBROUTINE mmul( A, B, C, N, M, L )
END SUBROUTINE mmul
!=================================================================================
!=================================================================================
attributes(global) SUBROUTINE QMAT(Phi2, PHI, Phi1)
IMPLICIT NONE
REAL(RP), DIMENSION(NCRYS), DEVICE ::PHI, Phi1,Phi2
INTEGER :: i
INTEGER :: tx, bx
REAL(RP), SHARED :: to_rad
!Total shared memory used per block is 6*256*8 = 12288 out of 49152
REAL(RP), DIMENSION(256), SHARED :: SPhi1, CPhi1, SPHI, CPHI, SPhi2, CPhi2
tx = threadidx%x !will be 1-256
bx = blockdim%x !will be 256
i = (blockidx%x-1) * bx + tx
to_rad = 1.e-03_rp * (PI / 180._rp)
IF (i<=NCRYS) THEN
ENDIF
RETURN
END SUBROUTINE QMAT
!=================================================================================
!=================================================================================
attributes(global) SUBROUTINE RMAT(w21_rec, w31_rec, w32_rec)
IMPLICIT NONE
REAL(RP), DIMENSION(NCRYS), DEVICE :: w21_rec, w31_rec, w32_rec
INTEGER :: i
INTEGER :: tx, bx
!Total shared memory used is 7*256*8 = 14336 out of 49152
REAL(RP), DIMENSION(256), SHARED :: ang, axis_1, axis_2, axis_3, w21_recs, w31_recs, w32_recs
tx = threadidx%x !will be 1-256
bx = blockdim%x !will be 256
i = (blockidx%x-1) * bx + tx
IF (i<=NCRYS) THEN
ENDIF
RETURN
END SUBROUTINE RMAT
!=================================================================================
!=================================================================================
attributes(global) SUBROUTINE G_flag(Q_p_sam,Phi2, PHI, Phi1, flag, n_steps)
IMPLICIT NONE
INTEGER, VALUE, INTENT(IN) :: flag, n_steps
REAL(RP), DIMENSION(9), DEVICE :: Q_p_sam
REAL(RP), DIMENSION(NCRYS,9), DEVICE :: G
REAL(RP), DIMENSION(NCRYS), DEVICE :: PHI, Phi1,Phi2
INTEGER :: i
INTEGER :: tx, bx
!Total shared memory used is 9*256*8 + 9*8 + 256*9*8 +256*3*8 + 9*8= 45128 out of 49152
REAL(RP), SHARED :: sQ_p_sam(9), angles(256,3), GG(9), twoPI
REAL(RP), DIMENSION(256), SHARED :: QMATs_11, QMATs_12, QMATs_13, QMATs_21, &
QMATs_22, QMATs_23, QMATs_31, QMATs_32, QMATs_33
REAL(RP), DIMENSION(256), SHARED :: Rs_11, Rs_12, Rs_13, Rs_21, Rs_22, Rs_23, &
Rs_31, Rs_32, Rs_33
REAL(RP) :: to_rad, to_deg
tx = threadidx%x !will be 1-256
bx = blockdim%x !will be 256
i = (blockidx%x-1) * bx + tx
sQ_p_sam = Q_p_sam
to_rad = 1.e-03_rp * (PI / 180._rp)
to_deg = 1._rp / to_rad
twoPI = 2._rp*PI
IF (i<=NCRYS) THEN
QMATs_11(tx) = QMATT(1,1,i)
QMATs_12(tx) = QMATT(1,2,i)
QMATs_13(tx) = QMATT(1,3,i)
QMATs_21(tx) = QMATT(2,1,i)
QMATs_22(tx) = QMATT(2,2,i)
QMATs_23(tx) = QMATT(2,3,i)
QMATs_31(tx) = QMATT(3,1,i)
QMATs_32(tx) = QMATT(3,2,i)
QMATs_33(tx) = QMATT(3,3,i)
Rs_11(tx) = RMATT(1,1,i)
Rs_12(tx) = RMATT(1,2,i)
Rs_13(tx) = RMATT(1,3,i)
Rs_21(tx) = RMATT(2,1,i)
Rs_22(tx) = RMATT(2,2,i)
Rs_23(tx) = RMATT(2,3,i)
Rs_31(tx) = RMATT(3,1,i)
Rs_32(tx) = RMATT(3,2,i)
Rs_33(tx) = RMATT(3,3,i)
G(i,1) = Rs_11(tx)*QMATs_11(tx) + Rs_21(tx)*QMATs_21(tx) + Rs_31(tx)*QMATs_31(tx)
G(i,2) = Rs_12(tx)*QMATs_11(tx) + Rs_22(tx)*QMATs_21(tx) + Rs_32(tx)*QMATs_31(tx)
G(i,3) = Rs_13(tx)*QMATs_11(tx) + Rs_23(tx)*QMATs_21(tx) + Rs_33(tx)*QMATs_31(tx)
G(i,4) = Rs_11(tx)*QMATs_12(tx) + Rs_21(tx)*QMATs_22(tx) + Rs_31(tx)*QMATs_32(tx)
G(i,5) = Rs_12(tx)*QMATs_12(tx) + Rs_22(tx)*QMATs_22(tx) + Rs_32(tx)*QMATs_32(tx)
G(i,6) = Rs_13(tx)*QMATs_12(tx) + Rs_23(tx)*QMATs_22(tx) + Rs_33(tx)*QMATs_32(tx)
G(i,7) = Rs_11(tx)*QMATs_13(tx) + Rs_21(tx)*QMATs_23(tx) + Rs_31(tx)*QMATs_33(tx)
G(i,8) = Rs_12(tx)*QMATs_13(tx) + Rs_22(tx)*QMATs_23(tx) + Rs_32(tx)*QMATs_33(tx)
G(i,9) = Rs_13(tx)*QMATs_13(tx) + Rs_23(tx)*QMATs_23(tx) + Rs_33(tx)*QMATs_33(tx)
IF (flag == n_steps) THEN
GG(:) = G(i,:)
G(i,1) = sQ_p_sam(1)*GG(1) + sQ_p_sam(4)*GG(2) + sQ_p_sam(7)*GG(3)
G(i,2) = sQ_p_sam(2)*GG(1) + sQ_p_sam(5)*GG(2) + sQ_p_sam(8)*GG(3)
G(i,3) = sQ_p_sam(3)*GG(1) + sQ_p_sam(6)*GG(2) + sQ_p_sam(9)*GG(3)
G(i,4) = sQ_p_sam(1)*GG(4) + sQ_p_sam(4)*GG(5) + sQ_p_sam(7)*GG(6)
G(i,5) = sQ_p_sam(2)*GG(4) + sQ_p_sam(5)*GG(5) + sQ_p_sam(8)*GG(6)
G(i,6) = sQ_p_sam(3)*GG(4) + sQ_p_sam(6)*GG(5) + sQ_p_sam(9)*GG(6)
G(i,7) = sQ_p_sam(1)*GG(7) + sQ_p_sam(4)*GG(8) + sQ_p_sam(7)*GG(9)
G(i,8) = sQ_p_sam(2)*GG(7) + sQ_p_sam(5)*GG(8) + sQ_p_sam(8)*GG(9)
G(i,9) = sQ_p_sam(3)*GG(7) + sQ_p_sam(6)*GG(8) + sQ_p_sam(9)*GG(9)
ENDIF
IF (abs(G(i,9)) == 1) THEN
angles(tx,1) = ACOS(G(i,1))
angles(tx,2) = ACOS(G(i,9))
angles(tx,3) = 0._rp
IF (G(i,2) < 0) THEN
angles(tx,1) = twoPI - angles(tx,1)
END IF
ELSE
angles(tx,1) = ATAN2(G(i,7),-1*G(i,8))
angles(tx,2) = ACOS(G(i,9))
angles(tx,3) = ATAN2(G(i,3),G(i,6))
ENDIF
IF (angles(tx,1) < 0) THEN
angles(tx,1) = angles(tx,1)+ twoPI
ENDIF
IF (angles(tx,2) < 0) THEN
angles(tx,2) = angles(tx,2)+ twoPI
ENDIF
IF (angles(tx,3) < 0) THEN
angles(tx,3) = angles(tx,3)+ twoPI
ENDIF
angles(tx,:) = NINT( to_deg * angles(tx,:) )
Phi1(i) = angles(tx,1)
PHI(i) = angles(tx,2)
Phi2(i) = angles(tx,3)
ENDIF
RETURN
END SUBROUTINE G_flag
END MODULE CUDA_Kernels
```

Where the arrays with *_d are allocated on the device from the host and HosttoDevice transfers are done from pinned allocated arrays on host. The other kernel parameters are passed as integer values.

QMATT and RMATT are updated in kernels QMAT and RMAT, then used in G_flag to update the Phi values. After the Phi values are updated the process loops. Everything in the loop except StressMat stays on the device for the duration of the loop.

The kernels are all programmed to use arrays that are powers of 2. Using array size of 1024 and 2048 return the correct results, with no errors. However, when I increase the array size to 4096 I get,

copyout Memcpy (host=0x200000000, dev=0x1b20420000, size=294912) FAILED: 4(unspecified launch failure),

where the size 294912 matches the StressMat = StressMat_d transfer (sizeof(double)*9*4096). When I increase the array size to 8192 i get an invalid argument error directly after the call to G_flag. I tried running G_flag with no arguments and I still received the same error (there is no error before G_flag). With array size 8192 the program completes, only the kernel G_flag does not run. Increasing the array size by powers of 2 from 8192 has same result.

I’m on a Kepler K20.

I checked memory usage right before G_flag launch and I’m nowhere near the card’s capacity.

These are the compiler tags from my makefile:

FLAGS = -V13.4 -g #fast -Mconcur=innermost -mp

FLAGS_CUDA =-Mcuda=cuda5.0,cc35 -tp:x64 #-Mcuda=cuda5.0,cc35,rdc -tp:x64

F90=pgf90[/code]

Thanks!