I have debugged to found the error point to the line of the code “ERR = cudaMemcpyAsync(DEN,m_DEN,dm_NPRT)”
both DEN and m_DEN are declared and the memories are also allocated.They are device and host memory, respectively.
cudaMemcpyAsync are also occurred before the line and those code can work through.
And the code can work in my home computer (Cuda5.5/K20), But it not in my office computer(CUDA7.5/GK110B).
The following is the message occurred when I run the executable program with cuda-memcheck.
…
========= Invalid shared write of size 16
========= at 0x00000250 in nbl_cal_neighborelist_devkernel2c_
========= by thread (0,0,0) in block (14,0,0)
========= Address 0x00000004 is misaligned
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/nvidia/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15865d]
========= Host Frame:/opt/pgi/linux86-64/2015/cuda/7.5/lib64/libcudart.so.7.5 [0x146ad]
========= Host Frame:/opt/pgi/linux86-64/2015/cuda/7.5/lib64/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
========= Host Frame:./fasMD.exe [0xe8260]
========= Host Frame:./fasMD.exe [0xe7946]
========= Host Frame:./fasMD.exe [0xe3315]
========= Host Frame:./fasMD.exe [0x5002]
========= Host Frame:./fasMD.exe [0x77fe]
========= Host Frame:./fasMD.exe [0x3b34]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
========= Host Frame:./fasMD.exe [0x3a29]
Checking A003
0: copyout MemcpyAsync (host=0x203707e00, dev=0x704b27e00, size=32000, stream=0) FAILED: 4(unspecified launch failure)
========= ERROR SUMMARY: 168 errors
[baoqin@oums-sfgpu MD]$ cuda-memcheck ./fasMD.exe ipa.thfu 0 1
Some friends from the internet tell me the error may not come from cudaMemcpyAsync,but may come from the kernel subroutine of cal_neighborelist_devkernel2c. And then I check the kernel.
First, I comment out the kernel subroutine, the program can work through. So I think the error should come from the kernel.
Then, I comment part of the kernel or/and add some demo line to check the errror. But it is not successful. And I can’t found the error point. could someone help me? and I will post the kernel subroutine below
use constant
implicit none
!
!— DUMMY VARIABLES
integer, value::NBPC,IP0, NC,NCX,NCY,NCZ, PDX, PDY, PDZ, NPART, CFROM, CTO, mxNAPDEV, mxKVOIS, IA1th0
real(KINDDF), value::cra011,cra021,cra031,cra012,cra022,cra032,cra013,cra023,cra033
real(KINDDF), device::XP(3,NPART)
integer, device::ITYP(NPART), NAC(NC), IA1th(NC)
integer,device::KVOIS(mxNAPDEV)
integer,device::INDI(mxKVOIS,mxNAPDEV)
integer(1),device::mvl(mxKVOIS,mxNAPDEV)
!--- Local variables
!nonshared by threads
real(KINDSF)::POS(3), SEP(3)
integer::IB, IB0, IT, IA, IA0,IA00, JA, NN, I, J, K, ITY
!variables share by all thread
integer,shared::NB, IC, IS0, STARTA, NCXY, NCXYZ,IX0, IY0, IZ0,IC0, NACC0
integer,shared::NS,NACC, IAC, IACE, FROM, TO
integer, shared, dimension(mp_NNC)::CID, IX,IY, IZ, OUT
integer(1), shared, dimension(mp_NNC)::I_mvl
real(KINDSF), shared, dimension(3,mp_NNC)::CXYZ
real(KINDSF), shared, dimension(3,mp_BLOCKSIZE)::SPOS
real(KINDSF), shared::RC2(mp_MXGROUP,mp_MXGROUP)
integer, shared, dimension(mp_BLOCKSIZE)::JTY
IB = (blockidx%y-1) * griddim%x + blockidx%x-1
NB = blockdim%x*blockdim%y
IB0 = IB/NBPC
IP0 = (IB-IB0*NBPC)*NB
IB0 = IB0 + CFROM-1
if(IB0 .GE. CTO) return
IT = (threadidx%y-1)*blockdim%x + threadidx%x
if(IT .EQ. 1) then
RC2(1:mp_MXGROUP,1:mp_MXGROUP) = dcm_RU2(1:mp_MXGROUP,1:mp_MXGROUP)
NCXY = NCX*NCY
NCXYZ = NCXY*NCZ
IS0 = IB0/NCXYZ
IC = IB0-IS0*NCXYZ
IZ0 = IC/NCXY
IY0 = (IC-IZ0*NCXY)/NCX
IX0 = IC-IZ0*NCXY-IY0*NCX
IZ0 = IZ0 + 1
IY0 = IY0 + 1
IX0 = IX0 + 1
IC = IB0 + 1
STARTA = IA1th0 ! IA1th(CFROM)
NACC0 = NAC(IC)
end if
call syncthreads()
if(NACC0 .LE. 0) return
if(IT .LE. mp_NNC) then
OUT(IT)= 0
IZ(IT) = IZ0+mp_NIZ(IT)
IY(IT) = IY0+mp_NIY(IT)
IX(IT) = IX0+mp_NIX(IT)
I_mvl(IT) = 0
CXYZ(1:3,IT) = 0.0d0
If(PDX .AND. IT.GT.1) Then
IF( IX(IT).GT.NCX )THEN
IX(IT) = 1
CXYZ(1,IT) = CXYZ(1,IT) + cra011
CXYZ(2,IT) = CXYZ(2,IT) + cra021
CXYZ(3,IT) = CXYZ(3,IT) + cra031
I_mvl(IT) = ior( I_mvl(IT) , 1)
ELSE IF (IX(IT).LT.1) THEN
IX(IT) = NCX
CXYZ(1,IT) = CXYZ(1,IT) - cra011
CXYZ(2,IT) = CXYZ(2,IT) - cra021
CXYZ(3,IT) = CXYZ(3,IT) - cra031
I_mvl(IT) = ior( I_mvl(IT) , 2)
ENDIF
End If
If(PDY .AND. IT.GT.1) Then
IF( IY(IT).GT.NCY )THEN
IY(IT) = 1
CXYZ(1,IT) = CXYZ(1,IT) + cra012
CXYZ(2,IT) = CXYZ(2,IT) + cra022
CXYZ(3,IT) = CXYZ(3,IT) + cra032
I_mvl(IT) = ior( I_mvl(IT) , 4)
ELSE IF (IY(IT).LT.1) THEN
IY(IT) = NCY
CXYZ(1,IT) = CXYZ(1,IT) - cra012
CXYZ(2,IT) = CXYZ(2,IT) - cra022
CXYZ(3,IT) = CXYZ(3,IT) - cra032
I_mvl(IT) = ior( I_mvl(IT) , 8)
ENDIF
End If
If(PDZ .AND. IT.GT.1) Then
IF( IZ(IT).GT.NCZ )THEN
IZ(IT) = 1
CXYZ(1,IT) = CXYZ(1,IT) + cra013
CXYZ(2,IT) = CXYZ(2,IT) + cra023
CXYZ(3,IT) = CXYZ(3,IT) + cra033
I_mvl(IT) = ior( I_mvl(IT) , 16)
ELSE IF (IZ(IT).LT.1) THEN
IZ(IT) = NCZ
CXYZ(1,IT) = CXYZ(1,IT) - cra013
CXYZ(2,IT) = CXYZ(2,IT) - cra023
CXYZ(3,IT) = CXYZ(3,IT) - cra033
I_mvl(IT) = ior( I_mvl(IT) , 32)
ENDIF
End If
IF( IX(IT) .GT. NCX .OR. IX(IT) .LT. 1) OUT(IT) = 1
IF( IY(IT) .GT. NCY .OR. IY(IT) .LT. 1) OUT(IT) = 1
IF( IZ(IT) .GT. NCZ .OR. IZ(IT) .LT. 1) OUT(IT) = 1
IF(OUT(IT) .EQ. 0) then
CID(IT) = NCXY*(IZ(IT)-1)+NCX*(IY(IT)-1)+IX(IT)+IS0*NCXYZ
ELSE
CID(IT) = 0
END IF
END IF
call syncthreads()
IA = (IT-1) + IA1th(IC) + IP0
IA0 = IA - STARTA + 1
IA00 = IA - IA1th(IC)+1
if(IA00.LE.NACC0) then
POS(1:3)=XP(1:3,IA)
ITY = ITYP(IA)
NN = 0
endif
K=1
NACC = NAC(CID(K))
NS = (NACC-1)/NB+1
IAC = IA1th(CID(K))
IACE = IAC + NACC -1
DO J=1, NS, 1
FROM = min((J-1)*NB+IAC,IACE)
TO = min(FROM+NB-1, IACE)
if(IT+FROM-1<=IACE)then
SPOS(1:3,IT) = XP(1:3, IT+FROM-1) + CXYZ(1:3,K)
JTY(IT) = ITYP(IT+FROM-1)
end if
call syncthreads()
IF(IA00.LE.NACC0) then
DO I=FROM, TO
JA = I-FROM+1
SEP(1:3) = POS(1:3) - SPOS(1:3,JA)
if( SEP(1)*SEP(1)+SEP(2)*SEP(2)+SEP(3)*SEP(3) .LE. RC2(ITY,JTY(JA)) ) then
if(I.NE.IA) then
if(NN<mxKVOIS) then
NN = NN + 1
INDI(NN,IA0) = I
mvl(NN,IA0) = I_mvl(K)
end if
end if
end if
END DO
END IF
call syncthreads()
END DO
DO K=2, mp_NNC
IF(OUT(K)) cycle
NACC = NAC(CID(K))
NS = min((NACC-1)/NB+1, NACC)
IAC = IA1th(CID(K))
IACE = IAC + NACC -1
call syncthreads()
DO J=1, NS
FROM = min((J-1)*NB+IAC,IACE)
TO = min(FROM+NB-1, IACE)
if(IT+FROM-1<=IACE)then
SPOS(1:3,IT) = XP(1:3, IT+FROM-1) + CXYZ(1:3,K)
JTY(IT) = ITYP(IT+FROM-1)
endif
call syncthreads()
IF(IA00.LE.NACC0) then
DO I=FROM, TO
JA = I-FROM+1
SEP(1:3) = POS(1:3) - SPOS(1:3,JA)
if( SEP(1)*SEP(1)+SEP(2)*SEP(2)+SEP(3)*SEP(3) .LE. RC2(ITY,JTY(JA)) ) then
if(NN<mxKVOIS) then
NN = NN + 1
INDI(NN, IA0) = I
mvl(NN, IA0) = I_mvl(K)
end if
end if
END DO
END IF
call syncthreads()
END DO
END DO
IF(IA00 .LE. NACC0) then
KVOIS(IA0) = NN
END IF
return