Hello,
I’m compiling the toy code below with the flags
-Mcuda -ta=nvidia,wait,time
If natoms is greater than 472 I get the following error message:
line 23: cudaEventSynchronize returned status 4: unspecified launch failure
Am I exceeding a resource limitation on the Tesla C2075?
- Sarom
program test
implicit none
integer irad, iang, iatom, jatom, igridpoint
integer, parameter :: natoms = 473
integer, parameter :: nradpt = 96
integer, parameter :: nlebpt = 1202
double precision, dimension(natoms) :: tempa
double precision, dimension(natoms,natoms) :: tempb
double precision, dimension(natoms,nradpt*nlebpt) :: tempc
double precision, dimension(nradpt*nlebpt) :: tempd
double precision valuea,valueb,valuec,valued,valuee
do irad=1,nradpt
!$acc data region
!$acc> copyout(tempc(1:natoms,(irad-1)*nlebpt+1:(irad-1)*nlebpt+nlebpt))
!$acc region
!$acc do parallel, private(tempa(1:natoms),tempb(1:natoms,1:natoms))
do iang=1,nlebpt
igridpoint=(irad-1)*nlebpt+iang
do iatom=1,natoms
tempa(iatom)=1.0D+00
do jatom=1,natoms
tempb(jatom,iatom)=1.0D+00
if (iatom .eq. jatom) cycle
valuea=dble(jatom)/(22.0D+00/7.00D+00)
valueb=sin(valuea)
valuec=dble(iatom)/(22.0D+00/7.00D+00)
valued=cos(valuec)
valuee=abs(atan(valued/valueb))
tempb(jatom,iatom)=valuee**4.0D+00
enddo
tempa(iatom)=product(tempb(1:natoms,iatom),1)
enddo
tempc(1:natoms,igridpoint)=tempa(1:natoms)
enddo
!$acc end region
!$acc end data region
enddo
do irad=1,nradpt
do iang=1,nlebpt
igridpoint=(irad-1)*nlebpt+iang
tempd(igridpoint)=sum(tempc(1:natoms,igridpoint),1)*
> dble(igridpoint)/dble(nlebpt*nradpt)
enddo
enddo
write(*,*) 'SUM:= ', sum(tempd(1:nradpt*nlebpt),1)
end
Build output
------ Build started: Project: test-loop-b, Configuration: Debug x64 ------
Compiling Project ...
test-b.f
test:
17, Generating copyout(tempc(:,(irad-1)*1202+1:(irad-1)*1202+1202))
21, Loop is parallelizable
23, Loop is parallelizable
Accelerator kernel generated
21, !$acc do parallel ! blockidx%y
Using register for 'tempa'
23, !$acc do parallel, vector(256) ! blockidx%x threadidx%x
25, Loop is parallelizable
35, product reduction inlined
Loop is parallelizable
37, Loop is parallelizable
Accelerator kernel generated
21, !$acc do parallel ! blockidx%y
37, !$acc do parallel, vector(256) ! blockidx%x threadidx%x
46, sum reduction inlined
51, sum reduction inlined
Linking...
test-loop-b build succeeded.
Build log was saved at "file://C:\gamessVS\11.28.2011\test\test-loop-b\x64\Debug\BuildLog.htm"