Hi,
45% of my code is in a “A*x” sparse matrix-vector product.
I have spent some amount of time playing with loop clauses, tiles, cache, etc and so far the fastest solution was to just use “loop,loop,loop” as shown below.
I am wondering if there has been an example 3D stencil code where the optimal clauses was found.
Any other ways to optimize the below code?
(I am using Titan XP, P100, and V100 GPUs and PGI 17.9)
!$acc parallel default(present) present(ps) private(ii)
!$acc loop
do k=2,npm1
!$acc loop
do j=2,ntm1
!$acc loop
do i=2,nrm-1
ii=ntm2*(nrm-2)*(k-2)+(nrm-2)*(j-2)+(i-1)
q(ii)=a_r( 1,i,j,k)*ps%r(i ,j ,k-1)
& +a_r( 2,i,j,k)*ps%r(i ,j-1,k )
& +a_r( 3,i,j,k)*ps%r(i-1,j ,k )
& +a_r( 4,i,j,k)*ps%r(i ,j ,k )
& +a_r( 5,i,j,k)*ps%r(i+1,j ,k )
& +a_r( 6,i,j,k)*ps%r(i ,j+1,k )
& +a_r( 7,i,j,k)*ps%r(i ,j ,k+1)
& +a_r( 8,i,j,k)*ps%t(i ,j-1,k )
& +a_r( 9,i,j,k)*ps%t(i+1,j-1,k )
& +a_r(10,i,j,k)*ps%t(i ,j ,k )
& +a_r(11,i,j,k)*ps%t(i+1,j ,k )
& +a_r(12,i,j,k)*ps%p(i ,j ,k-1)
& +a_r(13,i,j,k)*ps%p(i+1,j ,k-1)
& +a_r(14,i,j,k)*ps%p(i ,j ,k )
& +a_r(15,i,j,k)*ps%p(i+1,j ,k )
enddo
enddo
enddo
c
c ****** Get the t component.
c
!$acc loop
do k=2,npm1
!$acc loop
do j=2,ntm-1
!$acc loop
do i=2,nrm1
ii=(npm2*ntm2*(nrm-2))
& +(ntm-2)*nrm2*(k-2)+nrm2*(j-2)+(i-1)
q(ii)=
& a_t( 1,i,j,k)*ps%r(i-1,j ,k )
& +a_t( 2,i,j,k)*ps%r(i ,j ,k )
& +a_t( 3,i,j,k)*ps%r(i-1,j+1,k )
& +a_t( 4,i,j,k)*ps%r(i ,j+1,k )
& +a_t( 5,i,j,k)*ps%t(i ,j ,k-1)
& +a_t( 6,i,j,k)*ps%t(i ,j-1,k )
& +a_t( 7,i,j,k)*ps%t(i-1,j ,k )
& +a_t( 8,i,j,k)*ps%t(i ,j ,k )
& +a_t( 9,i,j,k)*ps%t(i+1,j ,k )
& +a_t(10,i,j,k)*ps%t(i ,j+1,k )
& +a_t(11,i,j,k)*ps%t(i ,j ,k+1)
& +a_t(12,i,j,k)*ps%p(i ,j ,k-1)
& +a_t(13,i,j,k)*ps%p(i ,j+1,k-1)
& +a_t(14,i,j,k)*ps%p(i ,j ,k )
& +a_t(15,i,j,k)*ps%p(i ,j+1,k )
enddo
enddo
enddo
c
c ****** Get the p component.
c
!$acc loop
do k=2,npm-1
!$acc loop
do j=2,ntm1
!$acc loop
do i=2,nrm1
ii=(npm2*ntm2*(nrm-2))+(npm2*(ntm-2)*nrm2)
& +ntm2*nrm2*(k-2)+nrm2*(j-2)+(i-1)
q(ii)=
& a_p( 1,i,j,k)*ps%r(i-1,j ,k )
& +a_p( 2,i,j,k)*ps%r(i ,j ,k )
& +a_p( 3,i,j,k)*ps%r(i-1,j ,k+1)
& +a_p( 4,i,j,k)*ps%r(i ,j ,k+1)
& +a_p( 5,i,j,k)*ps%t(i ,j-1,k )
& +a_p( 6,i,j,k)*ps%t(i ,j ,k )
& +a_p( 7,i,j,k)*ps%t(i ,j-1,k+1)
& +a_p( 8,i,j,k)*ps%t(i ,j ,k+1)
& +a_p( 9,i,j,k)*ps%p(i ,j ,k-1)
& +a_p(10,i,j,k)*ps%p(i ,j-1,k )
& +a_p(11,i,j,k)*ps%p(i-1,j ,k )
& +a_p(12,i,j,k)*ps%p(i ,j ,k )
& +a_p(13,i,j,k)*ps%p(i+1,j ,k )
& +a_p(14,i,j,k)*ps%p(i ,j+1,k )
& +a_p(15,i,j,k)*ps%p(i ,j ,k+1)
enddo
enddo
enddo
!$acc end parallel