Hello,
I wrote a code that transforms a symmetric matrix in the upper triangular stored csr format to general csr format without using the transformation to the dense matrix. When I run the code, it gives results as they should be. However, when I parallelize the code using OpenACC, it gives wrong results. I could not figure out why. Could you help me? I shared the program and Makefile below.
The program
PROGRAM symtogen
USE OPENACC
!$ USE OMP_LIB
IMPLICIT NONE
INTEGER, PARAMETER :: NNA = 10
INTEGER, PARAMETER :: N = 5
INTEGER :: NNZ
DOUBLE PRECISION, DIMENSION(NNA) :: A
INTEGER, DIMENSION(N+1) :: IA
INTEGER, DIMENSION(NNA) :: JA
INTEGER :: C
INTEGER, DIMENSION(NNA) :: JAtrue
LOGICAL, DIMENSION(NNA) :: logJA
INTEGER, DIMENSION(N) :: COUNTER
INTEGER :: T
DOUBLE PRECISION, ALLOCATABLE :: Z(:)
INTEGER, ALLOCATABLE :: IZ(:)
INTEGER, ALLOCATABLE :: IZinc(:)
INTEGER, ALLOCATABLE :: JZ(:)
INTEGER :: k,l,i,p
NNZ = 2*NNA - N
A = (/ 2.0d0, 1.0d0, 2.0d0, 5.0d0, 1.0d0, -2.0d0, 3.0d0, 4.0d0, 2.0d0, 3.0d0 /)
IA = (/ 1, 4, 6, 8, 10, 11 /)
JA = (/ 1, 2, 5, 2, 4, 3, 5, 4, 5, 5 /)
! 2 1 0 0 2
! 0 5 0 1 0
! 0 0 -2 0 3
! 0 0 0 4 2
! 0 0 0 0 3
ALLOCATE(Z(NNZ))
ALLOCATE(IZ(N+1))
ALLOCATE(IZinc(N+1))
ALLOCATE(JZ(NNZ))
! Determining IZ ------------------------------------------------------------------------
IZ(1)= 1
!$ACC DATA COPYIN(JA,IA) CREATE(JAtrue,logJA,IZinc) COPYOUT(IZinc)
!$ACC PARALLEL LOOP
DO k = 1,N
! c finds the count of a value (k) in JA
JAtrue = k
logJA = JA .EQ. JAtrue
C = COUNT(logJA)
IZinc(k) = IA(k+1) - IA(k) + C - 1
END DO !k
!$ACC END PARALLEL LOOP
!$ACC END DATA
DO k = 1,N
IZ(k+1) = IZ(k) + IZinc(k)
END DO !k
COUNTER = 0
Z = 0.0D0
JZ = 0
!$ACC DATA COPYIN(JA,IA,A,COUNTER) COPY(Z,IZ,JZ)
!$ACC PARALLEL LOOP
DO k = 1, NNA
Z(IZ(JA(k))+COUNTER(JA(k))) = A(k)
DO i = 1, N
IF ( k >= IA(i) .AND. k < IA(i+1)) THEN
JZ(IZ(JA(k))+COUNTER(JA(k))) = i
END IF
END DO
COUNTER(JA(k))= COUNTER(JA(k)) + 1
END DO !k
!$ACC END PARALLEL LOOP
!$ACC PARALLEL LOOP
DO l = 1, N
Z(IZ(l)+COUNTER(l)-1 : IZ(l+1)-1) = A(IA(l) : IA(l+1)-1)
JZ(IZ(l)+COUNTER(l)-1 : IZ(l+1)-1) = JA(IA(l) : IA(l+1)-1)
END DO !l
!$ACC END PARALLEL LOOP
!$ACC END DATA
write(*,*) "Z"
DO k = 1,NNZ
write(*,*) Z(k)
END DO
write(*,*) "IZ"
DO k = 1,N+1
write(*,*) IZ(k)
END DO
write(*,*) "JZ"
DO k = 1,NNZ
write(*,*) JZ(k)
END DO
DEALLOCATE(Z)
DEALLOCATE(IZ)
DEALLOCATE(IZinc)
DEALLOCATE(JZ)
END PROGRAM
The Makefile
FC=nvfortran
TIMER=/usr/bin/time
OPT=
NOPT=-fast -Minfo=opt $(OPT)
FCFLAGS = -Mpreprocess -fast -acc=gpu -cuda -cudalib=cusparse
symtogen: symtogen.o
$(TIMER) ./symtogen.o $(STEPS)
symtogen.o: symtogen.f90
$(FC) $(FCFLAGS) -o $@ $< $(NOPT) -Minfo=accel -acc
clean:
rm -f *.o *.exe *.s *.mod a.out
Thank you,
Yunus Altıntop