Hi everyone,
I have a large parallel Fortran code with OpenACC directives in it. It contains a subroutine that is completely executed on the GPU. This routine in turns calls a C routine. When I use the Cray Fortran and C compilers on the Titan supercomputer, the code runs fine and produces the correct output. When I try to use PGI version 15.7 compilers, the code blows up. I have traced the problem to the C routine called from Fortran. If I replace this C routine with an equivalent Fortran subroutine, the code compiled with the PGI compiler runs fine. But I want to keep using the C routine because it gives me much better performance than the Fortran version. The entire C routine called from Fortran is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define mat_elem(a, r, c, n) (a + ((r) * (n) + (c)))
#pragma acc routine seq
void inline swap_row(double *a, double *b, int r1, int r2, int n, int m)
{
double tmp, *p1, *p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(a, r1, i, n);
p2 = mat_elem(a, r2, i, n);
tmp = *p1, *p1 = *p2, *p2 = tmp;
}
for (i = 0; i < m; i++) {
p1 = mat_elem(b, r1, i, m);
p2 = mat_elem(b, r2, i, m);
tmp = *p1, *p1 = *p2, *p2 = tmp;
}
}
#pragma acc routine seq
extern “C” void gauss_eliminate(double *a, double *b, double *x, int n, int m)
{
#define A(r, c) (*mat_elem(a, r, c, n))
#define B(r, c) (*mat_elem(b, r, c, m))
#define X(r, c) (*mat_elem(x, r, c, m))
int i, j, col, row, max_row,dia;
double max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia, max = A(dia, dia);
for (row = dia + 1; row < n; row++)
if ((tmp = fabs(A(row, dia))) > max)
max_row = row, max = tmp;
swap_row(a, b, dia, max_row, n, m);
for (row = dia + 1; row < n; row++) {
tmp = A(row, dia) / A(dia, dia);
for (col = dia+1; col < n; col++)
A(row, col) -= tmp * A(dia, col);
A(row, dia) = 0;
for (col = 0; col < m; col++)
B(row,col) -= tmp * B(dia,col);
}
}
for (row = n - 1; row >= 0; row–) {
for (j = n - 1; j > row; j–) {
for (col = 0; col < m; col++) {
//printf(“row=%d j=%d col=%d x=%f\n”, row, j, col, X(j,col));
//printf(“row=%d j=%d col=%d x=%.16f\n”, row, j, col, X(row, j));
B(row,col) -= X(j,col) * A(row, j);
//printf(“row=%d j=%d col=%d b=%.16f x=%f\n”, row, j, col, B(row,col), X(j,col));
}
}
for (col = 0; col < m; col++) {
//printf(“row=%d col=%d b=%.16f a=%.16f x=%.16f\n”, row, col, B(row,col), A(row,row), X(row,col));
X(row,col) = B(row,col) / A(row,row);
//printf(“row=%d col=%d b=%.16f a=%.16f x=%.16f\n”, row, col, B(row,col), A(row,row), X(row,col));
}
}
#undef A
#undef B
#undef X
}
The Fortran subroutine calls the routine named ‘gauss_eliminate’ in the above C code. The calling Fortran routine contains the following interface:
INTERFACE
SUBROUTINE gauss_eliminate(a, b, x, n, m) BIND(C)
USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_DOUBLE, C_INT
IMPLICIT NONE
real(C_DOUBLE), dimension(n,n), intent(in ) :: a
real(C_DOUBLE), dimension(m,n), intent(in ) :: b
real(C_DOUBLE), dimension(m,n), intent(inout) :: x
INTEGER(C_INT), VALUE :: n
INTEGER(C_INT), VALUE :: m
END SUBROUTINE gauss_eliminate
END INTERFACE
and also has the following declaration:
!$acc routine (gauss_eliminate) seq
Does anyone have any ideas why the C routine creates a problem for the PGI compiler?