Hi everyone,

I have a large parallel Fortran code with OpenACC directives in it. It contains a subroutine that is completely executed on the GPU. This routine in turns calls a C routine. When I use the Cray Fortran and C compilers on the Titan supercomputer, the code runs fine and produces the correct output. When I try to use PGI version 15.7 compilers, the code blows up. I have traced the problem to the C routine called from Fortran. If I replace this C routine with an equivalent Fortran subroutine, the code compiled with the PGI compiler runs fine. But I want to keep using the C routine because it gives me much better performance than the Fortran version. The entire C routine called from Fortran is as follows:

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#define mat_elem(a, r, c, n) (a + (® * (n) + ©))

#pragma acc routine seq

void inline swap_row(double *a, double *b, int r1, int r2, int n, int m)

{

double tmp, *p1, *p2;

int i;

if (r1 == r2) return;

for (i = 0; i < n; i++) {

p1 = mat_elem(a, r1, i, n);

p2 = mat_elem(a, r2, i, n);

tmp = *p1, *p1 = *p2, *p2 = tmp;

}

for (i = 0; i < m; i++) {

p1 = mat_elem(b, r1, i, m);

p2 = mat_elem(b, r2, i, m);

tmp = *p1, *p1 = *p2, *p2 = tmp;

}

}

#pragma acc routine seq

extern “C” void gauss_eliminate(double *a, double *b, double *x, int n, int m)

{

#define A(r, c) (*mat_elem(a, r, c, n))

#define B(r, c) (*mat_elem(b, r, c, m))

#define X(r, c) (*mat_elem(x, r, c, m))

int i, j, col, row, max_row,dia;

double max, tmp;

for (dia = 0; dia < n; dia++) {

max_row = dia, max = A(dia, dia);

for (row = dia + 1; row < n; row++)

if ((tmp = fabs(A(row, dia))) > max)

max_row = row, max = tmp;

swap_row(a, b, dia, max_row, n, m);

for (row = dia + 1; row < n; row++) {

tmp = A(row, dia) / A(dia, dia);

for (col = dia+1; col < n; col++)

A(row, col) -= tmp * A(dia, col);

A(row, dia) = 0;

for (col = 0; col < m; col++)

B(row,col) -= tmp * B(dia,col);

}

}

## for (row = n - 1; row >= 0; row–) {

for (j = n - 1; j > row; j–) {

for (col = 0; col < m; col++) {

//printf(“row=%d j=%d col=%d x=%f\n”, row, j, col, X(j,col));

//printf(“row=%d j=%d col=%d x=%.16f\n”, row, j, col, X(row, j));

B(row,col) -= X(j,col) * A(row, j);

//printf(“row=%d j=%d col=%d b=%.16f x=%f\n”, row, j, col, B(row,col), X(j,col));

}

}

for (col = 0; col < m; col++) {

//printf(“row=%d col=%d b=%.16f a=%.16f x=%.16f\n”, row, col, B(row,col), A(row,row), X(row,col));

X(row,col) = B(row,col) / A(row,row);

//printf(“row=%d col=%d b=%.16f a=%.16f x=%.16f\n”, row, col, B(row,col), A(row,row), X(row,col));

}

}

#undef A

#undef B

#undef X

}

The Fortran subroutine calls the routine named ‘gauss_eliminate’ in the above C code. The calling Fortran routine contains the following interface:

INTERFACE

SUBROUTINE gauss_eliminate(a, b, x, n, m) BIND©

USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_DOUBLE, C_INT

IMPLICIT NONE

real(C_DOUBLE), dimension(n,n), intent(in ) :: a

real(C_DOUBLE), dimension(m,n), intent(in ) :: b

real(C_DOUBLE), dimension(m,n), intent(inout) :: x

INTEGER(C_INT), VALUE :: n

INTEGER(C_INT), VALUE :: m

END SUBROUTINE gauss_eliminate

END INTERFACE

and also has the following declaration:

!$acc routine (gauss_eliminate) seq

Does anyone have any ideas why the C routine creates a problem for the PGI compiler?