# Compiling the HPL benchmark using accelerator directives

I am trying to compile the High Performance Linpack (HPL) benchmark to run on GPUs using PGI accelerator directives.

The most compute intensive part of the program is the HPL_dgemm.c code which does matrix multiplication.

The code can be obtained from:
http://www.netlib.org/benchmark/hpl/hpl-2.0.tar.gz

I’ve added the pragma directives to the nested loops in the code as below:

#include "hpl.h"

#ifndef HPL_dgemm

#ifdef HPL_CALL_VSIPL

static void HPL_dgemmNN
(
constt int                  M,
const int                  N,
const int                  K,
const double               ALPHA,
const double               * A,
const int                  LDA,
const double               * B,
const int                  LDB,
const double               BETA,
double                     * C,
const int                  LDC
)
#else
static void HPL_dgemmNN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
const intt                  K, LDA, LDB, LDC, M, N;
const double               ALPHA, BETA;
const double               * A, * B;
double                     * C;
#endif
//{
register double            t0;
int                       i, iail, iblj, icij, j, jal, jbj, jcj, l;

#pragma acc region
{
for( j = 0, jbj = 0, jcj  = 0; j < N; j++, jbj += LDB, jcj += LDC )
{
HPL_dscal( M, BETA, C+jcj, 1 );
for( l = 0, jal = 0, iblj = jbj; l < K; l++, jal += LDA, iblj += 1 )
{
t0 = ALPHA * B[iblj];
for( i = 0, iail = jal, icij = jcj; i < M; i++, iail += 1, icij += 1 )
{ C[icij] += A[iail] * t0; }
}
}
}
}

static void HPL_dgemmNT
(
const int                  M,
const int                  N,
const int                  K,
const double               ALPHA,
const double               * A,
const int                  LDA,
const double               * B,
const int                  LDB,
const double               BETA,
double                     * C,
const int                  LDC
)
#else
static void HPL_dgemmNT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
const int                  K, LDA, LDB, LDC, M, N;
const double               ALPHA, BETA;
const double               * A, * B;
double                     * C;
#endif
{
register double            t0;
int                        i, iail, ibj, ibjl, icij, j, jal, jcj, l;

#pragma acc region
{
for( j = 0, ibj  = 0, jcj  = 0; j < N; j++, ibj += 1, jcj += LDC )
{
HPL_dscal( M, BETA, C+jcj, 1 );
for( l = 0, jal = 0, ibjl = ibj; l < K; l++, jal += LDA, ibjl += LDB )
{
t0 = ALPHA * B[ibjl];
for( i = 0, iail = jal, icij = jcj; i < M; i++, iail += 1, icij += 1 )
{ C[icij] += A[iail] * t0; }
}
}
}
}

static void HPL_dgemmTN
(
const int                  M,
const int                  N,
const int                  K,
const double               ALPHA,
const double               * A,
const int                  LDA,
const double               * B,
const int                  LDB,
const double               BETA,
double                     * C,
const int                  LDC
)
#else
static void HPL_dgemmTN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
const int                  K, LDA, LDB, LDC, M, N;
const double               ALPHA, BETA;
const double               * A, * B;
double                     * C;
#endif
{
register double            t0;
int                        i, iai, iail, iblj, icij, j, jbj, jcj, l;

#pragma acc region
{
for( j = 0, jbj = 0, jcj = 0; j < N; j++, jbj += LDB, jcj += LDC )
{
for( i = 0, icij = jcj, iai = 0; i < M; i++, icij += 1, iai += LDA )
{
t0 = HPL_rzero;
for( l = 0, iail = iai, iblj = jbj; l < K; l++, iail += 1, iblj += 1 )
{ t0 += A[iail] * B[iblj]; }
if( BETA == HPL_rzero ) C[icij]  = HPL_rzero;
else                    C[icij] *= BETA;
C[icij] += ALPHA * t0;
}
}
}
}

static void HPL_dgemmTT
(
const int                  M,
const int                  N,
const int                  K,
const double               ALPHA,
const double               * A,
const int                  LDA,
const double               * B,
const int                  LDB,
const double               BETA,
double                     * C,
const int                  LDC
)
#else
static void HPL_dgemmTT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
const int                  K, LDA, LDB, LDC, M, N;
const double               ALPHA, BETA;
const double               * A, * B;
double                     * C;
#endif
{
register double            t0;
int                        i, iali, ibj, ibjl, icij, j, jai, jcj, l;

#pragma acc region
{
for( j = 0, ibj = 0, jcj  = 0; j < N; j++, ibj += 1, jcj += LDC )
{
for( i = 0, icij = jcj, jai = 0; i < M; i++, icij += 1, jai += LDA )
{
t0 = HPL_rzero;
for( l = 0,      iali  = jai, ibjl  = ibj;
l < K; l++, iali += 1,   ibjl += LDB ) t0 += A[iali] * B[ibjl];
if( BETA == HPL_rzero ) C[icij]  = HPL_rzero;
else                    C[icij] *= BETA;
C[icij] += ALPHA * t0;
}
}
}
}

static void HPL_dgemm0
(
const enum HPL_TRANS       TRANSA,
const enum HPL_TRANS       TRANSB,
const int                  M,
const int                  N,
const int                  K,
const double               ALPHA,
const double               * A,
const int                  LDA,
const double               * B,
const int                  LDB,
const double               BETA,
double                     * C,
const int                  LDC
)
#else
static void HPL_dgemm0( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
BETA, C, LDC )
const enum HPL_TRANS       TRANSA, TRANSB;
const int                  K, LDA, LDB, LDC, M, N;
const double               ALPHA, BETA;
const double               * A, * B;
double                     * C;
#endif
{
int                        i, j;

if( ( M == 0 ) || ( N == 0 ) ||
( ( ( ALPHA == HPL_rzero ) || ( K == 0 ) ) &&
( BETA == HPL_rone ) ) ) return;

if( ALPHA == HPL_rzero )
{
#pragma acc region
{
for( j = 0; j < N; j++ )
{  for( i = 0; i < M; i++ ) *(C+i+j*LDC) = HPL_rzero; }
}
return;
}

if( TRANSB == HplNoTrans )
{
if( TRANSA == HplNoTrans )
{ HPL_dgemmNN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); }
else
{ HPL_dgemmTN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); }
}
else
{
if( TRANSA == HplNoTrans )
{ HPL_dgemmNT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); }
else
{ HPL_dgemmTT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); }
}
}

#endif

void HPL_dgemm
(
const enum HPL_ORDER             ORDER,
const enum HPL_TRANS             TRANSA,
const enum HPL_TRANS             TRANSB,
const int                        M,
const int                        N,
const int                        K,
const double                     ALPHA,
const double *                   A,
const int                        LDA,
const double *                   B,
const int                        LDB,
const double                     BETA,
double *                         C,
const int                        LDC
)
#else
void HPL_dgemm
( ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
const enum HPL_ORDER             ORDER;
const enum HPL_TRANS             TRANSA;
const enum HPL_TRANS             TRANSB;
const int                        M;
const int                        N;
const int                        K;
const double                     ALPHA;
const double *                   A;
const int                        LDA;
const double *                   B;
const int                        LDB;
const double                     BETA;
double *                         C;
const int                        LDC;
#endif
{
/*
* Purpose
* =======
*
* HPL_dgemm performs one of the matrix-matrix operations
*
*     C := alpha * op( A ) * op( B ) + beta * C
*
*  where op( X ) is one of
*
*     op( X ) = X   or   op( X ) = X^T.
*
* Alpha and beta are scalars,  and A,  B and C are matrices, with op(A)
* an m by k matrix, op(B) a k by n matrix and  C an m by n matrix.
*
* Arguments
* =========
*
* ORDER   (local input)                 const enum HPL_ORDER
*         On entry, ORDER  specifies the storage format of the operands
*         as follows:
*            ORDER = HplRowMajor,
*            ORDER = HplColumnMajor.
*
* TRANSA  (local input)                 const enum HPL_TRANS
*         On entry, TRANSA  specifies the form of  op(A)  to be used in
*         the matrix-matrix operation follows:
*            TRANSA==HplNoTrans    : op( A ) = A,
*            TRANSA==HplTrans      : op( A ) = A^T,
*            TRANSA==HplConjTrans  : op( A ) = A^T.
*
* TRANSB  (local input)                 const enum HPL_TRANS
*         On entry, TRANSB  specifies the form of  op(B)  to be used in
*         the matrix-matrix operation follows:
*            TRANSB==HplNoTrans    : op( B ) = B,
*            TRANSB==HplTrans      : op( B ) = B^T,
*            TRANSB==HplConjTrans  : op( B ) = B^T.
*
* M       (local input)                 const int
*         On entry,  M  specifies  the  number  of rows  of the  matrix
*         op(A)  and  of  the  matrix  C.  M  must  be  at least  zero.
*
* N       (local input)                 const int
*         On entry,  N  specifies  the number  of columns of the matrix
*         op(B)  and  the number of columns of the matrix  C. N must be
*         at least zero.
*
* K       (local input)                 const int
*         On entry,  K  specifies  the  number of columns of the matrix
*         op(A) and the number of rows of the matrix op(B).  K  must be
*         be at least  zero.
*
* ALPHA   (local input)                 const double
*         On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
*         supplied  as  zero  then the elements of the matrices A and B
*         need not be set on input.
*
* A       (local input)                 const double *
*         On entry,  A  is an array of dimension (LDA,ka),  where ka is
*         k  when   TRANSA==HplNoTrans,  and  is  m  otherwise.  Before
*         entry  with  TRANSA==HplNoTrans, the  leading  m by k part of
*         the array  A must contain the matrix A, otherwise the leading
*         k  by  m  part of the array  A  must  contain the  matrix  A.
*
* LDA     (local input)                 const int
*         On entry, LDA  specifies the first dimension of A as declared
*         in the  calling (sub) program. When  TRANSA==HplNoTrans  then
*         LDA must be at least max(1,m), otherwise LDA must be at least
*         max(1,k).
*
* B       (local input)                 const double *
*         On entry, B is an array of dimension (LDB,kb),  where  kb  is
*         n   when  TRANSB==HplNoTrans, and  is  k  otherwise.   Before
*         entry with TRANSB==HplNoTrans,  the  leading  k by n  part of
*         the array  B must contain the matrix B, otherwise the leading
*         n  by  k  part of the array  B  must  contain  the matrix  B.
*
* LDB     (local input)                 const int
*         On entry, LDB  specifies the first dimension of B as declared
*         in the  calling (sub) program. When  TRANSB==HplNoTrans  then
*         LDB must be at least max(1,k), otherwise LDB must be at least
*         max(1,n).
*
* BETA    (local input)                 const double
*         On entry,  BETA  specifies the scalar  beta.   When  BETA  is
*         supplied  as  zero  then  the  elements of the matrix C  need
*         not be set on input.
*
* C       (local input/output)          double *
*         On entry,  C  is an array of dimension (LDC,n). Before entry,
*         the  leading m by n part  of  the  array  C  must contain the
*         matrix C,  except when beta is zero, in which case C need not
*         be set on entry. On exit, the array  C  is overwritten by the
*         m by n  matrix ( alpha*op( A )*op( B ) + beta*C ).
*
* LDC     (local input)                 const int
*         On entry, LDC  specifies the first dimension of C as declared
*         in  the   calling  (sub)  program.   LDC  must  be  at  least
*         max(1,m).
*
* ---------------------------------------------------------------------
*/
#ifdef HPL_CALL_CBLAS
cblas_dgemm( ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
BETA, C, LDC );
#endif
#ifdef HPL_CALL_VSIPL
if( ORDER == HplColumnMajor )
{
HPL_dgemm0( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA,
C, LDC );
}
else
{
HPL_dgemm0( TRANSB, TRANSA, N, M, K, ALPHA, B, LDB, A, LDA, BETA,
C, LDC );
}
#endif
#ifdef HPL_CALL_FBLAS
double                    alpha = ALPHA, beta = BETA;
#ifdef StringSunStyle
#ifdef HPL_USE_F77_INTEGER_DEF
F77_INTEGER               IONE = 1;
#else
int                       IONE = 1;
#endif
#endif
#ifdef StringStructVal
F77_CHAR                  ftransa;
F77_CHAR                  ftransb;
#endif
#ifdef StringStructPtr
F77_CHAR                  ftransa;
F77_CHAR                  ftransb;
#endif
#ifdef StringCrayStyle
F77_CHAR                  ftransa;
F77_CHAR                  ftransb;
#endif
#ifdef HPL_USE_F77_INTEGER_DEF
const F77_INTEGER         F77M   = M,   F77N   = N,   F77K = K,
F77lda = LDA, F77ldb = LDB, F77ldc = LDC;
#else
#define F77M                 M
#define F77N                 N
#define F77K                 K
#define F77lda               LDA
#define F77ldb               LDB
#define F77ldc               LDC
#endif
char                      ctransa, ctransb;

if(      TRANSA == HplNoTrans ) ctransa = 'N';
else if( TRANSA == HplTrans   ) ctransa = 'T';
else                            ctransa = 'C';

if(      TRANSB == HplNoTrans ) ctransb = 'N';
else if( TRANSB == HplTrans   ) ctransb = 'T';
else                            ctransb = 'C';

if( ORDER == HplColumnMajor )
{
#ifdef StringSunStyle
F77dgemm( &ctransa, &ctransb, &F77M, &F77N, &F77K, &alpha, A, &F77lda,
B, &F77ldb, &beta, C, &F77ldc, IONE, IONE );
#endif
#ifdef StringCrayStyle
ftransa = HPL_C2F_CHAR( ctransa ); ftransb = HPL_C2F_CHAR( ctransb );
F77dgemm( ftransa,  ftransb,  &F77M, &F77N, &F77K, &alpha, A, &F77lda,
B, &F77ldb, &beta, C, &F77ldc );
#endif
#ifdef StringStructVal
ftransa.len = 1; ftransa.cp = &ctransa;
ftransb.len = 1; ftransb.cp = &ctransb;
F77dgemm( ftransa,  ftransb,  &F77M, &F77N, &F77K, &alpha, A, &F77lda,
B, &F77ldb, &beta, C, &F77ldc );
#endif
#ifdef StringStructPtr
ftransa.len = 1; ftransa.cp = &ctransa;
ftransb.len = 1; ftransb.cp = &ctransb;
F77dgemm( &ftransa, &ftransb, &F77M, &F77N, &F77K, &alpha, A, &F77lda,
B, &F77ldb, &beta, C, &F77ldc );
#endif
}
else
{
#ifdef StringSunStyle
F77dgemm( &ctransb, &ctransa, &F77N, &F77M, &F77K, &alpha, B, &F77ldb,
A, &F77lda, &beta, C, &F77ldc, IONE, IONE );
#endif
#ifdef StringCrayStyle
ftransa = HPL_C2F_CHAR( ctransa ); ftransb = HPL_C2F_CHAR( ctransb );
F77dgemm( ftransb,  ftransa,  &F77N, &F77M, &F77K, &alpha, B, &F77ldb,
A, &F77lda, &beta, C, &F77ldc );
#endif
#ifdef StringStructVal
ftransa.len = 1; ftransa.cp = &ctransa;
ftransb.len = 1; ftransb.cp = &ctransb;
F77dgemm( ftransb,  ftransa,  &F77N, &F77M, &F77K, &alpha, B, &F77ldb,
A, &F77lda, &beta, C, &F77ldc );
#endif
#ifdef StringStructPtr
ftransa.len = 1; ftransa.cp = &ctransa;
ftransb.len = 1; ftransb.cp = &ctransb;
F77dgemm( &ftransb, &ftransa, &F77N, &F77M, &F77K, &alpha, B, &F77ldb,
A, &F77lda, &beta, C, &F77ldc );
#endif
}
#endif
/*
* End of HPL_dgemm
*/
}

#endif

However, when I compile the program I am not getting any feedback whatsoever from the compiler regarding handling the directives even though I am using -Minfo=all,accel and the compilation is successful:

[sindimo@tlca055 Linux]\$ date
Wed Dec 1 11:37:06 AST 2010

[sindimo@tlca055 Linux]\$ /usr/local/mpi/mpich2/pgi10.9/bin/mpicc -ta=nvidia,time -Minfo=all,accel -o HPL_dgemm.o -c -DAdd_ -DF77_INTEGER=int -DStringSunStyle -I/home/ecc_11/sindimo/hpl-gpu/hpl-2.0/include -I/home/ecc_11/sindimo/hpl-gpu/hpl-2.0/include/Linux -I/usr/local/mpi/mpich2/pgi10.9/include …/HPL_dgemm.c

[sindimo@tlca055 Linux]\$ ls -l HPL_dgemm.o
-rw-r–r-- 1 sindimo eac 3568 Dec 1 11:37 HPL_dgemm.o

Also when I run the final HPL binary while setting ACC_NOTIFY 1, I don’t see any kernels being launched.

Any idea what might be wrong here?

There’s a ton of people trying to get HPL to run on GPUs and I haven’t seen any thread anywhere for attempts to do so using the PGI accelerator directives, so maybe this one should be a good start!

Thank you

Any idea what might be wrong here?

Most likely you don’t have HPL_CALL_VSIPL defined so the code that contains the acc regions get preprocessed away. What do you have “HPL_OPTS” set to in your Make configuration file?

#
# - Compile time options -----------------------------------------------
#
# -DHPL_COPY_L           force the copy of the panel L before bcast;
# -DHPL_CALL_CBLAS       call the cblas interface;
# -DHPL_CALL_VSIPL       call the vsip  library;
# -DHPL_DETAILED_TIMING  enable detailed timers;
#
# By default HPL will:
#    *) not copy L before broadcast,
#    *) call the BLAS Fortran 77 interface,
#    *) not display detailed timing information.
#
HPL_OPTS     =
#
# ----------------------------------------------------------------------
• Mat

Dear Mat,

HPL_OPTS is not set to anything in the Make flie, what should it be set to so that the pragma regions are not ignored?

Below is the complete Make file being used.

Thank you again for your superior support!

#
#  -- High Performance Computing Linpack Benchmark (HPL)
#     HPL - 2.0 - September 10, 2008
#     Antoine P. Petitet
#     University of Tennessee, Knoxville
#     Innovative Computing Laboratory
#
#  -- Copyright notice and Licensing terms:
#
#  Redistribution  and  use in  source and binary forms, with or without
#  modification, are  permitted provided  that the following  conditions
#  are met:
#
#  1. Redistributions  of  source  code  must retain the above copyright
#  notice, this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce  the above copyright
#  notice, this list of conditions,  and the following disclaimer in the
#  documentation and/or other materials provided with the distribution.
#
#  3. All  advertising  materials  mentioning  features  or  use of this
#  software must display the following acknowledgement:
#  This  product  includes  software  developed  at  the  University  of
#  Tennessee, Knoxville, Innovative Computing Laboratory.
#
#  4. The name of the  University,  the name of the  Laboratory,  or the
#  names  of  its  contributors  may  not  be used to endorse or promote
#  products  derived   from   this  software  without  specific  written
#  permission.
#
#  -- Disclaimer:
#
#  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
#  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
#  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ######################################################################
#
# ----------------------------------------------------------------------
# - shell --------------------------------------------------------------
# ----------------------------------------------------------------------
#
SHELL        = /bin/sh
#
CD           = cd
CP           = cp
LN_S         = ln -s
MKDIR        = mkdir
RM           = /bin/rm -f
TOUCH        = touch
#
# ----------------------------------------------------------------------
# - Platform identifier ------------------------------------------------
# ----------------------------------------------------------------------
#
ARCH         = Linux
#
# ----------------------------------------------------------------------
# - HPL Directory Structure / HPL library ------------------------------
# ----------------------------------------------------------------------
#
TOPdir       = /home/ecc_11/sindimo/hpl-gpu/hpl-2.0
INCdir       = \$(TOPdir)/include
BINdir       = \$(TOPdir)/bin/\$(ARCH)
LIBdir       = \$(TOPdir)/lib/\$(ARCH)
#
HPLlib       = \$(LIBdir)/libhpl.a
#
# ----------------------------------------------------------------------
# - Message Passing library (MPI) --------------------------------------
# ----------------------------------------------------------------------
# MPinc tells the  C  compiler where to find the Message Passing library
# header files,  MPlib  is defined  to be the name of  the library to be
# used. The variable MPdir is only used for defining MPinc and MPlib.
#
MPdir        = /usr/local/mpi/mpich2/pgi10.9
MPinc        = -I\$(MPdir)/include
MPlib        = \$(MPdir)/lib/libmpich.a
#
# ----------------------------------------------------------------------
# - Linear Algebra library (BLAS or VSIPL) -----------------------------
# ----------------------------------------------------------------------
# LAinc tells the  C  compiler where to find the Linear Algebra  library
# header files,  LAlib  is defined  to be the name of  the library to be
# used. The variable LAdir is only used for defining LAinc and LAlib.
#
LAinc        =
#
# ----------------------------------------------------------------------
# - F77 / C interface --------------------------------------------------
# ----------------------------------------------------------------------
# You can skip this section  if and only if  you are not planning to use
# a  BLAS  library featuring a Fortran 77 interface.  Otherwise,  it  is
# necessary  to  fill out the  F2CDEFS  variable  with  the  appropriate
# options.  **One and only one**  option should be chosen in **each** of
# the 3 following categories:
#
# 1) name space (How C calls a Fortran 77 routine)
#
# -DAdd_              : all lower case and a suffixed underscore  (Suns,
#                       Intel, ...),                           [default]
# -DNoChange          : all lower case (IBM RS6000),
# -DUpCase            : all upper case (Cray),
# -DAdd__             : the FORTRAN compiler in use is f2c.
#
# 2) C and Fortran 77 integer mapping
#
# -DF77_INTEGER=int   : Fortran 77 INTEGER is a C int,         [default]
# -DF77_INTEGER=long  : Fortran 77 INTEGER is a C long,
# -DF77_INTEGER=short : Fortran 77 INTEGER is a C short.
#
# 3) Fortran 77 string handling
#
# -DStringSunStyle    : The string address is passed at the string loca-
#                       tion on the stack, and the string length is then
#                       passed as  an  F77_INTEGER  after  all  explicit
#                       stack arguments,                       [default]
# -DStringStructPtr   : The address  of  a  structure  is  passed  by  a
#                       Fortran 77  string,  and the structure is of the
#                       form: struct {char *cp; F77_INTEGER len;},
# -DStringStructVal   : A structure is passed by value for each  Fortran
#                       77 string,  and  the  structure is  of the form:
#                       struct {char *cp; F77_INTEGER len;},
# -DStringCrayStyle   : Special option for  Cray  machines,  which  uses
#                       Cray  fcd  (fortran  character  descriptor)  for
#                       interoperation.
#
#
# ----------------------------------------------------------------------
# - HPL includes / libraries / specifics -------------------------------
# ----------------------------------------------------------------------
#
HPL_INCLUDES = -I\$(INCdir) -I\$(INCdir)/\$(ARCH) \$(LAinc) \$(MPinc)
HPL_LIBS     = \$(HPLlib) \$(LAlib) \$(MPlib)
#
# - Compile time options -----------------------------------------------
#
# -DHPL_COPY_L           force the copy of the panel L before bcast;
# -DHPL_CALL_CBLAS       call the cblas interface;
# -DHPL_CALL_VSIPL       call the vsip  library;
# -DHPL_DETAILED_TIMING  enable detailed timers;
#
# By default HPL will:
#    *) not copy L before broadcast,
#    *) call the BLAS Fortran 77 interface,
#    *) not display detailed timing information.
#
HPL_OPTS     =
#
# ----------------------------------------------------------------------
#
HPL_DEFS     = \$(F2CDEFS) \$(HPL_OPTS) \$(HPL_INCLUDES)
#
# ----------------------------------------------------------------------
# - Compilers / linkers - Optimization flags ---------------------------
# ----------------------------------------------------------------------
#
CC           = /usr/local/mpi/mpich2/pgi10.9/bin/mpicc -ta=nvidia,time -Minfo=all,accel

CCNOOPT      = \$(HPL_DEFS)
CCFLAGS      = \$(HPL_DEFS)
#

#
ARCHIVER     = ar
ARFLAGS      = r
RANLIB       = echo
#
# ----------------------------------------------------------------------

HPL_OPTS is not set to anything in the Make flie, what should it be set to so that the pragma regions are not ignored?

Well, to do this you would need to define “-DHPL_CALL_VSIPL” however, I doubt you want to use the vsip library.

A better solution would be to call a GPU enabled DGEMM, like CUBLAS. Doing a web search for “hpl cuda” should give you some ideas.

Of course, if your goal was to see the performance of the PGI Accelerator, you could take the NetLib BLAS DGEMM (http://www.netlib.org/blas/dgemm.f) and accelerate it or use a simple accelerated MATMUL routine.

Hope this helps,
Mat

subroutine mm_acc( a, b, c, m )
real, dimension(:,:), intent(out) :: a
real, dimension(:,:), intent(in) :: b,c
integer m

!\$acc region
do j = 1,m
do i = 1,m
a(i,j) = 0.0
enddo

do i = 1,m
do k = 1,m
a(i,j) = a(i,j) + b(i,k) * c(k,j)
enddo
enddo
enddo
!\$acc end region
end subroutine