I’m encountering two problems using OpenMPI and OpenACC shipped with the NVHPC SDK 21.3
Here is a minimal example:
#include <iostream>
#include <openacc.h>
#include <cuda_runtime.h>
#include <cuComplex.h>
#include <mpi.h>
int main(int argc, char* argv[]) {
try {
int mpi_rank = -1;
int mpi_size = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
uint64_t size = 8;
cuDoubleComplex* psi1 = new cuDoubleComplex[size]();
cuDoubleComplex* psi2 = new cuDoubleComplex[size]();
for (uint64_t i = 0; i < size; ++i)
psi1[i].x = mpi_rank;
#pragma acc parallel loop copyin(psi1[size]) copyout(psi2[size])
for (uint64_t i = 0; i < size; ++i)
psi2[i] = psi1[i];
for (uint64_t i = 0; i < size; ++i)
std::cout << "[" << mpi_rank << "]: " << psi2[i].x << ", " << psi2[i].y << std::endl;
delete[] psi2;
delete[] psi1;
MPI_Finalize();
} catch (...) {
return -1;
}
return 0;
}
I compiled and ran it using
mpic++ --std=c++17 -acc -fast -mp -gopt -gpu=lineinfo -Minfo=accel -Mcuda -Mcudalib=cublas -Wall -Wextra -pedantic minex.cpp -o minex -llapack -lblas -fortranlibs
mpirun -n 2 -q minex
The problems are:
- If the try and catch statements are given (not commented out), nvc++ does not parallelize the trivial acc loop:
main:
22, Generating copyout(psi2[:size]) [if not already present]
Generating copyin(psi1[:size]) [if not already present]
Generating Tesla code
25, #pragma acc loop seq
25, Complex loop carried dependence of psi1->x,psi2->x,psi1->y prevents parallelization
Loop carried dependence of psi2->x,psi1->y prevents parallelization
Loop carried backward dependence of psi2->x,psi1->y prevents vectorization
26, Accelerator restriction: induction variable live-out from loop: i
- Each MPI process and each thread create annoying empty files in my directory of the form 0_r0_t1, 1_r1_t1, … Why is that and how can I turn it off? Additionally, if I don’t run with -q, I see the following output from OpenMPI which I don’t know what to do with:
--------------------------------------------------------------------------
[[23575,1],1]: A high-performance Open MPI point-to-point messaging module
was unable to find any relevant network interfaces:
Module: OpenFabrics (openib)
Host: dxer
Another transport will be used instead, although this may result in
lower performance.
NOTE: You can disable this warning by setting the MCA parameter
btl_base_warn_component_unused to 0.
--------------------------------------------------------------------------
...
[dxer:916980] 1 more process has sent help message help-mpi-btl-base.txt / btl:no-nics
[dxer:916980] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
Version information from mpic++ --version
:
nvc++ 21.3-0 LLVM 64-bit target on x86-64 Linux -tp skylake
NVIDIA Compilers and Tools
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Thanks and best wishes,
Dennis