Problem with the nvc++ compiler for OpeMP GPU offloading

HI,

I have written this code that implements the Laplace equation in c++ with the use of the OpenMP librairy for the gpu offloading.

#include <iostream>
#include <cstdlib>
#include <cmath>
#include <chrono>
#include <omp.h>

#define N 3000
#define M 3000

int main(int argc, char** argv)
{
    int n = N;
    int m = M;
    double *A = new double[n*m];
    double *Anew = new double[n*m];
    
    double error = 1.0;
    double tol = 1e-6;
    //int device_used= -1;

    // Initialize arrays
    for (int j = 0; j < n; j++) {
        for (int i = 0; i < m; i++) {
            A[j*m+i] = 0.0;
            Anew[j*m+i] = 0.0;
        }
    }

    // Set boundary conditions
    for (int i = 0; i < m; i++) {
        A[0*m+i] = 1.0;
        Anew[0*m+i] = 1.0;
    }

    #pragma omp parallel for
    for (int j = 1; j < n-1; j++) {
        A[j*m+0] = 1.0;
        Anew[j*m+0] = 1.0;
        for (int i = 1; i < m-1; i++ ) {
            A[j*m+i] = 0.0;
            Anew[j*m+i] = 0.0;
        }
        A[j*m+m-1] = 1.0;
        Anew[j*m+m-1] = 1.0;
    }

    int iter = 0;

    auto start = std::chrono::high_resolution_clock::now();
    #pragma omp target data map(to:Anew) map(A)
    while (error > tol && iter < 1000) {
        std::cout<<"NON"<<std::endl;
        error = 0.0;
        #pragma omp target teams distribute parallel for reduction(max:error) map(error)
        for (int j = 1; j < n-1; j++) {
                std::cout<<"NON!"<<std::endl;
            for (int i = 1; i < m-1; i++ ) {
                Anew[j*m+i] = 0.25 * ( A[j*m+i+1] + A[j*m+i-1]
                                    + A[(j-1)*m+i] + A[(j+1)*m+i]);
                error = fmax( error, fabs(Anew[j*m+i] - A[j*m+i]));
            }
        }

        // swap A and Anew
        #pragma omp target teams distribute parallel for
        for (int j = 1; j < n-1; j++) {
            for (int i = 1; i < m-1; i++ ) {
                A[j*m+i] = Anew[j*m+i];
            }
        }

        iter++;
        //std::cout << "Iteration " << iter << ", error = " << error << std::endl;
    }

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);

    // Check if GPU was used
    //if (device_used != -1) {
      //  std::cout << "GPU was used." << std::endl;
    //}

        std::cout << "Execution time: " << duration.count() << "ms" << std::endl;


    delete[] A;
    delete[] Anew;

    return 0;
}

I compile with this command:
nvc++ -mp=gpu -gpu=cc75 c_omp_gpu.cc -o c_omp_gpu

And the terminal gives me this error:

NVC++-S-0000-Internal compiler error. Call in OpenACC region to support routine - strlen (c_omp_gpu.cc: 400)
NVC++-W-0155-Compiler failed to translate accelerator region (see -Minfo messages): Missing branch target block (c_omp_gpu.cc: 1)
NVC++-F-0704-Compilation aborted due to previous errors. (c_omp_gpu.cc)
NVC++/x86-64 Linux 23.1-0: compilation aborted

I don’t understand the error and I don’t understand wht they mention OpenAcc even if I never use it.

Can you help me?

Hi doncecchicarloelia,

The error is due to the use of “std::cout”. I/O support on the device is limited and the istream constructs can’t be used. If you do need to print, you can use the C stdio’s “printf”.

Also, the declarations in the “map” clause is incorrect. Here you’d be just creating the pointers for A and Anew on the device, not the data they point to. Instead, you’ll want to use triplet notation to defined the size of the arrays. Something like:

#pragma omp target data map(alloc:Anew[:n*m]) map(tofrom:A[:n*m])

Sorry about the error message. Our OpenMP and OpenACC implementations share common code so sometimes the error messages may say the other model.

Here’s an example of the corrected code:

% cat test.cpp
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <chrono>
#include <omp.h>

#define N 3000
#define M 3000

int main(int argc, char** argv)
{
    int n = N;
    int m = M;
    double *A = new double[n*m];
    double *Anew = new double[n*m];

    double error = 1.0;
    double tol = 1e-6;
    //int device_used= -1;

    // Initialize arrays
    for (int j = 0; j < n; j++) {
        for (int i = 0; i < m; i++) {
            A[j*m+i] = 0.0;
            Anew[j*m+i] = 0.0;
        }
    }

    // Set boundary conditions
    for (int i = 0; i < m; i++) {
        A[0*m+i] = 1.0;
        Anew[0*m+i] = 1.0;
    }

    #pragma omp parallel for
    for (int j = 1; j < n-1; j++) {
        A[j*m+0] = 1.0;
        Anew[j*m+0] = 1.0;
        for (int i = 1; i < m-1; i++ ) {
            A[j*m+i] = 0.0;
            Anew[j*m+i] = 0.0;
        }
        A[j*m+m-1] = 1.0;
        Anew[j*m+m-1] = 1.0;
    }

    int iter = 0;

    auto start = std::chrono::high_resolution_clock::now();
    #pragma omp target data map(alloc:Anew[:n*m]) map(tofrom:A[:n*m])
    while (error > tol && iter < 1000) {
        std::cout<<"NON"<<std::endl;
        error = 0.0;
        #pragma omp target teams distribute parallel for reduction(max:error) map(error)
        for (int j = 1; j < n-1; j++) {
//          std::cout<<"NON!"<<std::endl;
//          printf("NON!\n");
            for (int i = 1; i < m-1; i++ ) {
                Anew[j*m+i] = 0.25 * ( A[j*m+i+1] + A[j*m+i-1]
                                    + A[(j-1)*m+i] + A[(j+1)*m+i]);
                error = fmax( error, fabs(Anew[j*m+i] - A[j*m+i]));
            }
        }

        // swap A and Anew
        #pragma omp target teams distribute parallel for
        for (int j = 1; j < n-1; j++) {
            for (int i = 1; i < m-1; i++ ) {
                A[j*m+i] = Anew[j*m+i];
            }
        }

        iter++;
        //std::cout << "Iteration " << iter << ", error = " << error << std::endl;
    }

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);

    // Check if GPU was used
    //if (device_used != -1) {
      //  std::cout << "GPU was used." << std::endl;
    //}

        std::cout << "Execution time: " << duration.count() << "ms" << std::endl;


    delete[] A;
    delete[] Anew;

    return 0;
}
% nvc++ -mp=gpu -gpu=cc80 -Minfo=mp test.cpp ; a.out
main:
      1, include "iostream"
          39, include "ostream"
               38, include "ios"
                    44, include "basic_ios.h"
                         37, #omp parallel
                         56, #omp target teams distribute parallel for
                             56, Generating "nvkernel_main_F225L56_3" GPU kernel
                                 Loop parallelized across teams and threads(128), schedule(static)
                                 Generating reduction(max:error)
                         64, #omp target teams distribute parallel for
                             52, Generating map(tofrom:A[:m*n])
                                 Generating map(alloc:Anew[:m*n])
                             64, Generating "nvkernel_main_F225L64_5" GPU kernel
                             68, Loop parallelized across teams and threads(128), schedule(static)
     56, Generating map(tofrom:error)
NON
NON
... cut ...
NON
NON
Execution time: 4672ms

Thanks.