HI,

I have written this code that implements the Laplace equation in c++ with the use of the OpenMP librairy for the gpu offloading.

``````#include <iostream>
#include <cstdlib>
#include <cmath>
#include <chrono>
#include <omp.h>

#define N 3000
#define M 3000

int main(int argc, char** argv)
{
int n = N;
int m = M;
double *A = new double[n*m];
double *Anew = new double[n*m];

double error = 1.0;
double tol = 1e-6;
//int device_used= -1;

// Initialize arrays
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
A[j*m+i] = 0.0;
Anew[j*m+i] = 0.0;
}
}

// Set boundary conditions
for (int i = 0; i < m; i++) {
A[0*m+i] = 1.0;
Anew[0*m+i] = 1.0;
}

#pragma omp parallel for
for (int j = 1; j < n-1; j++) {
A[j*m+0] = 1.0;
Anew[j*m+0] = 1.0;
for (int i = 1; i < m-1; i++ ) {
A[j*m+i] = 0.0;
Anew[j*m+i] = 0.0;
}
A[j*m+m-1] = 1.0;
Anew[j*m+m-1] = 1.0;
}

int iter = 0;

auto start = std::chrono::high_resolution_clock::now();
#pragma omp target data map(to:Anew) map(A)
while (error > tol && iter < 1000) {
std::cout<<"NON"<<std::endl;
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error) map(error)
for (int j = 1; j < n-1; j++) {
std::cout<<"NON!"<<std::endl;
for (int i = 1; i < m-1; i++ ) {
Anew[j*m+i] = 0.25 * ( A[j*m+i+1] + A[j*m+i-1]
+ A[(j-1)*m+i] + A[(j+1)*m+i]);
error = fmax( error, fabs(Anew[j*m+i] - A[j*m+i]));
}
}

// swap A and Anew
#pragma omp target teams distribute parallel for
for (int j = 1; j < n-1; j++) {
for (int i = 1; i < m-1; i++ ) {
A[j*m+i] = Anew[j*m+i];
}
}

iter++;
//std::cout << "Iteration " << iter << ", error = " << error << std::endl;
}

auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);

// Check if GPU was used
//if (device_used != -1) {
//  std::cout << "GPU was used." << std::endl;
//}

std::cout << "Execution time: " << duration.count() << "ms" << std::endl;

delete[] A;
delete[] Anew;

return 0;
}
``````

I compile with this command:
`nvc++ -mp=gpu -gpu=cc75 c_omp_gpu.cc -o c_omp_gpu`

And the terminal gives me this error:

``````NVC++-S-0000-Internal compiler error. Call in OpenACC region to support routine - strlen (c_omp_gpu.cc: 400)
NVC++-W-0155-Compiler failed to translate accelerator region (see -Minfo messages): Missing branch target block (c_omp_gpu.cc: 1)
NVC++-F-0704-Compilation aborted due to previous errors. (c_omp_gpu.cc)
NVC++/x86-64 Linux 23.1-0: compilation aborted
``````

I don’t understand the error and I don’t understand wht they mention OpenAcc even if I never use it.

Can you help me?

Hi doncecchicarloelia,

The error is due to the use of “std::cout”. I/O support on the device is limited and the istream constructs can’t be used. If you do need to print, you can use the C stdio’s “printf”.

Also, the declarations in the “map” clause is incorrect. Here you’d be just creating the pointers for A and Anew on the device, not the data they point to. Instead, you’ll want to use triplet notation to defined the size of the arrays. Something like:

`#pragma omp target data map(alloc:Anew[:n*m]) map(tofrom:A[:n*m])`

Sorry about the error message. Our OpenMP and OpenACC implementations share common code so sometimes the error messages may say the other model.

Here’s an example of the corrected code:

``````% cat test.cpp
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <chrono>
#include <omp.h>

#define N 3000
#define M 3000

int main(int argc, char** argv)
{
int n = N;
int m = M;
double *A = new double[n*m];
double *Anew = new double[n*m];

double error = 1.0;
double tol = 1e-6;
//int device_used= -1;

// Initialize arrays
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
A[j*m+i] = 0.0;
Anew[j*m+i] = 0.0;
}
}

// Set boundary conditions
for (int i = 0; i < m; i++) {
A[0*m+i] = 1.0;
Anew[0*m+i] = 1.0;
}

#pragma omp parallel for
for (int j = 1; j < n-1; j++) {
A[j*m+0] = 1.0;
Anew[j*m+0] = 1.0;
for (int i = 1; i < m-1; i++ ) {
A[j*m+i] = 0.0;
Anew[j*m+i] = 0.0;
}
A[j*m+m-1] = 1.0;
Anew[j*m+m-1] = 1.0;
}

int iter = 0;

auto start = std::chrono::high_resolution_clock::now();
#pragma omp target data map(alloc:Anew[:n*m]) map(tofrom:A[:n*m])
while (error > tol && iter < 1000) {
std::cout<<"NON"<<std::endl;
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error) map(error)
for (int j = 1; j < n-1; j++) {
//          std::cout<<"NON!"<<std::endl;
//          printf("NON!\n");
for (int i = 1; i < m-1; i++ ) {
Anew[j*m+i] = 0.25 * ( A[j*m+i+1] + A[j*m+i-1]
+ A[(j-1)*m+i] + A[(j+1)*m+i]);
error = fmax( error, fabs(Anew[j*m+i] - A[j*m+i]));
}
}

// swap A and Anew
#pragma omp target teams distribute parallel for
for (int j = 1; j < n-1; j++) {
for (int i = 1; i < m-1; i++ ) {
A[j*m+i] = Anew[j*m+i];
}
}

iter++;
//std::cout << "Iteration " << iter << ", error = " << error << std::endl;
}

auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);

// Check if GPU was used
//if (device_used != -1) {
//  std::cout << "GPU was used." << std::endl;
//}

std::cout << "Execution time: " << duration.count() << "ms" << std::endl;

delete[] A;
delete[] Anew;

return 0;
}
% nvc++ -mp=gpu -gpu=cc80 -Minfo=mp test.cpp ; a.out
main:
1, include "iostream"
39, include "ostream"
38, include "ios"
44, include "basic_ios.h"
37, #omp parallel
56, #omp target teams distribute parallel for
56, Generating "nvkernel_main_F225L56_3" GPU kernel
Loop parallelized across teams and threads(128), schedule(static)
Generating reduction(max:error)
64, #omp target teams distribute parallel for
52, Generating map(tofrom:A[:m*n])
Generating map(alloc:Anew[:m*n])
64, Generating "nvkernel_main_F225L64_5" GPU kernel
68, Loop parallelized across teams and threads(128), schedule(static)
56, Generating map(tofrom:error)
NON
NON
... cut ...
NON
NON
Execution time: 4672ms
``````

Thanks.