Hi Andreas,
Unfortunately, I don’t believe we have support for this when using OpenMP Target Offload as of yet. Though, I added an RFE (TPR #31121) and sent it to engineering to see what they can do.
Note that we do have this support in for OpenACC, so you may consider using it instead, at least until support can be added for OpenMP as well.
Example:
% cat test_compute.h
void addone(double * Arr, int sze);
% cat test_compute.cpp
#include "test_compute.h"
void addone(double * Arr, int sze) {
#pragma omp target teams distribute parallel for map(tofrom:Arr[0:sze])
#pragma acc parallel loop copy(Arr[0:sze])
for (int i=0; i<sze; ++i) {
Arr[i]+=1.0;
}
}
% cat test.cpp
#include <iostream>
#include <cstdlib>
#include "test_compute.h"
int main () {
int sze = 1024;
double * Arr = new double[sze];
for (int i=0; i < sze; ++i) {
Arr[i] = i;
}
addone(Arr,sze);
for (int i=0; i < 10; ++i) {
std::cout << i << ": " << Arr[i] << std::endl;
}
}
% nvc++ -g -O3 -fpic -mp=gpu -shared test_compute.cpp -o libtest_compute.so
% export OMP_TARGET_OFFLOAD=MANDATORY
% g++ test.cpp -L. -ltest_compute -o test; ./test
Fatal error: Could not run target region on device 0, execution terminated.
Abort
% nvc++ test.cpp -L. -ltest_compute -o test; ./test
Fatal error: Could not run target region on device 0, execution terminated.
Abort
% nvc++ test.cpp -L. -ltest_compute -o test -mp=gpu; ./test
0: 1
1: 2
2: 3
3: 4
4: 5
5: 6
6: 7
7: 8
8: 9
9: 10
// Ok if using OpenACC:
% nvc++ -g -O3 -fpic -acc=gpu -shared test_compute.cpp -o libtest_compute.so
% export NV_ACC_TIME=1
% g++ test.cpp -L. -ltest_compute -o test ; ./test
0: 1
1: 2
2: 3
3: 4
4: 5
5: 6
6: 7
7: 8
8: 9
9: 10
Accelerator Kernel Timing data
test_compute.cpp
_Z6addonePdi NVIDIA devicenum=0
time(us): 49
3: compute region reached 1 time
3: kernel launched 1 time
grid: [8] block: [128]
device time(us): total=5 max=5 min=5 avg=5
elapsed time(us): total=360 max=360 min=360 avg=360
3: data region reached 2 times
3: data copyin transfers: 1
device time(us): total=23 max=23 min=23 avg=23
8: data copyout transfers: 1
device time(us): total=21 max=21 min=21 avg=21