Hi,
I have noticed a problem and I believe it could be a bug. OpenMP GPU Offloading does not work if the pointers are members of a class. But if I assign them to new pointers in the function where omp target
is, then the error does not occur.
Here is a minimal example to reproduce the issue:
#include <chrono>
#include <cstring>
#include <iostream>
#include <random>
class CL {
public:
float* A;
float* B;
float* C;
const size_t num_elements = 100000;
const size_t N = 32 * 32 * num_elements;
CL() {
A = new float[32 * 32 * num_elements];
B = new float[32 * 32 * num_elements];
C = new float[32 * 32 * num_elements];
float CoreA[32 * 32]{0.f};
float CoreB[32 * 32]{0.f};
float CoreC[32 * 32]{0.f};
std::default_random_engine generator;
std::uniform_real_distribution<float> distribution(1.0, 15.0);
for (int i = 0; i < 32 * 32; i++) {
float random_number = distribution(generator);
A[i] = random_number;
random_number = distribution(generator);
B[i] = random_number;
}
// Copy the Element Matrices N times into Element Buffers
for (int i = 0; i < num_elements; i++) {
std::memcpy(&A[32 * 32 * i], &CoreA[0], 32 * 32 * sizeof(float));
std::memcpy(&B[32 * 32 * i], &CoreB[0], 32 * 32 * sizeof(float));
std::memcpy(&C[32 * 32 * i], &CoreC[0], 32 * 32 * sizeof(float));
}
}
~CL() {
delete[] A;
delete[] B;
delete[] C;
}
void run() {
/* Fails with
Failing in Thread:1
Accelerator Fatal Error: call to cuMemcpyDtoHAsync returned error 700: Illegal address during kernel execution
File: /home/primrose/Work/Peano/omptest.cpp
Function: _ZN2CL3runEv:47
Line: 60
*/
/*
but
*/
float* _A = A;
float* _B = B;
float* _C = C;
// float *_A = new float[32 * 32 * num_elements];
// float *_B = new float[32 * 32 * num_elements];
// float *_C = new float[32 * 32 * num_elements];
auto start = std::chrono::high_resolution_clock::now();
for (int d = 0; d < 1; d++) {
#pragma omp target enter data map(to : _A[0 : N], _B[0 : N], _C[0 : N]) device(d)
{}
for (int j = 0; j < 50; j++) {
#pragma omp target teams distribute parallel for device(d)
for (int i = 0; i < N; i++) {
_C[i] += _A[i] * _B[i];
}
}
#pragma omp target exit data map(from : _A[0 : N], _B[0 : N], _C[0 : N]) device(d)
{}
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
std::cout << "Time taken: " << duration.count() << " milliseconds" << std::endl;
//delete[] _A;
//delete[] _B;
//delete[] _C;
}
};
int main() {
CL cl;
cl.run();
return 0;
}
If I change remove the lines, float* _A = A; float* _B = B; float* _C = C;
and change every occurrence of _A
to A
and _B
to B
and _C
to C
, then I get the following error:
/* Fails with
Failing in Thread:1
Accelerator Fatal Error: call to cuMemcpyDtoHAsync returned error 700: Illegal address during kernel execution
File: /home/primrose/Work/Peano/omptest.cpp
Function: _ZN2CL3runEv:47
Line: 60
*/
Could you inform me whether this is a bug, or I am doing something not allowed or UB per OpenMP standard?