Yes, this one is a bit tricky to understand,
To back-up a bit, the compiler runtime uses a “present” table that includes the host address, the corresponding device address, and the size in bytes of the device memory. A “partially present” error means that the size being looked up is larger than the size in found in the present table. Side note: we have a non-standard API call “acc_present_dump”, found in the “accel.h” header, which will show the contents of the present table.
The this pointer that’s triggering the error is A_runner’s this, not “a”. A_runner and it’s member “a” both have the same base host address. However “A_runner” is 24 bytes (because “aos” is included) while “a” is only 16 bytes. Hence the mismatch sizes and the error.
Parent classes/structs need to create the device data for their members, i.e. data creation is from the the top down. So here, you need to create “this” in A_runner’s constructor and the call a’s initDevice to create “x”. The init routine is optional in this case since “x” is public and you could instead add a “create(a.x[:n])” in A_runner’s constructor, but for encapsulation I still prefer using init.
For example:
#include <stdio.h>
#include <cstdlib>
#include <openacc.h>
#include <accel.h>
struct A {
A( size_t N ) : n( N ), x( new int[N] ) {}
void initDevice() {
// no-op if the parent 'this' is already created, but keep in case A is used directly elsewhere
#pragma acc enter data create(this)
#pragma acc enter data create(x[0:n])
}
void deleteDevice() {
#pragma acc exit data delete(x,this)
}
~A() {
delete x;
}
size_t n;
int * x;
};
struct A_operator {
void apply( A &a, int i ) {
printf("%d ", a.x[i]);
};
};
struct A_runner {
A_runner( size_t N ) : a( N ) {
#pragma acc enter data copyin(this)
a.initDevice();
};
~A_runner() {
a.deleteDevice();
#pragma acc exit data delete(this)
}
void applyOperator(){
#pragma acc parallel loop gang present(a)
for( int i=0; i<a.n; i++ ){
aop.apply( a, i );
}
printf("\n");
};
A a;
A_operator aop;
};
int main() {
A_runner arun( 10 );
arun.applyOperator();
return 0;
}
And the compile and run:
% nvc++ -acc -Minfo=accel test.cpp; a.out
A::initDevice():
13, Generating enter data create(x[:n],this[:1])
A::deleteDevice():
16, Generating exit data delete(x[:1],this[:1])
A_operator::apply(A&, int):
26, Generating implicit acc routine seq
Generating acc routine seq
Generating NVIDIA GPU code
A_runner::A_runner(unsigned long):
34, Generating enter data copyin(this[:1])
A_runner::~A_runner():
39, Generating exit data delete(this[:1])
A_runner::applyOperator():
41, Generating present(a)
Generating NVIDIA GPU code
43, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
41, Generating implicit create(this[:]) [if not already present]
0 0 0 0 0 0 0 0 0 0
Hope this helps,
Mat