transfer dynamic array of pointers to class to device memory

Hi,

is the following code a correct and recommended way of transferring a dynamic array of pointers to class to device memory so that it can be used in kernels?

Thanks,
LS

class mymaterial {
private:
  int n;
  double* b;

public:
  mymaterial(int n_){
    n = n_;
    b = new double[n];
    for(int i=0;i<n;++i){
      b[i] = i;
    }
  }

  void todev(){
#pragma acc enter data pcopyin(this[0:1], b[0:n])
  }

//some other member functions

};



mymaterial** materials = new mymaterial*[N];
for(int i=0;i<N;++i){
  materials[i] = new mymaterial(10);
} 

#pragma acc enter data pcopyin(materials[0:N])

for(int i=0;i<N;++i){
  materials[i]->todev();
} 

#pragma acc kernels present(materials)
{
#pragma acc loop independent
  for(int i=0;i<N;++i){
    mymaterial* mat = materials[i];
    for(int j=0;j<10;++j){
      mat->b[j] *= 2;
    }
  }

}

Hi LS,

The easiest thing to do is use CUDA Unified Memory by adding the flag “-ta=tesla:managed”. This will have the CUDA driver manage the memory for you so you don’t need to manage it yourself. The caveats being the Unified Memory is still considered experimental, only allows you to use as much memory as available on your device, may be slow (but not always), and is only available for use with dynamic data so you’d need to modify your code to dynamically allocate “materials”.

Hopefully the OpenACC 3.0 standard will add better support for aggregates with dynamic data members, but for now if you manage the data yourself, you’ll need to add a bit more code.

When the class or structure contains fixed size data (i.e. the size is known at compile time), the compiler can add some special case code to handle multi-dimensional arrays in OpenACC data clauses. However when the class contains dynamic data members, it doesn’t have enough information so the user needs to manually add code which populates the device array with the device pointers to the classes.

First, I’d recommend moving the unstructured data region which creates the “this” pointer into the class constructor. For dynamic data members, have their “enter data” in the same method where they are allocated. (For “b” this would be in the constructor as well).

In the main code when you allocate each element of “materials”, you need to capture the device “this” pointer and then assign it to the device copy of “materials”. I prefer to capture them into an array and then call “acc_memcpy_device” to update the device copy of “materials”, but you could do it one at a time as well (albeit slower).

You’ll also want to add update routines to the mymaterial class. The compiler won’t know how to perform a deep update of the dynamic data members so you’ll need to add this code yourself.

Hope this helps,
Mat

Note that I modified your code so that the size of each “b” is variable.

% cat test2.cpp
#include <iostream>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef N
#define N 20
#endif 

using namespace std;

class mymaterial {

public:
  int n;
  double* b;
  mymaterial(int n_){
    n = n_;
    b = new double[n];
    for(int i=0;i<n;++i){
      b[i] = i;
    }
    #pragma acc enter data pcopyin(this, b[0:n])
  }

  ~mymaterial() {
     #pragma acc exit data delete(b[0:n], this)
     delete [] b;
  }


#ifdef _OPENACC
  void update_self() {
#pragma acc update self(b[0:n])
  }
  void update_device() {
#pragma acc update device(b[0:n])
  }
#endif

};


int main () {

mymaterial** materials = new mymaterial*[N];
#ifdef _OPENACC
size_t devptr[N];
#endif

for(int i=0;i<N;++i){
  materials[i] = new mymaterial(i+1);
#ifdef _OPENACC
  devptr[i] = (size_t) acc_deviceptr(materials[i]);
#endif
}

#ifdef _OPENACC
#pragma acc enter data create(materials[0:N][0:1]) copyin(devptr[0:N])
acc_memcpy_device(acc_deviceptr(materials),acc_deviceptr(devptr),sizeof(size_t)*N);
#endif

#pragma acc kernels loop independent present(materials)
  for(int i=0;i<N;++i){
    mymaterial* mat = materials[i];
    for(int j=0;j<mat->n;++j){
      mat->b[j] *= 2;
    }
  }

#ifdef _OPENACC
  for(int i=0;i<N;++i){
    materials[i]->update_self();
  }
#endif

  for(int i=0;i<N;++i){
    for(int j=0;j<materials[i]->n;++j){
     cout << materials[i]->b[j] << ",";
    }
    cout << endl;
    delete materials[i];
  }
#pragma acc exit data delete(materials[0:N][0:1],devptr)

}

% pgc++ test2.cpp -acc -Minfo=accel; a.out
main:
     57, Generating enter data copyin(devptr[:])
         Generating enter data create(materials[:20][:1])
         Generating present(materials[:][:])
     61, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         61, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     63, Loop is parallelizable
     83, Generating exit data delete(devptr[:],materials[:20][:1])
mymaterial::mymaterial(int):
     21, Generating enter data copyin(b[:n],this[:1])
mymaterial::~mymaterial():
     25, Generating exit data delete(this[:1],b[:n])
mymaterial::update_self():
     32, Generating update self(b[:n])
0,
0,2,
0,2,4,
0,2,4,6,
0,2,4,6,8,
0,2,4,6,8,10,
0,2,4,6,8,10,12,
0,2,4,6,8,10,12,14,
0,2,4,6,8,10,12,14,16,
0,2,4,6,8,10,12,14,16,18,
0,2,4,6,8,10,12,14,16,18,20,
0,2,4,6,8,10,12,14,16,18,20,22,
0,2,4,6,8,10,12,14,16,18,20,22,24,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,

Hi Mat,

Many thanks for the good explanation. I was able to apply it to my code and it works well.

Cheers,
LS