Understanding how to work with derived c++ classes in Openacc?

Hello everyone,
I’m trying to implement some plasma physics codes on GPUs. However, I have a problem of the following type:

I have two classes: base and derived. The derived class contains an additional pointer that points to a member array of the base class. When I compile the code, there is no problem. However, the following error

FATAL ERROR: variable in data clause is partially present on the device: name=fderived

appears at runtime. I do understand why it occurs: it is due to the member vals1D_ of the derived class (see the code below). My question is how to correctly create vals1D_ on the device, or attach it to vals_ of the base class? In other words, I want to attach the pointer vals1D_ of the derived class to vals_ of the base class on GPU. I have tried different ways, and all led to the mentioned error.

Here is the code

#include <iostream>
#include <vector>
#include <iomanip>


template <typename U, typename T>
class base 
{
    public:
        U size_;
        T *vals_;

    base(U size): size_{size}
    {
        vals_ = new T [size_];
        #ifdef _OPENACC 
            #pragma acc enter data copyin(this)
            #pragma acc enter data create(vals_[0:size_])
        #endif
    }

    /* Update self */
    void updateSelf()
    {
        #ifdef _OPENACC 
            #pragma acc update self(vals_[0:size_])
        #endif
    }

    /* Update device */
    void updateDevice()
    {
        #ifdef _OPENACC 
            #pragma acc update device(vals_[0:size_])
        #endif
    }
    
    #ifdef _OPENACC
        #pragma acc routine seq
    #endif 
    inline T& operator() (U i)
    {
        return vals_[i];
    }
    /* Destructor */
    ~base()
    {
        #ifdef _OPENACC 
            #pragma acc exit data delete(vals_[0:size_])
        #endif
        if(vals_ != nullptr)
            delete [] vals_;
    }
};

template <typename U, typename T>
class derived : public base<U, T> 
{
    public: 
    T *vals1D_;
    derived(U size) : base<U, T>(size)
    {   
        vals1D_ = base<U, T>::vals_;

        #ifdef _OPENACC 
        
        #endif
    }
    ~derived()
    {
        
    }
};
int main()
{
    unsigned int s = 10;
    derived<unsigned int, double> fderived(s);

    for(unsigned int i = 0; i < s; i++)
        fderived.vals_[i] = i * i;
    
    fderived.updateDevice();

    
    #pragma acc parallel loop present(fderived)
    for(unsigned int i = 0; i < s; i++)
        fderived.vals_[i] = 5;

    return 0;
}

Hi feynman.physik,

You’ll want to move the “enter data” directives from the base class’ constructor to the derived class’ constructors. The problem being that when put in the base class, the size of the base class is smaller than the derived class and hence the partially present error.

This may be a good case, especially as your class structure becomes more complex, to use CUDA Unified Memory, i.e. add the flag “-gpu=managed”. You’ll still need to put “fderived” itself in a data region, but all the other update and data directives can be removed. The caveat being that UM support is a feature of nvhpc that may or may not be supported by other compilers.

Original example with the fixed data directives:

% cat test.cpp
#include <iostream>
#include <vector>
#include <iomanip>


template <typename U, typename T>
class base
{
    public:
        U size_;
        T *vals_;

    base(U size): size_{size}
    {
        vals_ = new T [size_];
        #ifdef _OPENACC
        #endif
    }

    /* Update self */
    void updateSelf()
    {
        #ifdef _OPENACC
            #pragma acc update self(vals_[0:size_])
        #endif
    }

    /* Update device */
    void updateDevice()
    {
        #ifdef _OPENACC
            #pragma acc update device(vals_[0:size_])
        #endif
    }

    #ifdef _OPENACC
        #pragma acc routine seq
    #endif
    inline T& operator() (U i)
    {
        return vals_[i];
    }
    /* Destructor */
    ~base()
    {
        #ifdef _OPENACC
            #pragma acc exit data delete(vals_[0:size_])
        #endif
        if(vals_ != nullptr)
            delete [] vals_;
    }
};

template <typename U, typename T>
class derived : public base<U, T>
{
    public:
    T *vals1D_;
    derived(U size) : base<U, T>(size)
    {
        vals1D_ = base<U, T>::vals_;
        #ifdef _OPENACC
            #pragma acc enter data copyin(this)
            #pragma acc enter data create(vals_[0:size_])
        #endif


    }
    ~derived()
    {

    }
};
int main()
{
    unsigned int s = 10;
    derived<unsigned int, double> fderived(s);

    for(unsigned int i = 0; i < s; i++)
        fderived.vals_[i] = i * i;

    fderived.updateDevice();


    #pragma acc parallel loop present(fderived)
    for(unsigned int i = 0; i < s; i++)
        fderived.vals_[i] = 5;

    return 0;
}
% nvc++ -Minfo=accel -acc  test.cpp ; a.out
main:
     82, Generating present(fderived)
         Generating implicit firstprivate(s)
         Generating NVIDIA GPU code
         86, #pragma acc loop gang /* blockIdx.x */
base<unsigned int, double>::updateDevice():
     34, Generating update device(vals_[:size_])
base<unsigned int, double>::~base():
     49, Generating exit data delete(vals_[:size_])
derived<unsigned int, double>::derived(unsigned int):
     68, Generating enter data copyin(this[:1])
         Generating enter data create(__b_4baseIjdE.vals_[:__b_4baseIjdE.size_])
%

The paired down version using CUDA UM:

% cat test2.cpp
#include <iostream>
#include <vector>
#include <iomanip>


template <typename U, typename T>
class base
{
    public:
        U size_;
        T *vals_;

    base(U size): size_{size}
    {
        vals_ = new T [size_];
    }

    #ifdef _OPENACC
        #pragma acc routine seq
    #endif
    inline T& operator() (U i)
    {
        return vals_[i];
    }
    /* Destructor */
    ~base()
    {
        if(vals_ != nullptr)
            delete [] vals_;
    }
};

template <typename U, typename T>
class derived : public base<U, T>
{
    public:
    T *vals1D_;
    derived(U size) : base<U, T>(size)
    {
        vals1D_ = base<U, T>::vals_;
    }
    ~derived()
    {

    }
};
int main()
{
    unsigned int s = 10;
    derived<unsigned int, double> fderived(s);

    for(unsigned int i = 0; i < s; i++)
        fderived.vals_[i] = i * i;

    #pragma acc enter data copyin(fderived)

    #pragma acc parallel loop present(fderived)
    for(unsigned int i = 0; i < s; i++)
        fderived.vals_[i] = 5;

    #pragma acc exit data delete(fderived)
    return 0;
}
% nvc++ -Minfo=accel -acc -gpu=managed test2.cpp ; a.out
main:
     53, Generating enter data copyin(fderived)
         Generating present(fderived)
         Generating implicit firstprivate(s)
         Generating NVIDIA GPU code
         58, #pragma acc loop gang /* blockIdx.x */
     59, Generating exit data delete(fderived)
%

Hope this helps,
Mat

1 Like

Thank you Mat for your very helpful answer and explanation.

I have a question about the pointer vals1D_? Is it copied to the device when moving “enter data” directives to the derived class, and thus points to vals_ array on the device?

And is it true that using the unified memory, in this case, leads to slowing down the code (worse performance)?

thanks,

I believe that since they have the same host address, they’ll map to the same device address.

And is it true that using the unified memory, in this case, leads to slowing down the code (worse performance)?

It can but usually only when there’s a lot of data movement back and forth between the host and device. Managed memory is paged which can be a bit slower than directly copying a large block of memory. Though for most cases, using UM is about the same performance as manually managing the data movement.

The main caveats are that it currently only can be used with allocated memory and is not performant when using CUDA Aware MPI.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.