Matching C++ classes with openACC's data handling

Hello,
I currently have a C++ project which I want to accelerate using OpenACC. Unfortunately, I am quite fresh to OpenAcc. However, the access of data members crashes the program.
I have reproduced the error with the following code.

The data class:

class Data{ 
private:
        double * ptr_ = nullptr;
        const size_t size_;

public:
        explicit Data(const size_t size) : size_(size){
            ptr_ = new double[size];

            #pragma acc enter data copyin(this[:1])
            #pragma acc enter data create(ptr_[:size])
        }

        double & at(size_t i){
            return ptr_[i];
        }
        
        double & at(size_t i) const {
            return ptr_[i];
        }
	
	size_t size() const {
		return size_;
	}

        #pragma acc routine
        double & operator()(size_t i){
            return ptr_[i];
        }
        
        #pragma acc routine
        double & operator()(size_t i) const{
            return ptr_[i];
        }

        void updateHostToDev(){
            // send the data stored on the host to the device
            #pragma acc update device(ptr_[0:size_])
        }

        ~Data(){
            #pragma acc exit data delete(ptr_[:size_])
            #pragma acc exit data delete(this[:1])

            delete[] ptr_; 
        }
};

As you can see, it is a data class oriented at the data class of the unstructured data chapter in the OpenACC 2.7 dokumentation.

Now the fun part starts. First there is a data collection base class:

class DataCollection{ 
    public:
        Data data1;
        DataCollection(size_t size) : data1(size) {}    
};

It is supposed to store a minimal amount of data the algorithm needs.
Then, of course, there is a derived class:

class DerivedCollection: public DataCollection { 
   public:
        DerivedCollection(size_t size) : DataCollection(size) {}

        void f(){
               for(size_t i = 0; i < this->data1.size(); ++i){
                    this->data1(i) = 42;
               }
               this->data1.updateHostToDev();
        }
};

Which initializes the data in some way with the method

void f()

All this is then stored in a algorithm class as a reference to the derived class.

class Algorithm{ 
    private:
        DerivedCollection & deriClass;
    
    public:
        Algorithm(DerivedCollection & newDat) : deriClass(newDat){
            #pragma acc enter data copyin(this)
            
            deriClass.f();
        }

        void g(size_t dim){
            #pragma acc parallel loop present(deriClass.data1)
            for(size_t i = 0; i < dim ; ++i){
                printf("data1(%i) = %f", i, deriClass.data1(i));
            }
        }
};

Last, everything, is called in the main function:

int main(){
    const size_t size = 1;
    
    DerivedCollection dc(size);

    Algorithm a(dc);

    a.g(size);
    
    return 0;
}

Now to the problem: I am compiling with pgc++ 19.3-0 LLVM 64-bit target on x86-64 Linux -tp haswell

pgc++ -acc =ta:tesla=lineinfo -Minfo openAcc.cpp -o openAcc

When I am running this code (Tesla K80, CUDA Version: 10.1) it crashes at the printf() statement in the method g(size_t dim).

Does anyone have an idea what is going wrong here?

Maybe the following can help:
Minfo output:

Data::Data(unsigned long):
     17, Generating enter data copyin(this[:1])
         Generating enter data create(ptr_[:size])
Data::operator ()(unsigned long):
     32, Generating acc routine seq
         Generating Tesla code
Data::operator ()(unsigned long) const:
     37, Generating acc routine seq
         Generating Tesla code
Data::updateHostToDev():
     44, Generating update device(ptr_[:size_])
Data::~Data():
     50, Generating exit data delete(this[:1],ptr_[:size_])
Algorithm::Algorithm(DerivedCollection &):
     80, Generating enter data copyin(this[:1])
Algorithm::g(unsigned long):
     83, Generating present(deriClass->__b_14DataCollection.data1)
         Generating Tesla code
         85, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     83, Generating implicit copyin(this[:])

PGI_ACC_NOTIFY=16 ./openAcc

create CUDA data  bytes=16 file=/p/home/jusers/rodekamp1/jureca/openAccTest/openAcc.cpp function=_ZN4DataC1Em line=17 device=0 threadid=1
alloc  CUDA data  devaddr=0x2303a5a000 bytes=512 file=/p/home/jusers/rodekamp1/jureca/openAccTest/openAcc.cpp function=_ZN4DataC1Em line=17 device=0 threadid=1
create CUDA data  bytes=8 file=/p/home/jusers/rodekamp1/jureca/openAccTest/openAcc.cpp function=_ZN4DataC1Em line=17 device=0 threadid=1
alloc  CUDA data  devaddr=0x2303a5a200 bytes=512 file=/p/home/jusers/rodekamp1/jureca/openAccTest/openAcc.cpp function=_ZN4DataC1Em line=17 device=0 threadid=1
create CUDA data  bytes=8 file=/p/home/jusers/rodekamp1/jureca/openAccTest/openAcc.cpp function=_ZN9AlgorithmC1ER17DerivedCollection line=80 device=0 threadid=1
alloc  CUDA data  devaddr=0x2303a5a400 bytes=512 file=/p/home/jusers/rodekamp1/jureca/openAccTest/openAcc.cpp function=_ZN9AlgorithmC1ER17DerivedCollection line=80 device=0 threadid=1
Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

Failing in Thread:1
call to cuMemFreeHost returned error 700: Illegal address during kernel execution

cuda-memcheck ./openAcc

========= CUDA-MEMCHECK
========= Invalid __global__ read of size 8
=========     at 0x00000018 in Data::__operator_()__(unsigned long)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7ffe69c67590 is out of bounds
=========     Device Frame:Algorithm::g_83_gpu(unsigned long) (Algorithm::g_83_gpu(unsigned long) : 0xe8)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/nvidia/418.87.00/lib64/libcuda.so (cuLaunchKernel + 0x2fe) [0x282a4e]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_launch3 + 0x1d59) [0x1a64a]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so [0x1b392]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_launch + 0x13a) [0x1b4ce]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_launch + 0x1ff) [0x18ed2]
=========     Host Frame:./openAcc [0x2c19]
=========     Host Frame:./openAcc [0x2535]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22495]
=========     Host Frame:./openAcc [0x2239]
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuStreamSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/nvidia/418.87.00/lib64/libcuda.so (cuStreamSynchronize + 0x165) [0x282525]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_wait + 0x458) [0x1770d]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone2 + 0x1a2) [0x9a37]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone + 0x2b) [0x9ce4]
=========     Host Frame:./openAcc [0x2c2e]
=========     Host Frame:./openAcc [0x2535]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22495]
=========     Host Frame:./openAcc [0x2239]
=========
Failing in Thread:1
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuCtxSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/nvidia/418.87.00/lib64/libcuda.so (cuCtxSynchronize + 0x152) [0x259df2]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_error_handler + 0x258) [0xef30]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_wait + 0x472) [0x17727]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone2 + 0x1a2) [0x9a37]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone + 0x2b) [0x9ce4]
=========     Host Frame:./openAcc [0x2c2e]
=========     Host Frame:./openAcc [0x2535]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22495]
=========     Host Frame:./openAcc [0x2239]
=========
call to cuStreamSynchronize returned error 719: Launch failed (often invalid pointer dereference)

========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuMemFreeHost. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/nvidia/418.87.00/lib64/libcuda.so (cuMemFreeHost + 0x165) [0x25d8d5]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_free_device_buffers + 0x165) [0x6a64]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_release_buffer + 0x4b) [0x6b1f]
=========     Host Frame:/usr/lib64/libc.so.6 [0x39c29]
=========     Host Frame:/usr/lib64/libc.so.6 [0x39c77]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_set_emu + 0x0) [0x17be6]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_error_handler + 0x276) [0xef4e]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_wait + 0x472) [0x17727]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone2 + 0x1a2) [0x9a37]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone + 0x2b) [0x9ce4]
=========     Host Frame:./openAcc [0x2c2e]
=========     Host Frame:./openAcc [0x2535]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22495]
=========     Host Frame:./openAcc [0x2239]
=========
Failing in Thread:1
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuCtxSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/nvidia/418.87.00/lib64/libcuda.so (cuCtxSynchronize + 0x152) [0x259df2]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_error_handler + 0x258) [0xef30]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_free_device_buffers + 0x187) [0x6a86]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_release_buffer + 0x4b) [0x6b1f]
=========     Host Frame:/usr/lib64/libc.so.6 [0x39c29]
=========     Host Frame:/usr/lib64/libc.so.6 [0x39c77]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_set_emu + 0x0) [0x17be6]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_error_handler + 0x276) [0xef4e]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccn.so (__pgi_uacc_cuda_wait + 0x472) [0x17727]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone2 + 0x1a2) [0x9a37]
=========     Host Frame:/usr/local/software/jureca/Stages/2019a/software/PGI/19.3-GCC-8.3.0/linux86-64-llvm/19.3/lib/libaccg.so (__pgi_uacc_computedone + 0x2b) [0x9ce4]
=========     Host Frame:./openAcc [0x2c2e]
=========     Host Frame:./openAcc [0x2535]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22495]
=========     Host Frame:./openAcc [0x2239]
=========
call to cuMemFreeHost returned error 719: Launch failed (often invalid pointer dereference)

========= ERROR SUMMARY: 5 errors

Hi MarcelR,

Try changing the “present” clause to just use the “deriClass” as opposed to “deriClass.data1”.

#pragma acc parallel loop present(deriClass)

With this change, your code works correctly for me.

Hope this helps,
Mat