Using deviceptr() with structs

rerhy · November 21, 2019, 3:22pm

Hi,

Usually, I allocate memory on the device using the directly cudaMalloc,… and from time to time I would like to accelerate loops utilizing open acc. For this purpose, I use the deviceptr clause where I can provide my device pointers. However, if the device pointer is defined inside a struct or class it seems it does not work even though I directly provide the pointer. E.g. in the example shown below the first version works but not the second one. Is there a solution to that?

Many thanks.
Regards,
Reto

#include <cstdio>
#include <iostream>
#include <vector>
#include <chrono>
#include <complex>

#include "accel.h"
#include "openacc.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <cusparse.h>
#include <cusolverDn.h>

void mallocOnDevice(void **devP,int size);
void mallocOnDevice(void **devP,int size)
{
  cudaError_t cudaStatus = cudaMalloc(devP,size);
  if (cudaStatus != cudaSuccess) {
    printf("Memory allocation on Device failed");
  }
}

template <typename T>
void freeOnDevice(T *devP);
template <typename T>
void freeOnDevice(T *devP)
{
  cudaError_t cudaStatus = cudaFree(devP);
  if (cudaStatus != cudaSuccess) {
    printf("Memory de-allocation on Device failed");
  }
}
template <typename T>
void setDataOnDevice(T *hostP,T *devP,int size);
template <typename T>
void setDataOnDevice(T *hostP,T *devP,int size)
{
  cudaError_t cudaStatus = cudaMemcpy(devP,hostP,\
                                      sizeof(T)*size,\
                                      cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
    printf("Set data on Device failed");
  }
}

template<typename T>
struct matrix_dev {
  T *matrix;
  int *NR,*NC;
  int NR_host,NC_host;
  matrix_dev(int rows,int columns,T *mat){
    int Nelem = rows*columns;
    NR_host = rows;
    NC_host = columns;
    mallocOnDevice((void**)&matrix,sizeof(T)*Nelem);
    mallocOnDevice((void**)&NR,sizeof(int)*1);
    mallocOnDevice((void**)&NC,sizeof(int)*1);
    setDataOnDevice(mat,matrix,Nelem);
    setDataOnDevice(&NR_host,&NR[0],1);
    setDataOnDevice(&NC_host,&NC[0],1);
  }
  matrix_dev(int rows,int columns){
    int Nelem = rows*columns;
    NR_host = rows;
    NC_host = columns;
    mallocOnDevice((void**)&matrix,sizeof(T)*Nelem);
    mallocOnDevice((void**)&NR,sizeof(int)*1);
    mallocOnDevice((void**)&NC,sizeof(int)*1);
    setDataOnDevice(&NR_host,&NR[0],1);
    setDataOnDevice(&NC_host,&NC[0],1);
  }
  ~matrix_dev(){
    freeOnDevice(matrix);
    freeOnDevice(NR);
    freeOnDevice(NC);
  }
};


int main(int argc, char *argv[])
{

  double timeUsed;
  int size = 5e3;

  // ACC1
  double *dA;
  mallocOnDevice((void**)&dA,sizeof(double)*size*size);
  auto t1 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dA)
    {
#pragma acc parallel loop independent
      for(int ii=0;ii<size*size;ii++){
        dA[ii] = 1.0;
      }
    }
  auto t2 = std::chrono::high_resolution_clock::now();
  timeUsed = (double) (std::chrono::duration_cast<      \
                       std::chrono::microseconds>(t2-t1).count()/1000.0);
  std::cout<<"Time for acc1: "<<timeUsed<<"ms"<<std::endl;
  freeOnDevice(dA);

 // ACC2
  matrix_dev<double> dB(size,size);
  auto t3 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dB.matrix)
    {
#pragma acc parallel loop independent
      for(int ii=0;ii<size*size;ii++){
        dB.matrix[ii] = 1.0;
      }
    }

  auto t4 = std::chrono::high_resolution_clock::now();
  timeUsed = (double) (std::chrono::duration_cast<      \
                       std::chrono::microseconds>(t4-t3).count()/1000.0);
  std::cout<<"Time for acc2: "<<timeUsed<<"ms"<<std::endl;

  return 0;

}

aglobus1 · November 22, 2019, 2:36am

  matrix_dev<double> dB(size,size);                   
  auto t3 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dB.matrix)

This will fail because dB is created on the host-side stack. The
acc directive you’re using says to treat dB.matrix like it is on
the device already, but it’s not.

One alternative is to acc create the data necessary for
computation, and move back the results. Another is to replace
deviceptr with attach. The attach clause will do an implicit
copy of the entire matrix_dev structure and attaches the device
pointer to the copy of the struct on the device. Note that you’ll
see a (relatively) big cost to doing it this way.

rerhy · November 22, 2019, 7:21am

Hi,

thanks for the fast reply. I am not sure if I fully get it. The dB.matrix is pointing to a properly allocated region on the device memory (allocated in the constructor of the matrix_dev through cudaMalloc). In my naive understanding, it should be enough to know where the allocated memory, which should be manipulated, can be found on the device. Because if I pass dB.matrix to cuda library functions e.g. to cublasDgemm it works fine. But it seems open acc does not simply take the pointer information but it needs that the full struct is somehow available on the device?

Thanks,
Regards,
Reto

MatColgrove · November 22, 2019, 6:54pm

Hi Reto,

In the loop you’re accessing “matrix” through the parent “dB”. Since “dB” is a host struct, it needs to be copied to the device before it can be used. Hence, replacing:

#pragma acc data deviceptr(dB.matrix)

with:

#pragma acc data copyin(dB)

will fix the issue.

When doing a “copyin”, a shallow copy of the struct, including the device pointers, will be made. There is no need to declare “dB.matrix” as a “deviceptr”.

With this change, I can successfully run your program:

% pgc++ -ta=tesla -Minfo test.cpp -Mcuda
main:
     92, Generating Tesla code
         94, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    108, Generating copyin(dB) [if not already present]
         Generating Tesla code
        110, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
% a.out
Time for acc1: 0.789ms
Time for acc2: 34.249ms

Hope this helps,
Mat

rerhy · November 26, 2019, 8:46am

Hi Mat,

many thanks for your help. Yes, it works also for me. However, it seems this second version is significantly slower than the first one. Do you know why?

Thanks,
Regards,
Reto

MatColgrove · November 26, 2019, 4:23pm

Looking at a quick profile, the kernels times are about the same. The difference is most likely the bit of overhead needed to allocate dB on the device and then copy it. Assuming in a full application that you’ll reuse dB, this overhead should get amortized.

Note that there’s also a bit of overhead being added to the first call to warp-up the device. To fix this, I added a call to “acc_init” before the timer so the warm-up cost isn’t added.

% setenv PGI_ACC_TIME 1
% pgc++ -ta=tesla -fast test.cpp -Mcuda ; a.out
Time for acc1: 0.879ms
Time for acc2: 36.929ms

Accelerator Kernel Timing data
test.cpp
  main  NVIDIA  devicenum=0
    time(us): 497
    93: compute region reached 1 time
        93: kernel launched 1 time
            grid: [65535]  block: [128]
             device time(us): total=248 max=248 min=248 avg=248
            elapsed time(us): total=817 max=817 min=817 avg=817
    109: compute region reached 1 time
        109: kernel launched 1 time
            grid: [65535]  block: [128]
             device time(us): total=230 max=230 min=230 avg=230
            elapsed time(us): total=285 max=285 min=285 avg=285
    109: data region reached 2 times
        109: data copyin transfers: 1
             device time(us): total=19 max=19 min=19 avg=19

rerhy · November 28, 2019, 3:28pm

Ok, I see.

Thanks.

Regards,
Reto

Topic		Replies	Views
A nice tip for copying C++ structs in openacc nvc, nvc++ and nvfortran	2	102	January 30, 2025
Openacc deviceptr on arbitrary number of pointers nvc, nvc++ and nvfortran	4	125	October 16, 2024
Using POD structures in device routines Legacy PGI Compilers	3	2883	October 11, 2016
OpenACC FORTRAN pointer how-to question nvc, nvc++ and nvfortran	5	1293	December 19, 2023
C array of pointers in OpenACC Legacy PGI Compilers	4	5093	August 26, 2015
Deviceptr vs present OpenACC directives nvc, nvc++ and nvfortran	10	802	March 12, 2024
How to use c_f_pointer in OpenACC nvc, nvc++ and nvfortran	5	1263	February 5, 2021
OpenACC Fortran pointer in the structured data construct nvc, nvc++ and nvfortran	2	731	April 30, 2021
copy(movement) of user defined objects to the gpu in OpenACC Legacy PGI Compilers	22	22740	December 10, 2018
OpenACC copy clause for pointer member of struct does not get attached if parent is copied implicitly nvc, nvc++ and nvfortran	5	1296	February 24, 2022

Using deviceptr() with structs

Related topics