Using deviceptr() with structs

Hi,

Usually, I allocate memory on the device using the directly cudaMalloc,… and from time to time I would like to accelerate loops utilizing open acc. For this purpose, I use the deviceptr clause where I can provide my device pointers. However, if the device pointer is defined inside a struct or class it seems it does not work even though I directly provide the pointer. E.g. in the example shown below the first version works but not the second one. Is there a solution to that?

Many thanks.
Regards,
Reto

#include <cstdio>
#include <iostream>
#include <vector>
#include <chrono>
#include <complex>

#include "accel.h"
#include "openacc.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <cusparse.h>
#include <cusolverDn.h>

void mallocOnDevice(void **devP,int size);
void mallocOnDevice(void **devP,int size)
{
  cudaError_t cudaStatus = cudaMalloc(devP,size);
  if (cudaStatus != cudaSuccess) {
    printf("Memory allocation on Device failed");
  }
}

template <typename T>
void freeOnDevice(T *devP);
template <typename T>
void freeOnDevice(T *devP)
{
  cudaError_t cudaStatus = cudaFree(devP);
  if (cudaStatus != cudaSuccess) {
    printf("Memory de-allocation on Device failed");
  }
}
template <typename T>
void setDataOnDevice(T *hostP,T *devP,int size);
template <typename T>
void setDataOnDevice(T *hostP,T *devP,int size)
{
  cudaError_t cudaStatus = cudaMemcpy(devP,hostP,\
                                      sizeof(T)*size,\
                                      cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
    printf("Set data on Device failed");
  }
}

template<typename T>
struct matrix_dev {
  T *matrix;
  int *NR,*NC;
  int NR_host,NC_host;
  matrix_dev(int rows,int columns,T *mat){
    int Nelem = rows*columns;
    NR_host = rows;
    NC_host = columns;
    mallocOnDevice((void**)&matrix,sizeof(T)*Nelem);
    mallocOnDevice((void**)&NR,sizeof(int)*1);
    mallocOnDevice((void**)&NC,sizeof(int)*1);
    setDataOnDevice(mat,matrix,Nelem);
    setDataOnDevice(&NR_host,&NR[0],1);
    setDataOnDevice(&NC_host,&NC[0],1);
  }
  matrix_dev(int rows,int columns){
    int Nelem = rows*columns;
    NR_host = rows;
    NC_host = columns;
    mallocOnDevice((void**)&matrix,sizeof(T)*Nelem);
    mallocOnDevice((void**)&NR,sizeof(int)*1);
    mallocOnDevice((void**)&NC,sizeof(int)*1);
    setDataOnDevice(&NR_host,&NR[0],1);
    setDataOnDevice(&NC_host,&NC[0],1);
  }
  ~matrix_dev(){
    freeOnDevice(matrix);
    freeOnDevice(NR);
    freeOnDevice(NC);
  }
};


int main(int argc, char *argv[])
{

  double timeUsed;
  int size = 5e3;

  // ACC1
  double *dA;
  mallocOnDevice((void**)&dA,sizeof(double)*size*size);
  auto t1 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dA)
    {
#pragma acc parallel loop independent
      for(int ii=0;ii<size*size;ii++){
        dA[ii] = 1.0;
      }
    }
  auto t2 = std::chrono::high_resolution_clock::now();
  timeUsed = (double) (std::chrono::duration_cast<      \
                       std::chrono::microseconds>(t2-t1).count()/1000.0);
  std::cout<<"Time for acc1: "<<timeUsed<<"ms"<<std::endl;
  freeOnDevice(dA);

 // ACC2
  matrix_dev<double> dB(size,size);
  auto t3 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dB.matrix)
    {
#pragma acc parallel loop independent
      for(int ii=0;ii<size*size;ii++){
        dB.matrix[ii] = 1.0;
      }
    }

  auto t4 = std::chrono::high_resolution_clock::now();
  timeUsed = (double) (std::chrono::duration_cast<      \
                       std::chrono::microseconds>(t4-t3).count()/1000.0);
  std::cout<<"Time for acc2: "<<timeUsed<<"ms"<<std::endl;

  return 0;

}
  matrix_dev<double> dB(size,size);                   
  auto t3 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dB.matrix)

This will fail because dB is created on the host-side stack. The
acc directive you’re using says to treat dB.matrix like it is on
the device already, but it’s not.

One alternative is to acc create the data necessary for
computation, and move back the results. Another is to replace
deviceptr with attach. The attach clause will do an implicit
copy of the entire matrix_dev structure and attaches the device
pointer to the copy of the struct on the device. Note that you’ll
see a (relatively) big cost to doing it this way.

Hi,

thanks for the fast reply. I am not sure if I fully get it. The dB.matrix is pointing to a properly allocated region on the device memory (allocated in the constructor of the matrix_dev through cudaMalloc). In my naive understanding, it should be enough to know where the allocated memory, which should be manipulated, can be found on the device. Because if I pass dB.matrix to cuda library functions e.g. to cublasDgemm it works fine. But it seems open acc does not simply take the pointer information but it needs that the full struct is somehow available on the device?

Thanks,
Regards,
Reto

Hi Reto,

In the loop you’re accessing “matrix” through the parent “dB”. Since “dB” is a host struct, it needs to be copied to the device before it can be used. Hence, replacing:

#pragma acc data deviceptr(dB.matrix)

with:

#pragma acc data copyin(dB)

will fix the issue.

When doing a “copyin”, a shallow copy of the struct, including the device pointers, will be made. There is no need to declare “dB.matrix” as a “deviceptr”.

With this change, I can successfully run your program:

% pgc++ -ta=tesla -Minfo test.cpp -Mcuda
main:
     92, Generating Tesla code
         94, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    108, Generating copyin(dB) [if not already present]
         Generating Tesla code
        110, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
% a.out
Time for acc1: 0.789ms
Time for acc2: 34.249ms

Hope this helps,
Mat

Hi Mat,

many thanks for your help. Yes, it works also for me. However, it seems this second version is significantly slower than the first one. Do you know why?

Thanks,
Regards,
Reto

Looking at a quick profile, the kernels times are about the same. The difference is most likely the bit of overhead needed to allocate dB on the device and then copy it. Assuming in a full application that you’ll reuse dB, this overhead should get amortized.

Note that there’s also a bit of overhead being added to the first call to warp-up the device. To fix this, I added a call to “acc_init” before the timer so the warm-up cost isn’t added.

% setenv PGI_ACC_TIME 1
% pgc++ -ta=tesla -fast test.cpp -Mcuda ; a.out
Time for acc1: 0.879ms
Time for acc2: 36.929ms

Accelerator Kernel Timing data
test.cpp
  main  NVIDIA  devicenum=0
    time(us): 497
    93: compute region reached 1 time
        93: kernel launched 1 time
            grid: [65535]  block: [128]
             device time(us): total=248 max=248 min=248 avg=248
            elapsed time(us): total=817 max=817 min=817 avg=817
    109: compute region reached 1 time
        109: kernel launched 1 time
            grid: [65535]  block: [128]
             device time(us): total=230 max=230 min=230 avg=230
            elapsed time(us): total=285 max=285 min=285 avg=285
    109: data region reached 2 times
        109: data copyin transfers: 1
             device time(us): total=19 max=19 min=19 avg=19

Ok, I see.

Thanks.

Regards,
Reto