Hi,
Usually, I allocate memory on the device using the directly cudaMalloc,… and from time to time I would like to accelerate loops utilizing open acc. For this purpose, I use the deviceptr clause where I can provide my device pointers. However, if the device pointer is defined inside a struct or class it seems it does not work even though I directly provide the pointer. E.g. in the example shown below the first version works but not the second one. Is there a solution to that?
Many thanks.
Regards,
Reto
#include <cstdio>
#include <iostream>
#include <vector>
#include <chrono>
#include <complex>
#include "accel.h"
#include "openacc.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <cusparse.h>
#include <cusolverDn.h>
void mallocOnDevice(void **devP,int size);
void mallocOnDevice(void **devP,int size)
{
cudaError_t cudaStatus = cudaMalloc(devP,size);
if (cudaStatus != cudaSuccess) {
printf("Memory allocation on Device failed");
}
}
template <typename T>
void freeOnDevice(T *devP);
template <typename T>
void freeOnDevice(T *devP)
{
cudaError_t cudaStatus = cudaFree(devP);
if (cudaStatus != cudaSuccess) {
printf("Memory de-allocation on Device failed");
}
}
template <typename T>
void setDataOnDevice(T *hostP,T *devP,int size);
template <typename T>
void setDataOnDevice(T *hostP,T *devP,int size)
{
cudaError_t cudaStatus = cudaMemcpy(devP,hostP,\
sizeof(T)*size,\
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("Set data on Device failed");
}
}
template<typename T>
struct matrix_dev {
T *matrix;
int *NR,*NC;
int NR_host,NC_host;
matrix_dev(int rows,int columns,T *mat){
int Nelem = rows*columns;
NR_host = rows;
NC_host = columns;
mallocOnDevice((void**)&matrix,sizeof(T)*Nelem);
mallocOnDevice((void**)&NR,sizeof(int)*1);
mallocOnDevice((void**)&NC,sizeof(int)*1);
setDataOnDevice(mat,matrix,Nelem);
setDataOnDevice(&NR_host,&NR[0],1);
setDataOnDevice(&NC_host,&NC[0],1);
}
matrix_dev(int rows,int columns){
int Nelem = rows*columns;
NR_host = rows;
NC_host = columns;
mallocOnDevice((void**)&matrix,sizeof(T)*Nelem);
mallocOnDevice((void**)&NR,sizeof(int)*1);
mallocOnDevice((void**)&NC,sizeof(int)*1);
setDataOnDevice(&NR_host,&NR[0],1);
setDataOnDevice(&NC_host,&NC[0],1);
}
~matrix_dev(){
freeOnDevice(matrix);
freeOnDevice(NR);
freeOnDevice(NC);
}
};
int main(int argc, char *argv[])
{
double timeUsed;
int size = 5e3;
// ACC1
double *dA;
mallocOnDevice((void**)&dA,sizeof(double)*size*size);
auto t1 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dA)
{
#pragma acc parallel loop independent
for(int ii=0;ii<size*size;ii++){
dA[ii] = 1.0;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
timeUsed = (double) (std::chrono::duration_cast< \
std::chrono::microseconds>(t2-t1).count()/1000.0);
std::cout<<"Time for acc1: "<<timeUsed<<"ms"<<std::endl;
freeOnDevice(dA);
// ACC2
matrix_dev<double> dB(size,size);
auto t3 = std::chrono::high_resolution_clock::now();
#pragma acc data deviceptr(dB.matrix)
{
#pragma acc parallel loop independent
for(int ii=0;ii<size*size;ii++){
dB.matrix[ii] = 1.0;
}
}
auto t4 = std::chrono::high_resolution_clock::now();
timeUsed = (double) (std::chrono::duration_cast< \
std::chrono::microseconds>(t4-t3).count()/1000.0);
std::cout<<"Time for acc2: "<<timeUsed<<"ms"<<std::endl;
return 0;
}