Hello,
I used this post Data Structures in C · fomics/EuroHack15 Wiki · GitHub as a reference to organize manual deep copy to GPU.
Unfortunately, I found out that it works with very large limitations.
It is matrix-matrix multiplication sample.
First of all, I created a class
class matrix
{
public:
int _n;
float * _mat;
matrix(){}
~matrix(){}
matrix(int n, float * mat)
{
_n=n;
_mat=mat;
}
matrix(int n)
{
_mat = (float*)malloc(n * n * sizeof(float));
_n=n;
}
void create(int n)
{
_mat = (float*)malloc(n * n * sizeof(float));
_n=n;
}
#pragma acc routine seq
float * get(){return &_mat[0];}
};
Then I allocate, init and copy objects to the GPU
int n = 1000;
matrix *a,*b,*c;
a=new matrix(n);
b=new matrix(n);
c=new matrix(n);
for (int i = 0; i < n * n; i++)
{
a->get()[i] = (float)rand() / RAND_MAX;
b->get()[i] = (float)rand() / RAND_MAX;
}
acc_copyin( a, sizeof(matrix));
acc_copyin( b, sizeof(matrix));
acc_copyin( c, sizeof(matrix));
acc_copyin( a->_mat, sizeof(float)*n*n);
acc_copyin( b->_mat, sizeof(float)*n*n);
acc_copyin( c->_mat, sizeof(float)*n*n);
Then I call kernels region:
#pragma acc kernels present (a->_mat[0:n*n],b->_mat[0:n*n],c->_mat[0:n*n])
{
#pragma acc loop independent
for (int i = 0; i < n; i++)
{
#pragma acc loop independent
for (int j = 0; j < n; j++)
{
float temp = 0.0f;
#pragma acc loop reduction(+:temp)
for (int k = 0; k < n; k++)
{
temp += a->_mat[i * n + k] * b->_mat[k * n + j];
}
c->_mat[i * n + j] = temp;
}
}
}
and copy data back
acc_copyout( a->get(), sizeof(float)*n*n);
acc_copyout( b->get(), sizeof(float)*n*n);
acc_copyout( c->get(), sizeof(float)*n*n);
In this case everything works fine.
But if replace the pointers to objects with objects, like here:
matrix a(n);
matrix b(n);
matrix c(n);
And replace all “->” calls with “.” changing copyin as follows:
acc_copyin( &a, sizeof(matrix));
acc_copyin( &b, sizeof(matrix));
acc_copyin( &c, sizeof(matrix));
acc_copyin( a._mat, sizeof(float)*n*n);
acc_copyin( b._mat, sizeof(float)*n*n);
acc_copyin( c._mat, sizeof(float)*n*n);
Then it crashes during the kernel execution with
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
Moreover, I found out that inside of kernels region I’m unable to make statements like this:
float * temp1 = a->_mat;
temp += temp1[i * n + k] * b->_mat[k * n + j];
or this
float &temp1=c->_mat[i*n+j];
temp1 = temp;
or even call a simple function, returning pointers to a device variables:
#pragma acc routine seq
float * get(){return &_mat[0];}
...
temp += a->get()[i * n + k] * b->get()[k * n + j];
All of this works fine with cpu binary.
Compile options and PGI version:
pgc++ -fast matmul.cpp -o matmul.cpu -std=c++11
pgc++ -acc -Minfo=accel -fast -ta=tesla:cc35 matmul.cpp -o matmul.acc -std=c++11
pgc++ 17.10-0 64-bit target on x86-64 Linux -tp nehalem