Hello, I am trying to write a MEX function calculating the sinc() function of a matrix using C++ and OpenACC.
This is my first shoot at OpenACC, and I meet some problems since It seems to be hard for OpenACC to call the usual sin() function. How do I solve this? I have tried googling and searching this forum for advices without any particular luck, so I chose to post a new post.
The main problem of interest in this post is how I may use a sine function without generatig an error stating:
"Accelerator restriction: function/procedure calls are not supported
Loop not vectorized/parallelized: contains call "
Also I wonder if it is a big problem that the compiler is not given the sice of the matrix at compile time. A mystical message in this regard is:
" Loop without integer trip count will be executed in sequential mode
Accelerator region ignored"
Complete code:
//newSinc.cpp
#include "mex.h"
#include "matrix.h"
#include "math.h"
#include <complex>
using namespace std;
#define ompWORKFLOW guided //Open MP schedule() setting -affects all calls
#ifdef _OPENMP
#include "omp.h"
#else
#define omp_get_thread_num() 0
#endif
#ifdef _OPENACC
#include "openacc.h"
#endif
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){
/*The C++ MEX function "newSinc(x)" takes a matrix as input argument and computes the sinc(x)
*value of any input element sinc(x.). */
//Validation of inputs:
if (nlhs!=1){
mexErrMsgIdAndTxt("Toolbox:complexSinc:nlhs","only one output is allowed");
};
if (nrhs!=1){
mexErrMsgIdAndTxt("Toolbox:complexSinc:nrhs","only one input matrix is allowed");
};
//Declaration of variables
const double pi = 3.1415926535897932384626433832795028841971693993751;
const mxArray * __restrict__ input=prhs[0];
complex<double> argument;
complex<double> I=complex<double>(0.0,1.0);
double * __restrict__ xInRe=NULL;
double * __restrict__ xInIm=NULL;
double * __restrict__ xOutRe=NULL;
double * __restrict__ xOutIm=NULL;
bool IsComplex=mxIsComplex(input);
//Extracting dimmensions from the input Matrix:
mwSize ndim;
const mwSize *dims;
dims=mxGetDimensions(input);
ndim=mxGetNumberOfDimensions(input);
const mwSize totalMatrixSize=mxGetNumberOfElements(input);
//Creating output Matrix:
mxClassID classid=mxDOUBLE_CLASS;
mxComplexity ComplexFlag;
if (IsComplex){
ComplexFlag=mxCOMPLEX;
}else{
ComplexFlag=mxREAL;
};
plhs[0]=mxCreateNumericArray(ndim, dims, classid,ComplexFlag);
//Directing pointers;
xInRe=mxGetPr(input);
xOutRe=mxGetPr(plhs[0]);
if (IsComplex){
xInIm=mxGetPi(input);
xOutIm=mxGetPi(plhs[0]);
};
//Performing calculations on every element in the Matrix:
#pragma omp parallel for private(argument) shared(pi,xInRe,xInIm,xOutRe,xOutIm) schedule(ompWORKFLOW) //openMP call
#pragma acc enter data copyin(pi, totalMatrixSize, xInRe[0:totalMatrixSize], xInIm[0:totalMatrixSize]) create(argument)
if (IsComplex){
#pragma acc parallel loop private(argument)
for (int i=0;i<totalMatrixSize;i++){
if((xInRe[i]==0)&&(xInIm[i]==0)){
xOutRe[i]=1;//if x=0, sinc(x)=1.
xOutIm[i]=0;
}else{
argument=(xInRe[i]+xInIm[i]*I)*pi;
argument=sin(argument)/argument;
xOutRe[i]=real(argument);
xOutIm[i]=imag(argument);
};
};
}else{
#pragma acc parallel loop
for (int i=0;i<totalMatrixSize;i++){
if(xInRe[i]==0){
xOutRe[i]=1;//if x=0, sinc(x)=1.
}else{
xOutRe[i]=sin(pi*xInRe[i])/(pi*xInRe[i]);
};
};
};
#pragma acc exit data copyout(xOutRe[0:totalMatrixSize], xOutIm[0:totalMatrixSize])
return;
};
-Minfo output is:
mex ACC_sinc_source.cpp
PGCC-S-0155-Accelerator region ignored; see -Minfo messages (/opt/pgi/linux86-64/14.3/include/CC/stl/_complex.h: 71)
mexFunction:
37, std::complex::complex(double, double) [with T1=double] inlined, size=3 (inline) file ACC_sinc_source.cpp (348)
38, std::complex::complex(double, double) [with T1=double] inlined, size=3 (inline) file ACC_sinc_source.cpp (348)
71, Generating enter data create(argument)
Generating enter data copyin(xInIm[:totalMatrixSize])
Generating enter data copyin(xInRe[:totalMatrixSize])
Generating enter data copyin(totalMatrixSize)
Generating enter data copyin(pi)
Loop without integer trip count will be executed in sequential mode
Accelerator region ignored
73, Accelerator restriction: size of the GPU copy of ‘xOutIm’ is unknown
Accelerator restriction: size of the GPU copy of ‘xOutRe’ is unknown
Accelerator restriction: size of the GPU copy of ‘xInIm’ is unknown
Accelerator restriction: size of the GPU copy of ‘xInRe’ is unknown
Accelerator restriction: function/procedure calls are not supported
Loop not vectorized/parallelized: contains call
78, std::complex std::operator *(const T1 &, const std::complex &) [with T1=double] inlined, size=4 (inline) file ACC_sinc_source.cpp (675)
676, std::complex::complex(double, double) [with T1=double] inlined, size=3, file ACC_sinc_source.cpp (348)
78, std::complex std::operator +(const T1 &, const std::complex &) [with T1=double] inlined, size=4 (inline) file ACC_sinc_source.cpp (655)
656, std::complex::complex(double, double) [with T1=double] inlined, size=3, file ACC_sinc_source.cpp (348)
78, std::complex std::operator (const std::complex &, const T1 &) [with T1=double] inlined, size=4 (inline) file ACC_sinc_source.cpp (680)
681, std::complex::complex(double, double) [with T1=double] inlined, size=3, file ACC_sinc_source.cpp (348)
78, std::complex::operator =(const std::complex &) [with T1=double] inlined, size=3 (inline) file ACC_sinc_source.cpp (439)
79, std::complex std::operator /(const std::complex &, const std::complex &) [with T1=double] inlined, size=6 (inline) file ACC_sinc_source.cpp (721)
722, std::complex::complex(double, double) [with T1=double] inlined, size=3, file ACC_sinc_source.cpp (348)
79, std::complex::operator =(const std::complex &) [with T1=double] inlined, size=3 (inline) file ACC_sinc_source.cpp (439)
79, Accelerator restriction: unsupported call to ‘sin__3stdFRCQ2_3std16complex__tm__2_d’
80, T1 std::real(const std::complex &) [with T1=double] inlined, size=2 (inline) file ACC_sinc_source.cpp (768)
81, T1 std::imag(const std::complex &) [with T1=double] inlined, size=2 (inline) file ACC_sinc_source.cpp (773)
84, Accelerator kernel generated
86, #pragma acc loop gang, vector(256) / blockIdx.x threadIdx.x */
84, Generating present_or_copyin(xInRe[:totalMatrixSize])
Generating present_or_copyin(xOutRe[:totalMatrixSize])
95, Generating exit data copyout(xOutIm[:totalMatrixSize])
Generating exit data copyout(xOutRe[:totalMatrixSize])
PGCC/x86 Linux 14.3-0: compilation completed with severe errorsmex: compile of ’ “ACC_sinc_source.cpp”’ failed.
My compile line is:
/opt/pgi/linux86-64/14.3/bin/pgcpp -c -I/usr/local/MATLAB/R2013a/extern/include -I/usr/local/MATLAB/R2013a/simulink/include -DMATLAB_MEX_FILE -Mnoframe -fPIC -largeArrayDims -acc -ta=nvidia -Mcuda=fastmath -Minfo=all -Minline -fast -silent -use_fast_math -DMX_COMPAT_32 -O3 -DNDEBUG “ACC_sinc_source.cpp”