¿Is it possible to call functions of a CUDA library inside of a OpenACC parallelized loop?
I have the following example code (file named test2d.c):
#include <iostream>
#include <npp.h>
#include <nppi.h>
#include <cuda_runtime.h>
#include <cstdlib>
#pragma acc routine
extern NppStatus nppiMulC_32f_C1R (const Npp32f *pSrc1, int nSrc1Step, const Npp32f nConstant, Npp32f *pDst, int nDstStep, NppiSize oSizeROI);
#pragma acc routine
extern NppStatus nppiAddC_32f_C1R (const Npp32f *pSrc1, int nSrc1Step, const Npp32f nConstant, Npp32f *pDst, int nDstStep, NppiSize oSizeROI);
int main(int argc, char **argv)
{
Npp32f *x, *y, *tmp;
int n = 10;
int stepX = 0;
int stepY = 0;
int stepTmp = 0;
NppiSize fullSize = {n, n};
NppiSize roiSize = {n, 1};
float *res =(float*) malloc(n*n*sizeof(float));
x = nppiMalloc_32f_C1(n, n, &stepX);
y = nppiMalloc_32f_C1(n, n, &stepY);
tmp = nppiMalloc_32f_C1(n, n, &stepTmp);
nppiSet_32f_C1R(1.0, x, stepX, fullSize);
#pragma acc data deviceptr(x, y, tmp)
{
#pragma acc parallel loop independent
{
for(int j=0; j<n; j++)
{
Npp32f *pSrc = &x[j*stepX];
Npp32f *pTmp = &tmp[j*stepTmp];
Npp32f *pDst = &y[j*stepTmp];
nppiMulC_32f_C1R(pSrc, stepX, 2.0, tmp, stepTmp, roiSize);
nppiAddC_32f_C1R(pTmp, stepTmp, 1.0, pDst, stepY, roiSize);
}
}
}
cudaMemcpy(res, y, n*n*sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < n*n; i++)
{
std::cout << res[i] << std::endl;
}
nppiFree(x);
nppiFree(y);
nppiFree(tmp);
return 0;
}
These are the compilation steps:
pgc++ -c -acc -Minfo=accel -ta:tesla:cc35 -ICUDA_HOME/include/ -ICUDA_HOME/samples/common/inc/ test2d.c
pgc++ test2d.o -acc -Minfo=accel -ta:tesla:cc35 -ICUDA_HOME/include/ -ICUDA_HOME/samples/common/inc/ -o test -LCUDA_HOME/lib64/ -L/opt/tools/cuda-8.0/lib64/ -lnppc -lnppial -lnppidei -lnppisu -lcudart -Mcuda
It uses NVIDIA parallel primitives (NPP) inside of an OpenACC parallelized loop. If I remove the “parallel” directive it compiles and runs fine with the “data” directive. However when the parallel directive is added it does not recognize any of the NPP functions in the compilation. I am using pgc++ version 18.4 and CUDA version 8. This is the compiler output error:
pgc++ -c -acc -Minfo=accel -ta:tesla:cc35 -ICUDA_HOME/include/ -ICUDA_HOME/samples/common/inc/ test2d.c
main:
34, Accelerator kernel generated
Generating Tesla code
35, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
34, Generating implicit copy(roiSize)
pgc++ test2d.o -acc -Minfo=accel -ta:tesla:cc35 -ICUDA_HOME/include/ -ICUDA_HOME/samples/common/inc/ -o test -LCUDA_HOME/lib64/ -L/opt/tools/cuda-8.0/lib64/ -lnppc -lnppial -lnppidei -lnppisu -lcudart -Mcuda
nvlink error : Undefined reference to 'nppiMulC_32f_C1R' in 'test2d.o'
nvlink error : Undefined reference to 'nppiAddC_32f_C1R' in 'test2d.o'
pgacclnk: child process exit status 2: /opt/compilers/pgi/linux86-64/18.4/bin/pgnvd
make: *** [test] Error 2