I am attempting to leverage the nvc++ compiler to produce GPU-accelerated C++ Standard Library calls within a mex file for Matlab. To start with, I built a minimal toy example purely in C++ (no mex/Matlab) that has two modules: one that serves as a main driver, and one that holds the actual processing code. The main driver does memory allocation, then calls the processing code, then exits. Here it is:
test_standalone.cpp: (main driver)
#include <vector>
#include <iostream>
size_t VEC_NUM_ELEM = 10;
void test_execute(float *array_in, float *array_out, const size_t vec_size, const float coeff); // processing code
int main(int argc, char **argv)
{
if (argc != 2)
{
std::cout << "Try: " << argv[0] << "<coeff> " << std::endl;
return -1;
}
const float coeff = std::stof(argv[1]);
std::cout << "Coeff: " << coeff << std::endl;
float *vec1_array = (float *)malloc(VEC_NUM_ELEM * sizeof(float));
float *vec2_array = (float *)malloc(VEC_NUM_ELEM * sizeof(float));
for (unsigned i = 0; i < VEC_NUM_ELEM; i++)
{
val = static_cast<float>(i);
}
test_execute(vec1_array, vec2_array, VEC_NUM_ELEM, coeff);
return 0;
} // end main driver
test_execute.cpp: (“processing”/CPU/GPU code):
#include <algorithm> // std::transform
#include <iostream>
#include <cassert>
#include "cuda_runtime.h"
#define assertm(exp, msg) assert(((void)msg, exp))
void test_execute(float *array_in, float *array_out, const size_t vec_size, const float coeff)
{
cudaPointerAttributes attrib_in;
cudaError_t cerr = cudaPointerGetAttributes(&attrib_in, array_in);
assertm(cerr == cudaSuccess, "cudaPointerGetAttributes() failed!");
std::cout << "array_in memory type: " << attrib_in.type << std::endl;
std::transform
(
std::execution::par_unseq,
array_in,
array_in + vec_size,
array_out,
[coeff](float a)
{
float ret = a + coeff;
return ret;
}
);
} // end processing code test_execute.cpp
Here’s how I am building:
/test/nvhpc_2022_2211_Linux_x86_64_cuda_11.8/install_location/Linux_x86_64/22.11/compilers/bin/nvc++ -stdpar=gpu -c test_execute.cpp
/test/nvhpc_2022_2211_Linux_x86_64_cuda_11.8/install_location/Linux_x86_64/22.11/compilers/bin/nvc++ -stdpar=gpu test_execute.o test_standalone.cpp -o test_standalone.exe -L/test/nvhpc_2022_2211_Linux_x86_64_cuda_11.8/install_location/Linux_x86_64/22.11/compilers/lib -L/test/nvhpc_2022_2211_Linux_x86_64_cuda_11.8/install_location/Linux_x86_64/22.11/cuda/11.8/lib64 -lcudanvhpc -lcudart -lnvc -lnvhpcatm -lpgm -Wl,-rpath=/test/nvhpc_2022_2211_Linux_x86_64_cuda_11.8/install_location/Linux_x86_64/22.11/compilers/lib -Wl,-rpath=/test/nvhpc_2022_2211_Linux_x86_64_cuda_11.8/install_location/Linux_x86_64/22.11/cuda/11.8/lib64
When I run ./test_standalone.exe, I get a memory access error. I suspected that this was because of memory type since the CPU (“stdpar=multicore”) path works, so I added the CUDA code to get pointer attributes and I confirmed that the memory being passed into test_execute is “host unregistered” memory. Next, I copy/pasted the test_execute() function definition into test_standalone(), then compiled it all as one file and everything ran great (same code told me that the memory was now managed memory). I have two questions:
-
What is the correct way to separately compile modules with stdpar=gpu functionality from nvc++? I know separate compilation and linking from nvcc, but I don’t think that should be necessary here as device code should be generated and able to be in-lined all within test_execute().
-
If separate compilation described in #1 is possible, am I able to link with a host compiler like g++ (ultimately to use with mex)? I tried changing to g++ in my linking step above and I was able to compile and link, but when I ran ./test_execute.exe, I got “No CUDA device code available”. Being restricted to only nvc++ is incredibly limiting so I’m hoping there’s a way to achieve this.
Thank you!
NV HPC SDK: nvhpc_2022_2211_Linux_x86_64_cuda_11.8
RHEL 7