Hi,
I want to integrate cuLibrary API in my multi-GPU application, however I am not sure if I use it right. Here is a simple single-GPU example of the way I want to use it. Compiled & run with CUDA 12.2.1 and GCC 12.2.0. To run it on your machine, you may change the target architecture.
#include <algorithm>
#include <iostream>
#include <numeric>
#include <string>
#include <string_view>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
// CUDA Driver API error checking macro
#define CUDA_CALL(call) \
do { \
CUresult result = (call); \
if (result != CUDA_SUCCESS) { \
const char* errName; \
const char* errMsg; \
cuGetErrorName(result, &errName); \
cuGetErrorString(result, &errMsg); \
reportErrorAndExit("CUDA", result, errName, errMsg, __LINE__); \
} \
} while (0)
// CUDA Runtime API error checking macro
#define CUDA_RT_CALL(call) \
do { \
cudaError_t result = (call); \
if (result != cudaSuccess) { \
reportErrorAndExit("CUDA RT", \
result, \
cudaGetErrorName(result), \
cudaGetErrorString(result), \
__LINE__); \
} \
} while (0)
// NVRTC error checking macro
#define NVRTC_CALL(call) \
do { \
nvrtcResult result = (call); \
if (result != NVRTC_SUCCESS) \
{ \
reportErrorAndExit("NVRTC", result, "", nvrtcGetErrorString(result), __LINE__); \
} \
} while (0)
// function for reporting CUDA related errors and terminating program
void reportErrorAndExit(const std::string_view& type,
const int errorNumber,
const std::string_view& errorName,
const std::string_view& errorMessage,
const int line)
{
std::cerr << type << " error #" << errorNumber << " " << errorName << " on line " << line << ": "
<< errorMessage << std::endl;
exit(1);
}
// Helper class for NVRTC compilation and library loading
class ScaleKernel
{
public:
ScaleKernel(float scale);
~ScaleKernel();
cudaKernel_t get();
private:
static const std::string_view sSourceCode;
CUlibrary mLibrary;
cudaKernel_t mKernel;
};
ScaleKernel::ScaleKernel(float scale)
{
nvrtcProgram prog;
// set nvrtc parameters
std::string scaleMacro = std::string("-DSCALE=") + std::to_string(scale);
std::vector<const char*> compileParams{"-arch=compute_75",
"-std=c++11",
"-rdc=true",
"-default-device",
"-dlto",
scaleMacro.c_str()};
// create nvrtc program from source code
NVRTC_CALL(nvrtcCreateProgram(&prog,
sSourceCode.data(),
"scaleKernelSourceCode.cu",
0,
nullptr,
nullptr));
// run compilation, save result
nvrtcResult compilationResult = nvrtcCompileProgram(prog,
compileParams.size(),
compileParams.data());
// obtain log size
std::size_t logSize;
NVRTC_CALL(nvrtcGetProgramLogSize(prog, &logSize));
// obtain log output
std::string log(logSize, '\0');
NVRTC_CALL(nvrtcGetProgramLog(prog, log.data()));
// print compilation output
if (log.length() > 1ul)
{
std::cout << "\n\t--- COMPILATION LOG START ---\n\n"
<< log
<< "\n\t--- COMPILATION LOG END ---\n\n";
}
// check compilation result, if we checked earlier we wouldn't get log
NVRTC_CALL(compilationResult);
// obtain PTX code size
std::size_t ptxSize;
NVRTC_CALL(nvrtcGetPTXSize(prog, &ptxSize));
// obtain PTX code
std::string ptx(ptxSize, '\0');
NVRTC_CALL(nvrtcGetPTX(prog, ptx.data()));
// destroy nvrtc program, no longer needed
NVRTC_CALL(nvrtcDestroyProgram(&prog));
// parameters for C API of cuLibraryLoadData
unsigned infoSize = 500u;
unsigned errorSize = 500u;
char info[infoSize] = {};
char error[errorSize] = {};
unsigned target = CU_TARGET_COMPUTE_75;
int verbose = 1;
std::vector<CUjit_option> jitOptions = {CU_JIT_INFO_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
//CU_JIT_ERROR_LOG_BUFFER, // Segfault when uncommented
//CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, // Segfault when uncommented
//CU_JIT_TARGET, // Invalid argument when uncommented
CU_JIT_LOG_VERBOSE};
std::vector<void*> jitOptionValues = {&info,
&infoSize,
//&error, // Segfault when uncommented
//&errorSize, // Segfault when uncommented
//&target, // Invalid argument when uncommented
&verbose};
// load PTX code into library
CUDA_CALL(cuLibraryLoadData(&mLibrary,
ptx.data(),
jitOptions.data(),
jitOptionValues.data(),
jitOptions.size(),
nullptr,
nullptr,
0u));
// convert outputs
std::string_view infoView(info);
std::string_view errorView(error);
// print info and error messages
if (infoView.length() > 0ul)
{
std::cout << "\n\t--- CULIBRARY LOAD INFO START ---\n\n"
<< infoView
<< "\n\t--- CULIBRARY LOAD INFO END ---\n\n";
}
if (errorView.length() > 0ul)
{
std::cout << "\n\t--- CULIBRARY LOAD ERROR START ---\n\n"
<< errorView
<< "\n\t--- CULIBRARY LOAD ERROR END ---\n\n";
}
// obtain kernel from library, cudaKernel_t and CUkernel are interchangable
CUDA_CALL(cuLibraryGetKernel(&mKernel, mLibrary, "scaleKernel"));
}
// unload library
inline ScaleKernel::~ScaleKernel()
{
CUDA_CALL(cuLibraryUnload(mLibrary));
}
// obtain kernel
inline cudaKernel_t ScaleKernel::get()
{
return mKernel;
}
// scale kernel source code, parameter SCALE must be given as macro
const std::string_view ScaleKernel::sSourceCode = R"(
#ifndef SCALE
# error "Undefined SCALE"
#endif
constexpr float scale = SCALE;
extern "C"
__global__ void scaleKernel(float* data, const unsigned size)
{
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size)
{
data[i] *= scale;
}
}
)";
// reference scale kernel
__global__ void scaleKernelRef(float* data, const float scale, const unsigned size)
{
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size)
{
data[i] *= scale;
}
}
// print array of numbers
template<typename T>
void printArray(T* data, std::size_t size)
{
for (unsigned i = 0u; i < size; ++i)
{
std::cout << (i > 0u ? ", " : "") << data[i];
}
}
int main()
{
// init CUDA primary context
CUDA_RT_CALL(cudaInitDevice(0, 0u, 0u));
// define scale
float scale = 2.f;
// create scale kernel object with unchangeable scale
ScaleKernel scaleKernel(scale);
// size of test data
unsigned dataSize = 32u;
// tested and reference data pointers
float* data;
float* dataRef;
// allocate unified memory
CUDA_RT_CALL(cudaMallocManaged(&data, dataSize * sizeof(float)));
CUDA_RT_CALL(cudaMallocManaged(&dataRef, dataSize * sizeof(float)));
// set both to same values from 0 to dataSize
for (unsigned i = 0u; i < dataSize; ++i)
{
data[i] = dataRef[i] = static_cast<float>(i);
}
// print original values
std::cout << "Original:\n";
printArray(data, dataSize);
std::cout << std::endl;
// arrays of parameters for kernels
void* args[] = {&data, &dataSize};
void* argsRef[] = {&dataRef, &scale, &dataSize};
// launch both kernels and wait for them to finish
CUDA_RT_CALL(cudaLaunchKernel(scaleKernel.get(), 1, dataSize, args));
CUDA_RT_CALL(cudaLaunchKernel<void(float*, const float, const unsigned)>(scaleKernelRef, 1, dataSize, argsRef));
CUDA_RT_CALL(cudaDeviceSynchronize());
// print results of NVRTC kernel
std::cout << "Scaled NVRTC:\n";
printArray(data, dataSize);
std::cout << std::endl;
// print results of reference kernel
std::cout << "Scaled Ref:\n";
printArray(dataRef, dataSize);
std::cout << std::endl;
// free allocated memory
CUDA_RT_CALL(cudaFree(data));
CUDA_RT_CALL(cudaFree(dataRef));
}
Helper class ScaleKernel handles NVRTC compilation to PTX code and loads the PTX code as cuLibrary through cuLibraryLoadData function. Then scaleKernel kernel object is obtained from the library. In the main function a ScaleKernel object is created and tested against a reference implementation.
I use CUDA Runtime API wherever I can, however cuLibrary API has no RT equivalent. According to what I read there should be no compatibility issues between Driver and RT API in this example.
The problem is that the example fails to obtain the scaleKernel object reporting error #200 CUDA_ERROR_INVALID_IMAGE. I was also unable to get the error log from cuLibraryLoadData eventhough I pass it the right pointer. It ends up with segmentation fault. And I was not able to specify the target architecture, it ends up with error #1 CUDA_ERROR_INVALID_VALUE.
Did I make any mistakes?
Thank you very much.
David