Combining NVRTC and cuLibrary API

Hi,

I want to integrate cuLibrary API in my multi-GPU application, however I am not sure if I use it right. Here is a simple single-GPU example of the way I want to use it. Compiled & run with CUDA 12.2.1 and GCC 12.2.0. To run it on your machine, you may change the target architecture.

#include <algorithm>
#include <iostream>
#include <numeric>
#include <string>
#include <string_view>
#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

// CUDA Driver API error checking macro
#define CUDA_CALL(call)                                                                \
  do {                                                                                 \
    CUresult result = (call);                                                          \
    if (result != CUDA_SUCCESS) {                                                      \
      const char* errName;                                                             \
      const char* errMsg;                                                              \
      cuGetErrorName(result, &errName);                                                \
      cuGetErrorString(result, &errMsg);                                               \
      reportErrorAndExit("CUDA", result, errName, errMsg, __LINE__);                   \
    }                                                                                  \
  } while (0)

// CUDA Runtime API error checking macro
#define CUDA_RT_CALL(call)                                                             \
  do {                                                                                 \
    cudaError_t result = (call);                                                       \
    if (result != cudaSuccess) {                                                       \
      reportErrorAndExit("CUDA RT",                                                    \
                         result,                                                       \
                         cudaGetErrorName(result),                                     \
                         cudaGetErrorString(result),                                   \
                         __LINE__);                                                    \
    }                                                                                  \
  } while (0)

// NVRTC error checking macro
#define NVRTC_CALL(call)                                                               \
  do {                                                                                 \
    nvrtcResult result = (call);                                                       \
    if (result != NVRTC_SUCCESS)                                                       \
    {                                                                                  \
      reportErrorAndExit("NVRTC", result, "", nvrtcGetErrorString(result), __LINE__);  \
    }                                                                                  \
  } while (0)

// function for reporting CUDA related errors and terminating program
void reportErrorAndExit(const std::string_view& type,
                        const int               errorNumber,
                        const std::string_view& errorName,
                        const std::string_view& errorMessage,
                        const int               line)
{
  std::cerr << type << " error #" << errorNumber << " " << errorName << " on line " << line << ": "
            << errorMessage << std::endl;
  exit(1);
}

// Helper class for NVRTC compilation and library loading
class ScaleKernel
{
  public:
    ScaleKernel(float scale);
    ~ScaleKernel();

    cudaKernel_t get();
  private:
    static const std::string_view sSourceCode;
    CUlibrary                     mLibrary;
    cudaKernel_t                  mKernel;
};

ScaleKernel::ScaleKernel(float scale)
{
  nvrtcProgram prog;

  // set nvrtc parameters
  std::string scaleMacro = std::string("-DSCALE=") + std::to_string(scale);
  std::vector<const char*> compileParams{"-arch=compute_75",
                                         "-std=c++11",
                                         "-rdc=true",
                                         "-default-device",
                                         "-dlto",
                                         scaleMacro.c_str()};
  
  // create nvrtc program from source code
  NVRTC_CALL(nvrtcCreateProgram(&prog,
                                sSourceCode.data(),
                                "scaleKernelSourceCode.cu",
                                0,
                                nullptr,
                                nullptr));

  // run compilation, save result
  nvrtcResult compilationResult = nvrtcCompileProgram(prog,
                                                      compileParams.size(),
                                                      compileParams.data());

  // obtain log size
  std::size_t logSize;
  NVRTC_CALL(nvrtcGetProgramLogSize(prog, &logSize));

  // obtain log output
  std::string log(logSize, '\0');
  NVRTC_CALL(nvrtcGetProgramLog(prog, log.data()));

  // print compilation output
  if (log.length() > 1ul)
  {    
    std::cout << "\n\t--- COMPILATION LOG START ---\n\n"
              << log
              << "\n\t--- COMPILATION LOG END ---\n\n";
  }
  
  // check compilation result, if we checked earlier we wouldn't get log
  NVRTC_CALL(compilationResult);

  // obtain PTX code size
  std::size_t ptxSize;
  NVRTC_CALL(nvrtcGetPTXSize(prog, &ptxSize));

  // obtain PTX code
  std::string ptx(ptxSize, '\0');
  NVRTC_CALL(nvrtcGetPTX(prog, ptx.data()));

  // destroy nvrtc program, no longer needed
  NVRTC_CALL(nvrtcDestroyProgram(&prog));

  // parameters for C API of cuLibraryLoadData
  unsigned infoSize  = 500u;
  unsigned errorSize = 500u;
  char     info[infoSize] = {};
  char     error[errorSize] = {};
  unsigned target = CU_TARGET_COMPUTE_75;
  int      verbose = 1;

  std::vector<CUjit_option> jitOptions = {CU_JIT_INFO_LOG_BUFFER,
                                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
                                          //CU_JIT_ERROR_LOG_BUFFER,             // Segfault when uncommented
                                          //CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,  // Segfault when uncommented
                                          //CU_JIT_TARGET,                       // Invalid argument when uncommented
                                          CU_JIT_LOG_VERBOSE};

  std::vector<void*> jitOptionValues = {&info,
                                        &infoSize,
                                        //&error,                                // Segfault when uncommented
                                        //&errorSize,                            // Segfault when uncommented
                                        //&target,                               // Invalid argument when uncommented
                                        &verbose};
  
  // load PTX code into library
  CUDA_CALL(cuLibraryLoadData(&mLibrary,
                              ptx.data(),
                              jitOptions.data(),
                              jitOptionValues.data(),
                              jitOptions.size(),
                              nullptr,
                              nullptr,
                              0u));

  // convert outputs
  std::string_view infoView(info);
  std::string_view errorView(error);

  // print info and error messages
  if (infoView.length() > 0ul)
  {
    std::cout << "\n\t--- CULIBRARY LOAD INFO START ---\n\n"
              << infoView
              << "\n\t--- CULIBRARY LOAD INFO END ---\n\n";
  }
  if (errorView.length() > 0ul)
  {
    std::cout << "\n\t--- CULIBRARY LOAD ERROR START ---\n\n"
              << errorView
              << "\n\t--- CULIBRARY LOAD ERROR END ---\n\n";
  }
  
  // obtain kernel from library, cudaKernel_t and CUkernel are interchangable
  CUDA_CALL(cuLibraryGetKernel(&mKernel, mLibrary, "scaleKernel"));
}

// unload library
inline ScaleKernel::~ScaleKernel()
{
  CUDA_CALL(cuLibraryUnload(mLibrary));
}

// obtain kernel
inline cudaKernel_t ScaleKernel::get()
{
  return mKernel;
}

// scale kernel source code, parameter SCALE must be given as macro
const std::string_view ScaleKernel::sSourceCode = R"(
#ifndef SCALE
# error "Undefined SCALE"
#endif

constexpr float scale = SCALE;

extern "C"
__global__ void scaleKernel(float* data, const unsigned size)
{
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < size)
  {
    data[i] *= scale;
  }
}
)";

// reference scale kernel
__global__ void scaleKernelRef(float* data, const float scale, const unsigned size)
{
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < size)
  {
    data[i] *= scale;
  }
}

// print array of numbers
template<typename T>
void printArray(T* data, std::size_t size)
{
  for (unsigned i = 0u; i < size; ++i)
  {
    std::cout << (i > 0u ? ", " : "") << data[i];
  }
}

int main()
{
  // init CUDA primary context
  CUDA_RT_CALL(cudaInitDevice(0, 0u, 0u));
  
  // define scale
  float scale = 2.f;

  // create scale kernel object with unchangeable scale
  ScaleKernel scaleKernel(scale);
  
  // size of test data
  unsigned dataSize = 32u;

  // tested and reference data pointers
  float* data;
  float* dataRef;

  // allocate unified memory
  CUDA_RT_CALL(cudaMallocManaged(&data,    dataSize * sizeof(float)));
  CUDA_RT_CALL(cudaMallocManaged(&dataRef, dataSize * sizeof(float)));
  
  // set both to same values from 0 to dataSize
  for (unsigned i = 0u; i < dataSize; ++i)
  {
    data[i] = dataRef[i] = static_cast<float>(i);
  }

  // print original values
  std::cout << "Original:\n";
  printArray(data, dataSize);
  std::cout << std::endl;  

  // arrays of parameters for kernels
  void* args[]    = {&data,            &dataSize};
  void* argsRef[] = {&dataRef, &scale, &dataSize};

  // launch both kernels and wait for them to finish
  CUDA_RT_CALL(cudaLaunchKernel(scaleKernel.get(), 1, dataSize, args));
  CUDA_RT_CALL(cudaLaunchKernel<void(float*, const float, const unsigned)>(scaleKernelRef, 1, dataSize, argsRef));
  CUDA_RT_CALL(cudaDeviceSynchronize());

  // print results of NVRTC kernel
  std::cout << "Scaled NVRTC:\n";
  printArray(data, dataSize);
  std::cout << std::endl;

  // print results of reference kernel
  std::cout << "Scaled Ref:\n";
  printArray(dataRef, dataSize);
  std::cout << std::endl;

  // free allocated memory
  CUDA_RT_CALL(cudaFree(data));
  CUDA_RT_CALL(cudaFree(dataRef));
}

Helper class ScaleKernel handles NVRTC compilation to PTX code and loads the PTX code as cuLibrary through cuLibraryLoadData function. Then scaleKernel kernel object is obtained from the library. In the main function a ScaleKernel object is created and tested against a reference implementation.

I use CUDA Runtime API wherever I can, however cuLibrary API has no RT equivalent. According to what I read there should be no compatibility issues between Driver and RT API in this example.

The problem is that the example fails to obtain the scaleKernel object reporting error #200 CUDA_ERROR_INVALID_IMAGE. I was also unable to get the error log from cuLibraryLoadData eventhough I pass it the right pointer. It ends up with segmentation fault. And I was not able to specify the target architecture, it ends up with error #1 CUDA_ERROR_INVALID_VALUE.

Did I make any mistakes?

Thank you very much.

David

Okey, I was able to figure it out myself. The main problem was using “-lto” option for NVRTC compilation. Also it was necessary to use cuLaunchKernel instead of cudaLaunchKernel. The log and error output however still does not work.

Here is the working solution:

#include <algorithm>
#include <iostream>
#include <numeric>
#include <string>
#include <string_view>
#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

// CUDA Driver API error checking macro
#define CUDA_CALL(call)                                                                \
  do {                                                                                 \
    CUresult result = (call);                                                          \
    if (result != CUDA_SUCCESS) {                                                      \
      const char* errName;                                                             \
      const char* errMsg;                                                              \
      cuGetErrorName(result, &errName);                                                \
      cuGetErrorString(result, &errMsg);                                               \
      reportErrorAndExit("CUDA", result, errName, errMsg, __LINE__);                   \
    }                                                                                  \
  } while (0)

// CUDA Runtime API error checking macro
#define CUDA_RT_CALL(call)                                                             \
  do {                                                                                 \
    cudaError_t result = (call);                                                       \
    if (result != cudaSuccess) {                                                       \
      reportErrorAndExit("CUDA RT",                                                    \
                         result,                                                       \
                         cudaGetErrorName(result),                                     \
                         cudaGetErrorString(result),                                   \
                         __LINE__);                                                    \
    }                                                                                  \
  } while (0)

// NVRTC error checking macro
#define NVRTC_CALL(call)                                                               \
  do {                                                                                 \
    nvrtcResult result = (call);                                                       \
    if (result != NVRTC_SUCCESS)                                                       \
    {                                                                                  \
      reportErrorAndExit("NVRTC", result, "", nvrtcGetErrorString(result), __LINE__);  \
    }                                                                                  \
  } while (0)

// function for reporting CUDA related errors and terminating program
void reportErrorAndExit(const std::string_view& type,
                        const int               errorNumber,
                        const std::string_view& errorName,
                        const std::string_view& errorMessage,
                        const int               line)
{
  std::cerr << type << " error #" << errorNumber << " " << errorName << " on line " << line << ": "
            << errorMessage << std::endl;
  exit(1);
}

// Helper class for NVRTC compilation and library loading
class ScaleKernel
{
  public:
    ScaleKernel(float scale);
    ~ScaleKernel();

    cudaKernel_t get();
  private:
    static const std::string_view sSourceCode;
    CUlibrary                     mLibrary;
    cudaKernel_t                  mKernel;
};

ScaleKernel::ScaleKernel(float scale)
{
  nvrtcProgram prog;

  // set nvrtc parameters
  std::string scaleMacro = std::string("-DSCALE=") + std::to_string(scale);
  std::vector<const char*> compileParams{"-arch=compute_75",
                                         "-std=c++11",
                                         "-rdc=true",
                                         "-default-device",
                                         scaleMacro.c_str()};
  
  // create nvrtc program from source code
  NVRTC_CALL(nvrtcCreateProgram(&prog,
                                sSourceCode.data(),
                                "scaleKernelSourceCode.cu",
                                0,
                                nullptr,
                                nullptr));

  // run compilation, save result
  nvrtcResult compilationResult = nvrtcCompileProgram(prog,
                                                      compileParams.size(),
                                                      compileParams.data());

  // obtain log size
  std::size_t logSize;
  NVRTC_CALL(nvrtcGetProgramLogSize(prog, &logSize));

  // obtain log output
  std::string log(logSize, '\0');
  NVRTC_CALL(nvrtcGetProgramLog(prog, log.data()));

  // print compilation output
  if (log.length() > 1ul)
  {    
    std::cout << "\n\t--- COMPILATION LOG START ---\n\n"
              << log
              << "\n\t--- COMPILATION LOG END ---\n\n";
  }
  
  // check compilation result, if we checked earlier we wouldn't get log
  NVRTC_CALL(compilationResult);

  // obtain PTX code size
  std::size_t ptxSize;
  NVRTC_CALL(nvrtcGetPTXSize(prog, &ptxSize));

  // obtain PTX code
  std::string ptx(ptxSize, '\0');
  NVRTC_CALL(nvrtcGetPTX(prog, ptx.data()));

  // destroy nvrtc program, no longer needed
  NVRTC_CALL(nvrtcDestroyProgram(&prog));

  // parameters for C API of cuLibraryLoadData
  unsigned infoSize  = 500u;
  unsigned errorSize = 500u;
  char     info[infoSize] = {};
  char     error[errorSize] = {};
  unsigned target = CU_TARGET_COMPUTE_75;
  int      verbose = 1;

  std::vector<CUjit_option> jitOptions = {//CU_JIT_INFO_LOG_BUFFER,
                                          //CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
                                          //CU_JIT_ERROR_LOG_BUFFER,             // Segfault when uncommented
                                          //CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,  // Segfault when uncommented
                                          //CU_JIT_TARGET,                       // Invalid argument when uncommented
                                          CU_JIT_LOG_VERBOSE};

  std::vector<void*> jitOptionValues = {//&info,
                                        //&infoSize,
                                        //&error,                                // Segfault when uncommented
                                        //&errorSize,                            // Segfault when uncommented
                                        //&target,                               // Invalid argument when uncommented
                                        &verbose};
  
  // load PTX code into library
  CUDA_CALL(cuLibraryLoadData(&mLibrary,
                              ptx.data(),
                              jitOptions.data(),
                              jitOptionValues.data(),
                              jitOptions.size(),
                              nullptr,
                              nullptr,
                              0u));

  // convert outputs
  std::string_view infoView(info);
  std::string_view errorView(error);

  // print info and error messages
  if (infoView.length() > 0ul)
  {
    std::cout << "\n\t--- CULIBRARY LOAD INFO START ---\n\n"
              << infoView
              << "\n\t--- CULIBRARY LOAD INFO END ---\n\n";
  }
  if (errorView.length() > 0ul)
  {
    std::cout << "\n\t--- CULIBRARY LOAD ERROR START ---\n\n"
              << errorView
              << "\n\t--- CULIBRARY LOAD ERROR END ---\n\n";
  }
  
  // obtain kernel from library, cudaKernel_t and CUkernel are interchangable
  CUDA_CALL(cuLibraryGetKernel(&mKernel, mLibrary, "scaleKernel"));
}

// unload library
inline ScaleKernel::~ScaleKernel()
{
  CUDA_CALL(cuLibraryUnload(mLibrary));
}

// obtain kernel
inline cudaKernel_t ScaleKernel::get()
{
  return mKernel;
}

// scale kernel source code, parameter SCALE must be given as macro
const std::string_view ScaleKernel::sSourceCode = R"(
#ifndef SCALE
# error "Undefined SCALE"
#endif

constexpr float scale = SCALE;

extern "C"
__global__ void scaleKernel(float* data, const unsigned size)
{
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < size)
  {
    data[i] *= scale;
  }
}
)";

// reference scale kernel
__global__ void scaleKernelRef(float* data, const float scale, const unsigned size)
{
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < size)
  {
    data[i] *= scale;
  }
}

// print array of numbers
template<typename T>
void printArray(T* data, std::size_t size)
{
  for (unsigned i = 0u; i < size; ++i)
  {
    std::cout << (i > 0u ? ", " : "") << data[i];
  }
}

int main()
{
  // init CUDA primary context
  CUDA_RT_CALL(cudaInitDevice(0, 0u, 0u));
  
  // define scale
  float scale = 8.f;

  // create scale kernel object with unchangeable scale
  ScaleKernel scaleKernel(scale);
  
  // size of test data
  unsigned dataSize = 32u;

  // tested and reference data pointers
  float* data;
  float* dataRef;

  // allocate unified memory
  CUDA_RT_CALL(cudaMallocManaged(&data,    dataSize * sizeof(float)));
  CUDA_RT_CALL(cudaMallocManaged(&dataRef, dataSize * sizeof(float)));
  
  // set both to same values from 0 to dataSize
  for (unsigned i = 0u; i < dataSize; ++i)
  {
    data[i] = dataRef[i] = static_cast<float>(i);
  }

  // print original values
  std::cout << "Original:\n";
  printArray(data, dataSize);
  std::cout << std::endl;  

  // arrays of parameters for kernels
  void* args[]    = {&data,            &dataSize};
  void* argsRef[] = {&dataRef, &scale, &dataSize};

  // launch both kernels and wait for them to finish
  CUDA_CALL(cuLaunchKernel(reinterpret_cast<CUfunction>(scaleKernel.get()), 1, 1, 1, dataSize, 1, 1, 0, 0, args, nullptr));
  CUDA_RT_CALL(cudaLaunchKernel<void(float*, const float, const unsigned)>(scaleKernelRef, 1, dataSize, argsRef));
  CUDA_RT_CALL(cudaDeviceSynchronize());

  // print results of NVRTC kernel
  std::cout << "Scaled NVRTC:\n";
  printArray(data, dataSize);
  std::cout << std::endl;

  // print results of reference kernel
  std::cout << "Scaled Ref:\n";
  printArray(dataRef, dataSize);
  std::cout << std::endl;

  // free allocated memory
  CUDA_RT_CALL(cudaFree(data));
  CUDA_RT_CALL(cudaFree(dataRef));
}

So the problem with log and error output is because I passed arguments to jitOptionValues in a wrong way. This is the final version.

#include <algorithm>
#include <iostream>
#include <numeric>
#include <string>
#include <string_view>
#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

// CUDA Driver API error checking macro
#define CUDA_CALL(call)                                                                \
  do {                                                                                 \
    CUresult result = (call);                                                          \
    if (result != CUDA_SUCCESS) {                                                      \
      const char* errName;                                                             \
      const char* errMsg;                                                              \
      cuGetErrorName(result, &errName);                                                \
      cuGetErrorString(result, &errMsg);                                               \
      reportErrorAndExit("CUDA", result, errName, errMsg, __LINE__);                   \
    }                                                                                  \
  } while (0)

// CUDA Runtime API error checking macro
#define CUDA_RT_CALL(call)                                                             \
  do {                                                                                 \
    cudaError_t result = (call);                                                       \
    if (result != cudaSuccess) {                                                       \
      reportErrorAndExit("CUDA RT",                                                    \
                         result,                                                       \
                         cudaGetErrorName(result),                                     \
                         cudaGetErrorString(result),                                   \
                         __LINE__);                                                    \
    }                                                                                  \
  } while (0)

// NVRTC error checking macro
#define NVRTC_CALL(call)                                                               \
  do {                                                                                 \
    nvrtcResult result = (call);                                                       \
    if (result != NVRTC_SUCCESS)                                                       \
    {                                                                                  \
      reportErrorAndExit("NVRTC", result, "", nvrtcGetErrorString(result), __LINE__);  \
    }                                                                                  \
  } while (0)

// function for reporting CUDA related errors and terminating program
void reportErrorAndExit(const std::string_view& type,
                        const int               errorNumber,
                        const std::string_view& errorName,
                        const std::string_view& errorMessage,
                        const int               line)
{
  std::cerr << type << " error #" << errorNumber << " " << errorName << " on line " << line << ": "
            << errorMessage << std::endl;
  exit(1);
}

// Helper class for NVRTC compilation and library loading
class ScaleKernel
{
  public:
    ScaleKernel(float scale);
    ~ScaleKernel();

    cudaKernel_t get();
  private:
    static const std::string_view sSourceCode;
    CUlibrary                     mLibrary;
    cudaKernel_t                  mKernel;
};

ScaleKernel::ScaleKernel(float scale)
{
  nvrtcProgram prog;

  // set nvrtc parameters
  std::string scaleMacro = std::string("-DSCALE=") + std::to_string(scale);
  std::vector<const char*> compileParams{"-arch=compute_75",
                                         "-std=c++11",
                                         "-rdc=true",
                                         "-default-device",
                                         scaleMacro.c_str()};
  
  // create nvrtc program from source code
  NVRTC_CALL(nvrtcCreateProgram(&prog,
                                sSourceCode.data(),
                                "scaleKernelSourceCode.cu",
                                0,
                                nullptr,
                                nullptr));

  // run compilation, save result
  nvrtcResult compilationResult = nvrtcCompileProgram(prog,
                                                      compileParams.size(),
                                                      compileParams.data());

  // obtain log size
  std::size_t logSize;
  NVRTC_CALL(nvrtcGetProgramLogSize(prog, &logSize));

  // obtain log output
  std::string log(logSize, '\0');
  NVRTC_CALL(nvrtcGetProgramLog(prog, log.data()));

  // print compilation output
  if (log.length() > 1ul)
  {    
    std::cout << "\n\t--- COMPILATION LOG START ---\n\n"
              << log
              << "\n\t--- COMPILATION LOG END ---\n\n";
  }
  
  // check compilation result, if we checked earlier we wouldn't get log
  NVRTC_CALL(compilationResult);

  // obtain PTX code size
  std::size_t ptxSize;
  NVRTC_CALL(nvrtcGetPTXSize(prog, &ptxSize));

  // obtain PTX code
  std::string ptx(ptxSize, '\0');
  NVRTC_CALL(nvrtcGetPTX(prog, ptx.data()));

  // destroy nvrtc program, no longer needed
  NVRTC_CALL(nvrtcDestroyProgram(&prog));

  // parameters for C API of cuLibraryLoadData
  unsigned infoSize         = 1024u;
  unsigned errorSize        = 1024u;
  char     info[infoSize]   = {};
  char     error[errorSize] = {};
  unsigned target           = CU_TARGET_COMPUTE_75;
  int      verbose          = 1;

  std::vector<CUjit_option> jitOptions = {CU_JIT_INFO_LOG_BUFFER,
                                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
                                          CU_JIT_ERROR_LOG_BUFFER,             // Segfault when uncommented
                                          CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,  // Segfault when uncommented
                                          CU_JIT_TARGET,                       // Invalid argument when uncommented
                                          CU_JIT_LOG_VERBOSE};

  std::vector<void*> jitOptionValues = {info,
                                        reinterpret_cast<void*>(infoSize),
                                        error,                                // Segfault when uncommented
                                        reinterpret_cast<void*>(errorSize),   // Segfault when uncommented
                                        reinterpret_cast<void*>(target),      // Invalid argument when uncommented
                                        reinterpret_cast<void*>(verbose)};
  
  // load PTX code into library
  CUDA_CALL(cuLibraryLoadData(&mLibrary,
                              ptx.data(),
                              jitOptions.data(),
                              jitOptionValues.data(),
                              jitOptions.size(),
                              nullptr,
                              nullptr,
                              0u));

  // convert outputs
  std::string_view infoView(info);
  std::string_view errorView(error);

  // print info and error messages
  if (infoView.length() > 0ul)
  {
    std::cout << "\n\n\t--- CULIBRARY LOAD INFO START ---\n\n"
              << infoView
              << "\n\n\t--- CULIBRARY LOAD INFO END ---\n\n";
  }
  if (errorView.length() > 0ul)
  {
    std::cout << "\n\n\t--- CULIBRARY LOAD ERROR START ---\n\n"
              << errorView
              << "\n\n\t--- CULIBRARY LOAD ERROR END ---\n\n";
  }
  
  // obtain kernel from library, cudaKernel_t and CUkernel are interchangable
  CUDA_CALL(cuLibraryGetKernel(&mKernel, mLibrary, "scaleKernel"));
}

// unload library
inline ScaleKernel::~ScaleKernel()
{
  CUDA_CALL(cuLibraryUnload(mLibrary));
}

// obtain kernel
inline cudaKernel_t ScaleKernel::get()
{
  return mKernel;
}

// scale kernel source code, parameter SCALE must be given as macro
const std::string_view ScaleKernel::sSourceCode = R"(
#ifndef SCALE
# error "Undefined SCALE"
#endif

constexpr float scale = SCALE;

extern "C"
__global__ void scaleKernel(float* data, const unsigned size)
{
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < size)
  {
    data[i] *= scale;
  }
}
)";

// reference scale kernel
__global__ void scaleKernelRef(float* data, const float scale, const unsigned size)
{
  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < size)
  {
    data[i] *= scale;
  }
}

// print array of numbers
template<typename T>
void printArray(T* data, std::size_t size)
{
  for (unsigned i = 0u; i < size; ++i)
  {
    std::cout << (i > 0u ? ", " : "") << data[i];
  }
}

int main()
{
  // init CUDA primary context
  CUDA_RT_CALL(cudaInitDevice(0, 0u, 0u));
  
  // define scale
  float scale = 8.f;

  // create scale kernel object with unchangeable scale
  ScaleKernel scaleKernel(scale);
  
  // size of test data
  unsigned dataSize = 32u;

  // tested and reference data pointers
  float* data;
  float* dataRef;

  // allocate unified memory
  CUDA_RT_CALL(cudaMallocManaged(&data,    dataSize * sizeof(float)));
  CUDA_RT_CALL(cudaMallocManaged(&dataRef, dataSize * sizeof(float)));
  
  // set both to same values from 0 to dataSize
  for (unsigned i = 0u; i < dataSize; ++i)
  {
    data[i] = dataRef[i] = static_cast<float>(i);
  }

  // print original values
  std::cout << "Original:\n";
  printArray(data, dataSize);
  std::cout << std::endl;  

  // arrays of parameters for kernels
  void* args[]    = {&data,            &dataSize};
  void* argsRef[] = {&dataRef, &scale, &dataSize};

  // launch both kernels and wait for them to finish
  CUDA_CALL(cuLaunchKernel(reinterpret_cast<CUfunction>(scaleKernel.get()), 1, 1, 1, dataSize, 1, 1, 0, 0, args, nullptr));
  CUDA_RT_CALL(cudaLaunchKernel<void(float*, const float, const unsigned)>(scaleKernelRef, 1, dataSize, argsRef));
  CUDA_RT_CALL(cudaDeviceSynchronize());

  // print results of NVRTC kernel
  std::cout << "Scaled NVRTC:\n";
  printArray(data, dataSize);
  std::cout << std::endl;

  // print results of reference kernel
  std::cout << "Scaled Ref:\n";
  printArray(dataRef, dataSize);
  std::cout << std::endl;

  // free allocated memory
  CUDA_RT_CALL(cudaFree(data));
  CUDA_RT_CALL(cudaFree(dataRef));
}