using cooperatives with nvrtc

Hi, all

When compiling .cu with nvcc, i can use both cg::thread_block and cg::grid_group, however,
when i using cooperatives with nvrtc i can only use cg::thread_block and can not use
cg::grid_group, it has
error: cuModuleLoadDataEx(&module, ptx, jitNumOptions, jitOptions, jitOptVals) failed with error CUDA_ERROR_INVALID_PTX
I don’t known how to solve this, I would be grateful for any advice!
And the code i used

const char *saxpy = "                                                    \n\
#include <cooperative_groups.h>                                          \n\
__device__ void sync() {                                                 \n\
  cooperative_groups::thread_block cta = cooperative_groups::this_thread_block(); \n\
  cooperative_groups::grid_group grid = cooperative_groups::this_grid(); \n\
extern \"C\" __global__                                                  \n\
void saxpy(float a, float *x, float *y, float *out, size_t n)            \n\
{                                                                        \n\
  sync();  \n\
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;                    \n\
  if (tid < n) {                                                         \n\
    out[tid] = a * x[tid] + y[tid];                                      \n\
  }                                                                      \n\

int main(int argc, char** argv)
  // Create an instance of nvrtcProgram with the SAXPY code string.
  nvrtcProgram prog;
    nvrtcCreateProgram(&prog,         // prog
                       saxpy,         // buffer
                       "",    // name
                       0,             // numHeaders
                       NULL,          // headers
                       NULL));        // includeNames
  const char *opts[] = {"--include-path=/usr/local/cuda-9.0/include/", "-rdc=true", "-arch=compute_61"};
  nvrtcResult compileResult = nvrtcCompileProgram(prog,  // prog
                                                  3,    // numOptions
                                                  opts); // options
  size_t logSize;
  NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
  char *log = new char[logSize];
  NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
  std::cout << log << '\n';
  delete[] log;
  if (compileResult != NVRTC_SUCCESS) {
  // Obtain PTX from the program.
 size_t ptxSize;
 NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
 char *ptx = new char[ptxSize];
 NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
 // Destroy the program.
 // Load the generated PTX and get a handle to the SAXPY kernel.
 CUdevice cuDevice;
 CUcontext context;
 CUmodule module;
 CUfunction kernel;
 CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
 CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
 const unsigned int jitNumOptions = 3;
 CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 void **jitOptVals = new void *[jitNumOptions];
 int jitTarget = 61;
 jitOptions[0] = CU_JIT_TARGET;
 jitOptVals[0] = (void *)(size_t)jitTarget;
 int jitLogBufferSize = 1 << 20;
 jitOptVals[1] = (void *)(size_t)jitLogBufferSize;
 itOptions[2] = CU_JIT_INFO_LOG_BUFFER;
 char *jitLogBuffer = new char[jitLogBufferSize];
 jitOptVals[2] = jitLogBuffer;
 std::cout << jitLogBuffer << std::endl;
 printf("> PTX JIT log:\n%s\n", jitLogBuffer);

 // set up size of compilation log buffer
 CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, jitNumOptions, jitOptions, jitOptVals));

I confirm that this issue is still there in CUDA 10.1, say “CUDA_ERROR_INVALID_PTX” reported when calling cuModuleLoadDataEx.

In addition, as an alternative, using cuLinkAddData() to link either pre-compiled PTX or CUBIN will results in CUDA_ERROR_UNKNOWN in cuLinkComplete().

Please take a look at the dynamic parallelism example in the NVRTC doc:

Including cudadevrt.lib via using cuLinkAddFile can save it.

It may save all cases that use NVRTC with relocatable codes. Have not verified them yet.