Get device side function pointer of cuda kernel (without the signature) using just the kernel's symbol name as a string may be

Note: In case the post seems long, one can directly jump to the section starting with “I was wondering…” at the end, in case one wants to skip the buildup/context.


Buildup/Context:

For the code mentioned in the post, it is the __nv_cudaEntityRegisterCallback function injected by the nvcc compiler, which registers the host side kernel’s name/symbol ((void (*)(double*, double*, double*, int))vecAdd) with the symbol_Z6vecAddPdS_S_i on the device for resolution by GPU by some lookup table during cudaLaunchKernel API call (to get hold of the device pointer), as mentioned here.

Similarly, for a __device__ function pointer definition to get hold of the device pointer of the kernel vecAdd as:

typedef void (*fp)(double *, double *, double *, int);
__device__ fp kernelPtrvecAdd = vecAdd;

is even handled by the __nv_cudaEntityRegisterCallback function.

static void __nv_cudaEntityRegisterCallback(void **__T23) { 
    // Save the fat binary handle for managed runtime
    __nv_save_fatbinhandle_for_managed_rt(__T23);
    ...
    ...
    // Register the vecAdd function
    __cudaRegisterFunction(
        __T23, 
        (const char*)((void (*)(double*, double*, double*, int))vecAdd), 
        "_Z6vecAddPdS_S_i", 
        "_Z6vecAddPdS_S_i", 
        -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0
    );
 
     // Register the kernelPtrvecAdd variable
    __cudaRegisterVar(
        __T23, 
        (char*)&::kernelPtrvecAdd, 
        "kernelPtrvecAdd", 
        "kernelPtrvecAdd", 
        0, 8UL, 0, 0
    );
    ...
    ...
}

The above code snippet can be obtained by compiling the post.cu file here as:

$ nvcc -cuda post.cu -o post.cu.cpp.ii

But consider the situation in the example given below:

Where I have the following setup:
vecAdd.cu

// vecAdd.cu
#include <cuda_runtime.h>
#include <stdio.h>

// CUDA kernel that adds two vectors, each thread handles one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

Compiled as

$ nvcc -cubin -arch=sm_75 vecAdd.cu -o vecAdd.cubin

main.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>

#define CUDA_SAFECALL(call)                                                 \
    {                                                                       \
        call;                                                               \
        cudaError err = cudaGetLastError();                                 \
        if (cudaSuccess != err) {                                           \
            fprintf(                                                        \
                stderr,                                                     \
                "Cuda error in function '%s' file '%s' in line %i : %s.\n", \
                #call, __FILE__, __LINE__, cudaGetErrorString(err));        \
            fflush(stderr);                                                 \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    }

#define SAFECALL_DRV(call)                                                  \
    {                                                                       \
        CUresult err = call;                                                \
        if (err != CUDA_SUCCESS) {                                          \
            const char *errStr;                                             \
            cuGetErrorString(err, &errStr);                                 \
            fprintf(                                                        \
                stderr,                                                     \
                "CUDA Driver API error in function '%s' file '%s' in line %i : %s.\n", \
                #call, __FILE__, __LINE__, errStr);                         \
            fflush(stderr);                                                 \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    }
    
int main(int argc, char *argv[]) {
    int n = 100000000;  // Size of the vectors
    if (argc > 1) n = atoi(argv[1]);

    // Initialize CUDA Driver API
    cuInit(0);

    // Get a CUDA device and create a context
    CUdevice device;
    CUcontext context;
    cuDeviceGet(&device, 0);
    cuCtxCreate(&context, 0, device);
    cuDevicePrimaryCtxRetain(&context, device);
    // Load the module from vecAdd.o
    CUmodule module;
    SAFECALL_DRV(cuModuleLoad(&module, "vecAdd.cubin"));

    // Create a CUDA stream for asynchronous execution
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    // Host and device vectors
    double *h_a, *h_b, *h_c;
    double *d_a, *d_b, *d_c;
    size_t bytes = n * sizeof(double);

    // Allocate host memory
    h_a = (double *)malloc(bytes);
    h_b = (double *)malloc(bytes);
    h_c = (double *)malloc(bytes);

    // Initialize host vectors
    for (int i = 0; i < n; i++) {
        h_a[i] = sin(i) * sin(i);
        h_b[i] = cos(i) * cos(i);
        h_c[i] = 0;
    }

    CUfunction vecAddFunc;
    SAFECALL_DRV(cuModuleGetFunction(&vecAddFunc, module, "_Z6vecAddPdS_S_i"));
    printf("vecAdd: %p\n", vecAddFunc);

    // Allocate device memory
    cudaMallocAsync(&d_a, bytes, stream);
    cudaMallocAsync(&d_b, bytes, stream);
    cudaMallocAsync(&d_c, bytes, stream);

    // Copy data from host to device
    cudaMemcpyAsync(d_a, h_a, bytes, cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_b, h_b, bytes, cudaMemcpyHostToDevice, stream);

    // Time the kernel execution
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    for (int i = 0; i < 10; i++) {
        cudaEventRecord(start, stream);
        int gridSize = (int)ceil((float)n / 1024);
        void *args[] = { &d_a, &d_b, &d_c, &n };

        SAFECALL_DRV(cuLaunchKernel(
                        vecAddFunc,      // Kernel function
                        gridSize, 1, 1,  // Grid dimensions
                        1024, 1, 1,      // Block dimensions
                        0,               // Shared memory
                        stream,          // Stream
                        args,            // Kernel arguments
                        NULL             // Extra (not used)
                    ));

        cudaStreamSynchronize(stream);
        cudaEventRecord(stop, stream);
        cudaEventSynchronize(stop);

        float time = 0;
        cudaEventElapsedTime(&time, start, stop);
        printf("Iteration %d: Time vecAdd: %f ms\n", i, time);
    }

    // Copy array back to host using async memory copy
    cudaMemcpyAsync(h_c, d_c, bytes, cudaMemcpyDeviceToHost, stream);
    
    // Release device memory using async memory deallocation
    cudaFreeAsync(d_a, stream);
    cudaFreeAsync(d_b, stream);
    cudaFreeAsync(d_c, stream);

    // Synchronize the stream to ensure everything is done
    cudaStreamSynchronize(stream);

    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for (int i = 0; i < n; i++) sum += h_c[i];
    printf("Final sum = %f; sum/n = %f (should be ~1)\n", sum, sum / n);

    // Clean up resources
    cudaStreamDestroy(stream);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Destroy the CUDA context
    cuCtxDestroy(context);

    return 0;
}
$ nvcc  main.cu -lcuda
./a.out vecAdd: 0x56400fc49640
Iteration 0: Time vecAdd: 6.092896 ms
...
Iteration 9: Time vecAdd: 6.029056 ms
Final sum = 100000000.000000; sum/n = 1.000000 (should be ~1)

In the above code, I load the cubin of the vecAdd kernel code in the main.cu file and then get hold of the host side stub (as is evident from the address 0x56400fc49640) of the vecAdd kernel using cuModuleGetFunction passing just the symbol name of the kernel function as a string (as in _Z6vecAddPdS_S_i) (though the manged cpp-style name has the function signature info in it, in this particular situation, but it might not always be the case) using which I can launch the kernel using cuLaunchKernel.

nvcc -cuda main.cu main.cu.cpp.ii output file (main.cu.cpp.ii) does not contain any register function line in the __nv_cudaEntityRegisterCallback, but
nvcc -cuda vecAdd.cu vecAdd.cu.cpp.ii contains a register function entry for _Z6vecAddPdS_S_i. So, I guess cuModuleLoad and cuModuleGetFunction is installing that in the main executable.


I was wondering,

  1. Is there any way by which using just the string _Z6vecAddPdS_S_i I could get hold of the device side pointer of the kernel vecAdd. Just as using cuModuleGetFunction we get hold of the host pointer of vecAdd kernel.
  2. Or, if vecAdd.cu was compiled as vecAdd.o using nvcc -c vecAdd.cu -o vecAdd.o -rdc=true and main be compiled as nvcc main.cu vecAdd.o, to define__device__ function pointer requires us to know the signature of vecAdd (for the external linkage) in main.cu:
typedef void (*fp)(double *, double *, double *, int);
extern __global__ void vecAdd(double *a, double *b, double *c, int n);
__device__ fp kernelPtr = vecAdd;

Is there a way out, where I can get the device function pointer of vecAdd without the knowledge of its signature (along the line of cuda dynamic parallelism; here, there is a discussion, but unfortunately both the parent and child kernel are in the module itself.)?

[Crosspost]

I was thinking along the line of cuda dynamic parallelism, where the child kernel is loaded through module in the main code and there you have a parent kernel through which you launch that loaded kernel as a child. Loading through cudaGetFunction gives us the CUfunction. But that can’t be used inside a kernel to launch a child kernel. I hope I am able to explain my rationale. Any trick to get hold of device-side handle of the kernel? Or at the point there is no way out?

I can get a device-side pointer of a kernel (at run-time, I don’t know how to do it at compile-time), and use that to dispatch. But that doesn’t seem to be everything you’re looking for.

# cat t290.cu
#include <cuda_device_runtime_api.h>
#include <cstdio>

__global__ void k1(){ printf("k1\n");}

__global__ void k2(){ printf("k2\n");}

__device__ void *f1 = (void *)k1;

__device__ void *f2 = (void *)k2;

__global__ void l(void *f){

  cudaLaunchDevice(f, NULL, dim3(1,1,1), dim3(1,1,1), 0, NULL);
}

int main(){

  void *hf1, *hf2;
  cudaMemcpyFromSymbol(&hf1, f1, sizeof(void *));
  cudaMemcpyFromSymbol(&hf2, f2, sizeof(void *));
  l<<<1,1>>>(hf1);
  cudaDeviceSynchronize();
  l<<<1,1>>>(hf2);
  cudaDeviceSynchronize();
}
# nvcc -o t290 t290.cu -rdc=true -lcudadevrt
# compute-sanitizer ./t290
========= COMPUTE-SANITIZER
k1
k2
========= ERROR SUMMARY: 0 errors
#

I guess you want k1 and f1 in a separate module?

I want k1 and k2 in a separate module.
The module might not have the f1 and f2 definitions.

In the importing module, I have the l(the small-letter L) kernel, which you wrote.

This module somehow gets hold of the device side pointers to k1 and k2 and passes them to l (small letter L) for dynamic launch.

This isn’t really an answer to your question, but its the closest I could come at the moment:

# cat kernel.cu
#include <cstdio>

extern "C"
__global__ void k1(){ printf("k1\n");}

extern "C"
__device__ void *f1 = (void *)k1;

# cat loader.cu
#include <cuda_device_runtime_api.h>
#include <cstdio>

extern "C" __global__ void l(void *f){

  cudaLaunchDevice(f, NULL, dim3(1,1,1), dim3(1,1,1), 0, NULL);
}

# cat main.cpp
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <string>

#define CUDA_SAFECALL(call)                                                 \
    {                                                                       \
        call;                                                               \
        cudaError err = cudaGetLastError();                                 \
        if (cudaSuccess != err) {                                           \
            fprintf(                                                        \
                stderr,                                                     \
                "Cuda error in function '%s' file '%s' in line %i : %s.\n", \
                #call, __FILE__, __LINE__, cudaGetErrorString(err));        \
            fflush(stderr);                                                 \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    }

#define SAFECALL_DRV(call)                                                  \
    {                                                                       \
        CUresult err = call;                                                \
        if (err != CUDA_SUCCESS) {                                          \
            const char *errStr;                                             \
            cuGetErrorString(err, &errStr);                                 \
            fprintf(                                                        \
                stderr,                                                     \
                "CUDA Driver API error in function '%s' file '%s' in line %i : %s.\n", \
                #call, __FILE__, __LINE__, errStr);                         \
            fflush(stderr);                                                 \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    }

int main(int argc, char *argv[]) {
    // Initialize CUDA Driver API
    cuInit(0);

    // Get a CUDA device and create a context
    CUdevice device;
    CUcontext context;
    cuDeviceGet(&device, 0);
    cuCtxCreate(&context, 0, device);
    CUlinkState lState;
    const int options_num = 5;
    CUjit_option options[options_num];
    void *optionVals[options_num];
    float walltime;
    char error_log[8192], info_log[8192];
    unsigned int logSize = 8192;
    void *cuOut;
    size_t outSize;
    int myErr = 0;
    std::string module_path, ptx_source;

  // Setup linker options
  // Return walltime from JIT compilation
    options[0] = CU_JIT_WALL_TIME;
    optionVals[0] = (void *)&walltime;
  // Pass a buffer for info messages
    options[1] = CU_JIT_INFO_LOG_BUFFER;
    optionVals[1] = (void *)info_log;
  // Pass the size of the info buffer
    options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
    optionVals[2] = (void *)(long)logSize;
  // Pass a buffer for error message
    options[3] = CU_JIT_ERROR_LOG_BUFFER;
    optionVals[3] = (void *)error_log;
  // Pass the size of the error buffer
    options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
    optionVals[4] = (void *)(long)logSize;

  // Create a pending linker invocation
    SAFECALL_DRV(cuLinkCreate(options_num, options, optionVals, &lState));

    SAFECALL_DRV(cuLinkAddFile(lState, CU_JIT_INPUT_PTX, "kernel.ptx", 0, NULL, NULL));
    SAFECALL_DRV(cuLinkAddFile(lState, CU_JIT_INPUT_PTX, "loader.ptx", 0, NULL, NULL));
    SAFECALL_DRV(cuLinkAddFile(lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL));
    SAFECALL_DRV(cuLinkComplete(lState, &cuOut, &outSize));
    CUmodule mod1;
    SAFECALL_DRV(cuModuleLoadData(&mod1, cuOut));
    CUdeviceptr f1;
    SAFECALL_DRV(cuModuleGetGlobal(&f1, NULL, mod1, "f1"));
    CUfunction l;
    void *hf1;
    SAFECALL_DRV(cuMemcpyDtoH((void *)&hf1, f1, sizeof(void *)));
    SAFECALL_DRV(cuModuleGetFunction(&l, mod1, "l"));

        void *args[] = { &hf1 };

        SAFECALL_DRV(cuLaunchKernel(
                        l,      // Kernel function
                        1, 1, 1,  // Grid dimensions
                        1, 1, 1,      // Block dimensions
                        0,               // Shared memory
                        NULL,          // Stream
                        args,            // Kernel arguments
                        NULL             // Extra (not used)
                    ));

    SAFECALL_DRV(cuCtxSynchronize());
    // Destroy the CUDA context
    cuCtxDestroy(context);
    return 0;
}
# nvcc -arch=sm_89 -ptx kernel.cu
# nvcc -arch=sm_89 -ptx loader.cu
# g++ -I/usr/local/cuda/include main.cpp -o test -L/usr/local/cuda/lib64  -lcuda
# compute-sanitizer ./test
========= COMPUTE-SANITIZER
k1
========= ERROR SUMMARY: 0 errors
#

FWIW, just trying to load two modules and have a kernel from one call a kernel from the other, without going through the link steps, doesn’t seem to work according to my testing. And none of this addresses your question about how to get the device side entry pointer for a kernel directly from the kernel name.