Note: In case the post seems long, one can directly jump to the section starting with “I was wondering…” at the end, in case one wants to skip the buildup/context.
Buildup/Context:
For the code mentioned in the post, it is the __nv_cudaEntityRegisterCallback
function injected by the nvcc compiler, which registers the host side kernel’s name/symbol ((void (*)(double*, double*, double*, int))vecAdd)
with the symbol_Z6vecAddPdS_S_i
on the device for resolution by GPU by some lookup table during cudaLaunchKernel
API call (to get hold of the device pointer), as mentioned here.
Similarly, for a __device__
function pointer definition to get hold of the device pointer of the kernel vecAdd
as:
typedef void (*fp)(double *, double *, double *, int);
__device__ fp kernelPtrvecAdd = vecAdd;
is even handled by the __nv_cudaEntityRegisterCallback
function.
static void __nv_cudaEntityRegisterCallback(void **__T23) {
// Save the fat binary handle for managed runtime
__nv_save_fatbinhandle_for_managed_rt(__T23);
...
...
// Register the vecAdd function
__cudaRegisterFunction(
__T23,
(const char*)((void (*)(double*, double*, double*, int))vecAdd),
"_Z6vecAddPdS_S_i",
"_Z6vecAddPdS_S_i",
-1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0
);
// Register the kernelPtrvecAdd variable
__cudaRegisterVar(
__T23,
(char*)&::kernelPtrvecAdd,
"kernelPtrvecAdd",
"kernelPtrvecAdd",
0, 8UL, 0, 0
);
...
...
}
The above code snippet can be obtained by compiling the post.cu
file here as:
$ nvcc -cuda post.cu -o post.cu.cpp.ii
But consider the situation in the example given below:
Where I have the following setup:
vecAdd.cu
// vecAdd.cu
#include <cuda_runtime.h>
#include <stdio.h>
// CUDA kernel that adds two vectors, each thread handles one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
c[id] = a[id] + b[id];
}
}
Compiled as
$ nvcc -cubin -arch=sm_75 vecAdd.cu -o vecAdd.cubin
main.cu
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define CUDA_SAFECALL(call) \
{ \
call; \
cudaError err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf( \
stderr, \
"Cuda error in function '%s' file '%s' in line %i : %s.\n", \
#call, __FILE__, __LINE__, cudaGetErrorString(err)); \
fflush(stderr); \
exit(EXIT_FAILURE); \
} \
}
#define SAFECALL_DRV(call) \
{ \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char *errStr; \
cuGetErrorString(err, &errStr); \
fprintf( \
stderr, \
"CUDA Driver API error in function '%s' file '%s' in line %i : %s.\n", \
#call, __FILE__, __LINE__, errStr); \
fflush(stderr); \
exit(EXIT_FAILURE); \
} \
}
int main(int argc, char *argv[]) {
int n = 100000000; // Size of the vectors
if (argc > 1) n = atoi(argv[1]);
// Initialize CUDA Driver API
cuInit(0);
// Get a CUDA device and create a context
CUdevice device;
CUcontext context;
cuDeviceGet(&device, 0);
cuCtxCreate(&context, 0, device);
cuDevicePrimaryCtxRetain(&context, device);
// Load the module from vecAdd.o
CUmodule module;
SAFECALL_DRV(cuModuleLoad(&module, "vecAdd.cubin"));
// Create a CUDA stream for asynchronous execution
cudaStream_t stream;
cudaStreamCreate(&stream);
// Host and device vectors
double *h_a, *h_b, *h_c;
double *d_a, *d_b, *d_c;
size_t bytes = n * sizeof(double);
// Allocate host memory
h_a = (double *)malloc(bytes);
h_b = (double *)malloc(bytes);
h_c = (double *)malloc(bytes);
// Initialize host vectors
for (int i = 0; i < n; i++) {
h_a[i] = sin(i) * sin(i);
h_b[i] = cos(i) * cos(i);
h_c[i] = 0;
}
CUfunction vecAddFunc;
SAFECALL_DRV(cuModuleGetFunction(&vecAddFunc, module, "_Z6vecAddPdS_S_i"));
printf("vecAdd: %p\n", vecAddFunc);
// Allocate device memory
cudaMallocAsync(&d_a, bytes, stream);
cudaMallocAsync(&d_b, bytes, stream);
cudaMallocAsync(&d_c, bytes, stream);
// Copy data from host to device
cudaMemcpyAsync(d_a, h_a, bytes, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, h_b, bytes, cudaMemcpyHostToDevice, stream);
// Time the kernel execution
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (int i = 0; i < 10; i++) {
cudaEventRecord(start, stream);
int gridSize = (int)ceil((float)n / 1024);
void *args[] = { &d_a, &d_b, &d_c, &n };
SAFECALL_DRV(cuLaunchKernel(
vecAddFunc, // Kernel function
gridSize, 1, 1, // Grid dimensions
1024, 1, 1, // Block dimensions
0, // Shared memory
stream, // Stream
args, // Kernel arguments
NULL // Extra (not used)
));
cudaStreamSynchronize(stream);
cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
float time = 0;
cudaEventElapsedTime(&time, start, stop);
printf("Iteration %d: Time vecAdd: %f ms\n", i, time);
}
// Copy array back to host using async memory copy
cudaMemcpyAsync(h_c, d_c, bytes, cudaMemcpyDeviceToHost, stream);
// Release device memory using async memory deallocation
cudaFreeAsync(d_a, stream);
cudaFreeAsync(d_b, stream);
cudaFreeAsync(d_c, stream);
// Synchronize the stream to ensure everything is done
cudaStreamSynchronize(stream);
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (int i = 0; i < n; i++) sum += h_c[i];
printf("Final sum = %f; sum/n = %f (should be ~1)\n", sum, sum / n);
// Clean up resources
cudaStreamDestroy(stream);
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Destroy the CUDA context
cuCtxDestroy(context);
return 0;
}
$ nvcc main.cu -lcuda
./a.out vecAdd: 0x56400fc49640
Iteration 0: Time vecAdd: 6.092896 ms
...
Iteration 9: Time vecAdd: 6.029056 ms
Final sum = 100000000.000000; sum/n = 1.000000 (should be ~1)
In the above code, I load the cubin
of the vecAdd
kernel code in the main.cu
file and then get hold of the host side stub (as is evident from the address 0x56400fc49640
) of the vecAdd
kernel using cuModuleGetFunction
passing just the symbol name of the kernel function as a string (as in _Z6vecAddPdS_S_i
) (though the manged cpp-style name has the function signature info in it, in this particular situation, but it might not always be the case) using which I can launch the kernel using cuLaunchKernel
.
nvcc -cuda main.cu main.cu.cpp.ii
output file (main.cu.cpp.ii) does not contain any register function line in the __nv_cudaEntityRegisterCallback
, but
nvcc -cuda vecAdd.cu vecAdd.cu.cpp.ii
contains a register function entry for _Z6vecAddPdS_S_i
. So, I guess cuModuleLoad
and cuModuleGetFunction
is installing that in the main executable.
I was wondering,
- Is there any way by which using just the string
_Z6vecAddPdS_S_i
I could get hold of the device side pointer of the kernelvecAdd
. Just as usingcuModuleGetFunction
we get hold of the host pointer ofvecAdd
kernel. - Or, if
vecAdd.cu
was compiled asvecAdd.o
usingnvcc -c vecAdd.cu -o vecAdd.o -rdc=true
and main be compiled asnvcc main.cu vecAdd.o
, to define__device__
function pointer requires us to know the signature ofvecAdd
(for the external linkage) inmain.cu
:
typedef void (*fp)(double *, double *, double *, int);
extern __global__ void vecAdd(double *a, double *b, double *c, int n);
__device__ fp kernelPtr = vecAdd;
Is there a way out, where I can get the device function pointer of vecAdd
without the knowledge of its signature (along the line of cuda dynamic parallelism; here, there is a discussion, but unfortunately both the parent and child kernel are in the module itself.)?