Note: In case the post seems long, one can directly jump to the section starting with “I was wondering…” at the end, in case one wants to skip the buildup/context.
Buildup/Context:
For the code mentioned in the post, it is the __nv_cudaEntityRegisterCallback function injected by the nvcc compiler, which registers the host side kernel’s name/symbol ((void (*)(double*, double*, double*, int))vecAdd) with the symbol_Z6vecAddPdS_S_i on the device for resolution by GPU by some lookup table during cudaLaunchKernel API call (to get hold of the device pointer), as mentioned here.
Similarly, for a __device__ function pointer definition to get hold of the device pointer of the kernel vecAdd as:
typedef void (*fp)(double *, double *, double *, int);
__device__ fp kernelPtrvecAdd = vecAdd;
is even handled by the __nv_cudaEntityRegisterCallback function.
static void __nv_cudaEntityRegisterCallback(void **__T23) {
// Save the fat binary handle for managed runtime
__nv_save_fatbinhandle_for_managed_rt(__T23);
...
...
// Register the vecAdd function
__cudaRegisterFunction(
__T23,
(const char*)((void (*)(double*, double*, double*, int))vecAdd),
"_Z6vecAddPdS_S_i",
"_Z6vecAddPdS_S_i",
-1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0
);
// Register the kernelPtrvecAdd variable
__cudaRegisterVar(
__T23,
(char*)&::kernelPtrvecAdd,
"kernelPtrvecAdd",
"kernelPtrvecAdd",
0, 8UL, 0, 0
);
...
...
}
The above code snippet can be obtained by compiling the post.cu file here as:
$ nvcc -cuda post.cu -o post.cu.cpp.ii
But consider the situation in the example given below:
Where I have the following setup:
vecAdd.cu
// vecAdd.cu
#include <cuda_runtime.h>
#include <stdio.h>
// CUDA kernel that adds two vectors, each thread handles one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
c[id] = a[id] + b[id];
}
}
Compiled as
$ nvcc -cubin -arch=sm_75 vecAdd.cu -o vecAdd.cubin
main.cu
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define CUDA_SAFECALL(call) \
{ \
call; \
cudaError err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf( \
stderr, \
"Cuda error in function '%s' file '%s' in line %i : %s.\n", \
#call, __FILE__, __LINE__, cudaGetErrorString(err)); \
fflush(stderr); \
exit(EXIT_FAILURE); \
} \
}
#define SAFECALL_DRV(call) \
{ \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char *errStr; \
cuGetErrorString(err, &errStr); \
fprintf( \
stderr, \
"CUDA Driver API error in function '%s' file '%s' in line %i : %s.\n", \
#call, __FILE__, __LINE__, errStr); \
fflush(stderr); \
exit(EXIT_FAILURE); \
} \
}
int main(int argc, char *argv[]) {
int n = 100000000; // Size of the vectors
if (argc > 1) n = atoi(argv[1]);
// Initialize CUDA Driver API
cuInit(0);
// Get a CUDA device and create a context
CUdevice device;
CUcontext context;
cuDeviceGet(&device, 0);
cuCtxCreate(&context, 0, device);
cuDevicePrimaryCtxRetain(&context, device);
// Load the module from vecAdd.o
CUmodule module;
SAFECALL_DRV(cuModuleLoad(&module, "vecAdd.cubin"));
// Create a CUDA stream for asynchronous execution
cudaStream_t stream;
cudaStreamCreate(&stream);
// Host and device vectors
double *h_a, *h_b, *h_c;
double *d_a, *d_b, *d_c;
size_t bytes = n * sizeof(double);
// Allocate host memory
h_a = (double *)malloc(bytes);
h_b = (double *)malloc(bytes);
h_c = (double *)malloc(bytes);
// Initialize host vectors
for (int i = 0; i < n; i++) {
h_a[i] = sin(i) * sin(i);
h_b[i] = cos(i) * cos(i);
h_c[i] = 0;
}
CUfunction vecAddFunc;
SAFECALL_DRV(cuModuleGetFunction(&vecAddFunc, module, "_Z6vecAddPdS_S_i"));
printf("vecAdd: %p\n", vecAddFunc);
// Allocate device memory
cudaMallocAsync(&d_a, bytes, stream);
cudaMallocAsync(&d_b, bytes, stream);
cudaMallocAsync(&d_c, bytes, stream);
// Copy data from host to device
cudaMemcpyAsync(d_a, h_a, bytes, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, h_b, bytes, cudaMemcpyHostToDevice, stream);
// Time the kernel execution
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (int i = 0; i < 10; i++) {
cudaEventRecord(start, stream);
int gridSize = (int)ceil((float)n / 1024);
void *args[] = { &d_a, &d_b, &d_c, &n };
SAFECALL_DRV(cuLaunchKernel(
vecAddFunc, // Kernel function
gridSize, 1, 1, // Grid dimensions
1024, 1, 1, // Block dimensions
0, // Shared memory
stream, // Stream
args, // Kernel arguments
NULL // Extra (not used)
));
cudaStreamSynchronize(stream);
cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
float time = 0;
cudaEventElapsedTime(&time, start, stop);
printf("Iteration %d: Time vecAdd: %f ms\n", i, time);
}
// Copy array back to host using async memory copy
cudaMemcpyAsync(h_c, d_c, bytes, cudaMemcpyDeviceToHost, stream);
// Release device memory using async memory deallocation
cudaFreeAsync(d_a, stream);
cudaFreeAsync(d_b, stream);
cudaFreeAsync(d_c, stream);
// Synchronize the stream to ensure everything is done
cudaStreamSynchronize(stream);
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (int i = 0; i < n; i++) sum += h_c[i];
printf("Final sum = %f; sum/n = %f (should be ~1)\n", sum, sum / n);
// Clean up resources
cudaStreamDestroy(stream);
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Destroy the CUDA context
cuCtxDestroy(context);
return 0;
}
$ nvcc main.cu -lcuda
./a.out vecAdd: 0x56400fc49640
Iteration 0: Time vecAdd: 6.092896 ms
...
Iteration 9: Time vecAdd: 6.029056 ms
Final sum = 100000000.000000; sum/n = 1.000000 (should be ~1)
In the above code, I load the cubin of the vecAdd kernel code in the main.cu file and then get hold of the host side stub (as is evident from the address 0x56400fc49640) of the vecAdd kernel using cuModuleGetFunction passing just the symbol name of the kernel function as a string (as in _Z6vecAddPdS_S_i) (though the manged cpp-style name has the function signature info in it, in this particular situation, but it might not always be the case) using which I can launch the kernel using cuLaunchKernel.
nvcc -cuda main.cu main.cu.cpp.ii output file (main.cu.cpp.ii) does not contain any register function line in the __nv_cudaEntityRegisterCallback, but
nvcc -cuda vecAdd.cu vecAdd.cu.cpp.ii contains a register function entry for _Z6vecAddPdS_S_i. So, I guess cuModuleLoad and cuModuleGetFunction is installing that in the main executable.
I was wondering,
- Is there any way by which using just the string
_Z6vecAddPdS_S_iI could get hold of the device side pointer of the kernelvecAdd. Just as usingcuModuleGetFunctionwe get hold of the host pointer ofvecAddkernel. - Or, if
vecAdd.cuwas compiled asvecAdd.ousingnvcc -c vecAdd.cu -o vecAdd.o -rdc=trueand main be compiled asnvcc main.cu vecAdd.o, to define__device__function pointer requires us to know the signature ofvecAdd(for the external linkage) inmain.cu:
typedef void (*fp)(double *, double *, double *, int);
extern __global__ void vecAdd(double *a, double *b, double *c, int n);
__device__ fp kernelPtr = vecAdd;
Is there a way out, where I can get the device function pointer of vecAdd without the knowledge of its signature (along the line of cuda dynamic parallelism; here, there is a discussion, but unfortunately both the parent and child kernel are in the module itself.)?