Hi all,
I’m trying to work with a library that contains CUDA host and device code (halloc) using the CUDA driver API (from Julia). The problem is that the library has an init
function that sets up global device state, which seems to require being linked statically.
A simplified example library (lib.cu
):
__constant__ int initialized = 0;
extern "C" void init() {
void *addr;
cudaGetSymbolAddress(&addr, initialized);
int val = 1;
printf("Initialized (%p)\n", addr);
cudaMemcpy(addr, &val, sizeof(val), cudaMemcpyHostToDevice);
}
extern "C" __device__ void check() {
printf("Initialized (%p): %s\n", &initialized, initialized ? "yes" : "no");
}
Calling check
from a kernel should print yes
if the init
host function has first been called. This works when compiling this code to a static library using nvcc -lib -rdc=true -o lib.a lib.cu
, and then linking it statically with e.g. the following code:
extern "C" void init();
extern "C" __device__ void check();
extern "C" __global__ void kernel() { check(); }
int main() {
init();
kernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
It prints:
Initialized (0xb03d20000)
Initialized (0xb03d20000): yes
I’m trying to accomplish the same using the CUDA driver API without linking statically. Although I can compile lib.cu
to a dynamic library (nvcc -shared -Xcompiler -fPIC -rdc=true -o lib.so lib.cu
), there doesn’t seem to be a way to pass that file to the CUDA linker. Naively using both lib.a
(to pass to the CUDA linker) and lib.so
(to call the init
function) of course fails:
int main() {
cuCheck(cuInit(0));
CUdevice device;
cuCheck(cuDeviceGet(&device, 0));
CUcontext context;
cuCheck(cuDevicePrimaryCtxRetain(&context, device));
cuCheck(cuCtxPushCurrent(context));
CUlinkState linker;
cuCheck(cuLinkCreate(0, NULL, NULL, &linker));
cuCheck(cuLinkAddFile(linker, CU_JIT_INPUT_LIBRARY, "lib.a", 0, NULL, NULL));
cuCheck(cuLinkAddFile(linker, CU_JIT_INPUT_PTX, "kernel.ptx", 0, NULL, NULL));
void *image;
size_t image_size;
cuCheck(cuLinkComplete(linker, &image, &image_size));
cuCheck(cuLinkDestroy(linker));
void *handle = dlopen("./lib.so", RTLD_GLOBAL | RTLD_NOW);
if (!handle) {
perror(dlerror());
exit(1);
}
void (*init)(void) = dlsym(handle, "init");
init();
CUmodule module;
cuCheck(cuModuleLoadData(&module, image));
CUfunction kernel;
cuCheck(cuModuleGetFunction(&kernel, module, "kernel"));
cuCheck(cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, NULL));
cuCheck(cuStreamSynchronize(NULL));
return 0;
}
(kernel.ptx
is the same PTX as generated from the previous example, and just defines a kernel kernel
that calls the device function check
).
This fails, and prints the following:
Initialized (0xb03d20400)
Initialized (0xb03d20000): no
Is there a way to do what I’m trying to accomplish here? I would assume that some of the steps that nvcc
does when statically linking the working example above could be used to generate relocated host and device code for me to call and link, but I can’t seem to find out how (I would assume the nvlink
step that is shown by nvcc ... --dryrun
, but I’ve failed to extract a working invocation).