Hello,
I have a kernel that runs for quite a while and, while I’m running it, I want to load some PTX files in as modules with cuModuleLoad.
However, it seems as if cuModuleLoad likes to wait for the kernel to complete before loading in the PTX.
To get around this, I’ve found that I have to load all of my PTX at once before I launch any of my kernels to prevent it from stalling on cuModuleLoad. But I would rather load in the modules while my kernel is running.
Is there any way that I can do a cuModuleLoad while a kernel is running? If not, why is that the case?
Here’s a fun example that shows the issue. It appears to stall indefinitely on the cuModuleLoad call but runs to completion after commenting out the cuModuleLoad and the cuModuleUnload. The hoard of printfs does not appear to be causing the issue, as the problem remains if they are removed.
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <iostream>
#define CHECK(cmd) \
do {\
cudaError_t error = cmd;\
if (error != cudaSuccess) { \
std::cerr << "Encountered CUDA runtime error \"" << ::cudaGetErrorString(error) \
<< " (" << error << ")\" at line " \
<< __LINE__ << " in file " << __FILE__<< "\n";\
exit(-1);\
}\
} while(0)
#define CHECK_DRIVER(cmd) \
do {\
CUresult result = cmd;\
if (result != CUDA_SUCCESS) { \
std::cerr << "Encountered CUDA driver error " << result << " at line " \
<< __LINE__ << " in file " << __FILE__ << "\n";\
exit(-1);\
}\
} while(0)
//sing a song until we tell it to knock it off
__global__ void
sing_the_song(volatile int *keep_going)
{
unsigned int bottles_of_beer = 100;
while (keep_going[0])
{
printf("%u bottles of beer on the wall\n"
"%u bottles of beer\n"
"take one down, pass it around\n", bottles_of_beer, bottles_of_beer);
bottles_of_beer--;
}
}
int main()
{
//get a pair of zerocopy int pointers
volatile int *h_keep_going, *d_keep_going;
CHECK(cudaHostAlloc((void**) &h_keep_going, sizeof(int), cudaHostAllocMapped|cudaHostAllocWriteCombined));
CHECK(cudaHostGetDevicePointer( (void**) &d_keep_going, (void*) h_keep_going, 0 ));
//initialize keep_going to 1
h_keep_going[0] = true;
//create stream and launch kernel
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
sing_the_song<<<1,1,0,stream>>>(d_keep_going);
fprintf(stderr, "launched the kernel\n");
CUmodule *module = new CUmodule;
//try to load a module
fprintf(stderr, "loading in a module...\n");
CHECK_DRIVER(::cuModuleLoad(module, "some_ptx")); //it appears to stall here
fprintf(stderr, "module loaded!\n");
//tell the kernel to stop
h_keep_going[0] = false;
fprintf(stderr, "synchronizing stream...\n");
CHECK(cudaStreamSynchronize(stream));
fprintf(stderr, "all done!\n");
CHECK(cudaStreamDestroy(stream));
CHECK(cudaFreeHost((void*) h_keep_going));
CHECK_DRIVER(cuModuleUnload(*module));
delete module;
}
I compiled and ran it with :
nvcc -o cuModuleLoad_implicit_sync cuModuleLoad_implicit_sync.cu -Xcompiler --std=c++11 -lcuda -lcudart -lstdc++
./cuModuleLoad_implicit_sync >> /dev/null