Calling cuModuleLoad while another kernel is running

Hello,

I have a kernel that runs for quite a while and, while I’m running it, I want to load some PTX files in as modules with cuModuleLoad.

However, it seems as if cuModuleLoad likes to wait for the kernel to complete before loading in the PTX.

To get around this, I’ve found that I have to load all of my PTX at once before I launch any of my kernels to prevent it from stalling on cuModuleLoad. But I would rather load in the modules while my kernel is running.

Is there any way that I can do a cuModuleLoad while a kernel is running? If not, why is that the case?

Here’s a fun example that shows the issue. It appears to stall indefinitely on the cuModuleLoad call but runs to completion after commenting out the cuModuleLoad and the cuModuleUnload. The hoard of printfs does not appear to be causing the issue, as the problem remains if they are removed.

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include <iostream>

#define CHECK(cmd) \
do {\
    cudaError_t error  = cmd;\
    if (error != cudaSuccess) { \
      std::cerr << "Encountered CUDA runtime error \"" << ::cudaGetErrorString(error) \
           << " (" << error << ")\" at line " \
           << __LINE__ << " in file " << __FILE__<< "\n";\
      exit(-1);\
	}\
} while(0)

#define CHECK_DRIVER(cmd) \
do {\
    CUresult result  = cmd;\
    if (result != CUDA_SUCCESS) { \
      std::cerr << "Encountered CUDA driver error " << result << " at line " \
           << __LINE__ << " in file " << __FILE__ << "\n";\
      exit(-1);\
	}\
} while(0)

//sing a song until we tell it to knock it off
__global__ void
sing_the_song(volatile int *keep_going)
{
  unsigned int bottles_of_beer = 100;
  while (keep_going[0])
  {
    printf("%u bottles of beer on the wall\n"
           "%u bottles of beer\n"
           "take one down, pass it around\n", bottles_of_beer, bottles_of_beer);
    bottles_of_beer--;
  }
}

int main()
{
  //get a pair of zerocopy int pointers
  volatile int *h_keep_going, *d_keep_going;
  CHECK(cudaHostAlloc((void**) &h_keep_going, sizeof(int), cudaHostAllocMapped|cudaHostAllocWriteCombined));
  CHECK(cudaHostGetDevicePointer( (void**) &d_keep_going, (void*) h_keep_going, 0 ));
  
  //initialize keep_going to 1
  h_keep_going[0] = true;
  
  //create stream and launch kernel
  cudaStream_t stream;
  CHECK(cudaStreamCreate(&stream));
  sing_the_song<<<1,1,0,stream>>>(d_keep_going);
  fprintf(stderr, "launched the kernel\n");
  
  CUmodule *module = new CUmodule;
  
  //try to load a module
  fprintf(stderr, "loading in a module...\n");
  CHECK_DRIVER(::cuModuleLoad(module, "some_ptx"));  //it appears to stall here
  fprintf(stderr, "module loaded!\n");
  
  //tell the kernel to stop
  h_keep_going[0] = false;
  
  fprintf(stderr, "synchronizing stream...\n");
  
  CHECK(cudaStreamSynchronize(stream));
  
  fprintf(stderr, "all done!\n");
  
  CHECK(cudaStreamDestroy(stream));
  CHECK(cudaFreeHost((void*) h_keep_going));
  CHECK_DRIVER(cuModuleUnload(*module));
  
  delete module;
}

I compiled and ran it with :

nvcc -o cuModuleLoad_implicit_sync cuModuleLoad_implicit_sync.cu -Xcompiler --std=c++11 -lcuda -lcudart -lstdc++

./cuModuleLoad_implicit_sync >> /dev/null

It’s likely that cuModuleLoad affects the memory map of the GPU. It’s likely that device allocations are made to store various module resources. This seems evident from the API description:

[url]CUDA Driver API :: CUDA Toolkit Documentation

As a general rule, changes to the memory map of the GPU will not proceed while the GPU is currently executing a kernel. Similar behavior is witnessed if you attempted to do a cudaMalloc while a GPU is executing a kernel.

Alright. Thank you for the help, Bob.

Bob,

You said that I should see similar behavior if I try to do a cudaMalloc while the GPU is running a kernel.

I modified the example I originally posted to do a massive cudaMalloc (2x10^8 ints) instead of a cuModuleLoad.

The example ran to completion without issue.

Here’s the modified example :

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include <iostream>

#define CHECK(cmd) \
do {\
    cudaError_t error  = cmd;\
    if (error != cudaSuccess) { \
      std::cerr << "Encountered CUDA runtime error \"" << ::cudaGetErrorString(error) \
           << " (" << error << ")\" at line " \
           << __LINE__ << " in file " << __FILE__<< "\n";\
      exit(-1);\
	}\
} while(0)

#define CHECK_DRIVER(cmd) \
do {\
    CUresult result  = cmd;\
    if (result != CUDA_SUCCESS) { \
      std::cerr << "Encountered CUDA driver error " << result << " at line " \
           << __LINE__ << " in file " << __FILE__ << "\n";\
      exit(-1);\
	}\
} while(0)

//sing a song until we tell it to knock it off
__global__ void
sing_the_song(volatile int *keep_going)
{
  unsigned int bottles_of_beer = 100;
  while (keep_going[0])
  {
    printf("%u bottles of beer on the wall\n"
           "%u bottles of beer\n"
           "take one down, pass it around\n", bottles_of_beer, bottles_of_beer);
    bottles_of_beer--;
  }
}

int main()
{
  //get a pair of zerocopy int pointers
  volatile int *h_keep_going, *d_keep_going;
  CHECK(cudaHostAlloc((void**) &h_keep_going, sizeof(int), cudaHostAllocMapped|cudaHostAllocWriteCombined));
  CHECK(cudaHostGetDevicePointer( (void**) &d_keep_going, (void*) h_keep_going, 0 ));
  
  size_t num_elements = 200000000;
  size_t malloc_size = sizeof(int) * num_elements;
  
  int *d_cuda_buf, *h_cuda_buf;
  h_cuda_buf = (int*) malloc(malloc_size);
  
  //initialize keep_going to 1
  h_keep_going[0] = true;
  
  //create stream and launch kernel
  cudaStream_t stream;
  CHECK(cudaStreamCreate(&stream));
  sing_the_song<<<1,1,0,stream>>>(d_keep_going);
  fprintf(stderr, "launched the kernel\n");
  
  //try to do a cudaMalloc while kernel is running
  fprintf(stderr, "calling cudaMalloc...\n");
  CHECK(cudaMalloc(&d_cuda_buf, malloc_size)); //no stall here
  fprintf(stderr, "malloced!\n");
  
  //tell the kernel to stop
  h_keep_going[0] = false;
  fprintf(stderr, "kernel stopped\n");
  
  //do something with the cudaMalloced buffer
  CHECK(cudaMemset(d_cuda_buf, 15, malloc_size)); //set each byte to 0b1111
  CHECK(cudaMemcpy(h_cuda_buf, d_cuda_buf, malloc_size, cudaMemcpyDeviceToHost));
  ssize_t sum = 0;
  for (size_t i = 0; i < num_elements; i++)
    sum += h_cuda_buf[i];
  fprintf(stderr, "sum obtained : %lld\n", sum);
  
  fprintf(stderr, "synchronizing stream...\n");
  
  CHECK(cudaStreamSynchronize(stream));
  
  fprintf(stderr, "all done!\n");
  
  CHECK(cudaStreamDestroy(stream));
  CHECK(cudaFreeHost((void*) h_keep_going));
}

Why does this work but not the cuModuleLoad?

Sorry, I appear to be just completely wrong about that (I believe cudaMalloc is synchronizing for device activity, but not blocking the host thread). I’m not sure why that driver API call is blocking, but some of the blocking calls I know about include things like cudaMemcpy. If that call were using something like cudaMemcpy under the hood to initialize resources on the GPU, that could be a reason for the blockage.