Docker IPC

I receive “invalid device context” error 201 when obtaining CUDA IPC memory inside a docker container.

The following simplified code is meant to run in two or more docker containers.
It works perfectly under many conditions… until it decides not to work.

When running with no parameters the developer can spin up multiple docker instances reading from the same IPC memory.
When specifying ‘-c’ on the command line the docker container will create and populate the IPC memory.

Teasing apart the NVIDIA provided simple IPC example to run across multiple docker containers proved to be to complex for this exercise and introduced too many variables. The following code has been tested and appears to work on the host with no issues.

compile the example code below and place in a known volume

nvcc -std=c++11 simpleIPC.cu -o simpleIPC sudo cp simpleIPC /opt/
$ sudo chmod 777 /opt/simpleIPC

enable user access to IPC memory handle UID stored on file system

sudo touch /opt/simpleIPC.uid sudo chmod 777 /opt/simpleIPC.uid

demonstrate functioning IPC on host (no docker)

$ /opt/simpleIPC

will actively poll until you run the next command

$ /opt/simpleIPC -c

observe the reading instance dumping memory values

equivalent to the source instances loop count

Kill the prior examples

cntrl-C will cleanly deallocate resoucres in both applications

Now run the same executable inside the docker containers

$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_N -v /opt:/opt nvidia/cuda:10.1-devel /opt/simpleIPC

repeat this command for as many N instances as desired

only one is needed to reproduce the fault

This read instance will poll periodically until the

file /opt/simpleIPC.uid is present and contains a

valid 64 byte IPC memory handle

now run the next container with the create flag and a unique docker conmtainer name

$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_create -v /opt:/opt nvidia/cuda:10.1-devel /opt/simpleIPC -c

observe “failed to CUDA open mem handle 201”

now for the really interesting part

kill the read instance and allow the create instance to continue running

issue the shell noop in the docker run command:

$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_N -v /opt:/opt nvidia/cuda:10.1-devel /bin/bash -c “:;/opt/simpleIPC”

It works!,… huh?

I have run these same tests on multiple NVIDIA

architectures and various but similar

driver/lib/docker versions.

My current test rig is as follows:
K1100M 418.87.00 cuda 10.1, also tested on Tesla V100
docker 18.06.1-ce

#include <iostream>

#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

__global__
void myCudaMemset(unsigned char *out, unsigned char value, unsigned int length)
{
  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
  if (tid >= length) return;
  out[tid] = value;
}

static int running = 1;
static bool create=false;
static int ipc_length = 10240;
static void *ipc_address = 0;
static char uidFileName[] = "/opt/simpleIPC.uid";
static int uidFd=-1;
static cudaIpcMemHandle_t ipc_handle;
static bool ipc_handle_valid = false;
static bool ipc_file_valid = false;
static unsigned char loop = 0;

// Catch any interrupt signal from the OS and disable the run state
static void terminate(int signum)
{
  std::cout << "caught signal " << signum << std::endl;
  running = 0;
}

void ProcessCommandLine(int argc, char *argv[])
{
  int opt;

  while ((opt = getopt(argc, argv, "c")) != -1)
  {
    switch (opt) {

      case 'c':
        create = true;
        break;

      default:
        fprintf(stderr, "Usage: %s\n", argv[0]);
        fprintf(stderr, "    -c                - create CUDA IPC (%d)\n", create);
        exit(1);
    }
  }
}

void CreateIPC()
{
  cudaError_t __err;

  if (uidFd == -1)
  {
    uidFd = open(uidFileName, O_RDWR | O_CREAT);
    if (uidFd == -1)
    {
      fprintf(stderr, "Failed to open %s for create\n", uidFileName);
      return;
    }
  }

  // allocate device memory 
  if (ipc_address == 0)
  {
    cudaMalloc((void **)&ipc_address, ipc_length);
    __err = cudaGetLastError();
    if (__err != cudaSuccess)
    {
      ipc_address = 0;
      fprintf(stderr, "failed to CUDA malloc %d bytes: %d\n", ipc_length, __err);
      return;
    }
  }

if (ipc_handle_valid == false)
  {
    // set up CUDA IPC handle for external processes
    cudaIpcGetMemHandle(&ipc_handle, ipc_address);
  
    __err = cudaGetLastError();
    if (__err != cudaSuccess)
    {
      fprintf(stderr, "failed to CUDA get mem handle %d\n", __err);
      return;
    }
   
    ipc_handle_valid = true;

    unsigned char *cptr = (unsigned char *)&ipc_handle;
    for (int i=0; i<64; i++)
      fprintf(stderr, "%02x.", cptr[i]);
    fprintf(stderr, "\n");
  
  }

  // write IPC handle to file system for other async processes
  if (ipc_file_valid == false)
  {
    lseek(uidFd, 0, SEEK_SET);
    if (write(uidFd, (void *)&ipc_handle, 64) != 64)
    {
      fprintf(stderr, "failed to write UID to %s: %s\n", uidFileName, strerror(errno));
      return;
    }
    ipc_file_valid = true;
  }

  // populate memory with a pattern
  myCudaMemset<<<10, 1024>>>((unsigned char *)ipc_address, loop, ipc_length);
  loop++;
}

void ReadIPC()
{
  cudaError_t __err;

  if (uidFd == -1)
  {
    uidFd = open(uidFileName, O_RDONLY);
    if (uidFd == -1)
    {
      fprintf(stderr, "Failed to open %s for read\n", uidFileName);
      return;
    }
  }

  // read IPC handle from file system 
  if (ipc_handle_valid == false)
  {
    lseek(uidFd, 0, SEEK_SET);
    if (read(uidFd, (void *)&ipc_handle, 64) != 64)
    {
      fprintf(stderr, "failed to read UID from %s: %s\n", uidFileName, strerror(errno));
      return;
    }
    ipc_handle_valid = true;
  }

  if (ipc_address == 0)
  {
    cudaIpcOpenMemHandle((void **)&ipc_address, ipc_handle, cudaIpcMemLazyEnablePeerAccess);
    __err = cudaGetLastError();
    if (__err != cudaSuccess)
    {
      fprintf(stderr, "failed to CUDA open mem handle %d\n", __err);
      ipc_address = 0;
      return;
    }
  }

  // dump memory contents
  unsigned char host_copy[16];
  cudaMemcpy(host_copy, ipc_address, 16, cudaMemcpyDeviceToHost);
  for (int i=0; i< 16; i++)
  {
    fprintf(stderr, "%02d.", host_copy[i]);
  }
  fprintf(stderr, "\n");
}

void CloseIPC()
{
  if (uidFd != -1)
  {
    close(uidFd);
    uidFd = -1;
  }

  if (create)
  {
    if (ipc_address)
    {
      cudaFree(ipc_address);
      ipc_address = 0;
    }
  }
  else
  {
    if (ipc_address)
    {
      cudaIpcCloseMemHandle(ipc_address);
      ipc_address = 0;
    }
  }
}

/**
*
* Entry Point
*
**/

int main(int argc, char *argv[])
{

  ProcessCommandLine(argc, argv);

// register interrupts for graceful shutdown
  struct sigaction action;
  memset(&action, 0, sizeof(struct sigaction));
  action.sa_handler = terminate;
  sigaction(SIGTERM, &action, NULL);
  sigaction(SIGINT, &action, NULL);

while (running)
  {
    if (create)
      CreateIPC();
    else
      ReadIPC();
    sleep(1);
    std::cout << "loop " << std::to_string(loop) << std::endl;
  }

  CloseIPC();

  std::cout << "exiting main loop" << std::endl;

}

Sometimes when you receive a invalid device context error, it’s because the current process doesn’t know about any active CUDA contexts. Try using cudaSetDevice in both CreateIPC() and ReadIPC().

Thank you. cudaSetDevice has the same behavior. Also ran in container pulled from NVidia: nvcr.io/NVidia/cuda:10.1-base with the same results.