Docker IPC

jgiovino · December 9, 2019, 9:35pm

I receive “invalid device context” error 201 when obtaining CUDA IPC memory inside a docker container.

The following simplified code is meant to run in two or more docker containers.
It works perfectly under many conditions… until it decides not to work.

When running with no parameters the developer can spin up multiple docker instances reading from the same IPC memory.
When specifying ‘-c’ on the command line the docker container will create and populate the IPC memory.

Teasing apart the NVIDIA provided simple IPC example to run across multiple docker containers proved to be to complex for this exercise and introduced too many variables. The following code has been tested and appears to work on the host with no issues.

compile the example code below and place in a known volume

$ nvcc -std=c++11 simpleIPC.cu -o simpleIPC
$ sudo cp simpleIPC /opt/
$ sudo chmod 777 /opt/simpleIPC

enable user access to IPC memory handle UID stored on file system

$ sudo touch /opt/simpleIPC.uid
$ sudo chmod 777 /opt/simpleIPC.uid

demonstrate functioning IPC on host (no docker)

$ /opt/simpleIPC

will actively poll until you run the next command

$ /opt/simpleIPC -c

observe the reading instance dumping memory values

equivalent to the source instances loop count

Kill the prior examples

cntrl-C will cleanly deallocate resoucres in both applications

Now run the same executable inside the docker containers

$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_N -v /opt:/opt nvidia/cuda:10.1-devel /opt/simpleIPC

repeat this command for as many N instances as desired

only one is needed to reproduce the fault

This read instance will poll periodically until the

file /opt/simpleIPC.uid is present and contains a

valid 64 byte IPC memory handle

now run the next container with the create flag and a unique docker conmtainer name

$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_create -v /opt:/opt nvidia/cuda:10.1-devel /opt/simpleIPC -c

observe “failed to CUDA open mem handle 201”

now for the really interesting part

kill the read instance and allow the create instance to continue running

issue the shell noop in the docker run command:

$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_N -v /opt:/opt nvidia/cuda:10.1-devel /bin/bash -c “:;/opt/simpleIPC”

It works!,… huh?

I have run these same tests on multiple NVIDIA

architectures and various but similar

driver/lib/docker versions.

My current test rig is as follows:
K1100M 418.87.00 cuda 10.1, also tested on Tesla V100
docker 18.06.1-ce

#include <iostream>

#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

__global__
void myCudaMemset(unsigned char *out, unsigned char value, unsigned int length)
{
  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
  if (tid >= length) return;
  out[tid] = value;
}

static int running = 1;
static bool create=false;
static int ipc_length = 10240;
static void *ipc_address = 0;
static char uidFileName[] = "/opt/simpleIPC.uid";
static int uidFd=-1;
static cudaIpcMemHandle_t ipc_handle;
static bool ipc_handle_valid = false;
static bool ipc_file_valid = false;
static unsigned char loop = 0;

// Catch any interrupt signal from the OS and disable the run state
static void terminate(int signum)
{
  std::cout << "caught signal " << signum << std::endl;
  running = 0;
}

void ProcessCommandLine(int argc, char *argv[])
{
  int opt;

  while ((opt = getopt(argc, argv, "c")) != -1)
  {
    switch (opt) {

      case 'c':
        create = true;
        break;

      default:
        fprintf(stderr, "Usage: %s\n", argv[0]);
        fprintf(stderr, "    -c                - create CUDA IPC (%d)\n", create);
        exit(1);
    }
  }
}

void CreateIPC()
{
  cudaError_t __err;

  if (uidFd == -1)
  {
    uidFd = open(uidFileName, O_RDWR | O_CREAT);
    if (uidFd == -1)
    {
      fprintf(stderr, "Failed to open %s for create\n", uidFileName);
      return;
    }
  }

  // allocate device memory 
  if (ipc_address == 0)
  {
    cudaMalloc((void **)&ipc_address, ipc_length);
    __err = cudaGetLastError();
    if (__err != cudaSuccess)
    {
      ipc_address = 0;
      fprintf(stderr, "failed to CUDA malloc %d bytes: %d\n", ipc_length, __err);
      return;
    }
  }

if (ipc_handle_valid == false)
  {
    // set up CUDA IPC handle for external processes
    cudaIpcGetMemHandle(&ipc_handle, ipc_address);
  
    __err = cudaGetLastError();
    if (__err != cudaSuccess)
    {
      fprintf(stderr, "failed to CUDA get mem handle %d\n", __err);
      return;
    }
   
    ipc_handle_valid = true;

    unsigned char *cptr = (unsigned char *)&ipc_handle;
    for (int i=0; i<64; i++)
      fprintf(stderr, "%02x.", cptr[i]);
    fprintf(stderr, "\n");
  
  }

  // write IPC handle to file system for other async processes
  if (ipc_file_valid == false)
  {
    lseek(uidFd, 0, SEEK_SET);
    if (write(uidFd, (void *)&ipc_handle, 64) != 64)
    {
      fprintf(stderr, "failed to write UID to %s: %s\n", uidFileName, strerror(errno));
      return;
    }
    ipc_file_valid = true;
  }

  // populate memory with a pattern
  myCudaMemset<<<10, 1024>>>((unsigned char *)ipc_address, loop, ipc_length);
  loop++;
}

void ReadIPC()
{
  cudaError_t __err;

  if (uidFd == -1)
  {
    uidFd = open(uidFileName, O_RDONLY);
    if (uidFd == -1)
    {
      fprintf(stderr, "Failed to open %s for read\n", uidFileName);
      return;
    }
  }

  // read IPC handle from file system 
  if (ipc_handle_valid == false)
  {
    lseek(uidFd, 0, SEEK_SET);
    if (read(uidFd, (void *)&ipc_handle, 64) != 64)
    {
      fprintf(stderr, "failed to read UID from %s: %s\n", uidFileName, strerror(errno));
      return;
    }
    ipc_handle_valid = true;
  }

  if (ipc_address == 0)
  {
    cudaIpcOpenMemHandle((void **)&ipc_address, ipc_handle, cudaIpcMemLazyEnablePeerAccess);
    __err = cudaGetLastError();
    if (__err != cudaSuccess)
    {
      fprintf(stderr, "failed to CUDA open mem handle %d\n", __err);
      ipc_address = 0;
      return;
    }
  }

  // dump memory contents
  unsigned char host_copy[16];
  cudaMemcpy(host_copy, ipc_address, 16, cudaMemcpyDeviceToHost);
  for (int i=0; i< 16; i++)
  {
    fprintf(stderr, "%02d.", host_copy[i]);
  }
  fprintf(stderr, "\n");
}

void CloseIPC()
{
  if (uidFd != -1)
  {
    close(uidFd);
    uidFd = -1;
  }

  if (create)
  {
    if (ipc_address)
    {
      cudaFree(ipc_address);
      ipc_address = 0;
    }
  }
  else
  {
    if (ipc_address)
    {
      cudaIpcCloseMemHandle(ipc_address);
      ipc_address = 0;
    }
  }
}

/**
*
* Entry Point
*
**/

int main(int argc, char *argv[])
{

  ProcessCommandLine(argc, argv);

// register interrupts for graceful shutdown
  struct sigaction action;
  memset(&action, 0, sizeof(struct sigaction));
  action.sa_handler = terminate;
  sigaction(SIGTERM, &action, NULL);
  sigaction(SIGINT, &action, NULL);

while (running)
  {
    if (create)
      CreateIPC();
    else
      ReadIPC();
    sleep(1);
    std::cout << "loop " << std::to_string(loop) << std::endl;
  }

  CloseIPC();

  std::cout << "exiting main loop" << std::endl;

}

mnicely · December 11, 2019, 5:35pm

Sometimes when you receive a invalid device context error, it’s because the current process doesn’t know about any active CUDA contexts. Try using cudaSetDevice in both CreateIPC() and ReadIPC().

jgiovino · December 11, 2019, 5:49pm

Thank you. cudaSetDevice has the same behavior. Also ran in container pulled from NVidia: nvcr.io/NVidia/cuda:10.1-base with the same results.

Topic		Replies	Views
'invalid device ordinal' (cudaErrorInvalidDevice) CUDA Programming and Performance	6	5806	August 25, 2015
CUDA IPC get invalid device context General Topics and Other SDKs cuda	0	911	July 21, 2022
CUDA IPC sharing across containers CUDA Programming and Performance cuda	1	629	December 4, 2023
cuIpcOpenMemHandle return CUDA_ERROR_INVALID_CONTEXT CUDA Programming and Performance	2	1294	July 25, 2022
New Cuda 4.1 IPC functionality Cleaning up cudaIpcMemHandle_t in /dev/shm CUDA Programming and Performance	7	4802	January 12, 2012
interprocess communication on single GPU ? CUDA Programming and Performance	1	2346	June 22, 2012
Share GPU/host pinned memory between host processes CUDA Programming and Performance	5	4113	March 7, 2012
CUDA init.createctx,ipcgetmemhandle act normal, but get CUDA_EEROR_INVALID_CONTEXT when use Ipcopenmemhandle General Topics and Other SDKs cuda	0	367	July 25, 2022
CUDA 4.1 RC1: "Peer-to-peer communication between processes"? CUDA Programming and Performance	4	8042	November 9, 2011
P2P mem transfer between multiple CPU processes CUDA Programming and Performance	2	3467	February 20, 2012