I receive “invalid device context” error 201 when obtaining CUDA IPC memory inside a docker container.
The following simplified code is meant to run in two or more docker containers.
It works perfectly under many conditions… until it decides not to work.
When running with no parameters the developer can spin up multiple docker instances reading from the same IPC memory.
When specifying ‘-c’ on the command line the docker container will create and populate the IPC memory.
Teasing apart the NVIDIA provided simple IPC example to run across multiple docker containers proved to be to complex for this exercise and introduced too many variables. The following code has been tested and appears to work on the host with no issues.
compile the example code below and place in a known volume
$ nvcc -std=c++11 simpleIPC.cu -o simpleIPC
$ sudo cp simpleIPC /opt/
$ sudo chmod 777 /opt/simpleIPC
enable user access to IPC memory handle UID stored on file system
$ sudo touch /opt/simpleIPC.uid
$ sudo chmod 777 /opt/simpleIPC.uid
demonstrate functioning IPC on host (no docker)
$ /opt/simpleIPC
will actively poll until you run the next command
$ /opt/simpleIPC -c
observe the reading instance dumping memory values
equivalent to the source instances loop count
Kill the prior examples
cntrl-C will cleanly deallocate resoucres in both applications
Now run the same executable inside the docker containers
$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_N -v /opt:/opt nvidia/cuda:10.1-devel /opt/simpleIPC
repeat this command for as many N instances as desired
only one is needed to reproduce the fault
This read instance will poll periodically until the
file /opt/simpleIPC.uid is present and contains a
valid 64 byte IPC memory handle
now run the next container with the create flag and a unique docker conmtainer name
$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_create -v /opt:/opt nvidia/cuda:10.1-devel /opt/simpleIPC -c
observe “failed to CUDA open mem handle 201”
now for the really interesting part
kill the read instance and allow the create instance to continue running
issue the shell noop in the docker run command:
$ sudo docker run -it --runtime=nvidia --rm --name simpleIPC_read_N -v /opt:/opt nvidia/cuda:10.1-devel /bin/bash -c “:;/opt/simpleIPC”
It works!,… huh?
I have run these same tests on multiple NVIDIA
architectures and various but similar
driver/lib/docker versions.
My current test rig is as follows:
K1100M 418.87.00 cuda 10.1, also tested on Tesla V100
docker 18.06.1-ce
#include <iostream>
#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
__global__
void myCudaMemset(unsigned char *out, unsigned char value, unsigned int length)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= length) return;
out[tid] = value;
}
static int running = 1;
static bool create=false;
static int ipc_length = 10240;
static void *ipc_address = 0;
static char uidFileName[] = "/opt/simpleIPC.uid";
static int uidFd=-1;
static cudaIpcMemHandle_t ipc_handle;
static bool ipc_handle_valid = false;
static bool ipc_file_valid = false;
static unsigned char loop = 0;
// Catch any interrupt signal from the OS and disable the run state
static void terminate(int signum)
{
std::cout << "caught signal " << signum << std::endl;
running = 0;
}
void ProcessCommandLine(int argc, char *argv[])
{
int opt;
while ((opt = getopt(argc, argv, "c")) != -1)
{
switch (opt) {
case 'c':
create = true;
break;
default:
fprintf(stderr, "Usage: %s\n", argv[0]);
fprintf(stderr, " -c - create CUDA IPC (%d)\n", create);
exit(1);
}
}
}
void CreateIPC()
{
cudaError_t __err;
if (uidFd == -1)
{
uidFd = open(uidFileName, O_RDWR | O_CREAT);
if (uidFd == -1)
{
fprintf(stderr, "Failed to open %s for create\n", uidFileName);
return;
}
}
// allocate device memory
if (ipc_address == 0)
{
cudaMalloc((void **)&ipc_address, ipc_length);
__err = cudaGetLastError();
if (__err != cudaSuccess)
{
ipc_address = 0;
fprintf(stderr, "failed to CUDA malloc %d bytes: %d\n", ipc_length, __err);
return;
}
}
if (ipc_handle_valid == false)
{
// set up CUDA IPC handle for external processes
cudaIpcGetMemHandle(&ipc_handle, ipc_address);
__err = cudaGetLastError();
if (__err != cudaSuccess)
{
fprintf(stderr, "failed to CUDA get mem handle %d\n", __err);
return;
}
ipc_handle_valid = true;
unsigned char *cptr = (unsigned char *)&ipc_handle;
for (int i=0; i<64; i++)
fprintf(stderr, "%02x.", cptr[i]);
fprintf(stderr, "\n");
}
// write IPC handle to file system for other async processes
if (ipc_file_valid == false)
{
lseek(uidFd, 0, SEEK_SET);
if (write(uidFd, (void *)&ipc_handle, 64) != 64)
{
fprintf(stderr, "failed to write UID to %s: %s\n", uidFileName, strerror(errno));
return;
}
ipc_file_valid = true;
}
// populate memory with a pattern
myCudaMemset<<<10, 1024>>>((unsigned char *)ipc_address, loop, ipc_length);
loop++;
}
void ReadIPC()
{
cudaError_t __err;
if (uidFd == -1)
{
uidFd = open(uidFileName, O_RDONLY);
if (uidFd == -1)
{
fprintf(stderr, "Failed to open %s for read\n", uidFileName);
return;
}
}
// read IPC handle from file system
if (ipc_handle_valid == false)
{
lseek(uidFd, 0, SEEK_SET);
if (read(uidFd, (void *)&ipc_handle, 64) != 64)
{
fprintf(stderr, "failed to read UID from %s: %s\n", uidFileName, strerror(errno));
return;
}
ipc_handle_valid = true;
}
if (ipc_address == 0)
{
cudaIpcOpenMemHandle((void **)&ipc_address, ipc_handle, cudaIpcMemLazyEnablePeerAccess);
__err = cudaGetLastError();
if (__err != cudaSuccess)
{
fprintf(stderr, "failed to CUDA open mem handle %d\n", __err);
ipc_address = 0;
return;
}
}
// dump memory contents
unsigned char host_copy[16];
cudaMemcpy(host_copy, ipc_address, 16, cudaMemcpyDeviceToHost);
for (int i=0; i< 16; i++)
{
fprintf(stderr, "%02d.", host_copy[i]);
}
fprintf(stderr, "\n");
}
void CloseIPC()
{
if (uidFd != -1)
{
close(uidFd);
uidFd = -1;
}
if (create)
{
if (ipc_address)
{
cudaFree(ipc_address);
ipc_address = 0;
}
}
else
{
if (ipc_address)
{
cudaIpcCloseMemHandle(ipc_address);
ipc_address = 0;
}
}
}
/**
*
* Entry Point
*
**/
int main(int argc, char *argv[])
{
ProcessCommandLine(argc, argv);
// register interrupts for graceful shutdown
struct sigaction action;
memset(&action, 0, sizeof(struct sigaction));
action.sa_handler = terminate;
sigaction(SIGTERM, &action, NULL);
sigaction(SIGINT, &action, NULL);
while (running)
{
if (create)
CreateIPC();
else
ReadIPC();
sleep(1);
std::cout << "loop " << std::to_string(loop) << std::endl;
}
CloseIPC();
std::cout << "exiting main loop" << std::endl;
}