GPU Inter-Process Communications(IPC) question

The cuda IPC sample code demonstrates the use of mmap() to pass IPC handles between processes.

For amusement purposes, I tried implementing the fifo method (named pipe) that I referenced above, to demonstrate a different approach. The following is the code of the two independent applications:

app1.cu:

// app 1, part of a 2-part IPC example
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#define DSIZE 1

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

int main(){
  system("rm -f testfifo");  // remove any debris
  int ret = mkfifo("testfifo", 0600); // create fifo
  if (ret != 0) {printf("mkfifo error: %d\n",ret); return 1;}
  int *data;
  cudaMalloc(&data, DSIZE*sizeof(int));
  cudaCheckErrors("malloc fail");
  cudaMemset(data, 0, DSIZE*sizeof(int));
  cudaCheckErrors("memset fail");
  cudaIpcMemHandle_t my_handle;
  cudaIpcGetMemHandle(&my_handle, data);
  unsigned char handle_buffer[sizeof(my_handle)+1];
  memset(handle_buffer, 0, sizeof(my_handle)+1);
  memcpy(handle_buffer, (unsigned char *)(&my_handle), sizeof(my_handle));
  cudaCheckErrors("get IPC handle fail");
  FILE *fp;
  printf("waiting for app2\n");
  fp = fopen("testfifo", "w");
  if (fp == NULL) {printf("fifo open fail \n"); return 1;}
  for (int i=0; i < sizeof(my_handle); i++){
    ret = fprintf(fp,"%c", handle_buffer[i]);
    if (ret != 1) printf("ret = %d\n", ret);}
  fclose(fp);
  sleep(2);  // wait for app 2 to modify data
  int *result = (int *)malloc(DSIZE*sizeof(int));
  cudaMemcpy(result, data, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  if (!(*result)) printf("Fail!\n");
  else printf("Success!\n");
  system("rm testfifo");
  return 0;
}

app2.cu:

// app 2, part of a 2-part IPC example
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#define DSIZE 1

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void set_kernel(volatile int *d, int val){
  *d = val;
}

int main(){
  int *data;
  cudaIpcMemHandle_t my_handle;
  unsigned char handle_buffer[sizeof(my_handle)+1];
  memset(handle_buffer, 0, sizeof(my_handle)+1);
  FILE *fp;
  fp = fopen("testfifo", "r");
  if (fp == NULL) {printf("fifo open fail \n"); return 1;}
  int ret;
  for (int i = 0; i < sizeof(my_handle); i++){
    ret = fscanf(fp,"%c", handle_buffer+i);
    if (ret == EOF) printf("received EOF\n");
    else if (ret != 1) printf("fscanf returned %d\n", ret);}
  memcpy((unsigned char *)(&my_handle), handle_buffer, sizeof(my_handle));
  cudaIpcOpenMemHandle((void **)&data, my_handle, cudaIpcMemLazyEnablePeerAccess);
  cudaCheckErrors("IPC handle fail");
  set_kernel<<<1,1>>>(data, 1);
  cudaDeviceSynchronize();
  cudaCheckErrors("memset fail");
  return 0;
}

If you run app1, it will start up and then wait for app2 to start. Then, when you start app2, app1 will send the IPC handle to app2. app2 will use that handle to modify some memory allocated by app1, and then exit. app1 waits (sleeps) for a short period, then checks to see if the modification was made by app2.

Just a proof of concept.