Thanks, yes there is proper synchronization and cudaGetDeviceCount() returned a sufficient number of devices, four.
It turns out that the call to open the IPC handle was still within the same process as where it was created, just in a different thread. In this case a call to cudaIpcOpenMemHandle returns cudaErrorInvalidDevice. This was confirmed with this example:
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <cuda_runtime.h>
void * run_method(void *arg);
static void handle_cuda_error(const char *pfx, cudaError_t err);
static void print_cuda_ipc_mem_handle(cudaIpcMemHandle_t *ipc_mem_handle);
typedef struct shared_context {
int device_id;
cudaIpcMemHandle_t cuipcmemhandle;
int *shared_gpu;
} shared_context_t;
int shmid;
int verbose = 0;
int use_ipc = 0;
int main(int argc, char **argv)
{
int *ptr_local_num;
int *ptr_local_num_gpu;
int local_device_id;
int rv;
char pfx[256];
const int ARRAY_SIZE = 1024*1024*512;
shared_context_t shctx;
pthread_t pthread;
// check verbose flag in arguments list
switch (argc) {
case 3: {
if (strcmp("ipc",argv[2]) == 0) {
use_ipc = 1;
fprintf(stdout,"Using IPC\n");
}
// no break
}
case 2: {
if (strcmp("v",argv[1]) == 0) {
verbose = 1;
fprintf(stdout,"Verbose output selected\n");
}
break;
}
}
// set the device
shctx.device_id = 0;
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaSetDevice(shctx.device_id));
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaGetDevice(&local_device_id));
fprintf(stdout,"main: local_device_id = %d\n",local_device_id);
// cuda-allocate host memory in parent process
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMallocHost(&ptr_local_num,sizeof(int)*ARRAY_SIZE));
// set the value in host memory
*ptr_local_num = 42;
*(ptr_local_num + ARRAY_SIZE - 1) = 42;
fprintf(stdout,"main: local_num = %d .. %d\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
// allocate device memory in parent process
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMalloc(&ptr_local_num_gpu,sizeof(int)*ARRAY_SIZE));
// copy set value in host memory to device memory
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMemcpy(ptr_local_num_gpu,ptr_local_num,sizeof(int)*ARRAY_SIZE,cudaMemcpyHostToDevice));
if (use_ipc) {
// get cuda-ipc memory handle
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcGetMemHandle(&shctx.cuipcmemhandle,ptr_local_num_gpu));
print_cuda_ipc_mem_handle(&shctx.cuipcmemhandle);
} else {
shctx.shared_gpu = ptr_local_num_gpu;
}
// launch pthread
rv = pthread_create(&pthread, NULL, run_method, &shctx);
if (rv != 0) {
char error_strings[4][200] = { "insufficient resources to create another thread, or a system-imposed limit on the number of threads was encountered",
"invalid settings in attr",
"no permission to set the scheduling policy and parameters specified in attr",
"unknown" };
char *errstr;
switch(rv) {
case EAGAIN:
errstr = error_strings[0];
break;
case EINVAL:
errstr = error_strings[1];
break;
case EPERM:
errstr = error_strings[2];
break;
default:
errstr = error_strings[3];
break;
}
fprintf(stderr,"ERROR, pthread_create returned %d [%s]\n",rv,errstr);
exit(EXIT_FAILURE);
}
// and wait for pthread to finish
rv = pthread_join(pthread, NULL);
if (rv != 0) {
char error_strings[4][200] = { "a deadlock was detected, or thread specifies the calling thread",
"thread is not a joinable thread OR another thread is already waiting to join with this thread",
"no thread with the ID thread could be found",
"unknown" };
char *errstr;
switch(rv) {
case EDEADLK:
errstr = error_strings[0];
break;
case EINVAL:
errstr = error_strings[1];
break;
case ESRCH:
errstr = error_strings[2];
break;
default:
errstr = error_strings[3];
break;
}
fprintf(stderr,"ERROR, pthread_create returned %d [%s]\n",rv,errstr);
exit(EXIT_FAILURE);
}
fprintf(stdout,"main: pthread is done\n");
// ... and then it should be safe to free device memory
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaFree((void *)ptr_local_num_gpu));
return 0;
}
void *run_method(void *arg) {
int *ptr_local_num;
int *ptr_local_num_gpu;
int local_device_id;
char pfx[256];
const int ARRAY_SIZE = 1024*1024*512;
shared_context_t *shctx = (shared_context_t *)arg;
// cuda-allocate host memory in pthread process for async copy
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMallocHost(&ptr_local_num,sizeof(int)*ARRAY_SIZE));
// set some known (incorrect) value
*ptr_local_num = -1;
*(ptr_local_num + ARRAY_SIZE - 1) = -1;
fprintf(stdout,"pthread: local_num = %d .. %d (before copy)\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
// get the device id, and set
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaSetDevice(shctx->device_id));
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaGetDevice(&local_device_id));
fprintf(stdout,"pthread: local_device_id = %d\n",local_device_id);
if (use_ipc) {
// open the cuda-ipc memory handle
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcOpenMemHandle((void **)&ptr_local_num_gpu,shctx->cuipcmemhandle, cudaIpcMemLazyEnablePeerAccess));
print_cuda_ipc_mem_handle(&shctx->cuipcmemhandle);
} else {
// use shared pointer directly
ptr_local_num_gpu = shctx->shared_gpu;
}
// now we can copy data back
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMemcpy(ptr_local_num,ptr_local_num_gpu,sizeof(int)*ARRAY_SIZE,cudaMemcpyDeviceToHost));
if (use_ipc) {
// close cuda-ipc memory handle
snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcCloseMemHandle((void *)ptr_local_num_gpu));
} else {
// just reset local pointer
ptr_local_num_gpu = NULL;
}
// value should match the one in the parent process
fprintf(stdout,"pthread: local_num = %d .. %d (after copy)\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
return NULL;
}
static void handle_cuda_error(const char *pfx, cudaError_t err) {
// make sure the last error was popped
cudaGetLastError();
if (err == cudaSuccess && !verbose) {
return;
}
fprintf(stderr,"%scudaError is %d [%s]\n",pfx,(int)err,cudaGetErrorString(err));
}
static void print_cuda_ipc_mem_handle(cudaIpcMemHandle_t *ipc_mem_handle) {
const int N_BYTES = 64;
const int MSG_LEN = N_BYTES*2+1;
char msg[MSG_LEN];
int ii;
for (ii=0; ii<N_BYTES; ii++) {
int offset = ii==0 ? 0 : 1;
snprintf(msg+2*ii-offset,MSG_LEN-2*ii,"%02x",*((char *)ipc_mem_handle + ii));
}
fprintf(stdout,"%s:%s(%3d): ipc_mem_handle = [%s]\n",__FILE__,__FUNCTION__,__LINE__,msg);
}
Running the program and using IPC gives this error:
multithread.cu:run_method( 156):cudaError is 10 [invalid device ordinal]