'invalid device ordinal' (cudaErrorInvalidDevice)

ayoungzz9 · August 18, 2015, 10:42pm

I get cudaErrorInvalidDevice when I try to open an IPC memory handle, but I’m not sure why.

I’m using shared GPU memory between different processes. In one process I do:

cudaSetDevice(device_id);
cudaIpcGetMemHandle(&my_ipcmemhandle, (void *)buf_gpu);

And in the other process I do:

handle_cuda_error(cudaSetDevice(device_id));
handle_cuda_error(cudaIpcOpenMemHandle((void **)buf_gpu, my_ipcmemhandle, cudaIpcMemLazyEnablePeerAccess));

The device ID set before getting the IPC handle is the same as the device ID set before opening the IPC handle, and the device ID is valid. I use (host) shared memory to pass the IPC handle and device ID between the two processes. Furthermore, handle_cuda_error calls cudaGetLastError() so that I shouldn’t receive errors from previous calls.

When I treat the cuIpcMemHandle instance as a char array[64] and print to screen, I get the same result in both processes.

Am I missing something? Any explanation of the meaning of this error for this specific call (cudaIpcOpenMemHandle(…)) would be very useful.

Thanks,

little_jimmy · August 19, 2015, 5:19am

your device, and os?

ayoungzz9 · August 19, 2015, 3:06pm

4x GTX980 and CentOS 6.6

little_jimmy · August 20, 2015, 4:59am

i presume you have some method of synchronization between the 2 processes, such that the latter waits for the former, before reading the applicable data from shared memory

as a way of testing, have you tried reversing the primary/ secondary roles?
also, what does calling cudaGetDeviceCount() from the secondary process return?

ayoungzz9 · August 20, 2015, 2:12pm

Thanks, yes there is proper synchronization and cudaGetDeviceCount() returned a sufficient number of devices, four.

It turns out that the call to open the IPC handle was still within the same process as where it was created, just in a different thread. In this case a call to cudaIpcOpenMemHandle returns cudaErrorInvalidDevice. This was confirmed with this example:

#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <cuda_runtime.h>

void * run_method(void *arg);
static void handle_cuda_error(const char *pfx, cudaError_t err);
static void print_cuda_ipc_mem_handle(cudaIpcMemHandle_t *ipc_mem_handle);

typedef struct shared_context {
	int device_id;
	cudaIpcMemHandle_t cuipcmemhandle;
	int *shared_gpu;
} shared_context_t;

int shmid;
int verbose = 0;
int use_ipc = 0;

int main(int argc, char **argv)
{
	int *ptr_local_num;
	int *ptr_local_num_gpu;
	int local_device_id;
	int rv;
	char pfx[256];
	const int ARRAY_SIZE = 1024*1024*512;
	shared_context_t shctx;
	pthread_t pthread;
	
	// check verbose flag in arguments list
	switch (argc) {
		case 3: {
			if (strcmp("ipc",argv[2]) == 0) {
				use_ipc = 1;
				fprintf(stdout,"Using IPC\n");
			}
			// no break
		}
		case 2: {
			if (strcmp("v",argv[1]) == 0) {
				verbose = 1;
				fprintf(stdout,"Verbose output selected\n");
			}
			break;
		}
	}
	
	// set the device
	shctx.device_id = 0;
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaSetDevice(shctx.device_id));
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaGetDevice(&local_device_id));
	fprintf(stdout,"main: local_device_id = %d\n",local_device_id);
	// cuda-allocate host memory in parent process
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMallocHost(&ptr_local_num,sizeof(int)*ARRAY_SIZE));
	// set the value in host memory
	*ptr_local_num = 42;
	*(ptr_local_num + ARRAY_SIZE - 1) = 42;
	fprintf(stdout,"main: local_num = %d .. %d\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
	// allocate device memory in parent process
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMalloc(&ptr_local_num_gpu,sizeof(int)*ARRAY_SIZE));
	// copy set value in host memory to device memory
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMemcpy(ptr_local_num_gpu,ptr_local_num,sizeof(int)*ARRAY_SIZE,cudaMemcpyHostToDevice));
	if (use_ipc) {
		// get cuda-ipc memory handle
		snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcGetMemHandle(&shctx.cuipcmemhandle,ptr_local_num_gpu));
		print_cuda_ipc_mem_handle(&shctx.cuipcmemhandle);
	} else {
		shctx.shared_gpu = ptr_local_num_gpu;
	}
	// launch pthread
	rv = pthread_create(&pthread, NULL, run_method, &shctx);
	if (rv != 0) {		
		char error_strings[4][200] = { "insufficient resources to create another thread, or a system-imposed limit on the number of threads was encountered",
										"invalid settings in attr",
										"no permission to set the scheduling policy and parameters specified in attr",
										"unknown" };
		char *errstr;
		switch(rv) {
			case EAGAIN:
				errstr = error_strings[0];
				break;
			case EINVAL:
				errstr = error_strings[1];
				break;
			case EPERM:
				errstr = error_strings[2];
				break;
			default:
				errstr = error_strings[3];
				break;
		}
		fprintf(stderr,"ERROR, pthread_create returned %d [%s]\n",rv,errstr);
		exit(EXIT_FAILURE);
	}
	// and wait for pthread to finish
	rv = pthread_join(pthread, NULL);
	if (rv != 0) {		
		char error_strings[4][200] = { "a deadlock was detected, or thread specifies the calling thread",
										"thread is not a joinable thread OR another thread is already waiting to join with this thread",
										"no thread with the ID thread could be found",
										"unknown" };
		char *errstr;
		switch(rv) {
			case EDEADLK:
				errstr = error_strings[0];
				break;
			case EINVAL:
				errstr = error_strings[1];
				break;
			case ESRCH:
				errstr = error_strings[2];
				break;
			default:
				errstr = error_strings[3];
				break;
		}
		fprintf(stderr,"ERROR, pthread_create returned %d [%s]\n",rv,errstr);
		exit(EXIT_FAILURE);
	}
	fprintf(stdout,"main: pthread is done\n");
	// ... and then it should be safe to free device memory
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaFree((void *)ptr_local_num_gpu));
	
	
	return 0;
}

void *run_method(void *arg) {
	
	int *ptr_local_num;
	int *ptr_local_num_gpu;
	int local_device_id;
	char pfx[256];
	const int ARRAY_SIZE = 1024*1024*512;
	
	shared_context_t *shctx = (shared_context_t *)arg;
	
	// cuda-allocate host memory in pthread process for async copy
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMallocHost(&ptr_local_num,sizeof(int)*ARRAY_SIZE));
	// set some known (incorrect) value
	*ptr_local_num = -1;
	*(ptr_local_num + ARRAY_SIZE - 1) = -1;
	fprintf(stdout,"pthread: local_num = %d .. %d (before copy)\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
	// get the device id, and set
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaSetDevice(shctx->device_id));
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaGetDevice(&local_device_id));
	fprintf(stdout,"pthread: local_device_id = %d\n",local_device_id);
	if (use_ipc) {
		// open the cuda-ipc memory handle
		snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcOpenMemHandle((void **)&ptr_local_num_gpu,shctx->cuipcmemhandle, cudaIpcMemLazyEnablePeerAccess));
		print_cuda_ipc_mem_handle(&shctx->cuipcmemhandle);
	} else {
		// use shared pointer directly
		ptr_local_num_gpu = shctx->shared_gpu;
	}
	// now we can copy data back
	snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMemcpy(ptr_local_num,ptr_local_num_gpu,sizeof(int)*ARRAY_SIZE,cudaMemcpyDeviceToHost));
	if (use_ipc) {
		// close cuda-ipc memory handle
		snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcCloseMemHandle((void *)ptr_local_num_gpu));
	} else {
		// just reset local pointer
		ptr_local_num_gpu = NULL;
	}
	// value should match the one in the parent process
	fprintf(stdout,"pthread: local_num = %d .. %d (after copy)\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
	
	return NULL;
}

static void handle_cuda_error(const char *pfx, cudaError_t err) {
	// make sure the last error was popped
	cudaGetLastError();
	if (err == cudaSuccess && !verbose) {
		return;
	}
	fprintf(stderr,"%scudaError is %d [%s]\n",pfx,(int)err,cudaGetErrorString(err));
}

static void print_cuda_ipc_mem_handle(cudaIpcMemHandle_t *ipc_mem_handle) {
	const int N_BYTES = 64;
	const int MSG_LEN = N_BYTES*2+1;
	char msg[MSG_LEN];
	int ii;
	for (ii=0; ii<N_BYTES; ii++) {
			int offset = ii==0 ? 0 : 1;
			snprintf(msg+2*ii-offset,MSG_LEN-2*ii,"%02x",*((char *)ipc_mem_handle + ii));
	}
	fprintf(stdout,"%s:%s(%3d): ipc_mem_handle = [%s]\n",__FILE__,__FUNCTION__,__LINE__,msg);
}

Running the program and using IPC gives this error:

multithread.cu:run_method( 156):cudaError is 10 [invalid device ordinal]

little_jimmy · August 21, 2015, 4:56am

“Maps memory exported from another process with cudaIpcGetMemHandle into the current device address space. For contexts on different devices cudaIpcOpenMemHandle can attempt to enable peer access between the devices”

“Contexts that may open cudaIpcMemHandles are restricted in the following way. cudaIpcMemHandles from each device in a given process may only be opened by one context per device per other process.”

are you sure that you can call cudaIpcOpenMemHandle from another thread in the same process?
it seems redundant too; threads of the same process generally share the address space, and you can thus simply pass a pointer, rather than a memory handle

perhaps the api shorts an error code: wrong context…

ayoungzz9 · August 25, 2015, 1:46pm

Yes, I believe that is what caused the error; I thought I was calling from a different process, but it was just from a different thread in the same process.

Indeed, much simpler.

I agree, the cudaErrorInvalidDevice is not very indicative of the cause in this case, but even if the documentation just listed it as one of the error codes that can be expected that would help a great deal. In any case, now it is documented here at least ;)

Topic		Replies	Views
CUDA IPC get invalid device context General Topics and Other SDKs cuda	0	891	July 21, 2022
cuIpcOpenMemHandle return CUDA_ERROR_INVALID_CONTEXT CUDA Programming and Performance	2	1256	July 25, 2022
CUDA IPC - Virtual Memory API (cuMemImportFromShareableHandle CUDA_ERROR_INVALID_DEVICE ) - CUDA 11.3 CUDA Programming and Performance	6	2248	August 4, 2025
CUDA IPC Low Level VMM, cuMemImportFromShareableHandle returns invalid device ordinal CUDA Programming and Performance	1	88	September 5, 2024
CUDA init.createctx,ipcgetmemhandle act normal, but get CUDA_EEROR_INVALID_CONTEXT when use Ipcopenmemhandle General Topics and Other SDKs cuda	0	356	July 25, 2022
cuMemImportFromShareableHandle return CUDA_ERROR_INVALID_DEVICE CUDA Programming and Performance cuda	0	26	July 31, 2025
interprocess communication on single GPU ? CUDA Programming and Performance	1	2325	June 22, 2012
Docker IPC CUDA Programming and Performance	2	1566	December 11, 2019
Why exporting and importing CUDA IPC handles in the scope of the same Linux process is not supported? CUDA Programming and Performance cuda	7	953	May 10, 2023
Share GPU/host pinned memory between host processes CUDA Programming and Performance	5	4056	March 7, 2012

'invalid device ordinal' (cudaErrorInvalidDevice)

Related topics