Error: invalid device ordinal on migrating to Cuda-11

Please help. I’m unable to understand what I’m doing wrong. I’ve just migrated to CUDA-11 from CUDA-7.5 and I get the following error while Initializing the Card

[Sat Sep 12 14:33:15 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ vim tst.cu
[Sat Sep 12 14:33:45 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ nvcc tst.cu -o tst
[Sat Sep 12 14:33:47 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ ./tst
DEBUG tst.cu:gpu_configuration:54: Device count 1
CUDA call from file:tst.cu func:gpu_configuration 55: cudaErrorInvalidDevice:invalid device ordinal failed
Segmentation fault (core dumped)

the code to reproduce this error is:

#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>

#include <cuda.h>
#include <driver_types.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#define gpuErrChk(ans)		{gpuAssert((ans), __FILE__, __func__, __LINE__);}
#define CUDA_GLOBAL_CHECK   {gpuErrChk (cudaPeekAtLastError ()); gpuErrChk (cudaDeviceSynchronize ());}
#define Dbg(M, ...) fprintf(stderr, "DEBUG %s:%s:%d: " M "\n", __FILE__, \
	__func__, __LINE__, ##__VA_ARGS__)

typedef struct gpu_config {
	int	n_threads;	// maxThreads allowed per block
	int	dev_count;	// number of cuda devices

	size_t	shmem;		// sh_mem per block
	size_t	free_mem;	// free memory on the card
	size_t	tot_mem;	// total memory on the card
	struct	cudaDeviceProp	dev_prop;	// device properties
} gpu_config;

inline void gpuAssert (cudaError_t code, const char *file, const char *func, int line)
{
	bool abort = true;
	bool sig_raise = true;

	if (code != cudaSuccess) {
		fprintf(stderr,"CUDA call from file:%s func:%s %d: %s:%s failed\n", file, func, line, cudaGetErrorName(code), cudaGetErrorString(code));

		if (sig_raise) raise (SIGSEGV);
		if (abort) exit (code);
	}
}

static __global__ void K_dummy (void);
static __global__ void K_dummy (void)
{
	unsigned int i = threadIdx.x;
	i += 2;		// any ole' code to warm the card up.
	i++;
}

static void gpu_configuration (gpu_config *gc);
static void gpu_configuration (gpu_config *gc)
{
	// assumption: a single device.  Needs to be modified for multiple devices.
	// use gpuDeviceInit from helper_cuda.h
	gpuErrChk (cudaDeviceReset ());		// reset device
	gpuErrChk (cudaGetDeviceCount (&gc -> dev_count));
	Dbg("Device count %d", gc -> dev_count);
	gpuErrChk (cudaSetDevice (gc -> dev_count));

	// gc -> dev_prop = malloc (sizeof (cudaDeviceProp) * gc -> dev_count);
	// for multiple devices only.
	gpuErrChk (cudaGetDeviceProperties (&(gc -> dev_prop), gc -> dev_count));
	gc -> n_threads = gc -> dev_prop.maxThreadsPerBlock;
	gc -> shmem = gc -> dev_prop.sharedMemPerBlock;

	gpuErrChk (cudaMemGetInfo (&(gc -> free_mem), &(gc -> tot_mem)));

	Dbg ("Dev prop name: %s, tot_mem: %u sharedMemPerBlock %u\nwarpSize %d maxThreadsPerBlock %d\nmaxthreads per mprocessor %d",
	gc -> dev_prop.name, (unsigned) gc -> dev_prop.totalGlobalMem,
	(unsigned) gc -> dev_prop.sharedMemPerBlock,
	gc -> dev_prop.warpSize, gc -> dev_prop.maxThreadsPerBlock,
	gc -> dev_prop.maxThreadsPerMultiProcessor);

	dim3 grid (1);
	dim3 block (32);
	K_dummy <<<grid,block>>> ();		// warm up the GPU
	CUDA_GLOBAL_CHECK;
}

int main (void)
{
    gpu_config gc;
    gpu_configuration (&gc);

    return 0;
}

More info:

[Sat Sep 12 14:53:15 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ uname -a
Linux Desktop 5.4.0-47-generic #51~18.04.1-Ubuntu SMP Sat Sep 5 14:35:50 UTC 2020 x86_64 x86_64 GNU/Linux

This is not an issue of the card. I run deviceQuery and get the following:

[Sat Sep 12 14:33:51 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ ~/Documents/cuda/samples/bin/x86_64/linux/release/deviceQuery
/home/rinka/Documents/cuda/samples/bin/x86_64/linux/release/deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 780"
  CUDA Driver Version / Runtime Version          11.0 / 11.0
  CUDA Capability Major/Minor version number:    3.5
  Total amount of global memory:                 3017 MBytes (3163488256 bytes)
  (12) Multiprocessors, (192) CUDA Cores/MP:     2304 CUDA Cores
  GPU Max Clock rate:                            902 MHz (0.90 GHz)
  Memory Clock rate:                             3004 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            No
  Supports Cooperative Kernel Launch:            No
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.0, CUDA Runtime Version = 11.0, NumDevs = 1
Result = PASS
[Sat Sep 12 14:34:48 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$

BTW, I looked at deviceQuery.cu and the initialization part seems to be the same except for the call cudaDeviceReset () that I’m making. I do want to make this call to ensure that the device is clean at startup.

I cannot tell where in your code the error is thrown. This looks suspicious to me:

gpuErrChk (cudaSetDevice (gc -> dev_count));

Devices are enumerated starting at zero, so valid arguments for cudaSetDevice() would be in [0, dev_count).

When you compiled the code, did you specify sm_35 as the target architecture? The CUDA 11 compiler uses sm_52 as the default target. Any kernel so compiled would not be able to run on your GPU.

gc -> dev_count is 1.

I also had edited the Makefile to remove makefile warnings and set

SMS ?= 52 60 61 70 75 80

OK. Let me change these and come back to you.

If dev_count is 1, then the only valid device ordinal to pass to cudaSetDevice() is 0. Is this what your code passes as the function argument? From a cursory reading of the code it seems you may be passing 1, which would be out of range, i.e. invalid.

Thank you for the inputs.
OK. I got through THAT error. Here’s the command line dump with the execution line from my Makefile:

 [Sat Sep 12 13:50:47 IST 2020]
 rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
 $ /usr/local/cuda/bin/nvcc -ccbin g++ -I../../../common/include -I/usr/local/cuda/samples/common/inc/  -m64 -g -G -O0 --compiler-options -Wall -gencode arch=compute_35,code=sm_35 -gencode     arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode             arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o tst tst.cu
 nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-        targets to suppress warning).
 [Sat Sep 12 13:50:56 IST 2020]
 rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
 $ ./tst
 DEBUG tst.cu:gpu_configuration:55: Device count 0
 DEBUG tst.cu:gpu_configuration:70: Dev prop name: GeForce GTX 780, tot_mem: 3163488256 sharedMemPerBlock 49152
 warpSize 32 maxThreadsPerBlock 1024
 maxthreads per mprocessor 2048

It works fine. dev_count is now 0, cudaDeviceProperties () works and so does the call to the kernel code.
The relevant part of the modified code (the full original file is above) is:

static void gpu_configuration (gpu_config *gc)
 {
     // assumption: a single device.  Needs to be modified for multiple devices.
     // use gpuDeviceInit from helper_cuda.h
     gpuErrChk (cudaDeviceReset ());     // reset device
     gpuErrChk (cudaGetDeviceCount (&gc -> dev_count));
     gc -> dev_count--;
     Dbg("Device count %d", gc -> dev_count);
     gpuErrChk (cudaSetDevice (gc -> dev_count));

     // gc -> dev_prop = malloc (sizeof (cudaDeviceProp) * gc -> dev_count);
     // for multiple devices only.
     gpuErrChk (cudaGetDeviceProperties (&(gc -> dev_prop), gc -> dev_count));
     gc -> n_threads = gc -> dev_prop.maxThreadsPerBlock;
     gc -> shmem = gc -> dev_prop.sharedMemPerBlock;

     gpuErrChk (cudaMemGetInfo (&(gc -> free_mem), &(gc -> tot_mem)));

     Dbg ("Dev prop name: %s, tot_mem: %u sharedMemPerBlock %u\nwarpSize %d maxThreadsPerBlock %d\nmaxthreads per mprocessor %d",
     gc -> dev_prop.name, (unsigned) gc -> dev_prop.totalGlobalMem,
     (unsigned) gc -> dev_prop.sharedMemPerBlock,
     gc -> dev_prop.warpSize, gc -> dev_prop.maxThreadsPerBlock,
     gc -> dev_prop.maxThreadsPerMultiProcessor);

     dim3 grid (1);
     dim3 block (32);
     K_dummy <<<grid, block>>> ();        // warm up the GPU
     CUDA_GLOBAL_CHECK;
 }

BUT NOW if I remove -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 from compilation (I’m trying to eliminate warnings from the make) as follows, I get:

 [Sat Sep 12 13:51:02 IST 2020]
 rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
 $ /usr/local/cuda/bin/nvcc -ccbin g++ -I../../../common/include -I/usr/local/cuda/samples/common/inc/  -m64 -g -G -O0 --compiler-options -Wall -gencode arch=compute_52,code=sm_52 -gencode     arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode             arch=compute_80,code=compute_80 -o tst tst.cu[Sat Sep 12 13:54:56 IST 2020]
 rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
 $ ./tst
 DEBUG tst.cu:gpu_configuration:55: Device count 0
 DEBUG tst.cu:gpu_configuration:70: Dev prop name: GeForce GTX 780, tot_mem: 3163488256 sharedMemPerBlock 49152
 warpSize 32 maxThreadsPerBlock 1024
 maxthreads per mprocessor 2048
 CUDA call from file:tst.cu func:gpu_configuration 75: cudaErrorNoKernelImageForDevice:no kernel image is available for execution on the device failed
 Segmentation fault (core dumped)
 [Sat Sep 12 13:55:00 IST 2020]
 rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
 $

Basically it is crashing at K_dummy. Why would this be?
To repeat K_dummy () is:

 static __global__ void K_dummy (void)
 {
     unsigned int i = threadIdx.x;
     i += 2;     // any ole' code to warm the card up.
     i++;
 }

As I said, the default architecture target for the compiler in CUDA 11 is sm_52. Since your device is an sm_35 device, you need to specify that as (one) target architecture, independent of any additional architectures you might want to specify.

GPU instruction set architectures are not binary compatible: a kernel compiled for sm_52 (default) will not be able to run on an sm_35 device. Trying to do so will result in a launch error.

So put the -gencode arch=compute_35,code=sm_35 back onto the nvcc command line.

Thank you.