Please help. I’m unable to understand what I’m doing wrong. I’ve just migrated to CUDA-11 from CUDA-7.5 and I get the following error while Initializing the Card
[Sat Sep 12 14:33:15 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ vim tst.cu
[Sat Sep 12 14:33:45 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ nvcc tst.cu -o tst
[Sat Sep 12 14:33:47 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ ./tst
DEBUG tst.cu:gpu_configuration:54: Device count 1
CUDA call from file:tst.cu func:gpu_configuration 55: cudaErrorInvalidDevice:invalid device ordinal failed
Segmentation fault (core dumped)
the code to reproduce this error is:
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <cuda.h>
#include <driver_types.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#define gpuErrChk(ans) {gpuAssert((ans), __FILE__, __func__, __LINE__);}
#define CUDA_GLOBAL_CHECK {gpuErrChk (cudaPeekAtLastError ()); gpuErrChk (cudaDeviceSynchronize ());}
#define Dbg(M, ...) fprintf(stderr, "DEBUG %s:%s:%d: " M "\n", __FILE__, \
__func__, __LINE__, ##__VA_ARGS__)
typedef struct gpu_config {
int n_threads; // maxThreads allowed per block
int dev_count; // number of cuda devices
size_t shmem; // sh_mem per block
size_t free_mem; // free memory on the card
size_t tot_mem; // total memory on the card
struct cudaDeviceProp dev_prop; // device properties
} gpu_config;
inline void gpuAssert (cudaError_t code, const char *file, const char *func, int line)
{
bool abort = true;
bool sig_raise = true;
if (code != cudaSuccess) {
fprintf(stderr,"CUDA call from file:%s func:%s %d: %s:%s failed\n", file, func, line, cudaGetErrorName(code), cudaGetErrorString(code));
if (sig_raise) raise (SIGSEGV);
if (abort) exit (code);
}
}
static __global__ void K_dummy (void);
static __global__ void K_dummy (void)
{
unsigned int i = threadIdx.x;
i += 2; // any ole' code to warm the card up.
i++;
}
static void gpu_configuration (gpu_config *gc);
static void gpu_configuration (gpu_config *gc)
{
// assumption: a single device. Needs to be modified for multiple devices.
// use gpuDeviceInit from helper_cuda.h
gpuErrChk (cudaDeviceReset ()); // reset device
gpuErrChk (cudaGetDeviceCount (&gc -> dev_count));
Dbg("Device count %d", gc -> dev_count);
gpuErrChk (cudaSetDevice (gc -> dev_count));
// gc -> dev_prop = malloc (sizeof (cudaDeviceProp) * gc -> dev_count);
// for multiple devices only.
gpuErrChk (cudaGetDeviceProperties (&(gc -> dev_prop), gc -> dev_count));
gc -> n_threads = gc -> dev_prop.maxThreadsPerBlock;
gc -> shmem = gc -> dev_prop.sharedMemPerBlock;
gpuErrChk (cudaMemGetInfo (&(gc -> free_mem), &(gc -> tot_mem)));
Dbg ("Dev prop name: %s, tot_mem: %u sharedMemPerBlock %u\nwarpSize %d maxThreadsPerBlock %d\nmaxthreads per mprocessor %d",
gc -> dev_prop.name, (unsigned) gc -> dev_prop.totalGlobalMem,
(unsigned) gc -> dev_prop.sharedMemPerBlock,
gc -> dev_prop.warpSize, gc -> dev_prop.maxThreadsPerBlock,
gc -> dev_prop.maxThreadsPerMultiProcessor);
dim3 grid (1);
dim3 block (32);
K_dummy <<<grid,block>>> (); // warm up the GPU
CUDA_GLOBAL_CHECK;
}
int main (void)
{
gpu_config gc;
gpu_configuration (&gc);
return 0;
}
More info:
[Sat Sep 12 14:53:15 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ uname -a
Linux Desktop 5.4.0-47-generic #51~18.04.1-Ubuntu SMP Sat Sep 5 14:35:50 UTC 2020 x86_64 x86_64 GNU/Linux
This is not an issue of the card. I run deviceQuery and get the following:
[Sat Sep 12 14:33:51 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$ ~/Documents/cuda/samples/bin/x86_64/linux/release/deviceQuery
/home/rinka/Documents/cuda/samples/bin/x86_64/linux/release/deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 780"
CUDA Driver Version / Runtime Version 11.0 / 11.0
CUDA Capability Major/Minor version number: 3.5
Total amount of global memory: 3017 MBytes (3163488256 bytes)
(12) Multiprocessors, (192) CUDA Cores/MP: 2304 CUDA Cores
GPU Max Clock rate: 902 MHz (0.90 GHz)
Memory Clock rate: 3004 Mhz
Memory Bus Width: 384-bit
L2 Cache Size: 1572864 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device supports Managed Memory: Yes
Device supports Compute Preemption: No
Supports Cooperative Kernel Launch: No
Supports MultiDevice Co-op Kernel Launch: No
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.0, CUDA Runtime Version = 11.0, NumDevs = 1
Result = PASS
[Sat Sep 12 14:34:48 IST 2020]
rinka@Desktop-New:~/Documents/cuda/dev/code/tst_code/error-26-9
$
BTW, I looked at deviceQuery.cu and the initialization part seems to be the same except for the call cudaDeviceReset () that I’m making. I do want to make this call to ensure that the device is clean at startup.