OK, let me quote the docs first and point out that all cudaGetDeviceCount and cudaGetDeviceProperties are available in the cudart library (which you can package with your application in binary form if you want). No NVIDIA driver needs to be installed for this to work. I’ve tested it on windows, linux, and mac w/o problems.
You have to read the docs carefully, though, as there are many quirks that don’t make a lot of intuituve sense (like: why does cudaGetDeviceCount return 1 when there are 0 devices on the system!)
4.4.2.3 cudaError_t cudaGetDeviceCount (int count)
Returns in count the number of devices with compute capability greater or equal to 1.0 that are available for
execution. If there is no such device, cudaGetDeviceCount() returns 1 and device 0 only supports device
emulation mode. Since this device will be able to emulate all hardware features, this device will report major and
minor compute capability versions of 9999.
So to determine if you have a “real” gpu, you need to run cudaGetDeviceCount and then filter out the GPUs that report compute capability 9999 (as the docs are horribly wrong to state that the magical “device” can support all hardware features. It gets even more fun if you want your code to also work in emulation mode. Additionally, in recent versions of CUDA you can compare the cuda driver version against the runtime version to make sure that they are compatible.
For those who are curious, I wrapped this all up into a beast of a GPU selection function for hoomd. It also prioritizes faster and non-kernel-timeout GPUs for selection first. Here is the code:
//! Element in a priority sort of GPUs
struct gpu_elem
{
//! Constructor
gpu_elem(float p=0.0f, int g=0) : priority(p), gpu_id(g) {}
float priority; //!< determined priority of the GPU
int gpu_id; //!< ID of the GPU
};
//! less than operator for sorting gpu_elem
/*! \param a first element in the comparison
\param b second element in the comparison
*/
bool operator<(const gpu_elem& a, const gpu_elem& b)
{
if (a.priority == b.priority)
return a.gpu_id < b.gpu_id;
else
return a.priority > b.priority;
}
/*! \param ignore_display If set to true, try to ignore GPUs attached to the display
Each GPU that CUDA reports to exist is scrutinized to determine if it is actually capable of running HOOMD
When one is found to be lacking, it is marked as unavailable and a short notice is printed as to why.
\post m_gpu_list, m_gpu_available and m_system_compute_exclusive are all filled out
*/
void ExecutionConfiguration::scanGPUs(bool ignore_display)
{
#if CUDART_VERSION >= 2020
// check the CUDA driver version
int driverVersion = 0;
cudaDriverGetVersion(&driverVersion);
#ifndef _DEVICEEMU
// device emulation mode doesn't need a driver
// first handle the situation where no driver is installed (or it is a CUDA 2.1 or earlier driver)
if (driverVersion == 0)
{
cout << endl << "***Warning! NVIDIA driver not installed or is too old, ignoring any GPUs in the system."
<< endl << endl;
return;
}
// next, check to see if the driver is capable of running the version of CUDART that HOOMD was compiled against
if (driverVersion < CUDART_VERSION)
{
int driver_major = driverVersion / 1000;
int driver_minor = (driverVersion - driver_major * 1000) / 10;
int cudart_major = CUDART_VERSION / 1000;
int cudart_minor = (CUDART_VERSION - cudart_major * 1000) / 10;
cout << endl << "***Warning! The NVIDIA driver only supports CUDA versions up to " << driver_major << "."
<< driver_minor << ", but HOOMD was built against CUDA " << cudart_major << "." << cudart_minor << endl;
cout << " Ignoring any GPUs in the system." << endl;
return;
}
#endif
#endif
// determine the number of GPUs that CUDA thinks there is
int dev_count;
cudaError_t error = cudaGetDeviceCount(&dev_count);
if (error != cudaSuccess)
{
cerr << endl << "***Error! Error calling cudaGetDeviceCount()." << endl << endl;
throw runtime_error("Error initializing execution configuration");
}
// initialize variables
int n_exclusive_gpus = 0;
m_gpu_available.resize(dev_count);
// loop through each GPU and check it's properties
for (int dev = 0; dev < dev_count; dev++)
{
// get the device properties
cudaDeviceProp dev_prop;
cudaError_t error = cudaGetDeviceProperties(&dev_prop, dev);
if (error != cudaSuccess)
{
cerr << endl << "***Error! Error calling cudaGetDeviceProperties()." << endl << endl;
throw runtime_error("Error initializing execution configuration");
}
// start by assuming that the device is available, it will be excluded later if it is not
m_gpu_available[dev] = true;
// if this is not a device emulation build: exclude the device emulation device
#ifndef _DEVICEEMU
if (dev_prop.major == 9999 && dev_prop.minor == 9999)
{
m_gpu_available[dev] = false;
cout << "Notice: GPU id " << dev << " is not available for computation because "
<< "it is an emulated device" << endl;
}
#endif
// exclude a GPU if it's compute version is not high enough
int compoundComputeVer = dev_prop.minor + dev_prop.major * 10;
if (m_gpu_available[dev] && compoundComputeVer < CUDA_ARCH)
{
m_gpu_available[dev] = false;
cout << "Notice: GPU id " << dev << " is not available for computation because "
<< "it's compute capability is not high enough" << endl;
int min_major = CUDA_ARCH/10;
int min_minor = CUDA_ARCH - min_major*10;
cout << " This build of hoomd was compiled for a minimum capability of of " << min_major << "."
<< min_minor << " but the GPU is only " << dev_prop.major << "." << dev_prop.minor << endl;
}
#if CUDART_VERSION > 2010
// ignore the display gpu if that was requested
if (m_gpu_available[dev] && ignore_display && dev_prop.kernelExecTimeoutEnabled)
{
m_gpu_available[dev] = false;
cout << "Notice: GPU id " << dev << " is not available for computation because "
<< "it appears to be attached to a display" << endl;
}
#else
if (ignore_display)
{
cout << endl << "***Warning! --ignore-dispaly-gpu is innefective because this build of HOOMD was compiled"
<< " against a CUDA version older than 2.1" << endl << endl;
}
#endif
#if CUDART_VERSION >= 2020
// exclude a gpu if it is compute-prohibited
if (m_gpu_available[dev] && dev_prop.computeMode == cudaComputeModeProhibited)
{
m_gpu_available[dev] = false;
cout << "Notice: GPU id " << dev << " is not available for computation because "
<< "it is set in the compute-prohibited mode" << endl;
}
// count the number of compute-exclusive gpus
if (m_gpu_available[dev] && dev_prop.computeMode == cudaComputeModeExclusive)
n_exclusive_gpus++;
#endif
}
std::vector<gpu_elem> gpu_priorities;
for (int dev = 0; dev < dev_count; dev++)
{
if (m_gpu_available[dev])
{
cudaDeviceProp dev_prop;
cudaError_t error = cudaGetDeviceProperties(&dev_prop, dev);
if (error != cudaSuccess)
{
cout << endl << "***Error! Error calling cudaGetDeviceProperties()." << endl << endl;
throw runtime_error("Error initializing execution configuration");
}
// calculate a simple priority: multiprocessors * clock = speed, then subtract a bit if the device is
// attached to a display
float priority = float(dev_prop.clockRate * dev_prop.multiProcessorCount) / float(1e7);
#if CUDART_VERSION > 2010
if (dev_prop.kernelExecTimeoutEnabled)
priority -= 0.1f;
#endif
gpu_priorities.push_back(gpu_elem(priority, dev));
}
}
// sort the GPUs based on priority
sort(gpu_priorities.begin(), gpu_priorities.end());
// add the prioritized GPUs to the list
for (unsigned int i = 0; i < gpu_priorities.size(); i++)
{
m_gpu_list.push_back(gpu_priorities[i].gpu_id);
}
// the system is fully compute-exclusive if all capable GPUs are compute-exclusive
if (n_exclusive_gpus == getNumCapableGPUs())
m_system_compute_exclusive = true;
else
m_system_compute_exclusive = false;
}
/*! \param gpu_id ID of the GPU to check for availability
\pre scanGPUs() has been called
\return The availability statis of GPU \a gpu_id as determined by scanGPU()
*/
bool ExecutionConfiguration::isGPUAvailable(int gpu_id)
{
if (gpu_id < -1)
return false;
if (gpu_id == -1)
return true;
if ((unsigned int)gpu_id >= m_gpu_available.size())
return false;
return m_gpu_available[gpu_id];
}
/*! \pre scanGPUs() has been called
\return The count of avaialble GPUs deteremined by scanGPUs
*/
int ExecutionConfiguration::getNumCapableGPUs()
{
int count = 0;
for (unsigned int i = 0; i < m_gpu_available.size(); i++)
{
if (m_gpu_available[i])
count++;
}
return count;
}
Hopefully that code makes some sense out of context. The generated list m_gpu_list is suitable for passing into cudaSetValidDevices(), where you can then allow cudart to automatically select one them (compute-exclusive systems), in the priority order determined by the above code. Of you could just pick one of the valid devices and cudaSetDevice().
If getNumCapableGPUs() returns 0, then the code can take the CPU path and must not call any functions that would attempt to auto-initialize a GPU (like cudaMalloc) or you will get an invalid device error from CUDA.