The following call to nvmlDeviceGetUtilizationRates fails with an NVML_ERROR_UNKNOWN error (error code 999):
// initialize
nvmlReturn_t rval = nvmlInit();
if( rval != NVML_SUCCESS )
printf("nvmlInit: %s\n", nvmlErrorString(rval));
char szNvmlVersion[NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE];
rval = nvmlSystemGetNVMLVersion( szNvmlVersion, sizeof szNvmlVersion );
if( rval != NVML_SUCCESS )
printf("nvmlSystemGetNVMLVersion: %s\n", nvmlErrorString(rval));
char szDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
rval = nvmlSystemGetDriverVersion( szDriverVersion, sizeof szDriverVersion );
if( rval != NVML_SUCCESS )
printf("nvmlSystemGetDriverVersion: %s\n", nvmlErrorString(rval));
printf( "Initialized NVML v%s (NVidia GPU driver v%s)\n", szNvmlVersion, szDriverVersion );
// build a list of NVML device handles
unsigned int deviceCount = 0;
rval = nvmlDeviceGetCount( &deviceCount );
if( rval != NVML_SUCCESS )
printf("nvmlDeviceGetCount: %s\n", nvmlErrorString(rval));
nvmlDevice_t* hGPU = reinterpret_cast<nvmlDevice_t*>(malloc(deviceCount*(sizeof nvmlDevice_t)));
for( unsigned int i=0; i<deviceCount; ++i )
{
rval = nvmlDeviceGetHandleByIndex( i, hGPU+i );
if( rval != NVML_SUCCESS )
printf("nvmlDeviceGetHandleByIndex: %s\n", nvmlErrorString(rval));
}
printf( "Found %d GPU handle(s)", deviceCount );
for( unsigned int n=0; n<deviceCount; ++n ) printf( " 0x%016llx", reinterpret_cast<unsigned long long>(hGPU[n]) );
printf( "\n");
// this works
unsigned int pct;
rval = nvmlDeviceGetFanSpeed( hGPU[0], &pct );
if( rval != NVML_SUCCESS )
printf("nvmlDeviceGetFanSpeed: %s\n", nvmlErrorString(rval));
printf( "Fan speed (%%): %u\n", pct );
// this fails
nvmlUtilization_t nvmlUtilization = { 0 };
rval = nvmlDeviceGetUtilizationRates( hGPU[0], &nvmlUtilization );
if( rval != NVML_SUCCESS )
printf("nvmlDeviceGetUtilizationRates: %s\n", nvmlErrorString(rval));
printf( "Utilization: GPU=%u gmem=%u", nvmlUtilization.gpu, nvmlUtilization.memory );
// shutdown
rval = nvmlShutdown();
if( rval != NVML_SUCCESS )
printf("nvmlShutdown: %s\n", nvmlErrorString(rval));
// discard the list of GPU handles
free( hGPU );
Output from the above:
Initialized NVML v7.353.90 (NVidia GPU driver v353.90)
Found 3 GPU handle(s) 0x000007fef2ce9a98 0x000007fef2cea308 0x000007fef2ceab78
Fan speed (%): 41
nvmlDeviceGetUtilizationRates: Unknown Error
Utilization: GPU=0 gmem=0
Details:
- three NVidia K20c devices
- Windows Server 2008 R2
- Visual Studio 2013
- CUDA 7.5
- Graphics driver 353.90
- NVML (from CUDA toolkit 7.5, downloaded October 9, 2015)
- 64-bit build (32-bit build not tested)
A workaround or fix would be great!