nvmlDeviceGetPowerUsage() SIGSEGV when invoked under a thread

I wrote a monitoring daemon that sample data from the nvml library, for every gpu card I spawn a new thread.

Everything was fine until driver version 535.104.05, when the daemon started segfaulting.

After thorough investigation, I verified that if nvmlDeviceGetPowerUsage() is invoked from the main thread, everything is fine, if run from a thread, it segfaults.

Calling other monitoring functions from within a thread, like nvmlDeviceGetMemoryInfo(), works fine.

here is an example of the stack trace:
#0 0x00007fffea684511 in ?? () from /lib64/libnvidia-ml.so
#1 0x00007fffea685a98 in ?? () from /lib64/libnvidia-ml.so
#2 0x00007fffea626b3f in nvmlDeviceGetPowerUsage () from /lib64/libnvidia-ml.so
#3 0x000000000043dc3d in ?? ()
#4 0x000000000043d733 in ?? ()
#5 0x000000000043c2a3 in ?? ()
#6 0x0000000000426f7e in ?? ()
#7 0x00007ffff7bb61ca in start_thread () from /lib64/libpthread.so.0
#8 0x00007ffff729ce73 in clone () from /lib64/libc.so.6

Hi,

Thanks for bringing this to our attention. Do you have a reproducer that we could use to aid us in our investigation?

Hi Gupadhyaya!!

Sorry, it was my fault.

It was actually a stack size problem within the thread, increasing it solved the problem.

This was the sample script I used:

#!/usr/bin/env micropython
import ffi
import sys
import time as clock
import uctypes as ctypes
import _thread as thread

MAXSTRINGLEN = 256

pointer_t = {
    'value': ctypes.UINT64 | 0,
}

libml = ffi.open('libnvidia-ml.so')
nvmlInit = libml.func('i','nvmlInit','')
nvmlShutdown =  libml.func('i','nvmlShutdown','')
nvmlErrorString = libml.func('s','nvmlErrorString','i')
nvmlDeviceGetName = libml.func('i','nvmlDeviceGetName','ppi')
nvmlDeviceGetCount = libml.func('i','nvmlDeviceGetCount','p')
nvmlDeviceGetPowerUsage = libml.func('i','nvmlDeviceGetPowerUsage', 'pp')
nvmlDeviceGetHandleByIndex = libml.func('i','nvmlDeviceGetHandleByIndex', 'ip')

def new(sdesc):
    buf = bytearray(ctypes.sizeof(sdesc))
    return ctypes.struct(ctypes.addressof(buf), sdesc, ctypes.NATIVE)

def thread_start(device):
    power = new(pointer_t)

    res = nvmlDeviceGetPowerUsage(device.value, power)
    if (res != 0):
       print("nvmlDeviceGetPowerUsage:", nvmlErrorString(res))

    print (power.value)

res = nvmlInit()
if (res != 0):
  print("nvmlInit:", nvmlErrorString(res))
  sys.exit(6)

numDevices = new(pointer_t)
res = nvmlDeviceGetCount(numDevices)
if (res != 0):
  print("nvmlDeviceGetCount:", nvmlErrorString(res))
  sys.exit(7)

for i in range(numDevices.value):
  device = new(pointer_t)
  res = nvmlDeviceGetHandleByIndex (i, device)
  if (res != 0):
    print("nvmlDeviceGetHandleByIndex:", nvmlErrorString(res))
    sys.exit(8)

  devname = bytearray(MAXSTRINGLEN)
  res = nvmlDeviceGetName(device.value, ctypes.addressof(devname), len(devname))
  if (res != 0):
    print("nvmlDeviceGetName:", nvmlErrorString(res))
    sys.exit(9)

  print ("GPU(%d): %s" % (i, devname.decode().rstrip('\0')))

  thread.stack_size(1048576)
  thread.start_new_thread(
        thread_start,
        (device,))

  clock.sleep(1)

res = nvmlShutdown()
if (res != 0):
  print("nvmlShutdown:", nvmlErrorString(res))
  sys.exit(10)

sys.exit(0)

Which could be translated to this in C:

/**
 *   Originally sourced from https://github.com/johnelse/nvml-experiments/tree/master/basic_dlopen
 */
#include <stdio.h>
#include <stdlib.h>

#include <dlfcn.h>

#include <nvml.h>

#include <pthread.h>
#include <unistd.h>
unsigned int getPower(int, nvmlDevice_t, nvmlReturn_t (*)(nvmlDevice_t, unsigned int*), char* (*)(nvmlReturn_t););

/**
 * - Load the NVML library via dlopen.
 * - Look up some symbols:
 *   - nvmlErrorString
 *   - nvmlInit
 *   - nvmlShutdown
 *   - nvmlSystemGetDriverVersion
 * - Using these symbols:
 *   - Initialise the library.
 *   - Get the driver version.
 *   - Shut down the library.
 * - Finally, unload the library with dlclose.
 */

int main(int argv, char** argc) {
    void *nvml_handle;
    int err;
    nvmlReturn_t nvml_err;
    char *error;
    unsigned int i;
    unsigned int device_count;
    nvmlDevice_t device;
    nvmlPciInfo_t pci_info;

    char* (*_nvmlErrorString)(nvmlReturn_t);
    nvmlReturn_t (*_nvmlInit)(void);
    nvmlReturn_t (*_nvmlShutdown)(void);
    nvmlReturn_t (*_nvmlSystemGetDriverVersion)(char*, unsigned int);
    nvmlReturn_t (*_nvmlDeviceGetCount)(unsigned int*);
    nvmlReturn_t (*_nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t*);
    nvmlReturn_t (*_nvmlDeviceGetPciInfo)(nvmlDevice_t, nvmlPciInfo_t*);
    nvmlReturn_t (*_nvmlDeviceGetPowerUsage)(nvmlDevice_t, unsigned int*);

    char driver_version[NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE];

    // Open a handle to the shared library.
    nvml_handle = dlopen("libnvidia-ml.so", RTLD_LAZY);
    if (!nvml_handle) {
        fprintf(stderr, "Failed to load library: %s\n", dlerror());
        exit(1);
    }
    printf("Library opened\n");

    // Load some symbols.
    _nvmlErrorString = dlsym(nvml_handle, "nvmlErrorString");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlErrorString\n");

    _nvmlInit = dlsym(nvml_handle, "nvmlInit");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlInit\n");

    _nvmlShutdown = dlsym(nvml_handle, "nvmlShutdown");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlShutdown\n");

    _nvmlSystemGetDriverVersion =
        dlsym(nvml_handle, "nvmlSystemGetDriverVersion");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlSystemGetDriverVersion\n");

    _nvmlDeviceGetCount =
        dlsym(nvml_handle, "nvmlDeviceGetCount");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlDeviceGetCount\n");

    _nvmlDeviceGetHandleByIndex =
        dlsym(nvml_handle, "nvmlDeviceGetHandleByIndex");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlDeviceGetHandleByIndex\n");

    _nvmlDeviceGetPciInfo =
        dlsym(nvml_handle, "nvmlDeviceGetPciInfo");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlDeviceGetPciInfo\n");

    _nvmlDeviceGetPowerUsage =
        dlsym(nvml_handle, "nvmlDeviceGetPowerUsage");
    if((error = dlerror()) != NULL) {
        fprintf(stderr, "%s\n", error);
        exit(1);
    }
    printf("Found nvmlDeviceGetPowerUsage\n");

    // Initialise the library.
    nvml_err = _nvmlInit();
    if (NVML_SUCCESS != nvml_err) {
        fprintf(stderr, "init failed: %s\n", _nvmlErrorString(nvml_err));
        exit(1);
    }
    printf("Initialised NVML\n");

    // Query the driver version.
    nvml_err = _nvmlSystemGetDriverVersion(driver_version,
        NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE);
    if (NVML_SUCCESS != nvml_err) {
        fprintf(stderr, "couldn't get driver version: %s\n",
                _nvmlErrorString(nvml_err));
        exit(1);
    }
    printf("Got driver version: %s\n", driver_version);

    // Query the number of installed devices.
    nvml_err = _nvmlDeviceGetCount(&device_count);
    if (NVML_SUCCESS != nvml_err) {
        fprintf(stderr, "couldn't get device count: %s\n",
                _nvmlErrorString(nvml_err));
        exit(1);
    }

    // Attempt to query each device's PCI info.
    for (i = 0; i < device_count; i++) {
        nvml_err = _nvmlDeviceGetHandleByIndex(i, &device);
        if (NVML_SUCCESS != nvml_err) {
            fprintf(stderr, "couldn't get handle for device %d: %s\n",
                    i, _nvmlErrorString(nvml_err));
        }
        else {
            nvml_err = _nvmlDeviceGetPciInfo(device, &pci_info);
            if (NVML_SUCCESS != nvml_err) {
                fprintf(stderr, "couldn't get PCI info for device %d: %s\n",
                        i, _nvmlErrorString(nvml_err));
            }
            else {
                unsigned int power = getPower(i, device, _nvmlDeviceGetPowerUsage, _nvmlErrorString);
                printf("%s\n", "-------------------");
                printf("PCI info for device %d:\n", i);
                printf("busId = %s\n", pci_info.busId);
                printf("pciDeviceId = %x\n", pci_info.pciDeviceId);
                printf("pciSubSystemId = %x\n", pci_info.pciSubSystemId);
                printf("Power usage = %d\n", power);
            }
        }
    }

    // Shutdown the library.
    nvml_err = _nvmlShutdown();
    if (NVML_SUCCESS != nvml_err) {
        fprintf(stderr, "shutdown failed: %s\n", _nvmlErrorString(nvml_err));
        exit(1);
    }
    printf("Shutdown NVML\n");

    // Close the handle.
    err = dlclose(nvml_handle);
    if (err) {
        fprintf(stderr, "Failed to close library: %s\n", dlerror());
        exit(1);
    }
    printf("Library closed\n");

    printf("Everything seems to be working OK :-)\n");

    return 0;
}

void *thread_start(void * param)
{
    nvmlReturn_t (*func)(nvmlDevice_t, unsigned int*) = ((void**)param)[2];
    char* (*err)(nvmlReturn_t) = ((void**)param)[3];
    nvmlDevice_t* device = ((void**)param)[0];
    unsigned int* power = ((void**)param)[1];
    int* i = ((void**)param)[5];

    nvmlReturn_t nvml_err = func(*device, power);
    if (NVML_SUCCESS != nvml_err) {
       fprintf(stderr, "couldn't get power info for device %d: %s\n",
                    *i, err(nvml_err));
    }
    return NULL;
}

unsigned int getPower(int i, nvmlDevice_t device, nvmlReturn_t (*func)(nvmlDevice_t, unsigned int*), char* (*err)(nvmlReturn_t))
{
    pthread_t thread;
    unsigned int power;

    void **params = (void**)malloc(5 * sizeof(void*));
    params[0] = &device;
    params[1] = &power;
    params[2] = func;
    params[3] = err;
    params[4] = &i;

    pthread_create(&thread, NULL, thread_start, params);
    sleep (2);

    return power;
}

No problem, glad you could diagnose and fix the issue.