On Windows, getting list of graphics or compute processes can return invalid argument depending on launch

Hello?

Bump.

Not fixed in 551.68.

Hi @BlueGoliath There is an internal team tracking this issue. If you could provide a minimal reproducer in C/C++, that would accelerate the process.

Iā€™m not sure how to make a minimal C/C++ example. I tried this:

#include <cstdlib>
#include <string>

#include <windows.h>

#include "nvml.h"

using namespace std;

nvmlDevice_t device;

unsigned int* graphicsValue = (unsigned int*)malloc(sizeof(unsigned int));
unsigned int* smValue = (unsigned int*)malloc(sizeof(unsigned int));
unsigned int* memoryValue = (unsigned int*)malloc(sizeof(unsigned int));
unsigned int* videoValue = (unsigned int*)malloc(sizeof(unsigned int));

void doUpdate(nvmlClockType_t clockType, unsigned int* valuePointer)
{
    nvmlReturn_t returnValue;

    returnValue = nvmlDeviceGetMaxClockInfo(
            device,
            clockType,
            valuePointer);
    
    if(NVML_SUCCESS == returnValue && *valuePointer == 0)
    { 
        printf("BUG");
        fflush(stdout);
    }
    else if(NVML_SUCCESS != returnValue)
    {
        printf("FAIL");
        fflush(stdout);
    }
}

unsigned int loopGraphicsUpdate(void* ptr)
{
    while(true)
    {
        doUpdate(NVML_CLOCK_GRAPHICS, graphicsValue);
        Sleep(1);
    }
    
    return 0;
}

unsigned int loopSMUpdate(void* ptr)
{
    while(true)
    {
        doUpdate(NVML_CLOCK_SM, smValue);
        Sleep(1);
    }
    
    return 0;
}

unsigned int loopMemoryUpdate(void* ptr)
{
    while(true)
    {
        doUpdate(NVML_CLOCK_MEM, memoryValue);
        Sleep(1);
    }
    
    return 0;
}

unsigned int loopVideoUpdate(void* ptr)
{
    while(true)
    {
        doUpdate(NVML_CLOCK_VIDEO, videoValue);
        Sleep(1);
    }
    
    return 0;
}

int main(int argc, char** argv) {
    
    nvmlReturn_t returnValue = nvmlInit();
    
    nvmlDevice_t* devicePointer = (nvmlDevice_t*)malloc(sizeof(nvmlDevice_t*));
    
    returnValue = nvmlDeviceGetHandleByIndex_v2(0, devicePointer);
    
    device = *devicePointer;
    
    HANDLE graphicsThread = CreateThread( 
            NULL,
            0,
            loopGraphicsUpdate,
            NULL, 
            0,
            NULL);

    HANDLE smThread = CreateThread( 
            NULL,
            0,
            loopSMUpdate,
            NULL, 
            0,
            NULL);
    
    HANDLE memoryThread = CreateThread( 
            NULL,
            0,
            loopMemoryUpdate,
            NULL, 
            0,
            NULL);
    
    HANDLE videoThread = CreateThread( 
            NULL,
            0,
            loopVideoUpdate,
            NULL, 
            0,
            NULL);

    Sleep(600000000);
    
    return 0;
}

But itā€™s not the same. The actual call to nvmlDeviceGetMaxClockInfo gets done on a random thread in a ScheduledExecutorService. The number of threads is by default equal to the number of logical threads.

For a more complete picture, here is the class file in full:

package com.bluegoliath.envious.nvml.local.attributes.clocks;

import java.util.List;
import java.util.Optional;
import com.bluegoliath.bindings.nvml.enums.nvmlClockType_t;
import com.bluegoliath.bindings.nvml.enums.nvmlReturn_t;
import com.bluegoliath.bindings.nvml.nvml_h;
import com.bluegoliath.envious.base.enums.Unit;
import com.bluegoliath.envious.nvml.local.internal.NVMLLocalGPUInternal;
import com.bluegoliath.crosspoint.values.NativeInteger;
import com.bluegoliath.envious.base.abstracts.internal.NVNumberAttributeGenericBase;
import com.bluegoliath.envious.nvml.local.internal.NVMLContextInternal;
import com.bluegoliath.oroc.interfaces.EnumStringProvider;

public class NVMLGPUClockMaxAttribute extends NVNumberAttributeGenericBase<Integer, nvmlReturn_t, NVMLLocalGPUInternal>
{
    private final NativeInteger valuePointer;
    
    private final nvmlClockType_t type;
    
    private final String description;
    
    private final EnumStringProvider<nvmlClockType_t> provider;
    
    public NVMLGPUClockMaxAttribute(NVMLLocalGPUInternal gpu, nvmlClockType_t type, String description, EnumStringProvider<nvmlClockType_t> provider)
    {
        super(gpu, "Clock Max", Unit.MEGAHERTZ);
        
        this.valuePointer = gpu.getAccountingAllocator().newNativeValue(NativeInteger.METADATA);
        
        this.type = type;
        this.description = description;
        this.provider = provider;
    }
    
    @Override
    public nvmlReturn_t update()
    {
        long startTime = System.currentTimeMillis();
        nvmlReturn_t returnValue = null;
        
        try
        {
            returnValue = nvml_h.INSTANCE.nvmlDeviceGetMaxClockInfo(
                    super.getNVDevice().get().getNativePointer(),
                    this.type,
                    this.valuePointer);
        }
        catch (Throwable ex)
        {
            ex.printStackTrace();
        }
        
        super.finishUpdate(returnValue, this.valuePointer.get(), System.currentTimeMillis() - startTime);
        
        return returnValue;
    }
    
    @Override
    public nvmlReturn_t getSuccessReturnValue()
    {
        return nvmlReturn_t.NVML_SUCCESS;
    }
    
    @Override
    public String getReturnString(nvmlReturn_t value)
    {
        return NVMLContextInternal.toString(value);
    }
    
    @Override
    public Optional<String> getDescription()
    {
        return Optional.of(this.description);
    }
    
    @Override
    public Optional<String> getContextualString(int index)
    {
        switch(index)
        {
            case 0:
                return Optional.of(this.getNVDevice().get().toString());
            case 1:
                return Optional.of(this.provider.getResultString(this.type));
            default:
                return Optional.empty();
        }
    }
    
    @Override
    public List<nvmlReturn_t> getReturnValues()
    {
        return List.of(
                nvmlReturn_t.NVML_SUCCESS,
                nvmlReturn_t.NVML_ERROR_UNINITALIZED,
                nvmlReturn_t.NVML_ERROR_INVALID_ARGUMENT,
                nvmlReturn_t.NVML_ERROR_NOT_SUPPORTED,
                nvmlReturn_t.NVML_ERROR_GPU_IS_LOST,
                nvmlReturn_t.NVML_ERROR_UNKNOWN);
    }
}

getAccountingAllocator just makes calls to the platformā€™s malloc stdlib function and keeps track of allocations.

I donā€™t really know what else to provide. Like I said, this max clock issue and the process function issue DOES NOT happen on Linux:

Also re: pcie function issues since it seems sorta related, you can see a bug report here:

I fixed this by single-threading all PCIe calls. PCIe gen/width/speed are updated in this class by the mentioned executor:

package com.bluegoliath.envious.fx.platform.internal;

import com.bluegoliath.bindings.nvml.enums.nvmlReturn_t;
import com.bluegoliath.oroc.interfaces.NumberReadable;

public class PCIeHotfix implements Runnable
{
    private final NumberReadable<Integer, nvmlReturn_t> gen;
    private final NumberReadable<Integer, nvmlReturn_t> width;
    private final NumberReadable<Integer, nvmlReturn_t> speed;
    
    public PCIeHotfix(NumberReadable<Integer, nvmlReturn_t> gen, NumberReadable<Integer, nvmlReturn_t> width, NumberReadable<Integer, nvmlReturn_t> speed)
    {
        this.gen = gen;
        this.width = width;
        this.speed = speed;
    }
    
    @Override
    public void run()
    {
        this.gen.update();
        this.width.update();
        this.speed.update();
    }
}

Anything?

Newest driver still has issues.

Still not fixed with newest driver.

552.22 not fixed.

Lol. So erm could you explain why you expect complete randos to do stuff when Nvidia team is trying their hardest to solve the issues? Seems you donā€™t have voltage control by the way, nice one.

@pdinkar , any way to set a max voltage / modify the frequency at each voltage? I see no way in nvml currently to do so. Having a maximum voltage control would allow us to undervolt efficiently under Linux as well.

Hello @BlueGoliath , We have tried to reproduce the issue with your reproducer code on multiple different boards, running in a multi-threaded environment for an extended period of time. We could not see the issue with the NVML API calls. Regardless, we are modifying NVML to return an error on 0 max clock values.

@faz Nvidia currently does not offer voltage control.

1 Like

Thanks for looking into it I guess. Iā€™ve just disabled multi-threading by default since on Linux Iā€™m getting the opposite problem where functions share some kind of lock. It never used to be like that on Linux a few years ago. Maybe this is a bug in the JDK.