cudaExternalSemaphore_t with D3D11 always causes memory leak

hseotky · July 24, 2021, 2:44pm

I want to repeat making and deleting ID3D11Buffer with CUDA External Resource Interoperability every time within a loop, because the number of vertices is different in each loop.

So I modified the NVIDIA’s official simpleD3D11 to the following.
Note that in this case, the number of vertices does not change for explanation.

HRESULT Render()
{
    /*static */uint64_t key = 0;

    ID3D11Buffer* l_VertexBuffer;
    IDXGIKeyedMutex* l_pKeyedMutex11;
    Vertex* d_VertexBufPtr = NULL;
    cudaExternalMemory_t extMemory;
    cudaExternalSemaphore_t extSemaphore;

    HRESULT hr = S_OK;

    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

    hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &l_VertexBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = l_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&l_pKeyedMutex11);

    HANDLE sharedHandle;

    IDXGIResource1* pResource;
    l_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    hr = pResource->GetSharedHandle(&sharedHandle);
    if (!SUCCEEDED(hr))
    {
      std::cout << "Failed GetSharedHandle hr= " << hr << std::endl;
    }
    // Import the D3D11 Vertex Buffer into CUDA
    d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight);
    pResource->Release();

    l_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    pResource->GetSharedHandle(&sharedHandle);
    // Import the D3D11 Keyed Mutex into CUDA
    cudaImportKeyedMutex(sharedHandle, extSemaphore);
    pResource->Release();

    // Launch cuda kernel to generate sinewave in vertex buffer
    RunSineWaveKernel(extSemaphore, key, INFINITE, g_WindowWidth, g_WindowWidth, d_VertexBufPtr, cuda_stream);

    // Draw the scene using them
    DrawScene(key, l_VertexBuffer, l_pKeyedMutex11);

    checkCudaErrors(cudaFree(d_VertexBufPtr));
    checkCudaErrors(cudaDestroyExternalMemory(extMemory));
    checkCudaErrors(cudaDestroyExternalSemaphore(extSemaphore));

    l_pKeyedMutex11->Release();

    if (l_VertexBuffer)
    {
      l_VertexBuffer->Release();
    }
}

The whole runnable source code is here.

This code causes huge memory leaks: all ID3D11Buffers are not released.

If I comment out all the CUDA External Resource Interoperability-related functions like the following, there is no memory leak.

HRESULT Render()
{
    /*static */uint64_t key = 0;

    ID3D11Buffer* l_VertexBuffer;
    IDXGIKeyedMutex* l_pKeyedMutex11;
    Vertex* d_VertexBufPtr = NULL;
    cudaExternalMemory_t extMemory;
    cudaExternalSemaphore_t extSemaphore;

    HRESULT hr = S_OK;

    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

    hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &l_VertexBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = l_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&l_pKeyedMutex11);

    HANDLE sharedHandle;

    IDXGIResource1* pResource;
    l_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    hr = pResource->GetSharedHandle(&sharedHandle);
    if (!SUCCEEDED(hr))
    {
      std::cout << "Failed GetSharedHandle hr= " << hr << std::endl;
    }
    // Import the D3D11 Vertex Buffer into CUDA
//    d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight);
    pResource->Release();

    l_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    pResource->GetSharedHandle(&sharedHandle);
    // Import the D3D11 Keyed Mutex into CUDA
//    cudaImportKeyedMutex(sharedHandle, extSemaphore);
    pResource->Release();

    // Launch cuda kernel to generate sinewave in vertex buffer
//    RunSineWaveKernel(extSemaphore, key, INFINITE, g_WindowWidth, g_WindowWidth, d_VertexBufPtr, cuda_stream);

    // Draw the scene using them
    DrawScene(key, l_VertexBuffer, l_pKeyedMutex11);

//    checkCudaErrors(cudaFree(d_VertexBufPtr));
//    checkCudaErrors(cudaDestroyExternalMemory(extMemory));
//    checkCudaErrors(cudaDestroyExternalSemaphore(extSemaphore));

    l_pKeyedMutex11->Release();

    if (l_VertexBuffer)
    {
      l_VertexBuffer->Release();
    }
}

The whole runnable source code is here.

And if I revert only cudaExternalSemaphore_t related functions like the following, there are huge memory leaks again.

HRESULT Render()
{
    /*static */uint64_t key = 0;

    ID3D11Buffer* l_VertexBuffer;
    IDXGIKeyedMutex* l_pKeyedMutex11;
    Vertex* d_VertexBufPtr = NULL;
    cudaExternalMemory_t extMemory;
    cudaExternalSemaphore_t extSemaphore;

    HRESULT hr = S_OK;

    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

    hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &l_VertexBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = l_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&l_pKeyedMutex11);

    HANDLE sharedHandle;

    IDXGIResource1* pResource;
    l_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    hr = pResource->GetSharedHandle(&sharedHandle);
    if (!SUCCEEDED(hr))
    {
      std::cout << "Failed GetSharedHandle hr= " << hr << std::endl;
    }
    // Import the D3D11 Vertex Buffer into CUDA
//    d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight);
    pResource->Release();

    l_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    pResource->GetSharedHandle(&sharedHandle);
    // Import the D3D11 Keyed Mutex into CUDA
    cudaImportKeyedMutex(sharedHandle, extSemaphore);
    pResource->Release();

    // Launch cuda kernel to generate sinewave in vertex buffer
//    RunSineWaveKernel(extSemaphore, key, INFINITE, g_WindowWidth, g_WindowWidth, d_VertexBufPtr, cuda_stream);

    // Draw the scene using them
    DrawScene(key, l_VertexBuffer, l_pKeyedMutex11);

//    checkCudaErrors(cudaFree(d_VertexBufPtr));
//    checkCudaErrors(cudaDestroyExternalMemory(extMemory));
    checkCudaErrors(cudaDestroyExternalSemaphore(extSemaphore));

    l_pKeyedMutex11->Release();

    if (l_VertexBuffer)
    {
      l_VertexBuffer->Release();
    }
}

The whole runnable source code is here.

So I suspect that cudaImportExternalSemaphore and/or cudaDestroyExternalSemaphore does not handle IDXGIKeyedMutex correctly.

I tried doing l_pKeyedMutex11->Release(); twice, which caused a runtime error.

In my case, there is no memory leak with the old CUDA Graphics Interoperability, but I’ve read that the new CUDA External Resource Interoperability is much faster than the old one in an official slide, I’d like to use the new one.

By the way, I suspect that the NVIDIA’s official simpleD3D11 has forgot to do g_pKeyedMutex11->Release(); at CleanUp(), which also causes another memory leak.

Any comments would be very welcome.
Thank you.

hseotky · October 21, 2021, 5:17am

Unfortunately no one commented on this, So I wrote the bug report and submitted it on July 26.
Then the official NVIDIA team investigated it, it turned out to be a real memory leak bug of CUDA.
There is no release note about the fix of this bug, but it seems the bug has been fixed on GeForce Game Ready Driver 496.13, released on October 12.

system · November 4, 2021, 5:18am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
D3D11 device context in a separate thread gets corrupted when CUDA graphics resource mapping is used CUDA Programming and Performance	3	1723	September 25, 2024
Memory Leak when using cuGraphicsD3D9RegisterResource CUDA Programming and Performance cuda	9	1320	October 18, 2023
FAO: Nvidia Engineers:- Memory Leak in cudaMemcpyAsync Only occurs on Host To Device memory transfer CUDA Programming and Performance	4	5866	August 18, 2010
CUDA Texture Memory Example for Beginners CUDA Programming and Performance	6	4149	July 10, 2023
Memory leaks in example simpleD3D9Texture CUDA Programming and Performance	1	5044	December 19, 2008
`cuCtxCreate` and `cuCtxDestroy` pairs have a memory leak CUDA Programming and Performance cuda , problem	9	1184	January 11, 2024
concurrent D2H+H2D transfers? CUDA Programming and Performance	5	2445	May 10, 2016
Function cudaGraphicsD3D11RegisterResource returns INVALID_VALUE CUDA Programming and Performance cuda	9	224	July 3, 2024
cudaGraphicsMapResources() and cuCtxCreate() incompatible? CUDA Programming and Performance	9	1832	April 7, 2018
11.2 > cudaMemPool_t and Peer2Peer CUDA Programming and Performance	4	1032	January 14, 2021

cudaExternalSemaphore_t with D3D11 always causes memory leak

Related topics