cudaExternalSemaphore_t with D3D11 always causes memory leak

I want to repeat making and deleting ID3D11Buffer with CUDA External Resource Interoperability every time within a loop, because the number of vertices is different in each loop.

So I modified the NVIDIA’s official simpleD3D11 to the following.
Note that in this case, the number of vertices does not change for explanation.

HRESULT Render()
{
    /*static */uint64_t key = 0;

    ID3D11Buffer* l_VertexBuffer;
    IDXGIKeyedMutex* l_pKeyedMutex11;
    Vertex* d_VertexBufPtr = NULL;
    cudaExternalMemory_t extMemory;
    cudaExternalSemaphore_t extSemaphore;

    HRESULT hr = S_OK;

    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

    hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &l_VertexBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = l_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&l_pKeyedMutex11);

    HANDLE sharedHandle;

    IDXGIResource1* pResource;
    l_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    hr = pResource->GetSharedHandle(&sharedHandle);
    if (!SUCCEEDED(hr))
    {
      std::cout << "Failed GetSharedHandle hr= " << hr << std::endl;
    }
    // Import the D3D11 Vertex Buffer into CUDA
    d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight);
    pResource->Release();

    l_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    pResource->GetSharedHandle(&sharedHandle);
    // Import the D3D11 Keyed Mutex into CUDA
    cudaImportKeyedMutex(sharedHandle, extSemaphore);
    pResource->Release();

    // Launch cuda kernel to generate sinewave in vertex buffer
    RunSineWaveKernel(extSemaphore, key, INFINITE, g_WindowWidth, g_WindowWidth, d_VertexBufPtr, cuda_stream);

    // Draw the scene using them
    DrawScene(key, l_VertexBuffer, l_pKeyedMutex11);

    checkCudaErrors(cudaFree(d_VertexBufPtr));
    checkCudaErrors(cudaDestroyExternalMemory(extMemory));
    checkCudaErrors(cudaDestroyExternalSemaphore(extSemaphore));

    l_pKeyedMutex11->Release();

    if (l_VertexBuffer)
    {
      l_VertexBuffer->Release();
    }
}

The whole runnable source code is here.

This code causes huge memory leaks: all ID3D11Buffers are not released.

If I comment out all the CUDA External Resource Interoperability-related functions like the following, there is no memory leak.

HRESULT Render()
{
    /*static */uint64_t key = 0;

    ID3D11Buffer* l_VertexBuffer;
    IDXGIKeyedMutex* l_pKeyedMutex11;
    Vertex* d_VertexBufPtr = NULL;
    cudaExternalMemory_t extMemory;
    cudaExternalSemaphore_t extSemaphore;

    HRESULT hr = S_OK;

    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

    hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &l_VertexBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = l_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&l_pKeyedMutex11);

    HANDLE sharedHandle;

    IDXGIResource1* pResource;
    l_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    hr = pResource->GetSharedHandle(&sharedHandle);
    if (!SUCCEEDED(hr))
    {
      std::cout << "Failed GetSharedHandle hr= " << hr << std::endl;
    }
    // Import the D3D11 Vertex Buffer into CUDA
//    d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight);
    pResource->Release();

    l_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    pResource->GetSharedHandle(&sharedHandle);
    // Import the D3D11 Keyed Mutex into CUDA
//    cudaImportKeyedMutex(sharedHandle, extSemaphore);
    pResource->Release();

    // Launch cuda kernel to generate sinewave in vertex buffer
//    RunSineWaveKernel(extSemaphore, key, INFINITE, g_WindowWidth, g_WindowWidth, d_VertexBufPtr, cuda_stream);

    // Draw the scene using them
    DrawScene(key, l_VertexBuffer, l_pKeyedMutex11);

//    checkCudaErrors(cudaFree(d_VertexBufPtr));
//    checkCudaErrors(cudaDestroyExternalMemory(extMemory));
//    checkCudaErrors(cudaDestroyExternalSemaphore(extSemaphore));

    l_pKeyedMutex11->Release();

    if (l_VertexBuffer)
    {
      l_VertexBuffer->Release();
    }
}

The whole runnable source code is here.

And if I revert only cudaExternalSemaphore_t related functions like the following, there are huge memory leaks again.

HRESULT Render()
{
    /*static */uint64_t key = 0;

    ID3D11Buffer* l_VertexBuffer;
    IDXGIKeyedMutex* l_pKeyedMutex11;
    Vertex* d_VertexBufPtr = NULL;
    cudaExternalMemory_t extMemory;
    cudaExternalSemaphore_t extSemaphore;

    HRESULT hr = S_OK;

    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

    hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &l_VertexBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = l_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&l_pKeyedMutex11);

    HANDLE sharedHandle;

    IDXGIResource1* pResource;
    l_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    hr = pResource->GetSharedHandle(&sharedHandle);
    if (!SUCCEEDED(hr))
    {
      std::cout << "Failed GetSharedHandle hr= " << hr << std::endl;
    }
    // Import the D3D11 Vertex Buffer into CUDA
//    d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight);
    pResource->Release();

    l_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource);
    pResource->GetSharedHandle(&sharedHandle);
    // Import the D3D11 Keyed Mutex into CUDA
    cudaImportKeyedMutex(sharedHandle, extSemaphore);
    pResource->Release();

    // Launch cuda kernel to generate sinewave in vertex buffer
//    RunSineWaveKernel(extSemaphore, key, INFINITE, g_WindowWidth, g_WindowWidth, d_VertexBufPtr, cuda_stream);

    // Draw the scene using them
    DrawScene(key, l_VertexBuffer, l_pKeyedMutex11);

//    checkCudaErrors(cudaFree(d_VertexBufPtr));
//    checkCudaErrors(cudaDestroyExternalMemory(extMemory));
    checkCudaErrors(cudaDestroyExternalSemaphore(extSemaphore));

    l_pKeyedMutex11->Release();

    if (l_VertexBuffer)
    {
      l_VertexBuffer->Release();
    }
}

The whole runnable source code is here.

So I suspect that cudaImportExternalSemaphore and/or cudaDestroyExternalSemaphore does not handle IDXGIKeyedMutex correctly.

I tried doing l_pKeyedMutex11->Release(); twice, which caused a runtime error.

In my case, there is no memory leak with the old CUDA Graphics Interoperability, but I’ve read that the new CUDA External Resource Interoperability is much faster than the old one in an official slide, I’d like to use the new one.

By the way, I suspect that the NVIDIA’s official simpleD3D11 has forgot to do g_pKeyedMutex11->Release(); at CleanUp(), which also causes another memory leak.

Any comments would be very welcome.
Thank you.

Unfortunately no one commented on this, So I wrote the bug report and submitted it on July 26.
Then the official NVIDIA team investigated it, it turned out to be a real memory leak bug of CUDA.
There is no release note about the fix of this bug, but it seems the bug has been fixed on GeForce Game Ready Driver 496.13, released on October 12.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.