CUDA Vulkan VkImage Interop

all500234765 · January 11, 2024, 8:27pm

I’ve been working on cuda interops for our engine. So far I’ve got DX12 to work fine and produce correct images with very simple kernel that just outputs uv as color:

union RGBA32 {
    uint32_t d;
    uchar4 v;
    struct {
        uint8_t r, g, b, a;
    } c;
};


template<class Rgb>
__global__ static void Kernel(cudaSurfaceObject_t surface, int nWidth, int nHeight) {
    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
    if (x + 1 >= nWidth || y + 1 >= nHeight) {
        return;
    }

    float4 rgba{};
    rgba.x = (x & 0xFF) / 255.0f;
    rgba.y = (y & 0xFF) / 255.0f;
    rgba.z = 0.0f;
    rgba.w = 1.0f;
    int color = rgbaFloatToInt(rgba);
    surf2Dwrite(color, surface, x * sizeof(Rgb), y);
    surf2Dwrite(color, surface, x * sizeof(Rgb), y + 1);
    surf2Dwrite(color, surface, (x + 1) * sizeof(Rgb), y);
    surf2Dwrite(color, surface, (x + 1) * sizeof(Rgb), y + 1);
}

void launch_kernel(cudaStream_t stream, cudaSurfaceObject_t surface, int nWidth, int nHeight) {
    Kernel<RGBA32><<<dim3(nWidth / (16 * 2), nHeight / (8 * 2)), dim3(16, 8), 0, stream>>>(surface, nWidth, nHeight);
}

When I use Vulkan backend with Cuda interop I get first image. When I use DX12 backend I get 2nd image.

I’ve been following this sample for vulkan interop. And this one for DX12 interop.

This is how I allocate new image with Shared flag (This is not the whole process, but the relevant parts):

VkImageCreateInfo image_info{};
image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
image_info.imageType = VK_IMAGE_TYPE_2D;
image_info.extent.width = static_cast<uint32_t>(width);
image_info.extent.height = static_cast<uint32_t>(height);
image_info.extent.depth = 1;
image_info.mipLevels = 1;
image_info.arrayLayers = 1;
image_info.format = pixel_format;
image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
image_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
image_info.samples = VK_SAMPLE_COUNT_1_BIT;

VkExternalMemoryImageCreateInfo vkExternalMemImageCreateInfo = {};
vkExternalMemImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
if (is_shared)
{
    image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
    image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;

    vkExternalMemImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
    image_info.pNext = &vkExternalMemImageCreateInfo;
}

// ...
// ...
// Memory allocation

VkMemoryAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.allocationSize = mem_requirements.size;

VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR{ VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR };

#ifdef _WIN32
    WindowsSecurityAttributes win_security_attributes;
    VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR{ VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR };
#endif
if (usage_shared)
{
    #ifdef _WIN32
        vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &win_security_attributes;
        vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;

        vulkanExportMemoryAllocateInfoKHR.pNext = &vulkanExportMemoryWin32HandleInfoKHR;
        vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
    #else
        vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
    #endif

    allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
}

VkMemoryPropertyFlags flags = 0;
if (getGPUType() == GPU_TYPE_DISCRETE || usage_render || usage_shared)
    flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;

allocInfo.memoryTypeIndex = findMemoryType(mem_requirements.memoryTypeBits, flags);

vkAllocateMemory(vk_device, &allocInfo, nullptr, &retval->allocation.memory);
vkBindImageMemory(vk_device, retval->image, retval->allocation.memory, 0);
retval->size = allocInfo.allocationSize;

Export vulkan memory as NT shared handle

VkMemoryGetWin32HandleInfoKHR desc{};
desc.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
desc.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
desc.memory = allocation.memory;


vkGetMemoryWin32HandleKHR(getDevice(), &desc, &external_memory_shared_handle);

Importing vulkan memory to cuda

cudaExternalMemoryHandleDesc cuExtmemHandleDesc{};
cuExtmemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
cuExtmemHandleDesc.handle.win32.handle = external_memory->getWin32Handle();
cuExtmemHandleDesc.size = texture->getVideoMemorySize();
cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated;

cuCheck(cudaImportExternalMemory(&external_memory, &cuExtmemHandleDesc));

// Create CUDA Surface Interop
{
    cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
    cuExtmemMipDesc.extent = make_cudaExtent(texture->getWidth(), texture->getHeight(), 0);
    cuExtmemMipDesc.formatDesc = { 8, 8, 8, 8, cudaChannelFormatKindUnsigned };
    cuExtmemMipDesc.numLevels = 1;
    cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore | cudaArrayColorAttachment;

    cudaMipmappedArray_t cuMipArray{};
    ck(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, external_memory, &cuExtmemMipDesc));

    cudaArray_t cuArray{};
    ck(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));

    cudaResourceDesc cuResDesc{};
    cuResDesc.resType = cudaResourceTypeArray;
    cuResDesc.res.array.array = cuArray;
    ck(cudaCreateSurfaceObject(&surface, &cuResDesc));
}

I am currently stuck with vulkan interop and not sure what I should check and try to look for.
I already tripple checked the way I import/export/allocate memory and not sure what I should try next.
If anyone can at least me a some directions what I should check for and If I should consider checking semaphores or barriers once more.

p.s. making image with VK_IMAGE_TILING_LINEAR instead of VK_IMAGE_TILING_OPTIMAL and writing to it directly without using surf2Dwrite worked perfectly for some reason…

p.s.s. I tried adding this complexity with transfering queue ownerships to VK_QUEUE_FAMILY_EXTERNAL and back but it had no effect on the output

p.s.s.s. Also, I already transition my texture into VK_IMAGE_LAYOUT_GENERAL right after I create it

p.s.s.s.s. If i use VK_TILING_LINEAR and map VkImage as buffer like so:

cudaExternalMemoryBufferDesc buffer_desc{};
buffer_desc.size = cuExtmemHandleDesc.size;

ck(cudaExternalMemoryGetMappedBuffer(&devptr, external_memory, &buffer_desc));

And then write as raw bytes in kernel then it works as expected. But I am not really satisfied with this because it will require 2 versions of kernels and 2 separate ways to map data externally for each of the APIs.

all500234765 · January 11, 2024, 8:33pm

Also want to point out that I have modified Vulkan → CUDA sample the way my engine’s backend issues vulkan and cuda calls and it works fine. So I am not sure what to look for, since I already checked allocations, import/export memory and semas and neither CUDA nor Vulkan can tell me what’s wrong I am doing here. I consider starting from the ground up so I can check everything again…

rs277 · January 11, 2024, 8:41pm

No disrespect to any of the followers here, but you might get a better response posting this in these forums.

nbickford · January 27, 2024, 12:51am

Hi @all500234765! Alexey Panteleev and I ran into a similar issues while working on an API interop. There were two hard-to-find things we ran into that changed how memory layouts work.

Probably the most likely one you’re running into – in Vulkan, if the image wasn’t created with a dedicated allocation (i.e. using VK_KHR_dedicated_allocation’s VkMemoryDedicatedAllocateInfoKHR in the VkMemoryAllocateInfo’s pNext chain, then the CUDA external memory handle must not use the cudaExternalMemoryDedicated flag. I’d try removing the cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated; in the CUDA-Vulkan path and see if that fixes things.

The artifacts from this one usually look like glitchy vertical stripes, which sort of match what’s going on here:

The underlying reason there’s this requirement is because if the driver knows that an image uses a dedicated allocation, then it knows there’s only one image in an allocation and that that image has offset 0, which allows it to do different optimizations, including a different image layout in memory. (Thanks to Vivek Kini for this info).

(Including this one for completeness; the code sample above avoids it, but it might be useful to someone else who’s reading this since I ran into this one.) The two APIs must agree on the depth of the image – in particular, one must be careful to use a depth of 0 (instead of 1) for a 2D CUDA image. If it’s 1, then that’s a 3D width x height x 1 texture, and may use a different layout (and will produce incorrect results if accessed using surf2D().

The artifacts for this one usually have some “holes” in a periodic pattern at some resolutions:

Hope this helps!

Topic		Replies	Views
Editing Vulkan Image in CUDA - external memory and interop problem CUDA Programming and Performance vulkan	0	940	April 12, 2021
CUDA_ERROR_INVALID_VALUE importing Vulkan VkImage CUDA Programming and Performance	4	1399	December 1, 2019
Example of using CUDA/Vulkan interop with VkImages? CUDA Programming and Performance	0	882	September 27, 2018
CUDA->Vulkan interop shows (uninitialized memory?) artifacts depending on the value written by CUDA CUDA Programming and Performance cuda , vulkan	3	45	January 2, 2026
Does Vulkan/CUDA interop only work with VK_IMAGE_TILING_OPTIMAL vulkan image? (Linux) Video Processing & Optical Flow cuda , encoder , vulkan , linux , nvenc	0	807	December 8, 2022
Creating a linear tiling VkImage by cuda failed Vulkan cuda , vulkan	0	277	April 30, 2024
Importing Vulkan image into OptixImage2D for Optix Denoiser OptiX	11	1662	June 14, 2022
EXT_external_objects stop working on 430.39 OpenGL	1	847	April 29, 2019
Use Vulkan image to share data between processes for CUDA NvEnc in Linux CUDA Programming and Performance cuda , encoder , vulkan , linux , nvenc	1	1583	December 6, 2022
Allocating memory in Vulkan API with VK_EXT_external_memory_host extension outputs ErrorOutOfDeviceMemory Linux vulkan	5	3500	August 26, 2021

CUDA Vulkan VkImage Interop

Related topics