CUDA Vulkan VkImage Interop

I’ve been working on cuda interops for our engine. So far I’ve got DX12 to work fine and produce correct images with very simple kernel that just outputs uv as color:

union RGBA32 {
    uint32_t d;
    uchar4 v;
    struct {
        uint8_t r, g, b, a;
    } c;
};


template<class Rgb>
__global__ static void Kernel(cudaSurfaceObject_t surface, int nWidth, int nHeight) {
    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
    if (x + 1 >= nWidth || y + 1 >= nHeight) {
        return;
    }

    float4 rgba{};
    rgba.x = (x & 0xFF) / 255.0f;
    rgba.y = (y & 0xFF) / 255.0f;
    rgba.z = 0.0f;
    rgba.w = 1.0f;
    int color = rgbaFloatToInt(rgba);
    surf2Dwrite(color, surface, x * sizeof(Rgb), y);
    surf2Dwrite(color, surface, x * sizeof(Rgb), y + 1);
    surf2Dwrite(color, surface, (x + 1) * sizeof(Rgb), y);
    surf2Dwrite(color, surface, (x + 1) * sizeof(Rgb), y + 1);
}

void launch_kernel(cudaStream_t stream, cudaSurfaceObject_t surface, int nWidth, int nHeight) {
    Kernel<RGBA32><<<dim3(nWidth / (16 * 2), nHeight / (8 * 2)), dim3(16, 8), 0, stream>>>(surface, nWidth, nHeight);
}

When I use Vulkan backend with Cuda interop I get first image. When I use DX12 backend I get 2nd image.


I’ve been following this sample for vulkan interop. And this one for DX12 interop.

This is how I allocate new image with Shared flag (This is not the whole process, but the relevant parts):

VkImageCreateInfo image_info{};
image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
image_info.imageType = VK_IMAGE_TYPE_2D;
image_info.extent.width = static_cast<uint32_t>(width);
image_info.extent.height = static_cast<uint32_t>(height);
image_info.extent.depth = 1;
image_info.mipLevels = 1;
image_info.arrayLayers = 1;
image_info.format = pixel_format;
image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
image_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
image_info.samples = VK_SAMPLE_COUNT_1_BIT;

VkExternalMemoryImageCreateInfo vkExternalMemImageCreateInfo = {};
vkExternalMemImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
if (is_shared)
{
    image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
    image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;

    vkExternalMemImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
    image_info.pNext = &vkExternalMemImageCreateInfo;
}

// ...
// ...
// Memory allocation

VkMemoryAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.allocationSize = mem_requirements.size;

VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR{ VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR };

#ifdef _WIN32
    WindowsSecurityAttributes win_security_attributes;
    VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR{ VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR };
#endif
if (usage_shared)
{
    #ifdef _WIN32
        vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &win_security_attributes;
        vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;

        vulkanExportMemoryAllocateInfoKHR.pNext = &vulkanExportMemoryWin32HandleInfoKHR;
        vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
    #else
        vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
    #endif

    allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
}

VkMemoryPropertyFlags flags = 0;
if (getGPUType() == GPU_TYPE_DISCRETE || usage_render || usage_shared)
    flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;

allocInfo.memoryTypeIndex = findMemoryType(mem_requirements.memoryTypeBits, flags);

vkAllocateMemory(vk_device, &allocInfo, nullptr, &retval->allocation.memory);
vkBindImageMemory(vk_device, retval->image, retval->allocation.memory, 0);
retval->size = allocInfo.allocationSize;

Export vulkan memory as NT shared handle

VkMemoryGetWin32HandleInfoKHR desc{};
desc.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
desc.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
desc.memory = allocation.memory;


vkGetMemoryWin32HandleKHR(getDevice(), &desc, &external_memory_shared_handle);

Importing vulkan memory to cuda

cudaExternalMemoryHandleDesc cuExtmemHandleDesc{};
cuExtmemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
cuExtmemHandleDesc.handle.win32.handle = external_memory->getWin32Handle();
cuExtmemHandleDesc.size = texture->getVideoMemorySize();
cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated;

cuCheck(cudaImportExternalMemory(&external_memory, &cuExtmemHandleDesc));

// Create CUDA Surface Interop
{
    cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
    cuExtmemMipDesc.extent = make_cudaExtent(texture->getWidth(), texture->getHeight(), 0);
    cuExtmemMipDesc.formatDesc = { 8, 8, 8, 8, cudaChannelFormatKindUnsigned };
    cuExtmemMipDesc.numLevels = 1;
    cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore | cudaArrayColorAttachment;

    cudaMipmappedArray_t cuMipArray{};
    ck(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, external_memory, &cuExtmemMipDesc));

    cudaArray_t cuArray{};
    ck(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));

    cudaResourceDesc cuResDesc{};
    cuResDesc.resType = cudaResourceTypeArray;
    cuResDesc.res.array.array = cuArray;
    ck(cudaCreateSurfaceObject(&surface, &cuResDesc));
}

I am currently stuck with vulkan interop and not sure what I should check and try to look for.
I already tripple checked the way I import/export/allocate memory and not sure what I should try next.
If anyone can at least me a some directions what I should check for and If I should consider checking semaphores or barriers once more.

p.s. making image with VK_IMAGE_TILING_LINEAR instead of VK_IMAGE_TILING_OPTIMAL and writing to it directly without using surf2Dwrite worked perfectly for some reason…

p.s.s. I tried adding this complexity with transfering queue ownerships to VK_QUEUE_FAMILY_EXTERNAL and back but it had no effect on the output

p.s.s.s. Also, I already transition my texture into VK_IMAGE_LAYOUT_GENERAL right after I create it

p.s.s.s.s. If i use VK_TILING_LINEAR and map VkImage as buffer like so:

cudaExternalMemoryBufferDesc buffer_desc{};
buffer_desc.size = cuExtmemHandleDesc.size;

ck(cudaExternalMemoryGetMappedBuffer(&devptr, external_memory, &buffer_desc));

And then write as raw bytes in kernel then it works as expected. But I am not really satisfied with this because it will require 2 versions of kernels and 2 separate ways to map data externally for each of the APIs.

Also want to point out that I have modified Vulkan → CUDA sample the way my engine’s backend issues vulkan and cuda calls and it works fine. So I am not sure what to look for, since I already checked allocations, import/export memory and semas and neither CUDA nor Vulkan can tell me what’s wrong I am doing here. I consider starting from the ground up so I can check everything again…

No disrespect to any of the followers here, but you might get a better response posting this in these forums.

Hi @all500234765! Alexey Panteleev and I ran into a similar issues while working on an API interop. There were two hard-to-find things we ran into that changed how memory layouts work.

  1. Probably the most likely one you’re running into – in Vulkan, if the image wasn’t created with a dedicated allocation (i.e. using VK_KHR_dedicated_allocation’s VkMemoryDedicatedAllocateInfoKHR in the VkMemoryAllocateInfo’s pNext chain, then the CUDA external memory handle must not use the cudaExternalMemoryDedicated flag. I’d try removing the cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated; in the CUDA-Vulkan path and see if that fixes things.

The artifacts from this one usually look like glitchy vertical stripes, which sort of match what’s going on here:

The underlying reason there’s this requirement is because if the driver knows that an image uses a dedicated allocation, then it knows there’s only one image in an allocation and that that image has offset 0, which allows it to do different optimizations, including a different image layout in memory. (Thanks to Vivek Kini for this info).

  1. (Including this one for completeness; the code sample above avoids it, but it might be useful to someone else who’s reading this since I ran into this one.) The two APIs must agree on the depth of the image – in particular, one must be careful to use a depth of 0 (instead of 1) for a 2D CUDA image. If it’s 1, then that’s a 3D width x height x 1 texture, and may use a different layout (and will produce incorrect results if accessed using surf2D().

The artifacts for this one usually have some “holes” in a periodic pattern at some resolutions:

Hope this helps!