CUDA-Vulkan buffer interop fails when buffer size <= 448KB

Fx5 · November 25, 2021, 3:20am

I am trying to access a Vulkan buffer with CUDA using the VK_KHR_EXTERNAL_MEMORY_EXTENSION. I can operate on the buffer without any issue as long as the buffer is greater than 448KB in size but when it is smaller, device to host cudaMemcpy cannot see data written with Vulkan, and Vulkan rendering does not see data uploaded through cudaMemcpy.
I am thinking there may be some cache that needs flushing but I could not find documentation anywhere on this topic.

Can anyone give me some insights on this issue?

Here are my system details, and I have a minimal code sample to reproduce the issue.

OS: tested on Ubuntu 20.04 and 18.04
Nvidia Driver: tested on 460.80, 470.63.01, 495.44
CUDA: 11.0
Vulkan SDK: 1.2.189.0
Compiler: g++ 10.3.0

Compile with g++ test.cpp -lvulkan -ldl -lcudart with appropriate include path and link path.

#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#define VULKAN_HPP_STORAGE_SHARED
#define VULKAN_HPP_STORAGE_SHARED_EXPORT

#define checkCudaErrors(call)                                                  \
  do {                                                                         \
    cudaError_t err = call;                                                    \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at %s %d: %s\n", __FILE__, __LINE__,         \
              cudaGetErrorString(err));                                        \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

#include <cuda_runtime.h>
#include <vulkan/vulkan.hpp>
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE

int main() {
  vk::ApplicationInfo appInfo("Vulkan", VK_MAKE_VERSION(0, 0, 1), "No Engine",
                              VK_MAKE_VERSION(0, 0, 1), VK_API_VERSION_1_1);
  std::vector<const char *> instanceExtensions = {
      VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
      VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME};

  vk::DynamicLoader dl;
  PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
  VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);

  vk::InstanceCreateInfo createInfo({}, &appInfo, 0, nullptr,
                                    instanceExtensions.size(),
                                    instanceExtensions.data());
  vk::UniqueInstance instance = vk::createInstanceUnique(createInfo);
  VULKAN_HPP_DEFAULT_DISPATCHER.init(instance.get());

  float queuePriority = 0.0f;

  // pick physical device
  vk::PhysicalDevice physicalDevice;
  uint32_t familyIndex = ~0u;
  for (auto device : instance->enumeratePhysicalDevices()) {
    std::vector<vk::QueueFamilyProperties> queueFamilyProperties =
        device.getQueueFamilyProperties();
    for (uint32_t i = 0; i < queueFamilyProperties.size(); ++i) {
      if (queueFamilyProperties[i].queueFlags & vk::QueueFlagBits::eGraphics) {
        physicalDevice = device;
        familyIndex = i;
        break;
      }
    }
    if (familyIndex != ~0u) {
      break;
    }
  }

  vk::DeviceQueueCreateInfo deviceQueueCreateInfo({}, familyIndex, 1,
                                                  &queuePriority);
  std::vector<const char *> deviceExtensions{};
  deviceExtensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
  deviceExtensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
  deviceExtensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
  deviceExtensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
  vk::DeviceCreateInfo deviceInfo({}, deviceQueueCreateInfo, {},
                                  deviceExtensions);
  vk::UniqueDevice device = physicalDevice.createDeviceUnique(deviceInfo);
  VULKAN_HPP_DEFAULT_DISPATCHER.init(device.get());

  vk::UniqueCommandPool pool =
      device->createCommandPoolUnique(vk::CommandPoolCreateInfo(
          vk::CommandPoolCreateFlagBits::eResetCommandBuffer, familyIndex));

  size_t bufferSize = 448 * 1024;
  // adding 1 and it works
  // size_t bufferSize = 448 * 1024 + 1;

  // staging buffer
  vk::UniqueBuffer stagingBuffer = device->createBufferUnique(
      vk::BufferCreateInfo({}, bufferSize,
                           vk::BufferUsageFlagBits::eTransferDst |
                               vk::BufferUsageFlagBits::eTransferSrc));
  auto requirements = device->getBufferMemoryRequirements(stagingBuffer.get());

  uint32_t memoryType = ~0u;
  auto memProps = physicalDevice.getMemoryProperties();
  for (memoryType = 0; memoryType < memProps.memoryTypeCount; memoryType++) {
    if ((requirements.memoryTypeBits & (1 << memoryType)) &&
        (memProps.memoryTypes[memoryType].propertyFlags &
         (vk::MemoryPropertyFlagBits::eHostVisible |
          vk::MemoryPropertyFlagBits::eHostCoherent))) {
      break;
    }
  }
  auto stagingBufferMemory = device->allocateMemoryUnique(
      vk::MemoryAllocateInfo(requirements.size, memoryType));

  device->bindBufferMemory(stagingBuffer.get(), stagingBufferMemory.get(), 0);

  // create buffer for cuda interop
  vk::UniqueBuffer buffer = device->createBufferUnique(
      vk::BufferCreateInfo({}, bufferSize,
                           vk::BufferUsageFlagBits::eTransferDst |
                               vk::BufferUsageFlagBits::eTransferSrc));
  requirements = device->getBufferMemoryRequirements(buffer.get());

  memoryType = ~0u;
  memProps = physicalDevice.getMemoryProperties();
  for (memoryType = 0; memoryType < memProps.memoryTypeCount; memoryType++) {
    if ((requirements.memoryTypeBits & (1 << memoryType)) &&
        (memProps.memoryTypes[memoryType].propertyFlags &
         (vk::MemoryPropertyFlagBits::eDeviceLocal))) {
      break;
    }
  }
  auto memory = device->allocateMemoryUnique(
      vk::MemoryAllocateInfo(requirements.size, memoryType));

  device->bindBufferMemory(buffer.get(), memory.get(), 0);

  uint32_t deviceId = 0;
  checkCudaErrors(cudaSetDevice(deviceId));

  cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
  externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
  externalMemoryHandleDesc.size = requirements.size;

  int fd = -1;
  vk::MemoryGetFdInfoKHR vkMemoryGetFdInfoKHR;
  vkMemoryGetFdInfoKHR.setPNext(nullptr);
  vkMemoryGetFdInfoKHR.setMemory(memory.get());
  vkMemoryGetFdInfoKHR.setHandleType(
      vk::ExternalMemoryHandleTypeFlagBits::eOpaqueFd);

  fd = device->getMemoryFdKHR(vkMemoryGetFdInfoKHR);
  externalMemoryHandleDesc.handle.fd = fd;

  cudaExternalMemory_t cudaMem;
  void *cudaPtr;
  checkCudaErrors(
      cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc));

  cudaExternalMemoryBufferDesc externalMemBufferDesc = {};
  externalMemBufferDesc.offset = 0;
  externalMemBufferDesc.size = requirements.size;
  externalMemBufferDesc.flags = 0;
  checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&cudaPtr, cudaMem,
                                                    &externalMemBufferDesc));

  // upload with staging buffer
  void *mem = device->mapMemory(stagingBufferMemory.get(), 0, bufferSize);
  int data[4]{1, 2, 3, 4};
  memcpy(mem, data, 16);
  device->unmapMemory(stagingBufferMemory.get());
  vk::UniqueCommandBuffer cb =
      std::move(device
                    ->allocateCommandBuffersUnique(
                        {pool.get(), vk::CommandBufferLevel::ePrimary, 1})
                    .front());
  cb->begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
  cb->copyBuffer(stagingBuffer.get(), buffer.get(), vk::BufferCopy(0, 0, 16));
  cb->end();
  vk::UniqueFence fence = device->createFenceUnique({});
  device->getQueue(familyIndex, 0)
      .submit(vk::SubmitInfo(0, nullptr, nullptr, 1, &cb.get()), fence.get());
  auto result = device->waitForFences(fence.get(), VK_TRUE, UINT64_MAX);
  device->waitIdle();

  // download using cuda
  int data2[4]{9, 9, 9, 9};
  checkCudaErrors(cudaMemcpy(data2, cudaPtr, 16, cudaMemcpyDeviceToHost););

  printf("%d %d %d %d\n", data2[0], data2[1], data2[2], data2[3]);  // expecting to see 1 2 3 4

  return 0;
}

Fx5 · November 30, 2021, 4:24am

I switched to use VulkanMemoryAllocator and the problem seems gone (the concern now is that I do not have access to the memory block allocation size required by CUDA but passing it (offset+buffer size) seems fine).
However, it is not a solution since this allocator clearly allocates more than 448KB at a time, so I still cannot tell whether I did something wrong in memory allocation, or there was a driver issue.
I would still like to know what exactly happened. I post this information just in case somebody would have the same issue and no one replies to me in the future.

Topic		Replies	Views
Question about exporting CUDA memory into Vulkan CUDA Programming and Performance vulkan	0	561	December 25, 2023
Vulkan memory import size truncated on Windows Vulkan driver , gaming , windows-driver	3	353	April 30, 2025
CUDA->Vulkan interop shows (uninitialized memory?) artifacts depending on the value written by CUDA CUDA Programming and Performance cuda , vulkan	3	120	January 2, 2026
CUDA Vulkan VkImage Interop CUDA Programming and Performance cuda	3	2263	January 27, 2024
Problem with allocating big buffer CUDA Programming and Performance	8	9013	November 30, 2010
CUDA-Vulkan image interop broken on Windows CUDA Programming and Performance cuda , vulkan	3	209	February 2, 2026
vkCmdFillBuffer bugged on ada Vulkan	10	1451	January 10, 2024
MMAPI samples: v4l2cuda buffer size question for zero copy Jetson AGX Xavier mmapi	3	475	June 8, 2022
[Jetson] Sharing Vulkan buffer data to NVidia video encoder (V4L2) Jetson TX2 encoder , vulkan , jetson	3	1113	April 6, 2022
Editing Vulkan Image in CUDA - external memory and interop problem CUDA Programming and Performance vulkan	0	987	April 12, 2021

CUDA-Vulkan buffer interop fails when buffer size <= 448KB

Related topics