CUDA-Vulkan buffer interop fails when buffer size <= 448KB

I am trying to access a Vulkan buffer with CUDA using the VK_KHR_EXTERNAL_MEMORY_EXTENSION. I can operate on the buffer without any issue as long as the buffer is greater than 448KB in size but when it is smaller, device to host cudaMemcpy cannot see data written with Vulkan, and Vulkan rendering does not see data uploaded through cudaMemcpy.
I am thinking there may be some cache that needs flushing but I could not find documentation anywhere on this topic.

Can anyone give me some insights on this issue?

Here are my system details, and I have a minimal code sample to reproduce the issue.

OS: tested on Ubuntu 20.04 and 18.04
Nvidia Driver: tested on 460.80, 470.63.01, 495.44
CUDA: 11.0
Vulkan SDK: 1.2.189.0
Compiler: g++ 10.3.0

Compile with g++ test.cpp -lvulkan -ldl -lcudart with appropriate include path and link path.

#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#define VULKAN_HPP_STORAGE_SHARED
#define VULKAN_HPP_STORAGE_SHARED_EXPORT

#define checkCudaErrors(call)                                                  \
  do {                                                                         \
    cudaError_t err = call;                                                    \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at %s %d: %s\n", __FILE__, __LINE__,         \
              cudaGetErrorString(err));                                        \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

#include <cuda_runtime.h>
#include <vulkan/vulkan.hpp>
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE

int main() {
  vk::ApplicationInfo appInfo("Vulkan", VK_MAKE_VERSION(0, 0, 1), "No Engine",
                              VK_MAKE_VERSION(0, 0, 1), VK_API_VERSION_1_1);
  std::vector<const char *> instanceExtensions = {
      VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
      VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME};

  vk::DynamicLoader dl;
  PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
  VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);

  vk::InstanceCreateInfo createInfo({}, &appInfo, 0, nullptr,
                                    instanceExtensions.size(),
                                    instanceExtensions.data());
  vk::UniqueInstance instance = vk::createInstanceUnique(createInfo);
  VULKAN_HPP_DEFAULT_DISPATCHER.init(instance.get());

  float queuePriority = 0.0f;

  // pick physical device
  vk::PhysicalDevice physicalDevice;
  uint32_t familyIndex = ~0u;
  for (auto device : instance->enumeratePhysicalDevices()) {
    std::vector<vk::QueueFamilyProperties> queueFamilyProperties =
        device.getQueueFamilyProperties();
    for (uint32_t i = 0; i < queueFamilyProperties.size(); ++i) {
      if (queueFamilyProperties[i].queueFlags & vk::QueueFlagBits::eGraphics) {
        physicalDevice = device;
        familyIndex = i;
        break;
      }
    }
    if (familyIndex != ~0u) {
      break;
    }
  }

  vk::DeviceQueueCreateInfo deviceQueueCreateInfo({}, familyIndex, 1,
                                                  &queuePriority);
  std::vector<const char *> deviceExtensions{};
  deviceExtensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
  deviceExtensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
  deviceExtensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
  deviceExtensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
  vk::DeviceCreateInfo deviceInfo({}, deviceQueueCreateInfo, {},
                                  deviceExtensions);
  vk::UniqueDevice device = physicalDevice.createDeviceUnique(deviceInfo);
  VULKAN_HPP_DEFAULT_DISPATCHER.init(device.get());

  vk::UniqueCommandPool pool =
      device->createCommandPoolUnique(vk::CommandPoolCreateInfo(
          vk::CommandPoolCreateFlagBits::eResetCommandBuffer, familyIndex));

  size_t bufferSize = 448 * 1024;
  // adding 1 and it works
  // size_t bufferSize = 448 * 1024 + 1;

  // staging buffer
  vk::UniqueBuffer stagingBuffer = device->createBufferUnique(
      vk::BufferCreateInfo({}, bufferSize,
                           vk::BufferUsageFlagBits::eTransferDst |
                               vk::BufferUsageFlagBits::eTransferSrc));
  auto requirements = device->getBufferMemoryRequirements(stagingBuffer.get());

  uint32_t memoryType = ~0u;
  auto memProps = physicalDevice.getMemoryProperties();
  for (memoryType = 0; memoryType < memProps.memoryTypeCount; memoryType++) {
    if ((requirements.memoryTypeBits & (1 << memoryType)) &&
        (memProps.memoryTypes[memoryType].propertyFlags &
         (vk::MemoryPropertyFlagBits::eHostVisible |
          vk::MemoryPropertyFlagBits::eHostCoherent))) {
      break;
    }
  }
  auto stagingBufferMemory = device->allocateMemoryUnique(
      vk::MemoryAllocateInfo(requirements.size, memoryType));

  device->bindBufferMemory(stagingBuffer.get(), stagingBufferMemory.get(), 0);

  // create buffer for cuda interop
  vk::UniqueBuffer buffer = device->createBufferUnique(
      vk::BufferCreateInfo({}, bufferSize,
                           vk::BufferUsageFlagBits::eTransferDst |
                               vk::BufferUsageFlagBits::eTransferSrc));
  requirements = device->getBufferMemoryRequirements(buffer.get());

  memoryType = ~0u;
  memProps = physicalDevice.getMemoryProperties();
  for (memoryType = 0; memoryType < memProps.memoryTypeCount; memoryType++) {
    if ((requirements.memoryTypeBits & (1 << memoryType)) &&
        (memProps.memoryTypes[memoryType].propertyFlags &
         (vk::MemoryPropertyFlagBits::eDeviceLocal))) {
      break;
    }
  }
  auto memory = device->allocateMemoryUnique(
      vk::MemoryAllocateInfo(requirements.size, memoryType));

  device->bindBufferMemory(buffer.get(), memory.get(), 0);

  uint32_t deviceId = 0;
  checkCudaErrors(cudaSetDevice(deviceId));

  cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
  externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
  externalMemoryHandleDesc.size = requirements.size;

  int fd = -1;
  vk::MemoryGetFdInfoKHR vkMemoryGetFdInfoKHR;
  vkMemoryGetFdInfoKHR.setPNext(nullptr);
  vkMemoryGetFdInfoKHR.setMemory(memory.get());
  vkMemoryGetFdInfoKHR.setHandleType(
      vk::ExternalMemoryHandleTypeFlagBits::eOpaqueFd);

  fd = device->getMemoryFdKHR(vkMemoryGetFdInfoKHR);
  externalMemoryHandleDesc.handle.fd = fd;

  cudaExternalMemory_t cudaMem;
  void *cudaPtr;
  checkCudaErrors(
      cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc));

  cudaExternalMemoryBufferDesc externalMemBufferDesc = {};
  externalMemBufferDesc.offset = 0;
  externalMemBufferDesc.size = requirements.size;
  externalMemBufferDesc.flags = 0;
  checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&cudaPtr, cudaMem,
                                                    &externalMemBufferDesc));

  // upload with staging buffer
  void *mem = device->mapMemory(stagingBufferMemory.get(), 0, bufferSize);
  int data[4]{1, 2, 3, 4};
  memcpy(mem, data, 16);
  device->unmapMemory(stagingBufferMemory.get());
  vk::UniqueCommandBuffer cb =
      std::move(device
                    ->allocateCommandBuffersUnique(
                        {pool.get(), vk::CommandBufferLevel::ePrimary, 1})
                    .front());
  cb->begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
  cb->copyBuffer(stagingBuffer.get(), buffer.get(), vk::BufferCopy(0, 0, 16));
  cb->end();
  vk::UniqueFence fence = device->createFenceUnique({});
  device->getQueue(familyIndex, 0)
      .submit(vk::SubmitInfo(0, nullptr, nullptr, 1, &cb.get()), fence.get());
  auto result = device->waitForFences(fence.get(), VK_TRUE, UINT64_MAX);
  device->waitIdle();

  // download using cuda
  int data2[4]{9, 9, 9, 9};
  checkCudaErrors(cudaMemcpy(data2, cudaPtr, 16, cudaMemcpyDeviceToHost););

  printf("%d %d %d %d\n", data2[0], data2[1], data2[2], data2[3]);  // expecting to see 1 2 3 4

  return 0;
}

I switched to use VulkanMemoryAllocator and the problem seems gone (the concern now is that I do not have access to the memory block allocation size required by CUDA but passing it (offset+buffer size) seems fine).
However, it is not a solution since this allocator clearly allocates more than 448KB at a time, so I still cannot tell whether I did something wrong in memory allocation, or there was a driver issue.
I would still like to know what exactly happened. I post this information just in case somebody would have the same issue and no one replies to me in the future.