I’ve been working on cuda interops for our engine. So far I’ve got DX12 to work fine and produce correct images with very simple kernel that just outputs uv as color:
union RGBA32 {
uint32_t d;
uchar4 v;
struct {
uint8_t r, g, b, a;
} c;
};
template<class Rgb>
__global__ static void Kernel(cudaSurfaceObject_t surface, int nWidth, int nHeight) {
int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
if (x + 1 >= nWidth || y + 1 >= nHeight) {
return;
}
float4 rgba{};
rgba.x = (x & 0xFF) / 255.0f;
rgba.y = (y & 0xFF) / 255.0f;
rgba.z = 0.0f;
rgba.w = 1.0f;
int color = rgbaFloatToInt(rgba);
surf2Dwrite(color, surface, x * sizeof(Rgb), y);
surf2Dwrite(color, surface, x * sizeof(Rgb), y + 1);
surf2Dwrite(color, surface, (x + 1) * sizeof(Rgb), y);
surf2Dwrite(color, surface, (x + 1) * sizeof(Rgb), y + 1);
}
void launch_kernel(cudaStream_t stream, cudaSurfaceObject_t surface, int nWidth, int nHeight) {
Kernel<RGBA32><<<dim3(nWidth / (16 * 2), nHeight / (8 * 2)), dim3(16, 8), 0, stream>>>(surface, nWidth, nHeight);
}
When I use Vulkan backend with Cuda interop I get first image. When I use DX12 backend I get 2nd image.
I’ve been following this sample for vulkan interop. And this one for DX12 interop.
This is how I allocate new image with Shared flag (This is not the whole process, but the relevant parts):
VkImageCreateInfo image_info{};
image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
image_info.imageType = VK_IMAGE_TYPE_2D;
image_info.extent.width = static_cast<uint32_t>(width);
image_info.extent.height = static_cast<uint32_t>(height);
image_info.extent.depth = 1;
image_info.mipLevels = 1;
image_info.arrayLayers = 1;
image_info.format = pixel_format;
image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
image_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
image_info.samples = VK_SAMPLE_COUNT_1_BIT;
VkExternalMemoryImageCreateInfo vkExternalMemImageCreateInfo = {};
vkExternalMemImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
if (is_shared)
{
image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
vkExternalMemImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
image_info.pNext = &vkExternalMemImageCreateInfo;
}
// ...
// ...
// Memory allocation
VkMemoryAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.allocationSize = mem_requirements.size;
VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR{ VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR };
#ifdef _WIN32
WindowsSecurityAttributes win_security_attributes;
VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR{ VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR };
#endif
if (usage_shared)
{
#ifdef _WIN32
vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &win_security_attributes;
vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
vulkanExportMemoryAllocateInfoKHR.pNext = &vulkanExportMemoryWin32HandleInfoKHR;
vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
#else
vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
#endif
allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
}
VkMemoryPropertyFlags flags = 0;
if (getGPUType() == GPU_TYPE_DISCRETE || usage_render || usage_shared)
flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
allocInfo.memoryTypeIndex = findMemoryType(mem_requirements.memoryTypeBits, flags);
vkAllocateMemory(vk_device, &allocInfo, nullptr, &retval->allocation.memory);
vkBindImageMemory(vk_device, retval->image, retval->allocation.memory, 0);
retval->size = allocInfo.allocationSize;
Export vulkan memory as NT shared handle
VkMemoryGetWin32HandleInfoKHR desc{};
desc.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
desc.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
desc.memory = allocation.memory;
vkGetMemoryWin32HandleKHR(getDevice(), &desc, &external_memory_shared_handle);
Importing vulkan memory to cuda
cudaExternalMemoryHandleDesc cuExtmemHandleDesc{};
cuExtmemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
cuExtmemHandleDesc.handle.win32.handle = external_memory->getWin32Handle();
cuExtmemHandleDesc.size = texture->getVideoMemorySize();
cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated;
cuCheck(cudaImportExternalMemory(&external_memory, &cuExtmemHandleDesc));
// Create CUDA Surface Interop
{
cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(texture->getWidth(), texture->getHeight(), 0);
cuExtmemMipDesc.formatDesc = { 8, 8, 8, 8, cudaChannelFormatKindUnsigned };
cuExtmemMipDesc.numLevels = 1;
cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore | cudaArrayColorAttachment;
cudaMipmappedArray_t cuMipArray{};
ck(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, external_memory, &cuExtmemMipDesc));
cudaArray_t cuArray{};
ck(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));
cudaResourceDesc cuResDesc{};
cuResDesc.resType = cudaResourceTypeArray;
cuResDesc.res.array.array = cuArray;
ck(cudaCreateSurfaceObject(&surface, &cuResDesc));
}
I am currently stuck with vulkan interop and not sure what I should check and try to look for.
I already tripple checked the way I import/export/allocate memory and not sure what I should try next.
If anyone can at least me a some directions what I should check for and If I should consider checking semaphores or barriers once more.
p.s. making image with VK_IMAGE_TILING_LINEAR instead of VK_IMAGE_TILING_OPTIMAL and writing to it directly without using surf2Dwrite worked perfectly for some reason…
p.s.s. I tried adding this complexity with transfering queue ownerships to VK_QUEUE_FAMILY_EXTERNAL and back but it had no effect on the output
p.s.s.s. Also, I already transition my texture into VK_IMAGE_LAYOUT_GENERAL right after I create it
p.s.s.s.s. If i use VK_TILING_LINEAR and map VkImage as buffer like so:
cudaExternalMemoryBufferDesc buffer_desc{};
buffer_desc.size = cuExtmemHandleDesc.size;
ck(cudaExternalMemoryGetMappedBuffer(&devptr, external_memory, &buffer_desc));
And then write as raw bytes in kernel then it works as expected. But I am not really satisfied with this because it will require 2 versions of kernels and 2 separate ways to map data externally for each of the APIs.




