Proper way to import D3D12 texture into cuda surface

I’m trying to import a D3D12 texture into a cuda surface so I can modify it (or actually display results of cuda compute in D3D12).

So far so good, but I encounter a problem where the displayed texture is not showing all the columns. I checked that the cuda kernel is really going through all pixels.

I believe there is something wrong going in the memory mapping, but I can’t see what.

TextureChannels = 4;
		TextureWidth = m_width;
		TextureHeight = m_height;
		const auto textureSurface = TextureWidth * TextureHeight;
		const auto texturePixels = textureSurface * TextureChannels;
		const auto textureSizeBytes = sizeof(float)* texturePixels;

		D3D12_RESOURCE_DESC texDesc{};
		texDesc.MipLevels = 1;
		texDesc.Format = TextureChannels == 4 ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R32G32B32_FLOAT;
		texDesc.Width = TextureWidth;
		texDesc.Height = TextureHeight;
		texDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
		texDesc.DepthOrArraySize = 1;
		texDesc.SampleDesc.Count = 1;
		texDesc.SampleDesc.Quality = 0;
		texDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;

		ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_SHARED,
			&texDesc, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, nullptr, IID_PPV_ARGS(&TextureArray)));
		NAME_D3D12_OBJECT(TextureArray);

			HANDLE sharedHandle{};
			WindowsSecurityAttributes secAttr{};
			LPCWSTR name{};
			ThrowIfFailed(m_device->CreateSharedHandle(TextureArray.Get(), &secAttr, GENERIC_ALL, name, &sharedHandle));
			const auto texAllocInfo = m_device->GetResourceAllocationInfo(m_nodeMask, 1, &texDesc);

			cudaExternalMemoryHandleDesc cuExtmemHandleDesc{};
			cuExtmemHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap;
			cuExtmemHandleDesc.handle.win32.handle = sharedHandle;
			cuExtmemHandleDesc.size = texAllocInfo.SizeInBytes;
			cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated;
			CheckCudaErrors(cudaImportExternalMemory(&m_externalMemory, &cuExtmemHandleDesc));

			cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
			cuExtmemMipDesc.extent = make_cudaExtent(texDesc.Width, texDesc.Height, 0);
			cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
			cuExtmemMipDesc.numLevels = 1;
			
			cudaMipmappedArray_t cuMipArray{};
			CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));

			cudaArray_t cuArray{};
			CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));

			cudaResourceDesc cuResDesc{};
			cuResDesc.resType = cudaResourceTypeArray;
			cuResDesc.res.array.array = cuArray;
			checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));

and the cuda part:

int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }

__global__ void UpdateSurface(cudaSurfaceObject_t surface, unsigned int width, unsigned int height, float time)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
	if (y >= height | x >= width) return;
	
	auto xVar = (float)x / (float)width;
	auto yVar = (float)y / (float)height;
	auto costx = __cosf(time) * 0.5f + xVar;
	auto cost = cosf(time) * 0.5f + 0.5f;
	auto costy = cosf(time) * 0.5f + yVar;
	auto costxx = (cosf(time) * 0.5f + 0.5f) * width;
	auto costyy = (cosf(time) * 0.5f + 0.5f) * height;
	
	float4 pixel{};
	if (y == 0)
		pixel = make_float4(1, 0, 0, 1);
	else if (y == height - 1)
		pixel = make_float4(1, 0, 1, 1);
	else if (x%5 == 0)
	{
		if(x>width/2)
			pixel = make_float4(0.1, 0.5, costx * 1, 1);
		else
			pixel = make_float4(costx * 1, 0.1, 0.2, 1);
	}
	else
		pixel = make_float4(costx * 0.2, costx * 0.4, costx * 0.6, 1);
	surf2Dwrite(pixel, surface, x, y);
}

void RunKernel(size_t textureW, size_t textureH, cudaSurfaceObject_t surface, cudaStream_t streamToRun, float animTime)
{
	auto unit = 16;
	dim3 threads(unit, unit);
	dim3 grid(iDivUp(textureW, unit), iDivUp(textureH, unit));
	UpdateSurface <<<grid, threads, 0, streamToRun >>>(surface, textureW, textureH, animTime);
	getLastCudaError("TextureKernel execution failed.\n");
}

The full code: https://github.com/mprevot/CudaD3D12Update
The result I get so far: https://1drv.ms/u/s!AlUmbfQiLoTZhib2HK5-PkhZxNH2?e=3CAanP