How to use a Direct3D 2D texture as both a buffer for a render view as well as a CUDA resource?

Hi all. Neither CUDA, nor Direct3D expert here.

I am trying to figure out how to process a 3D scene using CUDA. My goal is to draw the scene’s contents, process those contents with CUDA (actually later on with libtorch by converting the CUDA resource to a tensor) and render the result on the screen.

I have tried to follow and adapt the CUDA sample on this topic (removed most of the helper functions, added ImGUI, which is drawn at the very end of my render pass, removed all but the texture 2D kernel and converted the project to CMake). For a base Direct3D setup I have used this tutorial.

From what I understand about interop between CUDA and Direct3D 11 it allows me to map a Direct3D 2D texture as a CUDA graphics resource, then convert this resource to a CUDA array and process that array in some way (in the case of the example it just draws a multi-color hatch pattern). The result can then be rendered on the screen.

Initially I was mapping my resource and running the CUDA kernels on it too early (before my render frame method that actually draws and presents things on the screen). That resulted in

Once I put the CUDA related content regarding the mapping, use and unmapping for the buffer after the draw call of my cube and right before the draw call for ImGUI, I managed to get the following result:

So far this is great since I have nailed down the exact location where I need to inject the CUDA processing. Now the problem lies in the contents of the buffer. The original kernel is as follows:

__global__ void cuda_kernel_texture_2d(unsigned char *surface, int width,
                                       int height, size_t pitch, float t) {
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;
  float *pixel;

  // in the case where, due to quantization into grids, we have
  // more threads than pixels, skip the threads which don't
  // correspond to valid pixels
  if (x >= width || y >= height) return;

  // get a pointer to the pixel at (x,y)
  pixel = (float *)(surface + y * pitch) + 4 * x;

  // populate it
  float value_x = 0.5f + 0.5f * cos(t + 10.0f * ((2.0f * x) / width - 1.0f));
  float value_y = 0.5f + 0.5f * cos(t + 10.0f * ((2.0f * y) / height - 1.0f));
  pixel[0] = 0.5 * pixel[0] + 0.5 * pow(value_x, 3.0f);  // red
  pixel[1] = 0.5 * pixel[1] + 0.5 * pow(value_y, 3.0f);  // green
  pixel[2] = 0.5f + 0.5f * cos(t);                                     // blue
  pixel[3] = 1;                                                                   // alpha
}

This is the code that creates the multi-color hatch pattern. What I would like to do is not overwrite the buffer with this but rather:

  1. Read the pixel data of the rendered 3D scene (in my case a rotating 3D cube with a blue background)
  2. Modify the pixel data (e.g. convert to black and white using Otsu thresholding, swap color channels and so on - basic operations for now)
  3. Draw the modified pixel data (underneath the ImGUI part of course)

My guess was that, if I were to not do anything to pixel (a single 4-floats array that contains the RGBA value of the pixel that will be rendered), I would at least get the untouched scene’s pixel data. However, that is no the case and I get a black scene:

What I am expecting (with an “empty kernel” that doesn’t do anything) is to get

I made sure that I am using cudaGraphicsRegisterFlagsNone since I will be both writing and reading from the resource. So previous content should be preserved in my case.


main.cpp

#include <windows.h>
#include <windowsx.h>
#include <d3d11.h>
#include <DirectXMath.h>
#include <iostream>
#include <stdexcept>    
#include <dxgi.h>
#include <cuda_runtime_api.h>
#include <cuda_d3d11_interop.h>
#include <helper_cuda.h>
#include "load_sample.h"

#include <imgui.h>
#include <imgui_impl_win32.h>
#include <imgui_impl_dx11.h>

#define WIN_WIDTH 800
#define WIN_HEIGHT 600

IDXGIAdapter* adapter = nullptr;
IDXGISwapChain* swapchain = nullptr;
ID3D11Device* device = nullptr;
ID3D11DeviceContext* device_context = nullptr;

ID3D11RenderTargetView* rtv_cuda = nullptr;
ID3D11DepthStencilView* buffer_depth = nullptr;

ID3D11Texture2D* texture_rgb;
ID3D11Texture2D* texture_depth;

cudaGraphicsResource* cuda_interop_resource = nullptr;
size_t texture_rgb_cuda_pitch = 0;
void* texture_rgb_cuda_linear_mem = nullptr;
bool cuda_enable = true;
int cuda_device_count = 0;

extern "C"
{
    cudaError_t cuda_texture_2d(void* surface, size_t width, size_t height, size_t pitch, float t);
}


ID3D11SamplerState* sampler_state = nullptr;
ID3D11RasterizerState* raster_state = nullptr;
ID3D11DepthStencilState* stencil_state = nullptr;
char devname[256];

ID3D11PixelShader* sample_pixel_shader = nullptr;
ID3D11VertexShader* sample_vertex_shader = nullptr;
ID3D11InputLayout* sample_vertex_layout = nullptr;
ID3D11Buffer* sample_vertex_buffer = nullptr;
ID3D11Buffer* sample_index_buffer = nullptr;
ID3D11Buffer* sample_constant_buffer = nullptr;

DirectX::XMMATRIX       matrix_world;
DirectX::XMMATRIX       matrix_view;
DirectX::XMMATRIX       matrix_projection;

bool getAdapter(IDXGIAdapter** adapter, char* devname); 
void InitCuda();
cudaError_t RunCudaKernels();
void InitD3D(HWND hWnd);
void RenderFrame(void);
void Clean(void);

static long long frame_counter;

extern LRESULT ImGui_ImplWin32_WndProcHandler(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam);

LRESULT CALLBACK WindowProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);

int WINAPI WinMain(HINSTANCE hInstance,
    HINSTANCE hPrevInstance,
    LPSTR lpCmdLine,
    int nCmdShow)
{
    HWND hWnd;
    HRESULT hr;
    WNDCLASSEX wc;
    std::string window_title = "CUDA Torch Direct3D Interop Demo";

    ZeroMemory(&wc, sizeof(WNDCLASSEX));

    wc.cbSize = sizeof(WNDCLASSEX);
    wc.style = CS_HREDRAW | CS_VREDRAW;
    wc.lpfnWndProc = WindowProc;
    wc.hInstance = hInstance;
    wc.hCursor = LoadCursor(NULL, IDC_ARROW);
    wc.hbrBackground = (HBRUSH)COLOR_WINDOW;
    wc.lpszClassName = "WindowClass";

    RegisterClassEx(&wc);

    RECT wr = { 0, 0, WIN_WIDTH, WIN_HEIGHT };
    auto wrStyle = (WS_OVERLAPPED | WS_MINIMIZEBOX | WS_SYSMENU | WS_CAPTION | WS_MINIMIZEBOX);

    hWnd = CreateWindowEx(NULL,
        "WindowClass",
        window_title.c_str(),
        wrStyle,
        300,
        300,
        wr.right - wr.left,
        wr.bottom - wr.top,
        NULL,
        NULL,
        hInstance,
        NULL);

    ShowWindow(hWnd, nCmdShow);

    try
    {
        InitCuda();
        window_title += std::string(" (") + std::string(devname) + std::string(")");
        SetWindowText(hWnd, window_title.c_str());
    }
    catch (std::exception& e)
    {
        // ...
        return 2;
    }

    try
    {
        InitD3D(hWnd);
    }
    catch (std::exception& e)
    {
        // ...
        return 1;
    }

    // Register the render buffer as a CUDA resource
    cudaError_t cr = cudaGraphicsD3D11RegisterResource(&cuda_interop_resource, texture_rgb, cudaGraphicsRegisterFlags::cudaGraphicsRegisterFlagsNone);
    if (cr != cudaSuccess)
    {
        // ...
    }
    // The registered resource can only be mapped as a texture. In order to write to it, a CUDA array is required
    RECT rect;
    int width = 0;
    int height = 0;
    if (GetWindowRect(hWnd, &rect))
    {
        width = rect.right - rect.left;
        height = rect.bottom - rect.top;
    }
    cr = cudaMallocPitch(&texture_rgb_cuda_linear_mem, &texture_rgb_cuda_pitch, width * sizeof(float) * 4, height);
    if (cr != cudaSuccess)
    {
        // ...
    }
    cr = cudaMemset(texture_rgb_cuda_linear_mem, 1, texture_rgb_cuda_pitch * height);
    if (cr != cudaSuccess)
    {
        // ...
    }

    IMGUI_CHECKVERSION();
    ImGui::CreateContext();
    ImGuiIO& io = ImGui::GetIO();
    ImGui_ImplWin32_Init(hWnd);
    ImGui_ImplDX11_Init(device, device_context);
    ImGui::StyleColorsDark();

    MSG msg;

    // Create cube sample
    hr = create_cube(device, device_context,
        &sample_vertex_layout, &sample_vertex_buffer, &sample_vertex_shader,
        &sample_index_buffer, &sample_constant_buffer, &sample_pixel_shader
    );

    if (FAILED(hr))
    {
        // ...
    }

    while (TRUE)
    {
        if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))
        {
            TranslateMessage(&msg);
            DispatchMessage(&msg);

            if (msg.message == WM_QUIT)
                break;
        }

        // Render the frame and process with CUDA
        try
        {
            RenderFrame();
        }
        catch (std::exception& e)
        {
            // ...
        }
    }

    Clean();

    return msg.wParam;
}

LRESULT CALLBACK WindowProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
{
    if (ImGui_ImplWin32_WndProcHandler(hWnd, message, wParam, lParam))
    {
        return true;
    }

    switch (message)
    {
    case WM_DESTROY:
    {
        PostQuitMessage(0);
        return EXIT_SUCCESS;
    } break;
    case WM_KEYDOWN:
    {
        unsigned char keycode = static_cast<unsigned char>(wParam);
        //...
    }
    break;
    }

    return DefWindowProc(hWnd, message, wParam, lParam);
}

void InitD3D(HWND hWnd)
{
    RECT rect;
    int width = 0;
    int height = 0;
    if (GetWindowRect(hWnd, &rect))
    {
        width = rect.right - rect.left;
        height = rect.bottom - rect.top;
    }

    if (!width || !height)
    {
        // ...
        exit(1);
    }

    HRESULT hr = 0;

    DXGI_SWAP_CHAIN_DESC swapchainDesc;

    ZeroMemory(&swapchainDesc, sizeof(DXGI_SWAP_CHAIN_DESC));

    swapchainDesc.BufferCount = 1;
    swapchainDesc.BufferDesc.Width = width;
    swapchainDesc.BufferDesc.Height = height;
    swapchainDesc.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;     // use 32-bit color DXGI_FORMAT_R8G8B8A8_UNORM DXGI_FORMAT_R32G32B32_FLOAT
    swapchainDesc.BufferDesc.RefreshRate.Numerator = 60;
    swapchainDesc.BufferDesc.RefreshRate.Denominator = 1;
    swapchainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
    swapchainDesc.OutputWindow = hWnd;
    swapchainDesc.SampleDesc.Count = 1;
    swapchainDesc.SampleDesc.Quality = 0;
    swapchainDesc.Windowed = TRUE;

    D3D_FEATURE_LEVEL featureLevels[] =
    {
        D3D_FEATURE_LEVEL_11_1,
        D3D_FEATURE_LEVEL_11_0,
        D3D_FEATURE_LEVEL_10_1,
        D3D_FEATURE_LEVEL_10_0
    };
    UINT numFeatureLevels = ARRAYSIZE(featureLevels);

    char devname_dx[128];
    bool device_ok = getAdapter(&adapter, devname_dx);

    DXGI_ADAPTER_DESC adapterDesc;
    adapter->GetDesc(&adapterDesc); //devid = 9436, subsysid = 177803304

    hr = D3D11CreateDeviceAndSwapChain(
        adapter,
        D3D_DRIVER_TYPE_UNKNOWN,
        NULL,
        NULL,
        featureLevels,
        numFeatureLevels,
        D3D11_SDK_VERSION,
        &swapchainDesc,
        &swapchain,
        &device,
        NULL,
        &device_context);
    if (FAILED(hr))
    {
        // ...
    }

    adapter->Release();
    device->GetImmediateContext(&device_context);

    hr = swapchain->GetBuffer(0, __uuidof(ID3D11Texture2D), (LPVOID*)&texture_rgb); 
    if (FAILED(hr))
    {
        // ...
    }

    hr = device->CreateRenderTargetView(texture_rgb, nullptr, &rtv); //backbuffer    texture_rgb, &renderDesc, &backbuffer
    if (FAILED(hr))
    {
        // ...
    }
    device->CreateRenderTargetView(texture_rgb, nullptr, &rtv_cuda);
    texture_rgb->Release();

    D3D11_TEXTURE2D_DESC depthStencilDesc;
    ZeroMemory(&depthStencilDesc, sizeof(D3D11_TEXTURE2D_DESC));
    depthStencilDesc.Width = width;
    depthStencilDesc.Height = height;
    depthStencilDesc.MipLevels = 1;
    depthStencilDesc.ArraySize = 1;
    depthStencilDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT;
    depthStencilDesc.SampleDesc.Count = swapchainDesc.SampleDesc.Count;
    depthStencilDesc.SampleDesc.Quality = swapchainDesc.SampleDesc.Quality;
    depthStencilDesc.Usage = D3D11_USAGE::D3D11_USAGE_DEFAULT;
    depthStencilDesc.BindFlags = D3D11_BIND_FLAG::D3D11_BIND_DEPTH_STENCIL;
    depthStencilDesc.CPUAccessFlags = 0;
    depthStencilDesc.MiscFlags = 0;
    hr = device->CreateTexture2D(&depthStencilDesc, nullptr, &texture_depth);
    if (FAILED(hr))
    {
        // ...
    }
    hr = device->CreateDepthStencilView(texture_depth, nullptr, &buffer_depth);

    device_context->OMSetRenderTargets(1, &rtv_cuda, buffer_depth);

    D3D11_VIEWPORT viewport;
    ZeroMemory(&viewport, sizeof(D3D11_VIEWPORT));

    viewport.TopLeftX = 0;
    viewport.TopLeftY = 0;
    viewport.Width = WIN_WIDTH;
    viewport.Height = WIN_HEIGHT;
    viewport.MinDepth = 0.0f;
    viewport.MaxDepth = 1.0f;

    device_context->RSSetViewports(1, &viewport);

    matrix_world = DirectX::XMMatrixIdentity();
    DirectX::XMVECTOR Eye = DirectX::XMVectorSet(0.0f, 4.0f, -5.0f, 0.0f);  //0 1 -5 0
    DirectX::XMVECTOR At = DirectX::XMVectorSet(0.0f, 1.0f, 0.0f, 0.0f);
    DirectX::XMVECTOR Up = DirectX::XMVectorSet(0.0f, 1.0f, 0.0f, 0.0f);
    matrix_view = DirectX::XMMatrixLookAtLH(Eye, At, Up);
    matrix_projection = DirectX::XMMatrixPerspectiveFovLH(DirectX::XM_PIDIV2, viewport.Width / (FLOAT)viewport.Height, 0.01f, 100.0f);

    {
        D3D11_SAMPLER_DESC samplerStateDesc;
        samplerStateDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
        samplerStateDesc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
        samplerStateDesc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
        samplerStateDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
        samplerStateDesc.MinLOD = 0;
        samplerStateDesc.MaxLOD = 8;
        samplerStateDesc.MipLODBias = 0;
        samplerStateDesc.MaxAnisotropy = 1;

        hr = device->CreateSamplerState(&samplerStateDesc, &sampler_state);
        if (FAILED(hr))
        {
            // ...
        }
        device_context->PSSetSamplers(0, 1, &sampler_state);
    }

    {
        D3D11_RASTERIZER_DESC rasterizerStateDesc;
        rasterizerStateDesc.FillMode = D3D11_FILL_SOLID;    //D3D11_FILL_SOLID      D3D11_FILL_WIREFRAME
        rasterizerStateDesc.CullMode = D3D11_CULL_BACK;
        rasterizerStateDesc.FrontCounterClockwise = false;
        rasterizerStateDesc.DepthBias = false;
        rasterizerStateDesc.DepthBiasClamp = 0;
        rasterizerStateDesc.SlopeScaledDepthBias = 0;
        rasterizerStateDesc.DepthClipEnable = false;
        rasterizerStateDesc.ScissorEnable = false;
        rasterizerStateDesc.MultisampleEnable = false;
        rasterizerStateDesc.AntialiasedLineEnable = false;

        hr = device->CreateRasterizerState(&rasterizerStateDesc, &raster_state);
        if (FAILED(hr))
        {
            // ...
        }
        device_context->RSSetState(raster_state);
    }

    {
        D3D11_DEPTH_STENCIL_DESC stencilDesc;
        ZeroMemory(&stencilDesc, sizeof(stencilDesc));

        stencilDesc.DepthEnable = true;
        stencilDesc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL;
        stencilDesc.DepthFunc = D3D11_COMPARISON_LESS_EQUAL;

        hr = device->CreateDepthStencilState(&stencilDesc, &stencil_state);
        if (FAILED(hr))
        {
            // ...
        }
        device_context->OMSetDepthStencilState(stencil_state, 0);
    }
}


// Process a frame with CUDA and render it
void RenderFrame(void)
{
    HRESULT hr = 0;

    float color[4] = { 0.0f, 0.2f, 0.4f, 1.0f };

    device_context->ClearRenderTargetView(rtv_cuda, color); //CUDA result is cleared, we want to avoid that
    device_context->ClearDepthStencilView(buffer_depth, D3D11_CLEAR_DEPTH | D3D11_CLEAR_STENCIL, 1.0f, 0);

    static float t = 0.0f;
    static ULONGLONG timeStart = 0;
    ULONGLONG timeCur = GetTickCount64();
    if (timeStart == 0)
        timeStart = timeCur;
    t = (timeCur - timeStart) / 1000.0f;

    static float translationOffset[3] = { 0.0f, 0.0f, 0.0f };
    matrix_world = DirectX::XMMatrixRotationY(t) * DirectX::XMMatrixTranslation(translationOffset[0], translationOffset[1], translationOffset[2]);

    ConstantBuffer cb;
    cb.mWorld = DirectX::XMMatrixTranspose(matrix_world);
    cb.mView = DirectX::XMMatrixTranspose(matrix_view);
    cb.mProjection = DirectX::XMMatrixTranspose(matrix_projection);
    device_context->UpdateSubresource(sample_constant_buffer, 0, nullptr, &cb, 0, 0);

    device_context->VSSetShader(sample_vertex_shader, nullptr, 0);
    device_context->VSSetConstantBuffers(0, 1, &sample_constant_buffer);
    device_context->PSSetShader(sample_pixel_shader, nullptr, 0);
    device_context->DrawIndexed(36, 0, 0);        // 36 vertices needed for 12 triangles in a triangle list

    if (cuda_enable)
    {
        // Map resource
        cudaStream_t stream = 0;
        const int cuda_mapped_resources_count = 1;
        cudaGraphicsResource* cuda_mapped_resources[cuda_mapped_resources_count] = { cuda_interop_resource };
        cudaError_t cr = cudaGraphicsMapResources(cuda_mapped_resources_count, cuda_mapped_resources, stream);
        if (cr != cudaSuccess)
        {
            // ...
        }

        // Run CUDA kernels
        cr = RunCudaKernels();
        if (cr != cudaSuccess)
        {
            // ...
        }

        cr = cudaDeviceSynchronize();
        if (cr != cudaSuccess)
        {
            // ...
        }

        cr = cudaGraphicsUnmapResources(cuda_mapped_resources_count, cuda_mapped_resources, stream);
        if (cr != cudaSuccess)
        {
            // ...
        }
    }

    // Draw ImGUI
    ImGui_ImplDX11_NewFrame();
    ImGui_ImplWin32_NewFrame();

    ImGui::NewFrame();
    ImGui::Begin("Controls / Debug Information");
    ImGui::Text((cuda_device_count ? std::string("Found " + std::to_string(cuda_device_count) + " CUDA " + (cuda_device_count == 1 ? "device" : "devices")).c_str() : "No CUDA devices found!"));
    ImGui::Text(devname);
    ImGui::DragFloat3("Trans X/Y/Z", translationOffset, 0.1f, -5.0f, 5.0f);
    ImGui::Checkbox(std::string((cuda_enable ? "Enabled" : "Disabled") + std::string(" CUDA")).c_str(), &cuda_enable);
    ImGui::End();
    ImGui::Render();
    ImGui_ImplDX11_RenderDrawData(ImGui::GetDrawData());

    hr = swapchain->Present(0, 0);
    if (FAILED(hr))
    {
        std::string msg = std::string("[RenderFrame] Failed to swap back with front buffer: ") + std::to_string(hr);
        throw std::exception(msg.c_str());
    }
}

void Clean(void)
{
    // TODO Clean all allocated resources from DirectX and CUDA
}


bool getAdapter(IDXGIAdapter** adapter, char* devname)
{
    HRESULT hr = S_OK;
    cudaError_t cr = cudaSuccess;
    UINT adapter_idx = 0;
    IDXGIFactory* pFactory;

    hr = CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)(&pFactory));

    if (FAILED(hr))
    {
        std::string msg = std::string("[getAdapter] Unable to create DXGI factory: ") + std::to_string(hr);
        throw std::exception(msg.c_str());
        return false;
    }

    for (; !(*adapter); ++adapter_idx) {
        // Get a candidate DXGI adapter
        IDXGIAdapter* pAdapter = NULL;
        hr = pFactory->EnumAdapters(adapter_idx, &pAdapter);

        if (FAILED(hr))
        {
            break;  // no compatible adapters found
        }

        // Query to see if there exists a corresponding compute device
        int cuDevice;
        cr = cudaD3D11GetDevice(&cuDevice, pAdapter);

        if (cr == cudaSuccess) {
            // If so, mark it as the one against which to create our d3d10 device
            (*adapter) = pAdapter;
            (*adapter)->AddRef();
            ++adapter_idx;
            break;
        }

        pAdapter->Release();
    }

    std::cout << "Found " << (int)adapter_idx << " D3D11 adapater(s)" << std::endl;

    pFactory->Release();

    if (!adapter) {
        std::cerr << "Unable to find a D3D11 adapater with compute capability" << std::endl;
        return false;
    }

    DXGI_ADAPTER_DESC adapterDesc;
    (*adapter)->GetDesc(&adapterDesc);
    std::cout << "Found a D3D11 adapater with compute capability" << std::endl;

    return true;
}


// This function initializes and prepares CUDA for use
void InitCuda()
{
    int nGraphicsGPU = 0;
    bool bFoundGraphics = false;

    // This function call returns 0 if there are no CUDA capable devices.
    cudaError_t cr = cudaGetDeviceCount(&cuda_device_count);

    if (cr != cudaSuccess)
    {
        std::string msg = std::string("[InitCuda] Failed retrieve number of CUDA devices: ") + std::string(cudaGetErrorString(cr));
        throw std::exception(msg.c_str());
    }

    if (cuda_device_count == 0) {
        cuda_enable = false;
    }
    else{
        std::cout << "Found " << cuda_device_count << " CUDA capable device(s)" << std::endl;
    }

    // Get CUDA device properties
    cudaDeviceProp deviceProp;

    for (int dev = 0; dev < cuda_device_count; ++dev) {
        cr = cudaGetDeviceProperties(&deviceProp, dev);
        strcpy_s(devname, deviceProp.name);
        std::cout << "CUDA GPU " << dev << " \"" << devname << "\" : [BusID " << deviceProp.pciBusID << " | DevID " << deviceProp.pciDeviceID << "]" << std::endl;

        if (cr != cudaSuccess)
        {
            std::string msg = std::string("[InitCuda] Failed to retrieve properties of CUDA device: ") + std::string(cudaGetErrorString(cr));
            throw std::exception(msg.c_str());
        }

        break;
    }
}

cudaError_t RunCudaKernels()
{
    static float t = 0.0f;
    cudaError_t cr = cudaSuccess;

    // populate the 2d texture
    cudaArray* cuArray;
    cr = cudaGraphicsSubResourceGetMappedArray(&cuArray, cuda_interop_resource, 0, 0);
    if (cr != cudaSuccess)
        return cr;

    // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it
    cr = cuda_texture_2d(texture_rgb_cuda_linear_mem, WIN_WIDTH, WIN_HEIGHT, texture_rgb_cuda_pitch, t);
    if (cr != cudaSuccess)
        return cr;

    // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray
    cr = cudaMemcpy2DToArray(
        cuArray, // dst array
        0, 0,    // offset
        texture_rgb_cuda_linear_mem, texture_rgb_cuda_pitch,        // src
        WIN_WIDTH * 4 * sizeof(UINT8), WIN_HEIGHT, // extent        // sizeof(...) is based on DXGI_FORMAT of texture, e.g. DXGI_FORMAT_R8G8B8A8_UNORM is uint8, while DXGI_FORMAT_R32G32B32A32_FLOAT
        cudaMemcpyDeviceToDevice); // kind
    if (cr != cudaSuccess)
        return cr;

    t += 0.1f;

    return cr;
}

UPDATE:

I have also tried using two textures - one for the RTV and one for the CUDA processing with ID3D11DeviceContext::CopyResource() used to copy back and forth between the two right at the beginning and end of the body inside if (cuda_enable) {}. The result is the same. I am thinking that I am misunderstanding something here. My CUDA texture is defined as

D3D11_TEXTURE2D_DESC texture_rgb_cudaDesc;
	ZeroMemory(&texture_rgb_cudaDesc, sizeof(D3D11_TEXTURE2D_DESC));
	texture_rgb_cudaDesc.Width = width;
	texture_rgb_cudaDesc.Height = height;
	texture_rgb_cudaDesc.MipLevels = 1;
	texture_rgb_cudaDesc.ArraySize = 1;
	texture_rgb_cudaDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;	//DXGI_FORMAT_R32G32B32A32_FLOAT
	texture_rgb_cudaDesc.SampleDesc.Count = 1;
	texture_rgb_cudaDesc.Usage = D3D11_USAGE::D3D11_USAGE_DEFAULT;
	texture_rgb_cudaDesc.BindFlags = D3D11_BIND_FLAG::D3D11_BIND_RENDER_TARGET; // D3D11_BIND_UNORDERED_ACCESS
	hr = device->CreateTexture2D(&texture_rgb_cudaDesc, nullptr, &texture_rgb_cuda);
	if (FAILED(hr))
	{
		std::string msg = std::string("[InitD3D] Failed to create texture (for CUDA): ") + std::to_string(hr);
		throw std::exception(msg.c_str());
	}