I wrote a C++ code utilizing CUDA on an NVIDIA 1660 Ti Max Q (laptop). I compiled the code, and it performed the task I wanted it to precisely how I designed it too. However, I recently switched to another laptop, which has an NVIDIA RTX 5070 (laptop) as its GPU. I wanted to use the RTX 5070 for better performance than my 6 year old 1660 Ti. However, when I transferred the exact same code to the laptop with the RTX 5070 GPU I got these errors for the first time in my console output:
[CUDA ERROR] Failed to copy mapped array to 2D Linear Array…
[NVIDIA ERROR] Kernel Launch Error: invalid argument
At this line of code:
cudaError_t err = cudaMemcpy2DFromArray(d_linearSource, dstPitch, mappedArray, 0, 0, frameWidth * sizeof(uchar4),
frameHeight, cudaMemcpyDeviceToDevice);
if (err != cudaSuccess) {
PRINT(L"[CUDA ERROR] Failed to copy mapped array to 2D Linear Array…");
return false;
}
I have done some “digging in” but I have failed to locate a conclusive problem. The only thing I know that could be affecting and causing this is the difference in architecture’s 1660 Ti (Turing) versus the RTX 5070 (Blackwell). Other than that, everything is the same: same CUDA version (12.8), same exact code, valid variables passed into cudaMemcpy2DFromArray, proper registration of the D3D11-Resource, successful mapping of the resource, and successful retrieval of the mapped array. Any further insight or knowledge into what might be causing these troubles would be greatly appreciated.
The first thing you would want to do is to isolate the failing call into a minimal self-contained standalone reproducer code that demonstrates the issue. This would allow others (including NVIDIA, should you want to file a bug against the CUDA runtime eventually) to reproduce the issue. Of particular interest are the actual arguments passed to cudaMemcpy2DFromArray(), as the call presumably fails only for particular values of these arguments. Knowing the formal argument alone is not really helpful, other that we can see that they look plausible. We would also want to examine the exact value of err returned as status by cudaMemcpy2DFromArray().
As I have a vague recollection that some APIs related to CUDA arrays were deprecated or even removed over the years, I searched for a deprecation notice for cudaMemcpy2DFromArray() but found none, suggesting that CUDA programmers should expect this API to work.
Interesting, I tried your suggestion of writing a very minimal source code script to test cudaMemcpy2DFromArray() and to my surprise it succeeded. Here is my test script:
#include <iostream>
#include <d3d11.h>
#include <winrt/base.h>
#include <cuda_runtime.h>
#include <cuda_d3d11_interop.h>
#include "device_launch_parameters.h"
#pragma comment(lib, "d3d11.lib")
int main() {
constexpr int frameWidth = 512;
constexpr int frameHeight = 512;
//Create D3D11 device and context
winrt::com_ptr<ID3D11Device> device;
winrt::com_ptr<ID3D11DeviceContext> context;
D3D_FEATURE_LEVEL featureLevel;
HRESULT hr = D3D11CreateDevice(
nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0,
D3D11_SDK_VERSION, device.put(), &featureLevel, context.put()
);
if (FAILED(hr)) {
std::cerr << "Failed to create D3D11 device!" << std::endl;
return -1;
}
//Create D3D11 texture
D3D11_TEXTURE2D_DESC textureDesc = {};
textureDesc.Width = frameWidth;
textureDesc.Height = frameHeight;
textureDesc.MipLevels = 1;
textureDesc.ArraySize = 1;
textureDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
textureDesc.SampleDesc.Count = 1;
textureDesc.Usage = D3D11_USAGE_DEFAULT;
textureDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
winrt::com_ptr<ID3D11Texture2D> texture;
hr = device->CreateTexture2D(&textureDesc, nullptr, texture.put());
if (FAILED(hr)) {
std::cerr << "Failed to create D3D11 texture!" << std::endl;
return -1;
}
std::cout << "Successfully created D3D11 Texture2D!" << std::endl;
cudaGraphicsResource_t cudaResource = nullptr;
cudaError_t e = cudaGraphicsD3D11RegisterResource(
&cudaResource, texture.get(), cudaGraphicsRegisterFlagsNone
);
if (e != cudaSuccess) {
std::cerr << "Failed to register D3D11 texture with CUDA: " << cudaGetErrorString(e) << std::endl;
return -1;
}
e = cudaGraphicsMapResources(1, &cudaResource, 0);
if (e != cudaSuccess) {
std::cerr << "Failed to map CUDA graphics resource: " << cudaGetErrorString(e) << std::endl;
return -1;
}
cudaArray_t mappedArray = nullptr;
e = cudaGraphicsSubResourceGetMappedArray(&mappedArray, cudaResource, 0, 0);
if (e != cudaSuccess) {
std::cerr << "Failed to get mapped CUDA array: " << cudaGetErrorString(e) << std::endl;
cudaGraphicsUnmapResources(1, &cudaResource, 0);
return -1;
}
size_t dstPitch = frameWidth * sizeof(uchar4); // Pitch in bytes
uchar4* d_linearSource = nullptr;
size_t linearSize = frameWidth * frameHeight * sizeof(uchar4);
e = cudaMalloc(&d_linearSource, linearSize);
if (e != cudaSuccess) {
std::cerr << "Failed to allocate linearSource buffer: " << cudaGetErrorString(e) << std::endl;
cudaGraphicsUnmapResources(1, &cudaResource, 0);
return -1;
}
// Perform cudaMemcpy2DFromArray for debugging
e = cudaMemcpy2DFromArray(
d_linearSource, // dst pointer
dstPitch, // dst pitch (in bytes)
mappedArray, // src cudaArray
0, 0, // srcXInBytes, srcY
frameWidth * sizeof(uchar4), // width of the row to copy (in bytes)
frameHeight, // height (number of rows)
cudaMemcpyDeviceToDevice
);
if (e != cudaSuccess) {
std::cerr << "cudaMemcpy2DFromArray failed: " << cudaGetErrorString(e) << std::endl;
}
else {
std::cout << "cudaMemcpy2DFromArray succeeded!" << std::endl;
}
cudaFree(d_linearSource);
cudaGraphicsUnmapResources(1, &cudaResource, 0);
cudaGraphicsUnregisterResource(cudaResource);
std::cout << "Process Completed!" << std::endl;
return 0;
}
Now I got to see what happened in my other source code file.