I am attempting to integrate some CUDA operations into a GStreamer-based application, and work with the NVMM buffers between pipelines(using the application API).
I have scoured the Deepstream CUDA examples, and the Jetson Multimedia API samples, as well as these forums and the wide web™.
My application starts with a simple test pipeline:
videotestsrc do-timestamp=true is-live=true pattern=24 horizontal-speed=6 ! video/x-raw,width=1280,height=720,framerate=15/1 ! nvvidconv ! video/x-raw(memory:NVMM),format=NV12 ! queue leaky=2 ! appsink name=appsink drop=true max-buffers=1 sync=false async=false
As you can see this generates a test video, with such as such dimensions and framerate, copies it to HW memory space, in NV12 format, and pushes the buffer to the appsink, whereupon it is intercepted using a Pad Probe, and moved to the following functions:
void TestClass::cleanup(GstBuffer* buffer = nullptr, GstMapInfo* info = nullptr,
EGLImageKHR gl_image = nullptr, CUgraphicsResource pResource = nullptr,
int* devicePtr = nullptr)
{
if (devicePtr) {
cudaFree(devicePtr);
}
if (pResource) {
auto ret = cuGraphicsUnregisterResource(pResource);
if (ret != CUDA_SUCCESS) {
m_logger->error("WARNING: could not unregister resource during cleanup");
}
}
if (gl_image) NvDestroyEGLImage(m_egl_display, gl_image);
if (buffer && info) gst_buffer_unmap(buffer, info);
}
bool TestClass::frameToCUDAMemory(GstBuffer* buffer, CUgraphicsResource& pResource,
EGLImageKHR& gl_image, CUeglFrame* cudaFrame)
{
// Verify we have a buffer to handle
auto inmem = gst_buffer_peek_memory(buffer, 0);
if (!inmem) {
m_logger->error("No memory block to peek into");
cleanup();
return false;
}
// Map the GST Buffer
GstMapInfo info;
if (!gst_buffer_map(buffer, &info, GST_MAP_READ)) {
m_logger->error("Could not get NV memory block");
cleanup();
return false;
}
// Get the file descriptor from the buffer, one way or the other
int buff_fd;
if (!g_strcmp0(inmem->allocator->mem_type, GST_NVSTREAM_MEMORY_TYPE) && info.size == sizeof(NvBufSurface)) {
auto surf = (NvBufSurface*)info.data;
buff_fd = int(surf->surfaceList[0].bufferDesc);
if (NvBufSurfaceMapEglImage(surf, -1) == -1) {
cleanup(buffer, &info);
return false;
}
gl_image = surf->surfaceList[0].mappedAddr.eglImage;
} else {
auto retn = ExtractFdFromNvBuffer(info.data, &buff_fd);
if (retn) {
m_logger->error("Error extracting fd");
cleanup(buffer, &info);
return false;
}
// Wrap the fd with EGLImage
gl_image = NvEGLImageFromFd(m_egl_display, buff_fd);
if (!gl_image) {
m_logger->error("Could not create image from fd");
cleanup(buffer, &info);
return false;
}
}
// Ensure context exists
cudaFree(nullptr);
// Register EGL Image with resource
auto ret = cuGraphicsEGLRegisterImage(&pResource, gl_image, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
if (ret != CUDA_SUCCESS) {
m_logger->error("Error registering image to CUDA resource");
cleanup(buffer, &info, gl_image);
return false;
}
// Map EGL frame to accessible resource
ret = cuGraphicsResourceGetMappedEglFrame(cudaFrame, pResource, 0, 0);
if (ret != CUDA_SUCCESS) {
m_logger->error("Could not get mapped EGL frame");
cleanup(buffer, &info, gl_image, pResource);
return false;
}
// Wait for sync between host and device
ret = cuCtxSynchronize();
if (ret != CUDA_SUCCESS) {
m_logger->error("Failed to synchronize context");
cleanup(buffer, &info, gl_image, pResource);
return false;
}
// Unmap the buffer
gst_buffer_unmap(buffer, &info);
return true;
}
void TestClass::handleFrame(GstBuffer* buffer) {
CUgraphicsResource pResource = nullptr;
EGLImageKHR gl_image = nullptr;
CUeglFrame cudaFrame;
if (!frameToCUDAMemory(buffer, pResource, gl_image, &cudaFrame)) {
return;
}
cudaPointerAttributes attr;
auto err = cudaPointerGetAttributes(&attr, (void*)(cudaFrame.frame.pArray[0]));
/* handleEGLImage or any other algorithm you want */
}
Every check I have in these functions passes, no errors whatsover.
However:
-
the cudaFrame->frameType shows as “ARRAY”, and not as “PITCH”, Somewhere along the docs I’ve read that ARRAY is the default way to save buffers in GPU memory, which seems fine, however none of the examples provided by NVidia ever use ARRAY frames, only PITCH, in fact, in most sample applications there is an explicit check for if the frame type is pitch, at which point the code executes, and if it isn’t - nothing happens.
-
the attr object, which should contain the params for that memory section, simply show “UNREGISTERED”, and the err gives “invalid parameter”, since the first param is pretty straightforward, I’m guessing the memory address is the invalid param, but shouldn’t the cuGraphicsResourceGetMappedEglFrame function have failed if it didn’t provide any pointers to the actual data?
The rest of the CUeglFrame contains the correct data, resolution and everything, just the data seems to not exist.
When attempting to run some CUDA operations on that image(even the HandleEGLImage functions supplied with the samples) CUDA essentially crashes, with every future request returning INVALID_LAUNCH.
running my function with cuda-memcheck gives the following:
========= Invalid __global__ write of size 1
========= at 0x000001f0 in /home/user/src/pipeline/CUDATest.cu:38:testCUDAFunc(int*, unsigned int, unsigned int)
========= by thread (0,519,0) in block (0,0,0)
========= Address 0x7f7077c0e0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 (cuLaunchKernel + 0x218) [0x1f3970]
========= Host Frame:/usr/local/cuda-10.2/lib64/libcudart.so.10.2 [0x102ac]
The EGL display is initiated so:
m_egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (m_egl_display == EGL_NO_DISPLAY) {
throw std::runtime_error("Could not get EGL display");
}
if (!eglInitialize(m_egl_display, nullptr, nullptr)) {
throw std::runtime_error("Could not initialize EGL display");
}
All of which seems to point to the simple fact - all of the pointers provided by the CUeglFrame->frame->pArray are pointing to non-existing addresses, not reachable by neither the host, or device.
Surely there’s a way to manually handle frames the way nvivainfer does, without inserting that element into the pipeline.
Am I missing something basic?
Much appreciated