Creating a working nveglstreamsrc->omxh264enc gstreamer-pipeline

Hello,

I would like to encode data rendered with CUDA as quickly as possible as h264. It seems that a proper way to do this is to create a gstreamer-pipeline with a nveglstreamsrc-element connected to an omx264enc-element. I implemented it, but it doesn’t work as expected. Specifically the cuEGLStreamProducerPresentFrame-method always returns immediately and no data is written. Presumably I’m missing something, but as I’m not able to find any documentation about proper usage of the egl-methods, I’m asking you if could tell me what I have to do.

This is the complete source-code of my test-application:

#include <gst/gst.h>

#include <iostream>
#include <stdexcept>
#include <chrono>

#include <cuda.h>
#include <cudaEGL.h>

static void checkCuda(cudaError_t code) {
   if (code != cudaSuccess) {
      printf("cuda-error: %s", cudaGetErrorString(code));
      std::cout << std::endl;
      exit(1);
   }
}
static void checkCu(CUresult code) {
   if (code != CUDA_SUCCESS) {
	  const char *str = NULL;
	  cuGetErrorString(code, &str);
      printf("cuda error-description: %s", str);
      std::cout << std::endl;
      exit(1);
   }
}

static EGLStreamKHR eglStream;
static EGLDisplay eglDisplay;

#define EXTENSION_LIST(T) \
    T( PFNEGLCREATESTREAMKHRPROC,          eglCreateStreamKHR ) \
    T( PFNEGLDESTROYSTREAMKHRPROC,         eglDestroyStreamKHR ) \
    T( PFNEGLQUERYSTREAMKHRPROC,           eglQueryStreamKHR ) \
    T( PFNEGLQUERYSTREAMU64KHRPROC,        eglQueryStreamu64KHR ) \
    T( PFNEGLQUERYSTREAMTIMEKHRPROC,       eglQueryStreamTimeKHR ) \
    T( PFNEGLSTREAMATTRIBKHRPROC,          eglStreamAttribKHR ) \
    T( PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR ) \
    T( PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR ) \
    T( PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC, \
                                    eglStreamConsumerGLTextureExternalKHR ) \
    T( PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \
    T( PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC, eglCreateStreamFromFileDescriptorKHR)

// See bug 200161837 on my EGL pointer functions should renamed starting with my_
// not renaming egl pointer function to start with my_ causes 64 bit app to crash
#define eglCreateStreamKHR                                      my_eglCreateStreamKHR
#define eglDestroyStreamKHR                                     my_eglDestroyStreamKHR
#define eglQueryStreamKHR                                       my_eglQueryStreamKHR
#define eglQueryStreamu64KHR                                    my_eglQueryStreamu64KHR
#define eglQueryStreamTimeKHR                                   my_eglQueryStreamTimeKHR
#define eglStreamAttribKHR                                      my_eglStreamAttribKHR
#define eglStreamConsumerAcquireKHR                             my_eglStreamConsumerAcquireKHR
#define eglStreamConsumerReleaseKHR                             my_eglStreamConsumerReleaseKHR
#define eglStreamConsumerGLTextureExternalKHR                   my_eglStreamConsumerGLTextureExternalKHR
#define eglGetStreamFileDescriptorKHR                           my_eglGetStreamFileDescriptorKHR
#define eglCreateStreamFromFileDescriptorKHR                    my_eglCreateStreamFromFileDescriptorKHR

#define EXTLST_DECL(tx, x)  tx my_ ## x = NULL;
#define EXTLST_EXTERN(tx, x) extern tx my_ ## x;
#define EXTLST_ENTRY(tx, x) { (extlst_fnptr_t *)&my_ ## x, #x },

EXTENSION_LIST(EXTLST_DECL)
typedef void (*extlst_fnptr_t)(void);
static struct {
    extlst_fnptr_t *fnptr;
    char const *name;
} extensionList[] = { EXTENSION_LIST(EXTLST_ENTRY) };

static int eglSetupExtensions(void) {
    for (int i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) {
        *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name);
        if (*extensionList[i].fnptr == NULL) {
            printf("Couldn't get address of %s()\n", extensionList[i].name);
            return 0;
        }
    }
    return 1;
}

static int EGLStreamInit() {
    static const EGLint streamAttrMailboxMode[] = { EGL_SUPPORT_REUSE_NV, EGL_FALSE, EGL_NONE };
    EGLBoolean eglStatus;
    eglDisplay = eglGetDisplay(EGL_DEFAULT_DISPLAY);
    if (eglDisplay == EGL_NO_DISPLAY) {
        printf("eglDisplayHandle failed \n");
        return 0;
    } else {
        printf("eglDisplay Handle created (0x%p)\n", eglDisplay);
    }

    eglStatus = eglInitialize(eglDisplay, 0, 0);
    if (!eglStatus) {
        printf("EGL failed to initialize.\n");
        return 0;
    }

    eglStream = eglCreateStreamKHR(eglDisplay, streamAttrMailboxMode);
    if (eglStream == EGL_NO_STREAM_KHR) {
        printf("EGLStreamInit: Couldn't create eglStream.\n");
        return 0;
    }

    // Set stream attribute
    if (!eglStreamAttribKHR(eglDisplay, eglStream, EGL_CONSUMER_LATENCY_USEC_KHR, 16000)) {
        printf("Consumer: eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n");
        return 0;
    }
    if (!eglStreamAttribKHR(eglDisplay, eglStream, EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000)) {
        printf("Consumer: eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR failed\n");
        return 0;
    }

    printf("EGLStream initialized\n");
    return 1;
}

constexpr int WIDTH = 1280;
constexpr int HEIGHT = 720;
static CUeglStreamConnection conn = NULL;

static gboolean myIdle(gpointer user_data) {
	static uint8_t *data = NULL;
	if (!data) {
		checkCuda(cudaMallocHost(&data, WIDTH*HEIGHT*3/2));
		checkCuda(cudaMemset(data, 128, WIDTH*HEIGHT*3/2));
	}

	CUeglFrame eglFrame{};
	eglFrame.frame.pPitch[0] = const_cast<uint8_t*>(data);
	eglFrame.frame.pPitch[1] = const_cast<uint8_t*>(data) + WIDTH*HEIGHT;
	eglFrame.frame.pPitch[2] = const_cast<uint8_t*>(data) + WIDTH*HEIGHT*5/4;
	eglFrame.width = WIDTH;
	eglFrame.height = HEIGHT;
	eglFrame.depth = 1;
	eglFrame.pitch = WIDTH;
	eglFrame.frameType = CU_EGL_FRAME_TYPE_PITCH;
	eglFrame.planeCount = 3;
	eglFrame.numChannels = 1;
	eglFrame.eglColorFormat = CU_EGL_COLOR_FORMAT_YUV420_PLANAR;
	eglFrame.cuFormat = CU_AD_FORMAT_UNSIGNED_INT8;
	checkCu(cuEGLStreamProducerPresentFrame(&conn, eglFrame, NULL));


	static int counter = 0;
	++counter;

	using namespace std::chrono;
	using theclk = high_resolution_clock;
	static theclk::time_point lastTime{};
	const auto curTime = theclk::now();
	const auto timeDiff = curTime-lastTime;
	if (timeDiff >= seconds(1)) {
		lastTime = curTime;
		printf("%.1f fps", counter/duration_cast<duration<double>>(timeDiff).count());
		std::cout << std::endl;
		counter = 0;
	}

	return G_SOURCE_CONTINUE;
}

int main() {
	gst_init(NULL, NULL);
	if (!eglSetupExtensions()) throw std::runtime_error("eglSetupExtensions");
	if (!EGLStreamInit()) throw std::runtime_error("EGLStreamInit");


	checkCu(cuInit(0));
	CUdevice device;
	checkCu(cuDeviceGet(&device, 0));
	CUcontext ctx = NULL;
	cuCtxCreate(&ctx, 0, device);
	checkCu(cuEGLStreamConsumerConnect(&conn, eglStream)); // without this the producer-connect call fails
	checkCu(cuEGLStreamProducerConnect(&conn, eglStream, WIDTH, HEIGHT));


	GstElement *pipeline = gst_pipeline_new("video_pipeline");
	if (!pipeline) throw std::runtime_error("no pipeline");

	GstElement *eglSrc = gst_element_factory_make("nveglstreamsrc", NULL);
	if (!eglSrc) throw std::runtime_error("no eglSrc");
	if (!gst_bin_add(GST_BIN(pipeline), eglSrc)) throw std::runtime_error("could not add eglSrc");
	g_object_set(G_OBJECT(eglSrc), "display", eglDisplay, NULL);
	g_object_set(G_OBJECT(eglSrc), "eglstream", eglStream, NULL);

	GstElement *queue = gst_element_factory_make("queue", NULL);
	if (!queue) throw std::runtime_error("no queue");
	if (!gst_bin_add(GST_BIN(pipeline), queue)) throw std::runtime_error("could not add queue");

	GstElement *h264Enc = gst_element_factory_make("omxh264enc", NULL);
	if (!h264Enc) throw std::runtime_error("no omxh264enc");
	if (!gst_bin_add(GST_BIN(pipeline), h264Enc)) throw std::runtime_error("could not add omxh264enc");
	g_object_set(G_OBJECT(h264Enc), "bitrate", 30000000, NULL);

	GstElement *videoMuxer = gst_element_factory_make("qtmux", NULL);
	if (!videoMuxer) throw std::runtime_error("no qtmux");
	if (!gst_bin_add(GST_BIN(pipeline), videoMuxer)) throw std::runtime_error("could not add qtmux");

	GstElement *fileSink = gst_element_factory_make("filesink", NULL);
	if (!fileSink) throw std::runtime_error("no filesink");
	if (!gst_bin_add(GST_BIN(pipeline), fileSink)) throw std::runtime_error("could not add filesink");
	g_object_set(G_OBJECT(fileSink), "location", "test.mp4", NULL);

	GstCaps *caps = gst_caps_new_simple("video/x-raw",
									   "format", G_TYPE_STRING, "I420", // I420/NV12
									   "width", G_TYPE_INT, WIDTH,
									   "height", G_TYPE_INT, HEIGHT,
									   "framerate", GST_TYPE_FRACTION, 30, 1,
									   NULL);
	if (!caps) throw std::runtime_error("no caps");
	GstCapsFeatures *features = gst_caps_features_new("memory:NVMM", NULL);
	if (!features) throw std::runtime_error("no features");
	gst_caps_set_features(caps, 0, features);
	if (!gst_element_link_filtered(eglSrc, queue, caps)) throw std::runtime_error("could not link eglSrc to queue");
	gst_caps_unref(caps);

	if (!gst_element_link(queue, h264Enc)) throw std::runtime_error("could not link queue to h264Enc");
	if (!gst_element_link_pads(h264Enc, "src", videoMuxer, "video_%u")) throw std::runtime_error("could not link h264Enc to videoMuxer");
	if (!gst_element_link(videoMuxer, fileSink)) throw std::runtime_error("could not link videoMuxer to fileSink");

	gst_element_set_state(pipeline, GST_STATE_PLAYING);

	GMainLoop *gMainLoop = g_main_loop_new(NULL, FALSE);
	g_idle_add(myIdle, NULL);
	g_main_loop_run(gMainLoop);

	return 0;
}

Please refer to following posts:
https://devtalk.nvidia.com/default/topic/1001636/jetson-tx1/tearing-in-gstreamer-captured-screen-video-with-opengl-full-screen-mode/post/5137203/#5137203
https://devtalk.nvidia.com/default/topic/1023481/jetson-tx2/frames-returned-from-nveglstreamsrc-via-egl-stream-out-of-order/post/5209249/#5209249

Thank you, I got the pipeline now to do something. The main problem was that the producer-connection has to be made after the pipeline is created and not before.
Also important to note is that memory has to be allocated for each the Y-, U- and V-planes individually. Just allocating one chunk of memory and using offsets for U and V does not work. Apparently the egl-source doesn’t use the exact memory addresses specified, but is tracking back the starting address of each allocated memory chunk and then using that.
The pitch-value has to be set to zero. The system then automatically determines the pitch-values for the individual planes.

I have one more question: How can I set the presentation-timestamp for each frame manually? Neither the cuEGLStreamProducerPresentFrame-method nor the CUeglFrame-struct provides a timestamp-parameter/field.

Hi mrjazz2,
Just as you can see, there is no interface to configure timestamps.

We suggest you try NvBuffer APIs in tegra_multimedia_api. There are APIs defined in tegra_multimedia_api\include\nvbuf_utils.h

You can create a buffer:

/**
 * Use this method to allocate HW buffer.
 * @param[out] dmabuf_fd Returns `dmabuf_fd` of hardware buffer.
 * @param[in] input_params Input parameters for hardware buffer creation.
 *
 * @returns 0 for success, -1 for failure
 */
int NvBufferCreateEx (int *dmabuf_fd, NvBufferCreateParams *input_params);

And get EGLImage:

/**
* This method must be used for getting `EGLImage` from `dmabuf-fd`.
*
* @param[in] display `EGLDisplay` object used during the creation of `EGLImage`.
* @param[in] dmabuf_fd `DMABUF FD` of buffer from which `EGLImage` to be created.
*
* @returns `EGLImageKHR` for success, `NULL` for failure
*/
EGLImageKHR NvEGLImageFromFd (EGLDisplay display, int dmabuf_fd);

So that you can access via CUDA.

We have NvVideoEncoder class for h264 encoding.