I’m observing that the nvp-models lead to anomalous performance behavior when encoding h264-video with the hardware encoder. When enabling for example the Max-N mode and only running cuda-kernels (without h264-encoding) I get as expected the best performance and with Max-Q the worst. But when I also encode h264 at the same time, then Max-Q has much higher overall performance than Max-N. I tested all nvp-models with a test-application with and without h264-encoding and this is the result (fps = frames per second):
nvp-model fps (w/o enc.) fps (with enc.)
0 86.3 24.3
1 56.8 41.4
2 74.5 20.5
3 74.7 48.8
4 74.6 21.9
Do you have any explanation for this?
This is the complete source-code of my test-application. It renders a 10-second mandelbrot-animation with 4K-resolution and encodes it into a file. You can disable the encoding by commenting line 18.
#include <iostream>
#include <chrono>
#include <stdexcept>
#include <cuda.h>
#include <glib.h>
#include <gst/gst.h>
#include <gst/app/gstappsrc.h>
using FloatType = float;
constexpr int MB_WIDTH = 3840;
constexpr int MB_HEIGHT = 2160;
constexpr FloatType MB_ASPECT = (FloatType)MB_WIDTH/MB_HEIGHT;
constexpr int MB_BLOCK_WIDTH = 32;
constexpr int MB_BLOCK_HEIGHT = 32;
constexpr int MAX_ITER = 64;
constexpr int FRAME_RATE = 30;
#define ENCODE_TO_FILE
class GstCapture {
public:
GstCapture() {
GError *err = NULL;
pipeline = gst_parse_launch("appsrc name=mysource ! omxh264enc name=myenc iframeinterval=60 profile=2 bitrate=30000000 control-rate=2"
" ! qtmux ! filesink location=/home/nvidia/Documents/mandelbrot.mp4", &err);
if (err) throw std::runtime_error("parse-error");
appsrc = (GstAppSrc*)gst_bin_get_by_name(GST_BIN(pipeline), "mysource");
if (!appsrc) throw std::runtime_error("no appsrc");
g_object_set (G_OBJECT (appsrc),
"caps", gst_caps_new_simple ("video/x-raw",
"format", G_TYPE_STRING, "I420",
"width", G_TYPE_INT, MB_WIDTH,
"height", G_TYPE_INT, MB_HEIGHT,
"framerate", GST_TYPE_FRACTION, 0, 1,
NULL),
"stream-type", GST_APP_STREAM_TYPE_STREAM,
"format", GST_FORMAT_TIME,
NULL);
gst_element_set_state (pipeline, GST_STATE_PLAYING);
}
~GstCapture() {
gst_app_src_end_of_stream(appsrc);
GstBus *bus = gst_pipeline_get_bus(GST_PIPELINE (pipeline));
gst_message_unref(gst_bus_timed_pop_filtered(bus, GST_CLOCK_TIME_NONE, GST_MESSAGE_EOS));
gst_object_unref(bus);
gst_object_unref(appsrc);
gst_element_set_state(pipeline, GST_STATE_NULL);
}
struct Buffer {
Buffer() {
cudaMallocHost(&data, BUF_SIZE);
cudaMemset(data, 128, BUF_SIZE);
}
~Buffer() {cudaFree(data);}
uint32_t *data;
bool used = false;
};
Buffer *getFreeBuffer() {
for (int i = 0; i < BUF_COUNT; ++i) if (!buffers[i].used) return &buffers[i];
return NULL;
}
void commitBuffer(Buffer *buffer) {
#ifndef ENCODE_TO_FILE
return;
#endif
buffer->used = true;
GstBuffer *gstBuf = gst_buffer_new_wrapped_full((GstMemoryFlags)0, (gpointer)buffer->data, BUF_SIZE, 0, BUF_SIZE, (void*)buffer, cbDestroyBuf);
GST_BUFFER_PTS (gstBuf) = timestamp;
GST_BUFFER_DURATION (gstBuf) = gst_util_uint64_scale_int (1, GST_SECOND, FRAME_RATE);
timestamp += GST_BUFFER_DURATION (gstBuf);
GstFlowReturn ret = gst_app_src_push_buffer(appsrc, gstBuf);
if (ret != GST_FLOW_OK) throw std::runtime_error("push-error");
}
private:
static constexpr int BUF_COUNT = 8;
static constexpr int BUF_SIZE = MB_WIDTH*MB_HEIGHT*3/2;
Buffer buffers[BUF_COUNT];
GstElement *pipeline = NULL;
GstAppSrc *appsrc = NULL;
GstClockTime timestamp = 0;
static void cbDestroyBuf(gpointer data) {
Buffer *buf = (Buffer*)data;
buf->used = false;
}
};
__global__ void mandelbrotK(uint32_t *pixelsOut, FloatType tx, FloatType ty, FloatType s) {
const int xQuad = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
const FloatType cIm = -s/MB_ASPECT*((FloatType)y/MB_HEIGHT-FloatType(0.5)) + ty;
uint32_t quad;
uint8_t *Y = (uint8_t*)&quad;
for (int i = 0; i < 4; ++i) {
const int x = 4*xQuad+i;
const FloatType cRe = s*((FloatType)x/MB_WIDTH-FloatType(0.5)) + tx;
FloatType zRe = 0;
FloatType zIm = 0;
int it;
for (it = 0; it < MAX_ITER && zRe*zRe+zIm*zIm < 2*2; ++it) {
const FloatType temp = zRe*zRe - zIm*zIm + cRe;
zIm = 2*zRe*zIm + cIm;
zRe = temp;
}
Y[i] = it==MAX_ITER?0:it*(256/MAX_ITER);
}
pixelsOut[xQuad + y*(MB_WIDTH/4)] = quad;
}
static GMainLoop *mainLoop;
static gboolean myIdleFunc(gpointer user_data) {
using namespace std;
using namespace chrono;
using theclock = high_resolution_clock;
static GstCapture *gstCapture = NULL;
if (!gstCapture) gstCapture = new GstCapture;
GstCapture::Buffer *buffer = gstCapture->getFreeBuffer();
if (!buffer) return G_SOURCE_CONTINUE;
static double scaleFactor = 2;
scaleFactor *= 0.999;
static theclock::time_point lastTime;
static theclock::time_point startTime;
static int lastFrameCount = 0;
static int totalFrameCount = 0;
const auto tStartRender = theclock::now();
mandelbrotK<<<dim3(MB_WIDTH/MB_BLOCK_WIDTH/4, MB_HEIGHT/MB_BLOCK_HEIGHT), dim3(MB_BLOCK_WIDTH, MB_BLOCK_HEIGHT)>>>(
buffer->data, -0.7436, 0.131, scaleFactor
);
const auto tStartSync = theclock::now();
cudaDeviceSynchronize();
const auto tCommit = theclock::now();
gstCapture->commitBuffer(buffer);
const auto tDone = theclock::now();
const auto curTime = tDone;
if (!lastTime.time_since_epoch().count()) {
lastTime = curTime;
startTime = curTime;
}
++lastFrameCount;
++totalFrameCount;
const double timeDiffLast = duration_cast<duration<double>>(curTime-lastTime).count();
if (timeDiffLast > 1) {
const double timeDiffStart = duration_cast<duration<double>>(curTime-startTime).count();
const double kernelDuration = duration_cast<duration<double, ratio<1, 1000>>>(tStartSync-tStartRender).count();
const double syncDuration = duration_cast<duration<double, ratio<1, 1000>>>(tCommit-tStartSync).count();
const double commitDuration = duration_cast<duration<double, ratio<1, 1000>>>(tDone-tCommit).count();
const double renderDuration = duration_cast<duration<double, ratio<1, 1000>>>(tDone-tStartRender).count();
printf("%.0fs, %.1f fps (kernel: %.3fms + sync: %.3fms + commit: %.3fms = %.3fms)",
timeDiffStart, lastFrameCount/timeDiffLast,
kernelDuration, syncDuration, commitDuration, renderDuration);
cout << endl;
lastTime = curTime;
lastFrameCount = 0;
}
constexpr int MAX_FRAME_COUNT = 60*5;
if (totalFrameCount == MAX_FRAME_COUNT) {
const double timeDiffStart = duration_cast<duration<double>>(curTime-startTime).count();
printf("Result after %d frames: Rendering needed %.2f seconds with an avg. frame rate of %.2f fps",
MAX_FRAME_COUNT, timeDiffStart, totalFrameCount/timeDiffStart);
cout << endl;
delete gstCapture;
gstCapture = NULL;
g_main_loop_quit(mainLoop);
return G_SOURCE_REMOVE;
}
return G_SOURCE_CONTINUE;
}
int main(int argc, char *argv[]) {
mainLoop = g_main_loop_new(NULL, FALSE);
gst_init(NULL, NULL);
g_idle_add(myIdleFunc, NULL);
g_main_loop_run(mainLoop);
return 0;
}