I am trying to perform Gaussian filtering on video input using OpenCV and CUDA framework. I have encountered an issue of frames flickering when I try to send the filtered frames to V4L2 capable output device. The frames seem perfect without flicker when I view it using “imshow” OpenCV API. On debugging, I found the issue emerges while using cudaMemcpy() to copy input frame to device for processing. I am sure about copying exact frame with appropriate frame size to the GPU. On copying both input and output frames seem flickering.
Can you provide more details on your software and hardware stack?
I have a code where I try to perform gaussian filtering on NVIDIA GPU using CUDA toolkit 9.0 with OpenCV 3.4.6 and V4L2 framework for video capture and transmit. The video captured with 1080(i50) resolution. Frames are captured in YUV(4:2:2) format, converted to RGB using OpenCV cvtColor API.On trying to copy the frame to GPU, frames start flickering with green frames coming in between. Is it because of the global references to the device and host variables?
#include "main.h"
#include <signal.h>
#define QUEUE_SZ 8
#define OUT_FIFO_START 2
#define DEFAULT_FRAME_Y 1080
#define DEFAULT_FRAME_X 1920
#define COUNTOF(x) (sizeof(x)/sizeof(x[0]))
struct v4l2_buffer v4l2In;
enum{
V4L2_PIX_FMT_v210 = 0x30313276
};
cv::Mat frameIn(DEFAULT_FRAME_Y, DEFAULT_FRAME_X, CV_8UC3);
cv::Mat frameOut;
cv::Mat frameRGB(DEFAULT_FRAME_Y, DEFAULT_FRAME_X,CV_8UC3);
cv::Mat frameYCbCr(DEFAULT_FRAME_Y, DEFAULT_FRAME_X,CV_8UC3);
cv::Mat frameYUV(DEFAULT_FRAME_Y, DEFAULT_FRAME_X, CV_8UC2);
uchar4 *d_frameIn;
uchar4 *d_frameOut;
unsigned char *buffer;
static int high =0;
// Filter vars, these will be used to apply a weighting matrix to the image
float *_h_filter;
int filterWidth;
int i = 0;
int stencilSize = 3;
int restart = 1;
// Channels and frame initializers
uchar4 *h_inputFrame;
uchar4 *d_inputFrame;
uchar4 *h_outputFrame;
uchar4 *d_outputFrame;
unsigned char *d_redBlurred;
unsigned char *d_greenBlurred;
unsigned char *d_blueBlurred;
// Helper methods for frame
size_t numRows() { return DEFAULT_FRAME_Y; }
size_t numCols() { return DEFAULT_FRAME_X; }
size_t numPixels() { return (DEFAULT_FRAME_X * DEFAULT_FRAME_Y); }
volatile int stop_everything = 0;
int in_h = 0; //input device file descriptor
int out_h = 0;//Output device file descriptor
int main()
{
int ret =-1;
unsigned u;
struct v4l2_format fmtIn; //current format
struct v4l2_streamparm parm; //current format
struct v4l2_buffer v4l2In;
int source = -1;
unsigned isV210On = 0;
...
...
/* I query the input and output device capabilities, allocate array of input buffers (8 buffers) for input and output video frames*/
void* bufsIn[QUEUE_SZ];
memset(&bufsIn, 0, sizeof(bufsIn));
void* bufsOut[QUEUE_SZ];
memset(&bufsOut, 0, sizeof(bufsOut));
...
...
...
/*Buffers are then requested and mapped to store frames at both input and output ends */
for (u = 0; u < QUEUE_SZ; u++) {
memset(&v4l2In, 0, sizeof(v4l2In));
v4l2In.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
v4l2In.memory = V4L2_MEMORY_MMAP;
v4l2In.index = u;
v4l2In.length = frmSz;
{
ret = ioctl(in_h, VIDIOC_QUERYBUF, &v4l2In);
if (ret) {
fprintf(stdout, "%d in query buf error %s\n", u, strerror(errno));
goto terminate;
}
if (MAP_FAILED == (bufsIn[u] = mmap(0, frmSz, PROT_READ|PROT_WRITE, MAP_SHARED,
in_h, v4l2In.m.offset))) {
fprintf(stdout, "%d mmap error\n", u);
goto terminate;
}
fprintf(stdout, "%d buf in memmap: %p %d\n", u, bufsIn[u], frmSz);
if (out_h)
{
struct v4l2_buffer v4l2Out;
memset(&v4l2Out, 0, sizeof(v4l2Out));
v4l2Out.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
v4l2Out.memory = V4L2_MEMORY_MMAP;
v4l2Out.index = u;
v4l2Out.length = frmSz;
ret = ioctl(out_h, VIDIOC_QUERYBUF, &v4l2Out);
if (ret)
{
fprintf(stdout, "%d out query buf error %s\n", u, strerror(errno));
goto terminate;
}
if (MAP_FAILED == (bufsOut[u] = mmap(NULL, frmSz, PROT_READ|PROT_WRITE, MAP_SHARED,
out_h, v4l2Out.m.offset))) {
fprintf(stdout, "%d out mmap error %s\n", u, strerror(errno));
goto terminate;
}
fprintf(stdout, "%d buf out mmap: %p %d\n", u, bufsOut[u], frmSz);
}
}
ret = ioctl(in_h, VIDIOC_QBUF, &v4l2In);
if (ret)
{
fprintf(stdout, "%d in queue error %s\n", u, strerror(errno));
goto terminate;
}
in_n++;
}
//----------------------------------------------------------------------------------------------
{ //Start Capture stream
enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl (in_h, VIDIOC_STREAMON, &type)) {
fprintf(stdout, "in stream on error:%s\n", strerror(errno));
terminate();
}
}
unsigned uOut, isOutStart, uIn, uNoActIn, uNoActOut, uOutDrop;
uOut = isOutStart = uIn = uNoActIn = uNoActOut = uOutDrop = stop_everything = 0;
while(!stop_everything)
{
/* Place where the captured frames are dequeued */
int toQueueInBuf = 0;
v4l2In.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
v4l2In.memory = V4L2_MEMORY_MMAP;
ret = ioctl(in_h, VIDIOC_DQBUF, &v4l2In);
if (ret) {
if (EAGAIN != errno) {
fprintf(stdout, "%d (%d/%d) in dequeue error %s %d\n", uIn, in_n, out_n,
strerror(errno), v4l2In.index);
goto terminate;
}
{ goto step2;}
}
uNoActIn = 0; //no activity counter on capture channel restart
in_n--; //input FIFO indicator update
uIn++; //counter of input frames update
toQueueInBuf = 1;
{
assert(v4l2In.index < QUEUE_SZ);
cv::Mat mYUV(DEFAULT_FRAME_Y, DEFAULT_FRAME_X,CV_8UC2,bufsIn[v4l2In.index]);
cvtColor(mYUV, frameIn, CV_YUV2BGR_YUYV);
imshow("Source", frameIn);
// I/O Pointers
beginStream(&h_inputFrame, &h_outputFrame, &d_inputFrame, &d_outputFrame, &d_blueBlurred, &d_greenBlurred, &d_redBlurred, &_h_filter, &filterWidth, frameIn, true);
...
...
...
}
}
}
When I try to call “beginStream” function,the frame “frameIn” starts to flicker/flash when the control reaches to “cudaMemcpy()” API in the function.
void beginStream(
uchar4 **h_inputFrame, // Pointer to host input frame
uchar4 **h_outputFrame, // Pointer to host output frame
uchar4 **d_inputFrame, // Pointer to device input frame
uchar4 **d_outputFrame, // Pointer to device output frame
unsigned char **d_redBlurred, // Device red channel blur
unsigned char **d_greenBlurred, // Device green channel blur
unsigned char **d_blueBlurred, // Device blue channel blur
float **h_filter, int *filterWidth, // The width we want our filter to be
cv::Mat src, // The source frame we just captured
const bool runningGPU // Running the GPU method, so allocate mem on device and host
)
{
// Check we are okay
cudaFree(0);
// Move source data into the input frame, ensuring RGBA format
cv::cvtColor(src, frameIn, CV_BGR2RGBA);
// Allocate memory for the output frame
frameOut.create(frameIn.rows, frameIn.cols, CV_8UC4);
// Allocate host variables, casting the frameIn and frameOut vars to uchar4 elements, these will
// later be processed by the kernel
*h_inputFrame = (uchar4 *)frameIn.ptr<unsigned char>(0);
*h_outputFrame = (uchar4 *)frameOut.ptr<unsigned char>(0);
// The image has been created, now we can find out how many pixels we are going to be working with
const size_t numPixels = numRows() * numCols();
// Allocate memory on the device for I/O
cudaMalloc(d_inputFrame, sizeof(uchar4) * numPixels);
cudaMalloc(d_outputFrame, sizeof(uchar4) * numPixels);
cudaMemset(*d_outputFrame, 0, numPixels * sizeof(uchar4));
// Copy the input frame array to the CPU for processing
cudaMemcpy(*d_inputFrame, *h_inputFrame, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice);
/* Frames start flickering at this point of time */
// Set the global references of the current working image
d_frameIn = *d_inputFrame;
d_frameOut = *d_outputFrame;
// Create blur kernel
const int stencil = stencilSize;
const float sigma = 2.f;
*filterWidth = stencil;
// Fill the filter for convulution
*h_filter = new float[stencil * stencil];
_h_filter = *h_filter;
float filterSum = 0.f;
// Create the weightings for the filter
for (int r = -stencil/2; r <= stencil/2; ++r) {
for (int c = -stencil/2; c <= stencil/2; ++c)
{
float filterValue = expf( -(float)(c * c + r * r) / (2.f * sigma * sigma));
(*h_filter)[(r + stencil/2) * stencil + c + stencil/2] = filterValue;
filterSum += filterValue;
}
}
float normalise = 1.f / filterSum;
for (int r = -stencil/2; r <= stencil/2; ++r) {
for (int c = -stencil/2; c <= stencil/2; ++c) {
(*h_filter)[(r + stencil/2) * stencil + c + stencil/2] *= normalise;
}
}
if(runningGPU)
{
// Alloacate memory for the channels
cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels);
cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels);
cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels);
cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels);
cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels);
cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels);
}
}