Frames flicker on copying from host to device using cudaMemcpy()

I am trying to perform Gaussian filtering on video input using OpenCV and CUDA framework. I have encountered an issue of frames flickering when I try to send the filtered frames to V4L2 capable output device. The frames seem perfect without flicker when I view it using “imshow” OpenCV API. On debugging, I found the issue emerges while using cudaMemcpy() to copy input frame to device for processing. I am sure about copying exact frame with appropriate frame size to the GPU. On copying both input and output frames seem flickering.

Can you provide more details on your software and hardware stack?

I have a code where I try to perform gaussian filtering on NVIDIA GPU using CUDA toolkit 9.0 with OpenCV 3.4.6 and V4L2 framework for video capture and transmit. The video captured with 1080(i50) resolution. Frames are captured in YUV(4:2:2) format, converted to RGB using OpenCV cvtColor API.On trying to copy the frame to GPU, frames start flickering with green frames coming in between. Is it because of the global references to the device and host variables?

#include "main.h"
#include <signal.h>

#define QUEUE_SZ    8
#define OUT_FIFO_START  2
#define DEFAULT_FRAME_Y 1080 
#define DEFAULT_FRAME_X 1920
#define COUNTOF(x) (sizeof(x)/sizeof(x[0]))
struct v4l2_buffer v4l2In;

     V4L2_PIX_FMT_v210 = 0x30313276

cv::Mat frameOut;

uchar4 *d_frameIn;
uchar4 *d_frameOut;
unsigned char *buffer;
static int high =0;
// Filter vars, these will be used to apply a weighting matrix to the image
float *_h_filter;
int   filterWidth; 
int i = 0;
int stencilSize = 3;
int restart = 1;

// Channels and frame initializers
uchar4 *h_inputFrame;
uchar4 *d_inputFrame;
uchar4 *h_outputFrame;
uchar4 *d_outputFrame;
unsigned char *d_redBlurred;
unsigned char *d_greenBlurred; 
unsigned char *d_blueBlurred;

// Helper methods for frame
size_t numRows() { return DEFAULT_FRAME_Y; }
size_t numCols() { return DEFAULT_FRAME_X; }
size_t numPixels() { return (DEFAULT_FRAME_X * DEFAULT_FRAME_Y); }    

volatile int stop_everything = 0;                                  
int in_h = 0; //input device file descriptor                       
int out_h = 0;//Output device file descriptor
int main() 
    int ret =-1;
    unsigned u;
    struct v4l2_format fmtIn; //current format
    struct v4l2_streamparm parm; //current format
    struct v4l2_buffer v4l2In;
    int source = -1;
    unsigned isV210On = 0;
    /* I query the input and output device capabilities, allocate   array of input buffers (8 buffers) for input and output video frames*/
    void* bufsIn[QUEUE_SZ]; 
    memset(&bufsIn, 0, sizeof(bufsIn));
    void* bufsOut[QUEUE_SZ]; 
    memset(&bufsOut, 0, sizeof(bufsOut));
    /*Buffers are then requested and mapped to store frames at both input and output ends */
    for (u = 0; u < QUEUE_SZ; u++) {    
    memset(&v4l2In, 0, sizeof(v4l2In));         
    v4l2In.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
    v4l2In.memory = V4L2_MEMORY_MMAP;
    v4l2In.index = u;
    v4l2In.length = frmSz;
        ret = ioctl(in_h, VIDIOC_QUERYBUF, &v4l2In);
        if (ret) {
            fprintf(stdout, "%d in query buf error %s\n", u, strerror(errno));
            goto terminate;
        if (MAP_FAILED == (bufsIn[u] = mmap(0, frmSz, PROT_READ|PROT_WRITE, MAP_SHARED, 
            in_h, v4l2In.m.offset))) {
            fprintf(stdout, "%d mmap error\n", u);
            goto terminate;
        fprintf(stdout, "%d buf in memmap: %p %d\n", u, bufsIn[u], frmSz);

        if (out_h) 
            struct v4l2_buffer v4l2Out;     
            memset(&v4l2Out, 0, sizeof(v4l2Out));           
            v4l2Out.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;          
            v4l2Out.memory = V4L2_MEMORY_MMAP;
            v4l2Out.index = u;
            v4l2Out.length = frmSz;

            ret = ioctl(out_h, VIDIOC_QUERYBUF, &v4l2Out);
            if (ret) 
                fprintf(stdout, "%d out query buf error %s\n", u, strerror(errno));
                goto terminate;
            if (MAP_FAILED == (bufsOut[u] = mmap(NULL, frmSz, PROT_READ|PROT_WRITE, MAP_SHARED,
                out_h, v4l2Out.m.offset))) {
                fprintf(stdout, "%d out mmap error %s\n", u, strerror(errno));
                goto terminate;
                fprintf(stdout, "%d buf out mmap: %p %d\n", u, bufsOut[u], frmSz);          

ret = ioctl(in_h, VIDIOC_QBUF, &v4l2In);
    if (ret) 
        fprintf(stdout, "%d in queue error %s\n", u, strerror(errno));  
        goto terminate;
{   //Start Capture stream
    enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
    if (ioctl (in_h, VIDIOC_STREAMON, &type)) {
        fprintf(stdout, "in stream on error:%s\n", strerror(errno));

    unsigned uOut, isOutStart, uIn, uNoActIn, uNoActOut, uOutDrop;
    uOut = isOutStart = uIn = uNoActIn = uNoActOut = uOutDrop = stop_everything = 0;
    /* Place where the captured frames are dequeued */
        int toQueueInBuf = 0;
        v4l2In.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
        v4l2In.memory = V4L2_MEMORY_MMAP;   
        ret = ioctl(in_h, VIDIOC_DQBUF, &v4l2In);
        if (ret) {          
            if (EAGAIN != errno) {
                fprintf(stdout, "%d (%d/%d) in dequeue error %s %d\n", uIn, in_n, out_n, 
                strerror(errno), v4l2In.index);
                goto terminate;
            {   goto step2;}

    uNoActIn = 0; //no activity counter on capture channel restart
    in_n--;  //input FIFO indicator update
    uIn++; //counter of input frames update
    toQueueInBuf = 1;
        assert(v4l2In.index < QUEUE_SZ);
        cv::Mat mYUV(DEFAULT_FRAME_Y, DEFAULT_FRAME_X,CV_8UC2,bufsIn[v4l2In.index]);
        cvtColor(mYUV, frameIn, CV_YUV2BGR_YUYV);
        imshow("Source", frameIn);

        // I/O Pointers
        beginStream(&h_inputFrame, &h_outputFrame, &d_inputFrame, &d_outputFrame, &d_blueBlurred, &d_greenBlurred, &d_redBlurred, &_h_filter, &filterWidth, frameIn, true);

When I try to call “beginStream” function,the frame “frameIn” starts to flicker/flash when the control reaches to “cudaMemcpy()” API in the function.

void beginStream(
    uchar4 **h_inputFrame,                      // Pointer to host input frame
    uchar4 **h_outputFrame,                     // Pointer to host output frame
    uchar4 **d_inputFrame,                      // Pointer to device input frame
    uchar4 **d_outputFrame,                     // Pointer to device output frame
    unsigned char **d_redBlurred,               // Device red channel blur
    unsigned char **d_greenBlurred,             // Device green channel blur 
    unsigned char **d_blueBlurred,              // Device blue channel blur
    float **h_filter, int *filterWidth,         // The width we want our filter to be
    cv::Mat src,                                // The source frame we just captured
    const bool runningGPU                       // Running the GPU method, so allocate mem on device and host
    // Check we are okay

    // Move source data into the input frame, ensuring RGBA format
    cv::cvtColor(src, frameIn, CV_BGR2RGBA); 

    // Allocate memory for the output frame
    frameOut.create(frameIn.rows, frameIn.cols, CV_8UC4);

    // Allocate host variables, casting the frameIn and frameOut vars to uchar4 elements, these will
    // later be processed by the kernel
    *h_inputFrame = (uchar4 *)frameIn.ptr<unsigned char>(0);
    *h_outputFrame = (uchar4 *)frameOut.ptr<unsigned char>(0);

    // The image has been created, now we can find out how many pixels we are going to be working with
    const size_t numPixels = numRows() * numCols();

    // Allocate memory on the device for I/O
    cudaMalloc(d_inputFrame, sizeof(uchar4) * numPixels); 
    cudaMalloc(d_outputFrame, sizeof(uchar4) * numPixels);
    cudaMemset(*d_outputFrame, 0, numPixels * sizeof(uchar4));

// Copy the input frame array to the CPU for processing
    cudaMemcpy(*d_inputFrame, *h_inputFrame, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice);
/* Frames start flickering at this point of time */

    // Set the global references of the current working image
    d_frameIn  = *d_inputFrame;
    d_frameOut = *d_outputFrame;

    // Create blur kernel
    const int stencil = stencilSize;
    const float sigma = 2.f;

    *filterWidth = stencil;

    // Fill the filter for convulution
    *h_filter = new float[stencil * stencil];
    _h_filter = *h_filter;

    float filterSum = 0.f;

    // Create the weightings for the filter
    for (int r = -stencil/2; r <= stencil/2; ++r) {
        for (int c = -stencil/2; c <= stencil/2; ++c) 
            float filterValue = expf( -(float)(c * c + r * r) / (2.f * sigma * sigma));
            (*h_filter)[(r + stencil/2) * stencil + c + stencil/2] = filterValue;
            filterSum += filterValue;

    float normalise = 1.f / filterSum;

    for (int r = -stencil/2; r <= stencil/2; ++r) {
        for (int c = -stencil/2; c <= stencil/2; ++c) {
            (*h_filter)[(r + stencil/2) * stencil + c + stencil/2] *= normalise;

        // Alloacate memory for the channels
        cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels);
        cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels);
        cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels);
        cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels);
        cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels);
        cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels);