Reading cudaEGLFrame from host code (CUDA Programming) on TX1

Hi Folks,

I am using the sample cuda code “cudaHistogram” as a reference to implement a frame difference kernel. As a starting step just like “cudaHistogram” I am trying to access the data of live camera capture from within the kernel and assigning to a fixed value ( say 255). Now, I need to access the modified data (if it has successfully modified) from the host code which has triggered the parallel GPU run.

My Host code is shown below :

static bool execute()
{
    // Create the CameraProvider object
    UniqueObj<CameraProvider> cameraProvider(CameraProvider::create());
    ICameraProvider *iCameraProvider = interface_cast<ICameraProvider>(cameraProvider);
    if (!iCameraProvider)
        ORIGINATE_ERROR("Failed to create CameraProvider");

    // Get the camera devices.
    std::vector<CameraDevice*> cameraDevices;
    iCameraProvider->getCameraDevices(&cameraDevices);
    if (cameraDevices.size() == 0)
        ORIGINATE_ERROR("No cameras available");

    // Create the capture session using the first device.
    UniqueObj<CaptureSession> captureSession(
        iCameraProvider->createCaptureSession(cameraDevices[0]));
    ICaptureSession *iCaptureSession = interface_cast<ICaptureSession>(captureSession);
    if (!iCaptureSession)
        ORIGINATE_ERROR("Failed to create CaptureSession");

    printf("Creating output stream\n");
    UniqueObj<OutputStreamSettings> streamSettings(iCaptureSession->createOutputStreamSettings());
    IOutputStreamSettings *iStreamSettings = interface_cast<IOutputStreamSettings>(streamSettings);
    if (iStreamSettings)
    {
        iStreamSettings->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
        iStreamSettings->setResolution(STREAM_SIZE);
    }
    UniqueObj<OutputStream> outputStream(iCaptureSession->createOutputStream(streamSettings.get()));
    IStream *iStream = interface_cast<IStream>(outputStream);
    if (!iStream)
        ORIGINATE_ERROR("Failed to create OutputStream");

    // Initialize and connect CUDA as the EGLStream consumer.
    PROPAGATE_ERROR(initCUDA(&g_cudaContext));
    CUresult cuResult;
    CUeglStreamConnection cudaConnection;
    printf("Connecting CUDA to OutputStream as an EGLStream consumer\n");
    cuResult = cuEGLStreamConsumerConnect(&cudaConnection, iStream->getEGLStream());
    if (cuResult != CUDA_SUCCESS)
    {
        ORIGINATE_ERROR("Unable to connect CUDA to EGLStream as a consumer (CUresult %s)",
            getCudaErrorString(cuResult));
    }

    // Create capture request and enable output stream.
    UniqueObj<Request> request(iCaptureSession->createRequest());
    IRequest *iRequest = interface_cast<IRequest>(request);
    if (!iRequest)
        ORIGINATE_ERROR("Failed to create Request");
    iRequest->enableOutputStream(outputStream.get());

    // Submit some captures and calculate the histogram with CUDA
   // UniquePointer<unsigned int> histogramData(new unsigned int[HISTOGRAM_BINS]);
   // if (!histogramData)
    //    ORIGINATE_ERROR("Failed to allocate histogram");
    for (unsigned int frame = 0; frame < FRAME_COUNT; ++frame)
    {
        /*
         * For simplicity this example submits a capture then waits for an output.
         * This pattern will not provide the best possible performance as the camera
         * stack runs in a pipeline, it is best to keep submitting as many captures as
         * possible prior to waiting for the result.
         */
        printf("Submitting a capture request\n");
        {
            Argus::Status status;
            const uint64_t ONE_SECOND = 1000000000;
            uint32_t result = iCaptureSession->capture(request.get(), ONE_SECOND, &status);
            if (result == 0)
                ORIGINATE_ERROR("Failed to submit capture request (status %x)", status);
        }

        printf("Acquiring an image from the EGLStream\n");
        CUgraphicsResource cudaResource = 0;
        CUstream cudaStream = 0;
        cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a "
                "consumer (CUresult %s).", getCudaErrorString(cuResult));
        }

        // Get the CUDA EGL frame.
        CUeglFrame cudaEGLFrame;
        cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
                getCudaErrorString(cuResult));
        }

        // Print the information contained in the CUDA EGL frame structure.
        PROPAGATE_ERROR(printCUDAEGLFrame(cudaEGLFrame));

        if ((cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
            (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
            (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
            (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
        {
            ORIGINATE_ERROR("Only YUV color formats are supported");
        }
        if (cudaEGLFrame.cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
            ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");

        // Create a surface from the luminance plane
        CUDA_RESOURCE_DESC cudaResourceDesc;
        memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
        cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;
        cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
        CUsurfObject cudaSurfObj = 0;
        cuResult = cuSurfObjectCreate(&cudaSurfObj, &cudaResourceDesc);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to create the surface object (CUresult %s)",
                getCudaErrorString(cuResult));
        }

  
        float time = diff(cudaSurfObj, cudaEGLFrame.width, cudaEGLFrame.height);
        printf("Finished after %f ms.\n", time);


        cuResult = cuSurfObjectDestroy(cudaSurfObj);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to destroy the surface object (CUresult %s)",
                getCudaErrorString(cuResult));
        }

        cuResult = cuEGLStreamConsumerReleaseFrame(&cudaConnection, cudaResource, &cudaStream);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to release the last frame acquired from the EGLStream "
                "(CUresult %s).", getCudaErrorString(cuResult));
        }
    }

    printf("Cleaning up\n");

    // Disconnect the Argus producer from the stream.
    /// @todo: This is a WAR for a bug in cuEGLStreamConsumerDisconnect (see bug 200239336).
    outputStream.reset();

    cuResult = cuEGLStreamConsumerDisconnect(&cudaConnection);
    if (cuResult != CUDA_SUCCESS)
    {
        ORIGINATE_ERROR("Unable to disconnect CUDA as a consumer from EGLStream (CUresult %s)",
            getCudaErrorString(cuResult));
    }

    PROPAGATE_ERROR(cleanupCUDA(&g_cudaContext));

    printf("Done\n");

    return true;
}

My CUDA code is shown below :

#include "absdiff.h"



__global__ void diff_atomics(
    CUsurfObject surface,
    unsigned int width,
    unsigned int height)
{
    // global position and size
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int nx = blockDim.x * gridDim.x;
    int ny = blockDim.y * gridDim.y;

    // threads in workgroup
    int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
    int nt = blockDim.x * blockDim.y; // total threads in workgroup

    // group index in 0..ngroups-1
    int g = blockIdx.x + blockIdx.y * gridDim.x;


    // process pixels (updates our group's partial histogram in smem)
    for (int col = x; col < width; col += nx)
    {
        for (int row = y; row < height; row += ny)
        {
            uchar1 data;
            surf2Dread(&data, surface, col, row);
		
	    data.x = 255;
	    

        }
    }

    __syncthreads();

}



float run_smem_atomics(
    CUsurfObject surface,
    unsigned int width,
    unsigned int height)
{
    dim3 block(32, 4);
    dim3 grid(16, 16);
 
    cudaEvent_t start;
    cudaEvent_t stop;

    cudaEventCreate(&stop);
    cudaEventCreate(&start);

    cudaEventRecord(start, 0);

    diff_atomics<<<grid, block>>>(
        surface,
        width,
        height
        );
 
    cudaEventRecord(stop, 0);

    cudaEventSynchronize(stop);
    float elapsed_millis;
    cudaEventElapsedTime(&elapsed_millis, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

 
    return elapsed_millis;
}


float diff(CUsurfObject surface ,  unsigned int width, unsigned int height)
{
    return run_smem_atomics(surface , width, height);
}

where absdiff.h is shown below:

#ifndef ABSDIFF_H
#define ABSDIFF_H

#include <cuda.h>

extern float diff(CUsurfObject surface ,  unsigned int width, unsigned int height);

#endif

Kindly help me out on how to access the modified data using cudaEGLFrame from host code (if its possible).

Thanks

Hi,

Sorry for that I’m not very clear about your question.
Could you share more information with us?

From your code, you launched kernel via

float time = diff(cudaSurfObj, cudaEGLFrame.width, cudaEGLFrame.height);

And overwrite the data to 255 with the kernel

So, does your problem is how to copy the overwrite data back to host? (CPU operation)
Or you want to launch another kernel code, and it also uses the overwrite data? (GPU operation)

Thanks.

Hi AastaLLL

Thanks for your help.

I am looking for a way to read the output of CUDA kernel from host (CPU). Such that we can see effect of what kernel did. In my experimental case it would convert all pixels of input image to white pixels (value 255).

Thanks,

Hi,

It’s not easy to read cuEGLFrame directly since you will need to consider the different data arrangement type.
If you just want to checking the results, an alternative is:

  1. Create a normal cuda array
  2. Launch kernel to copy data from cuEGLFrame to cuda array
  3. Copy CUDA array back to host via cudamemcpy()

For example:

/*
 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/**
 * Based on CUB histogram code: https://github.com/NVlabs/cub/tree/master/experimental/histogram
 */

#include "histogram.h"
#include <iostream>

// First-pass histogram kernel (binning into privatized counters)
template <
    int         NUM_PARTS,
    int         NUM_BINS>
__global__ void histogram_smem_atomics(
    CUsurfObject surface,
    unsigned int width,
    unsigned int height,
    unsigned int *out)
{
    // global position and size
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int nx = blockDim.x * gridDim.x;
    int ny = blockDim.y * gridDim.y;

    // threads in workgroup
    int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
    int nt = blockDim.x * blockDim.y; // total threads in workgroup

    // group index in 0..ngroups-1
    int g = blockIdx.x + blockIdx.y * gridDim.x;

    // initialize smem
    __shared__ unsigned int smem[NUM_BINS];
    for (int i = t; i < NUM_BINS; i += nt)
        smem[i] = 0;

    // process pixels (updates our group's partial histogram in smem)
    for (int col = x; col < width; col += nx)
    {
        for (int row = y; row < height; row += ny)
        {
            uchar1 data;
            surf2Dread(&data, surface, col, row);

            atomicAdd(&smem[((unsigned int)data.x) % NUM_BINS], 1);
        }
    }

    __syncthreads();

    // move to our workgroup's slice of output
    out += g * NUM_PARTS;

    // store local output to global
    for (int i = t; i < NUM_BINS; i += nt)
    {
        out[i] = smem[i];
    }
}

__global__ void checkData(
    CUsurfObject surface,
    unsigned int width,
    unsigned int height,
    int *res)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<width && y < height)
    {
        uchar1 data;
        surf2Dread(&data, surface, x, y);
        res[y*width+x] = data.x;    
    }
}

// Second pass histogram kernel (accumulation)
template <
    int         NUM_PARTS,
    int         NUM_BINS>
__global__ void histogram_smem_accum(
    const unsigned int *in,
    int n,
    unsigned int *out)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i > NUM_BINS)
        return; // out of range

    unsigned int total = 0;
    for (int j = 0; j < n; j++)
        total += in[i + NUM_PARTS * j];

    out[i] = total;
}

template <
    int         NUM_BINS>
float run_smem_atomics(
    CUsurfObject surface,
    unsigned int width,
    unsigned int height,
    unsigned int *h_hist)
{
    enum
    {
        NUM_PARTS = 1024
    };

    dim3 block(32, 4);
    dim3 grid(16, 16);
    int total_blocks = grid.x * grid.y;

    // allocate device histogram
    unsigned int *d_hist;
    cudaMalloc(&d_hist, NUM_BINS * sizeof(unsigned int));
    // allocate partial histogram
    unsigned int *d_part_hist;
    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));

    dim3 block2(128);
    dim3 grid2((NUM_BINS + block.x - 1) / block.x);

    cudaEvent_t start;
    cudaEvent_t stop;

    cudaEventCreate(&stop);
    cudaEventCreate(&start);

    cudaEventRecord(start, 0);

    histogram_smem_atomics<NUM_PARTS, NUM_BINS><<<grid, block>>>(
        surface,
        width,
        height,
        d_part_hist);

    histogram_smem_accum<NUM_PARTS, NUM_BINS><<<grid2, block2>>>(
        d_part_hist,
        total_blocks,
        d_hist);

    cudaEventRecord(stop, 0);

    cudaEventSynchronize(stop);
    float elapsed_millis;
    cudaEventElapsedTime(&elapsed_millis, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    int *h_res, *d_res;
    h_res = (int*)malloc(width*height*sizeof(int));
    cudaMalloc(&d_res, width*height*sizeof(int));
    checkData<<<(width/block.x+1,height/block.y+1), block>>>(surface, width, height, d_res);
    cudaMemcpy(h_res, d_res, width*height*sizeof(int), cudaMemcpyDeviceToHost);
    for(int i = 0; i < 10; i++) std::cout << "results["<<i<<"]: "<<h_res[i]<<std::endl;

    cudaMemcpy(h_hist, d_hist, NUM_BINS * sizeof(unsigned int), cudaMemcpyDeviceToHost);

    cudaFree(d_part_hist);
    cudaFree(d_hist);

    return elapsed_millis;
}

float histogram(CUsurfObject surface, unsigned int width, unsigned int height,
    unsigned int *histogram)
{
    return run_smem_atomics<HISTOGRAM_BINS>(surface, width, height, histogram);
}