Hi Folks,
I am using the sample cuda code “cudaHistogram” as a reference to implement a frame difference kernel. As a starting step just like “cudaHistogram” I am trying to access the data of live camera capture from within the kernel and assigning to a fixed value ( say 255). Now, I need to access the modified data (if it has successfully modified) from the host code which has triggered the parallel GPU run.
My Host code is shown below :
static bool execute()
{
// Create the CameraProvider object
UniqueObj<CameraProvider> cameraProvider(CameraProvider::create());
ICameraProvider *iCameraProvider = interface_cast<ICameraProvider>(cameraProvider);
if (!iCameraProvider)
ORIGINATE_ERROR("Failed to create CameraProvider");
// Get the camera devices.
std::vector<CameraDevice*> cameraDevices;
iCameraProvider->getCameraDevices(&cameraDevices);
if (cameraDevices.size() == 0)
ORIGINATE_ERROR("No cameras available");
// Create the capture session using the first device.
UniqueObj<CaptureSession> captureSession(
iCameraProvider->createCaptureSession(cameraDevices[0]));
ICaptureSession *iCaptureSession = interface_cast<ICaptureSession>(captureSession);
if (!iCaptureSession)
ORIGINATE_ERROR("Failed to create CaptureSession");
printf("Creating output stream\n");
UniqueObj<OutputStreamSettings> streamSettings(iCaptureSession->createOutputStreamSettings());
IOutputStreamSettings *iStreamSettings = interface_cast<IOutputStreamSettings>(streamSettings);
if (iStreamSettings)
{
iStreamSettings->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
iStreamSettings->setResolution(STREAM_SIZE);
}
UniqueObj<OutputStream> outputStream(iCaptureSession->createOutputStream(streamSettings.get()));
IStream *iStream = interface_cast<IStream>(outputStream);
if (!iStream)
ORIGINATE_ERROR("Failed to create OutputStream");
// Initialize and connect CUDA as the EGLStream consumer.
PROPAGATE_ERROR(initCUDA(&g_cudaContext));
CUresult cuResult;
CUeglStreamConnection cudaConnection;
printf("Connecting CUDA to OutputStream as an EGLStream consumer\n");
cuResult = cuEGLStreamConsumerConnect(&cudaConnection, iStream->getEGLStream());
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to connect CUDA to EGLStream as a consumer (CUresult %s)",
getCudaErrorString(cuResult));
}
// Create capture request and enable output stream.
UniqueObj<Request> request(iCaptureSession->createRequest());
IRequest *iRequest = interface_cast<IRequest>(request);
if (!iRequest)
ORIGINATE_ERROR("Failed to create Request");
iRequest->enableOutputStream(outputStream.get());
// Submit some captures and calculate the histogram with CUDA
// UniquePointer<unsigned int> histogramData(new unsigned int[HISTOGRAM_BINS]);
// if (!histogramData)
// ORIGINATE_ERROR("Failed to allocate histogram");
for (unsigned int frame = 0; frame < FRAME_COUNT; ++frame)
{
/*
* For simplicity this example submits a capture then waits for an output.
* This pattern will not provide the best possible performance as the camera
* stack runs in a pipeline, it is best to keep submitting as many captures as
* possible prior to waiting for the result.
*/
printf("Submitting a capture request\n");
{
Argus::Status status;
const uint64_t ONE_SECOND = 1000000000;
uint32_t result = iCaptureSession->capture(request.get(), ONE_SECOND, &status);
if (result == 0)
ORIGINATE_ERROR("Failed to submit capture request (status %x)", status);
}
printf("Acquiring an image from the EGLStream\n");
CUgraphicsResource cudaResource = 0;
CUstream cudaStream = 0;
cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a "
"consumer (CUresult %s).", getCudaErrorString(cuResult));
}
// Get the CUDA EGL frame.
CUeglFrame cudaEGLFrame;
cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
getCudaErrorString(cuResult));
}
// Print the information contained in the CUDA EGL frame structure.
PROPAGATE_ERROR(printCUDAEGLFrame(cudaEGLFrame));
if ((cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
(cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
(cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
(cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
{
ORIGINATE_ERROR("Only YUV color formats are supported");
}
if (cudaEGLFrame.cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");
// Create a surface from the luminance plane
CUDA_RESOURCE_DESC cudaResourceDesc;
memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
CUsurfObject cudaSurfObj = 0;
cuResult = cuSurfObjectCreate(&cudaSurfObj, &cudaResourceDesc);
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to create the surface object (CUresult %s)",
getCudaErrorString(cuResult));
}
float time = diff(cudaSurfObj, cudaEGLFrame.width, cudaEGLFrame.height);
printf("Finished after %f ms.\n", time);
cuResult = cuSurfObjectDestroy(cudaSurfObj);
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to destroy the surface object (CUresult %s)",
getCudaErrorString(cuResult));
}
cuResult = cuEGLStreamConsumerReleaseFrame(&cudaConnection, cudaResource, &cudaStream);
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to release the last frame acquired from the EGLStream "
"(CUresult %s).", getCudaErrorString(cuResult));
}
}
printf("Cleaning up\n");
// Disconnect the Argus producer from the stream.
/// @todo: This is a WAR for a bug in cuEGLStreamConsumerDisconnect (see bug 200239336).
outputStream.reset();
cuResult = cuEGLStreamConsumerDisconnect(&cudaConnection);
if (cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to disconnect CUDA as a consumer from EGLStream (CUresult %s)",
getCudaErrorString(cuResult));
}
PROPAGATE_ERROR(cleanupCUDA(&g_cudaContext));
printf("Done\n");
return true;
}
My CUDA code is shown below :
#include "absdiff.h"
__global__ void diff_atomics(
CUsurfObject surface,
unsigned int width,
unsigned int height)
{
// global position and size
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int nx = blockDim.x * gridDim.x;
int ny = blockDim.y * gridDim.y;
// threads in workgroup
int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
int nt = blockDim.x * blockDim.y; // total threads in workgroup
// group index in 0..ngroups-1
int g = blockIdx.x + blockIdx.y * gridDim.x;
// process pixels (updates our group's partial histogram in smem)
for (int col = x; col < width; col += nx)
{
for (int row = y; row < height; row += ny)
{
uchar1 data;
surf2Dread(&data, surface, col, row);
data.x = 255;
}
}
__syncthreads();
}
float run_smem_atomics(
CUsurfObject surface,
unsigned int width,
unsigned int height)
{
dim3 block(32, 4);
dim3 grid(16, 16);
cudaEvent_t start;
cudaEvent_t stop;
cudaEventCreate(&stop);
cudaEventCreate(&start);
cudaEventRecord(start, 0);
diff_atomics<<<grid, block>>>(
surface,
width,
height
);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed_millis;
cudaEventElapsedTime(&elapsed_millis, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsed_millis;
}
float diff(CUsurfObject surface , unsigned int width, unsigned int height)
{
return run_smem_atomics(surface , width, height);
}
where absdiff.h is shown below:
#ifndef ABSDIFF_H
#define ABSDIFF_H
#include <cuda.h>
extern float diff(CUsurfObject surface , unsigned int width, unsigned int height);
#endif
Kindly help me out on how to access the modified data using cudaEGLFrame from host code (if its possible).
Thanks