I am writing a stitching application that would require 6 camera inputs to be processed on cuda.
Currently i am able to get 6 buffers on cuda side. However i am experiencing a frame drop on some camera threads. I would like to know if the implementation is optimal. I am using 6 raspberry pi csi cameras on TX1. I get the following output when i compute FPS. I would like to know if it is due to the way i assign cpu cores for the threads.
FPS = 33.485615 Cam Index = 5
FPS = 33.461558 Cam Index = 4
FPS = 26.813825 Cam Index = 2
FPS = 26.443207 Cam Index = 1
FPS = 26.282621 Cam Index = 0
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <Argus/Argus.h>
#include <unistd.h>
#include "Error.h"
#include "UniquePointer.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudaEGL.h>
#include "CUDAHelper.h"
#include <pthread.h>
#include <sys/time.h>
#define PRODUCER_PRINT(...) printf("PRODUCER: " __VA_ARGS__)
#define CONSUMER_PRINT(...) printf("CONSUMER: " __VA_ARGS__)
using namespace Argus;
#define NUM_CAMERAS 6
#define SENSOR_MODE 2
#define PREVIEW_WIDTH 1640
#define PREVIEW_HEIGHT 1232
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
// Constants
static const Size STREAM_SIZE (PREVIEW_WIDTH, PREVIEW_HEIGHT);
static const uint32_t FRAME_COUNT = 200;
// Global variables
#define CAPTURE_TIME 10
namespace ArgusSamples
{
typedef struct CudaConsumerHandle
{
int m_numCameras;
CUcontext g_cudaContext = 0;
IStream **outputStreams;
CUresult *cuResult;
CUeglStreamConnection *cudaConnection;
CUgraphicsResource *cudaResource;
CUstream *cudaStream;
CUeglFrame *cudaEGLFrame;
uint8_t **cuda_buffers;
cudaArray **cuda_arrays;
int m_width;
int m_height;
pthread_mutex_t *lock;
}CudaConsumerHandle;
CudaConsumerHandle cudaConsumer;
bool InitCudaConsumer(CudaConsumerHandle *p, IStream ** OutputStreams, int num_cameras, int width, int height)
{
p->m_numCameras = num_cameras;
p->m_width = width;
p->m_height = height;
p->outputStreams = (IStream **)malloc(num_cameras * sizeof(IStream *));
for(int i = 0 ; i < num_cameras; i++)
{
p->outputStreams[i] = OutputStreams[i];
}
p->lock = (pthread_mutex_t *)malloc(num_cameras * sizeof(pthread_mutex_t));
p->cudaConnection = (CUeglStreamConnection *)malloc(num_cameras * sizeof(CUeglStreamConnection));
p->cudaResource = (CUgraphicsResource *)malloc(num_cameras * sizeof(CUgraphicsResource));
p->cudaStream = (CUstream *)malloc(num_cameras * sizeof(CUstream));
p->cuResult = (CUresult *)malloc(num_cameras * sizeof(CUresult));
p->cudaEGLFrame = (CUeglFrame *)malloc(num_cameras * sizeof(CUeglFrame));
for(int i = 0; i < num_cameras ; i++)
pthread_mutex_init(&p->lock[i], NULL);
PROPAGATE_ERROR(initCUDA(&p->g_cudaContext));
printf("Connecting CUDA to OutputStream as an EGLStream consumer\n");
for(int i = 0 ; i < p->m_numCameras ; i++)
{
p->cuResult[i] = cuEGLStreamConsumerConnect(&p->cudaConnection[i], p->outputStreams[i]->getEGLStream());
if (p->cuResult[i] != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to connect CUDA to EGLStream as a consumer (CUresult %s)",
getCudaErrorString(p->cuResult[i]));
}
}
p->cuda_buffers = (uint8_t **)malloc(num_cameras * sizeof(uint8_t *));
p->cuda_arrays = (cudaArray **)malloc(num_cameras * sizeof(cudaArray *));
for(int i = 0 ; i < p->m_numCameras ; i++)
{
cudaMalloc((void **)&p->cuda_buffers[i], p->m_width * p->m_height * 1.5 * sizeof(uint8_t));
}
}
bool CudaConsumerDisconnect(CudaConsumerHandle *p)
{
printf("Cleaning up cuda consumer\n");
// Disconnect the Argus producer from the stream.
/// @todo: This is a WAR for a bug in cuEGLStreamConsumerDisconnect (see bug 200239336).
for(int i = 0 ; i < p->m_numCameras ; i++)
{
//outputStream[i].reset();
p->cuResult[i] = cuEGLStreamConsumerDisconnect(&p->cudaConnection[i]);
if (p->cuResult[i] != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to disconnect CUDA as a consumer from EGLStream (CUresult %s)",
getCudaErrorString(p->cuResult[i]));
}
}
PROPAGATE_ERROR(cleanupCUDA(&p->g_cudaContext));
}
void *CudaConsumerThreadRun(void *index)
{
int camera_index = *((int *)index);
CudaConsumerHandle *p = (CudaConsumerHandle *)(&cudaConsumer);
CONSUMER_PRINT("Waiting for Argus producer to connect to output stream.\n");
printf("Camera Index = %d ", camera_index);
fflush(stdout);
p->outputStreams[camera_index]->waitUntilConnected();
fflush(stdout);
struct timeval tv1, tv2;
double time = 0.0;
gettimeofday(&tv1, NULL);
for(unsigned int frame = 0; frame < FRAME_COUNT; ++frame)
{
/*
* For simplicity this example submits a capture then waits for an output.
* This pattern will not provide the best possible performance as the camera
* stack runs in a pipeline, it is best to keep submitting as many captures as
* possible prior to waiting for the result.
*/
//printf("Acquiring an image from the EGLStream\n");
fflush(stdout);
p->cuResult[camera_index] = cuEGLStreamConsumerAcquireFrame(&p->cudaConnection[camera_index], &p->cudaResource[camera_index], &p->cudaStream[camera_index], -1);
if (p->cuResult[camera_index] != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a "
"consumer (CUresult %s).", getCudaErrorString(p->cuResult[camera_index]));
}
// Get the CUDA EGL frame.
p->cuResult[camera_index] = cuGraphicsResourceGetMappedEglFrame(&p->cudaEGLFrame[camera_index], p->cudaResource[camera_index], 0, 0);
if (p->cuResult[camera_index] != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
getCudaErrorString(p->cuResult[camera_index]));
}
// Print the information contained in the CUDA EGL frame structure.
//PROPAGATE_ERROR(printCUDAEGLFrame(p->cudaEGLFrame[i]));
if ((p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
{
ORIGINATE_ERROR("Only YUV color formats are supported");
}
if (p->cudaEGLFrame[camera_index].cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");
//pthread_mutex_lock(&p->lock[camera_index]);
cudaArray *arr_y = (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[0];
cudaArray *arr_u = (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[1];
cudaArray *arr_v = (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[2];
//pthread_mutex_unlock(&p->lock[camera_index]);
int u_offset = p->m_width * p->m_height;
int v_offset = p->m_width * p->m_height * 1.25;
//gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index], arr_y, 0, 0, p->m_width * p->m_height , cudaMemcpyDeviceToDevice));
//gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index] + u_offset, arr_u, 0, 0, p->m_width * p->m_height *0.25 , cudaMemcpyDeviceToDevice));
//gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index] + v_offset, arr_v, 0, 0, p->m_width * p->m_height * 0.25 , cudaMemcpyDeviceToDevice));
p->cuResult[camera_index] = cuEGLStreamConsumerReleaseFrame(&p->cudaConnection[camera_index], p->cudaResource[camera_index], &p->cudaStream[camera_index]);
if (p->cuResult[camera_index] != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to release the last frame acquired from the EGLStream "
"(CUresult %s).", getCudaErrorString(p->cuResult[camera_index]));
}
}
gettimeofday(&tv2, NULL);
time = (tv2.tv_sec - tv1.tv_sec)*1000 + (tv2.tv_usec - tv1.tv_usec)/(1000.0);
time/=1000.0;
printf("FPS = %f Cam Index = %d \n", FRAME_COUNT/time, camera_index);
fflush(stdout);
}
bool execute()
{
// Create the CameraProvider object
UniqueObj<CameraProvider> cameraProvider(CameraProvider::create());
ICameraProvider *iCameraProvider = interface_cast<ICameraProvider>(cameraProvider);
if (!iCameraProvider)
ORIGINATE_ERROR("Failed to create CameraProvider");
// Get the camera devices.
std::vector<CameraDevice*> cameraDevices;
iCameraProvider->getCameraDevices(&cameraDevices);
if (cameraDevices.size() < NUM_CAMERAS)
ORIGINATE_ERROR("Not enough cameras available");
UniqueObj<CaptureSession>captureSession[NUM_CAMERAS];
ICaptureSession *iCaptureSession[NUM_CAMERAS];
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
captureSession[i] = UniqueObj<CaptureSession>(iCameraProvider->createCaptureSession(cameraDevices[i]));
iCaptureSession[i] = interface_cast<ICaptureSession>(captureSession[i]);
if (!iCaptureSession[i])
ORIGINATE_ERROR("Failed to create CaptureSession");
}
CameraDevice *cameraDevice = cameraDevices[0];
ICameraProperties *iCameraProperties = interface_cast<ICameraProperties>(cameraDevice);
std::vector<SensorMode*> sensorModes;
iCameraProperties->getSensorModes(&sensorModes);
if (sensorModes.size() == 0)
ORIGINATE_ERROR("Failed to get sensor modes");
SensorMode *sensorMode = sensorModes[1];
printf("Creating output streams for cuda\n");
UniqueObj<OutputStreamSettings> streamSettings[NUM_CAMERAS];
IOutputStreamSettings *iStreamSettings[NUM_CAMERAS];
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
streamSettings[i] = UniqueObj<OutputStreamSettings>(iCaptureSession[i]->createOutputStreamSettings());
iStreamSettings[i] = interface_cast<IOutputStreamSettings>(streamSettings[i]);
if (iStreamSettings[i])
{
iStreamSettings[i]->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
iStreamSettings[i]->setResolution(STREAM_SIZE);
}
}
UniqueObj<OutputStream> outputStream[NUM_CAMERAS];
IStream *iStream[NUM_CAMERAS];
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
outputStream[i] = UniqueObj<OutputStream>(iCaptureSession[i]->createOutputStream(streamSettings[i].get()));
iStream[i] = interface_cast<IStream>(outputStream[i]);
if (!iStream[i])
ORIGINATE_ERROR("Failed to create OutputStream");
}
pthread_t ConsumerThread[NUM_CAMERAS];
InitCudaConsumer(&cudaConsumer, iStream, NUM_CAMERAS, PREVIEW_WIDTH, PREVIEW_HEIGHT);
pthread_attr_t attr_;
cpu_set_t cpus_;
pthread_attr_init(&attr_);
// Initialize and connect CUDA as the EGLStream consumer.
// Create capture request and enable output stream.
UniqueObj<Request> request[NUM_CAMERAS];
IRequest *iRequest[NUM_CAMERAS];
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
request[i] = UniqueObj<Request>(iCaptureSession[i]->createRequest());
iRequest[i] = interface_cast<IRequest>(request[i]);
if (!iRequest[i])
ORIGINATE_ERROR("Failed to create Request");
ISourceSettings *sourceSettings = interface_cast<ISourceSettings>(iRequest[i]->getSourceSettings());
sourceSettings->setSensorMode(sensorMode);
iRequest[i]->enableOutputStream(outputStream[i].get());
}
// Submit some captures and calculate the histogram with CUDA
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
if (iCaptureSession[i]->repeat(request[i].get()) != STATUS_OK)
ORIGINATE_ERROR("Failed to start repeat capture request for preview");
}
int num[] = {0, 1, 2, 3, 4, 5};
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
CPU_ZERO(&cpus_);
CPU_SET( (i%4) + 1, &cpus_);
pthread_attr_setaffinity_np(&attr_, sizeof(cpu_set_t), &cpus_);
pthread_create(&ConsumerThread[i], &attr_, CudaConsumerThreadRun, (void *)(&num[i]));
}
usleep(CAPTURE_TIME*1000000);
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
iCaptureSession[i]->stopRepeat();
iCaptureSession[i]->waitForIdle();
}
for(int i = 0 ; i < NUM_CAMERAS ; i++)
{
iStream[i]->disconnect();
pthread_join(ConsumerThread[i], NULL);
}
for(int i = 0 ; i < NUM_CAMERAS ; i++)
outputStream[i].reset();
CudaConsumerDisconnect(&cudaConsumer);
printf("Done\n");
return true;
}
}; // namespace ArgusSamples
int main(int argc, const char *argv[])
{
if (!ArgusSamples::execute())
return EXIT_FAILURE;
return EXIT_SUCCESS;
}