CUDA consumption from ARGUS API 6 Cameras

I am writing a stitching application that would require 6 camera inputs to be processed on cuda.
Currently i am able to get 6 buffers on cuda side. However i am experiencing a frame drop on some camera threads. I would like to know if the implementation is optimal. I am using 6 raspberry pi csi cameras on TX1. I get the following output when i compute FPS. I would like to know if it is due to the way i assign cpu cores for the threads.

FPS = 33.485615 Cam Index = 5
FPS = 33.461558 Cam Index = 4
FPS = 26.813825 Cam Index = 2
FPS = 26.443207 Cam Index = 1
FPS = 26.282621 Cam Index = 0

/*
 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <Argus/Argus.h>
#include <unistd.h>
#include "Error.h"
#include "UniquePointer.h"

#include <cuda.h>
#include <cuda_runtime.h>
#include <cudaEGL.h>

#include "CUDAHelper.h"
#include <pthread.h>
#include <sys/time.h>

#define PRODUCER_PRINT(...) printf("PRODUCER: " __VA_ARGS__)
#define CONSUMER_PRINT(...) printf("CONSUMER: " __VA_ARGS__)
using namespace Argus;
#define NUM_CAMERAS 6
#define SENSOR_MODE 2
#define PREVIEW_WIDTH 1640
#define PREVIEW_HEIGHT 1232

#define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
    
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
}



// Constants
static const Size     STREAM_SIZE (PREVIEW_WIDTH, PREVIEW_HEIGHT);
static const uint32_t FRAME_COUNT = 200;

// Global variables

#define CAPTURE_TIME 10

namespace ArgusSamples
{
	
typedef struct CudaConsumerHandle
{
	int m_numCameras;
	
	CUcontext g_cudaContext = 0;
	
	IStream **outputStreams;
	CUresult *cuResult;
    CUeglStreamConnection *cudaConnection;
    CUgraphicsResource *cudaResource;
    CUstream *cudaStream;
    CUeglFrame *cudaEGLFrame;
	
	
	uint8_t **cuda_buffers;
	cudaArray **cuda_arrays;
	
	int m_width;
	int m_height;
	
	pthread_mutex_t *lock;
	
		
		
}CudaConsumerHandle;
	
CudaConsumerHandle cudaConsumer;

	
bool InitCudaConsumer(CudaConsumerHandle *p, IStream ** OutputStreams, int num_cameras, int width, int height)
{
	p->m_numCameras = num_cameras;
	p->m_width = width;
	p->m_height = height;
	
	p->outputStreams = (IStream **)malloc(num_cameras * sizeof(IStream *));
	
	for(int i = 0 ; i < num_cameras; i++)
	{
		p->outputStreams[i] = OutputStreams[i];
		
	}
	p->lock = (pthread_mutex_t *)malloc(num_cameras * sizeof(pthread_mutex_t));
	p->cudaConnection = (CUeglStreamConnection *)malloc(num_cameras * sizeof(CUeglStreamConnection));
	p->cudaResource = (CUgraphicsResource *)malloc(num_cameras * sizeof(CUgraphicsResource));
	p->cudaStream = (CUstream *)malloc(num_cameras * sizeof(CUstream));
	p->cuResult = (CUresult *)malloc(num_cameras * sizeof(CUresult));
	p->cudaEGLFrame = (CUeglFrame *)malloc(num_cameras * sizeof(CUeglFrame));
	
	
	for(int i = 0; i < num_cameras ; i++)
		pthread_mutex_init(&p->lock[i], NULL);
		
	PROPAGATE_ERROR(initCUDA(&p->g_cudaContext));
	
	printf("Connecting CUDA to OutputStream as an EGLStream consumer\n");
    
    for(int i = 0 ; i < p->m_numCameras ; i++)
    {
		p->cuResult[i] = cuEGLStreamConsumerConnect(&p->cudaConnection[i], p->outputStreams[i]->getEGLStream());
		if (p->cuResult[i] != CUDA_SUCCESS)
		{
			ORIGINATE_ERROR("Unable to connect CUDA to EGLStream as a consumer (CUresult %s)",
				getCudaErrorString(p->cuResult[i]));
		}
	}
	
	p->cuda_buffers = (uint8_t **)malloc(num_cameras * sizeof(uint8_t *));
	p->cuda_arrays = (cudaArray **)malloc(num_cameras * sizeof(cudaArray *));
	
	for(int i = 0 ; i < p->m_numCameras ; i++)
	{
		cudaMalloc((void **)&p->cuda_buffers[i], p->m_width * p->m_height * 1.5 * sizeof(uint8_t));
	}
	
	
}

bool CudaConsumerDisconnect(CudaConsumerHandle *p)
{
	
    printf("Cleaning up cuda consumer\n");

    // Disconnect the Argus producer from the stream.
    /// @todo: This is a WAR for a bug in cuEGLStreamConsumerDisconnect (see bug 200239336).
    
    for(int i = 0 ; i < p->m_numCameras ; i++)
    {
		//outputStream[i].reset();

		p->cuResult[i] = cuEGLStreamConsumerDisconnect(&p->cudaConnection[i]);
		if (p->cuResult[i] != CUDA_SUCCESS)
		{
			ORIGINATE_ERROR("Unable to disconnect CUDA as a consumer from EGLStream (CUresult %s)",
				getCudaErrorString(p->cuResult[i]));
		}
	}

    PROPAGATE_ERROR(cleanupCUDA(&p->g_cudaContext));
	
	
	
	
}

void *CudaConsumerThreadRun(void *index)
{
	int camera_index = *((int *)index);
	CudaConsumerHandle *p = (CudaConsumerHandle *)(&cudaConsumer);
	
	CONSUMER_PRINT("Waiting for Argus producer to connect to output stream.\n");
		printf("Camera Index = %d ", camera_index);
fflush(stdout);
		
	p->outputStreams[camera_index]->waitUntilConnected();

	fflush(stdout);
	
	struct timeval tv1, tv2;
	double time = 0.0;
	 
	gettimeofday(&tv1, NULL);
	for(unsigned int frame = 0; frame < FRAME_COUNT; ++frame)
    {
        /*
         * For simplicity this example submits a capture then waits for an output.
         * This pattern will not provide the best possible performance as the camera
         * stack runs in a pipeline, it is best to keep submitting as many captures as
         * possible prior to waiting for the result.
         */
      
		
     
		//printf("Acquiring an image from the EGLStream\n");
		fflush(stdout);

        
   
		p->cuResult[camera_index] = cuEGLStreamConsumerAcquireFrame(&p->cudaConnection[camera_index], &p->cudaResource[camera_index], &p->cudaStream[camera_index], -1);
		if (p->cuResult[camera_index] != CUDA_SUCCESS)
		{
				ORIGINATE_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a "
					"consumer (CUresult %s).", getCudaErrorString(p->cuResult[camera_index]));
		}
		
		
        // Get the CUDA EGL frame.
      
        
      
			p->cuResult[camera_index] = cuGraphicsResourceGetMappedEglFrame(&p->cudaEGLFrame[camera_index], p->cudaResource[camera_index], 0, 0);
			if (p->cuResult[camera_index] != CUDA_SUCCESS)
			{
				ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
					getCudaErrorString(p->cuResult[camera_index]));
			}

			// Print the information contained in the CUDA EGL frame structure.
			//PROPAGATE_ERROR(printCUDAEGLFrame(p->cudaEGLFrame[i]));

			if ((p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
				(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
				(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
				(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
			{
				ORIGINATE_ERROR("Only YUV color formats are supported");
			}
			if (p->cudaEGLFrame[camera_index].cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
				ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");
		
		
		
			//pthread_mutex_lock(&p->lock[camera_index]);
			
			cudaArray *arr_y =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[0];
			cudaArray *arr_u =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[1];
			cudaArray *arr_v =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[2];
			
			//pthread_mutex_unlock(&p->lock[camera_index]);
			
			
			int u_offset = p->m_width * p->m_height;
			int v_offset = p->m_width * p->m_height * 1.25;
			
	
			
			//gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index], arr_y, 0, 0, p->m_width * p->m_height , cudaMemcpyDeviceToDevice));
			//gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index] + u_offset, arr_u, 0, 0, p->m_width * p->m_height *0.25 , cudaMemcpyDeviceToDevice));
			//gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index] + v_offset, arr_v, 0, 0, p->m_width * p->m_height * 0.25 , cudaMemcpyDeviceToDevice));
		
		
			
		
		
		
		
		
		p->cuResult[camera_index] = cuEGLStreamConsumerReleaseFrame(&p->cudaConnection[camera_index], p->cudaResource[camera_index], &p->cudaStream[camera_index]);
		if (p->cuResult[camera_index] != CUDA_SUCCESS)
		{
			ORIGINATE_ERROR("Unable to release the last frame acquired from the EGLStream "
					"(CUresult %s).", getCudaErrorString(p->cuResult[camera_index]));
		}
		
    }
    gettimeofday(&tv2, NULL);
    time = (tv2.tv_sec - tv1.tv_sec)*1000 + (tv2.tv_usec - tv1.tv_usec)/(1000.0);
    time/=1000.0;
    
    printf("FPS = %f Cam Index = %d \n", FRAME_COUNT/time, camera_index);
    fflush(stdout);
	
	

}
   

 bool execute()
{
	
	// Create the CameraProvider object
    UniqueObj<CameraProvider> cameraProvider(CameraProvider::create());
    ICameraProvider *iCameraProvider = interface_cast<ICameraProvider>(cameraProvider);
    if (!iCameraProvider)
        ORIGINATE_ERROR("Failed to create CameraProvider");

	
    // Get the camera devices.
    std::vector<CameraDevice*> cameraDevices;
    iCameraProvider->getCameraDevices(&cameraDevices);
    if (cameraDevices.size() < NUM_CAMERAS)
        ORIGINATE_ERROR("Not enough cameras available");

	UniqueObj<CaptureSession>captureSession[NUM_CAMERAS];
	ICaptureSession *iCaptureSession[NUM_CAMERAS];
	
	for(int i = 0 ; i < NUM_CAMERAS ; i++)
	{
   
       captureSession[i] = UniqueObj<CaptureSession>(iCameraProvider->createCaptureSession(cameraDevices[i]));
       
	   iCaptureSession[i] = interface_cast<ICaptureSession>(captureSession[i]);
		if (!iCaptureSession[i])
        ORIGINATE_ERROR("Failed to create CaptureSession");
	}

	CameraDevice *cameraDevice = cameraDevices[0];
	ICameraProperties *iCameraProperties = interface_cast<ICameraProperties>(cameraDevice);
	std::vector<SensorMode*> sensorModes;
	iCameraProperties->getSensorModes(&sensorModes);
	if (sensorModes.size() == 0)
		ORIGINATE_ERROR("Failed to get sensor modes");
	
	SensorMode *sensorMode = sensorModes[1];
	
	
    printf("Creating output streams for cuda\n");
    
		
    UniqueObj<OutputStreamSettings> streamSettings[NUM_CAMERAS];
    IOutputStreamSettings *iStreamSettings[NUM_CAMERAS];
    
    for(int i = 0 ; i < NUM_CAMERAS ; i++)
    {
		streamSettings[i] = UniqueObj<OutputStreamSettings>(iCaptureSession[i]->createOutputStreamSettings());
		iStreamSettings[i]	= interface_cast<IOutputStreamSettings>(streamSettings[i]);
		if (iStreamSettings[i])
		{
			iStreamSettings[i]->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
			iStreamSettings[i]->setResolution(STREAM_SIZE);
		}
	}
    UniqueObj<OutputStream> outputStream[NUM_CAMERAS];
    IStream *iStream[NUM_CAMERAS];
 
    for(int i = 0 ; i < NUM_CAMERAS ; i++)
    {
		outputStream[i] = UniqueObj<OutputStream>(iCaptureSession[i]->createOutputStream(streamSettings[i].get()));
		iStream[i] = interface_cast<IStream>(outputStream[i]);
        if (!iStream[i])
         ORIGINATE_ERROR("Failed to create OutputStream");
    }
	
	
	  
    pthread_t ConsumerThread[NUM_CAMERAS];
	InitCudaConsumer(&cudaConsumer, iStream, NUM_CAMERAS, PREVIEW_WIDTH, PREVIEW_HEIGHT);
	pthread_attr_t attr_;
    cpu_set_t cpus_;
    pthread_attr_init(&attr_);




 
    
    

 


    // Initialize and connect CUDA as the EGLStream consumer.
   


 
    // Create capture request and enable output stream.
    
    UniqueObj<Request> request[NUM_CAMERAS];
    IRequest *iRequest[NUM_CAMERAS];
    
    for(int i = 0 ; i < NUM_CAMERAS ; i++)
    {
    
		request[i] = UniqueObj<Request>(iCaptureSession[i]->createRequest());
		iRequest[i] = interface_cast<IRequest>(request[i]);
		if (!iRequest[i])
			ORIGINATE_ERROR("Failed to create Request");
		ISourceSettings *sourceSettings  = interface_cast<ISourceSettings>(iRequest[i]->getSourceSettings());

		sourceSettings->setSensorMode(sensorMode);
		
		iRequest[i]->enableOutputStream(outputStream[i].get());
	}
    
  
     // Submit some captures and calculate the histogram with CUDA
     
   
     
     for(int i = 0 ; i < NUM_CAMERAS ; i++)
     {
      if (iCaptureSession[i]->repeat(request[i].get()) != STATUS_OK)
        ORIGINATE_ERROR("Failed to start repeat capture request for preview");
     }
   
	
	
    
		int num[] = {0, 1, 2, 3, 4, 5};
		for(int i = 0 ; i < NUM_CAMERAS ; i++)
		{
			CPU_ZERO(&cpus_);
			CPU_SET( (i%4) + 1, &cpus_);
			pthread_attr_setaffinity_np(&attr_, sizeof(cpu_set_t), &cpus_);
			pthread_create(&ConsumerThread[i], &attr_, CudaConsumerThreadRun, (void *)(&num[i]));
		}
	

 
	
		
		


        usleep(CAPTURE_TIME*1000000);
     
		for(int i = 0 ; i  < NUM_CAMERAS ; i++)
		{
			iCaptureSession[i]->stopRepeat();
			iCaptureSession[i]->waitForIdle();
		}
		
		for(int i = 0 ; i < NUM_CAMERAS ; i++)
		{
			iStream[i]->disconnect();	
			pthread_join(ConsumerThread[i], NULL);
		}
		 
     
	

  
        
		for(int i = 0 ; i < NUM_CAMERAS ; i++)
			outputStream[i].reset();
	
		CudaConsumerDisconnect(&cudaConsumer);
  
       
    


		printf("Done\n");

    return true;
}

}; // namespace ArgusSamples

int main(int argc, const char *argv[])
{
    if (!ArgusSamples::execute())
        return EXIT_FAILURE;

    return EXIT_SUCCESS;
}

sorry a small change in the code.
i%4 is i%3 and the output is

FPS = 33.688619 Cam Index = 3
FPS = 33.763799 Cam Index = 5
FPS = 33.645542 Cam Index = 4
FPS = 27.048300 Cam Index = 2
FPS = 26.792976 Cam Index = 1
FPS = 26.291328 Cam Index = 0

@tejas95
Could verify the argus_camera for 6 camera as we had verified we can get 30fps with 6 cameras.
The you can launch multi sensors from the argus_camera’s UI from ubuntu Desktop.

Yes i am able to get 30 fps with multi session 6 cameras from the UI. However i am not sure of how my implementation varies from the multi-session source code of the UI.

@tejas95
I think you need to narrow down your pipeline like skip some cuda process etc…

Hello tejas95,

Can you explain how to use index 1,2 to access u,v data?
I tried cudaHistogram sample but cannot access the pArray[1]. Only 2 planes is in my cudaEGLFrame, so I believe it is a yuv420 semi-planar case.

cudaArray *arr_u =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[1];
		cudaArray *arr_v =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[2];