Parallel Image Capture with Processing

Hello,

I would like to implement a parallel processing aspect to my program that captures images and performs a series of FFT’s and miscellaneous calculations on the GPU.

I would like to begin processing as soon as the image is copied over to the GPU, while the next image is being captured in parallel with the processing. The structure I want to implement looks something like (I apologize if this is confusing):

capture image1 --> copy image1 to GPU --> capture image2 -->_______copy to image2 to GPU -->
_____________________________________Process image1 --------->

I have tested this version, but it is still slower than anticipated. Is this being performed correctly?

I am posting in this cuda forum because there may be a more effective way of accomplishing this using streams rather than creating a thread. I am new with all types of parallel programming, so any type of advice, resources,etc is welcome.

I have removed a large portion of code dealing with cuda error checking, and the details of each cuda kernel. This program is functional, the main area of concern is the capture ‘while’ loop.

#include <cufft.h>
#include <FlyCapture2.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <chrono>
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <thread>
#include "rt_nonfinite.h"
#include "get_peaks.h"
#include "main.h"
#include "get_peaks_terminate.h"
#include "get_peaks_initialize.h"

#define height 2048
#define width 2448
#define size 5013504

dim3 threadsPerBlock(32,32);
dim3 numBlocks(77,64);

using namespace FlyCapture2;


__global__ void datatransfer(cufftComplex *f2, float *f)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {
        //Do something
        ...
}
}

__global__ void magnitude_kernel(cufftComplex *out, cufftComplex *in2)
{

	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
	int y = (blockIdx.y * blockDim.y) + threadIdx.y;

	if (x>=0 && x<width && y>=0 && y< height) {

	//Do something
        ...
	}
}

__global__ void swap_quadrants(cufftComplex *old_img,cufftComplex *new_img)
{

	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
	int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width/2 && y>=0 && y< height/2)   {

	//Do something
        ...
	}
}

__global__ void transferpeakdata(float *row, cufftComplex *image)
{
	int x = (blockIdx.x * blockDim.x) + threadIdx.x;

        //Do something
        ...
	
}

void algorithm(float *d_a,
		cufftComplex *a2,
		cufftComplex *a3,
		float *h_1d,
		float *d_1d,
		double *doub_1d,
		double *peaks,
		double *loc,
		int *loc_sz,
		int *peaks_sz,
		cufftHandle plan)
{

	//Conversion from float to float2
	datatransfer<<<numBlocks,threadsPerBlock>>>(a2,d_a);


	//First FFT
	cufftExecC2C(plan,(cufftComplex *)a2,(cufftComplex *)a2, CUFFT_FORWARD)

	//Perform absolute value
	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(a2,a3);

	//Swap quadrants 
	swap_quadrants<<<numBlocks,threadsPerBlock>>>(a3,a2);

	//Perform 2nd FFT
	cufftExecC2C(plan,(cufftComplex *)a2,(cufftComplex *)a2, CUFFT_FORWARD)
	
	//Perform 2nd absolute value
	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(a2,a3);

	//Swap quadrants 
	swap_quadrants<<<numBlocks,threadsPerBlock>>>(a3,a2);

	//Transfer Middle Two rows to a 1D array
	transferpeakdata<<<3,1024>>>(d_1d,a2);

	//Copy 1D array to host
	cudaMemcpy(h_1d,d_1d,sizeof(float)*width,cudaMemcpyDeviceToHost);
	

	//Better Peak method
	int glob_max_ind;
	int max_ind[2];
	double max_num;
	int distance;
	
        get_peaks(doub_1d,peaks,peaks_sz,loc,loc_sz);

	
	std::cout<<"Global max index: "<<max_ind[0]<<" Second max index: "<<max_ind[1]<<std::endl;
	std::cout<<"Number of peaks: "<<peaks_sz[1]<<std::endl;
	distance = abs(max_ind[0] - max_ind[1]);
	std::cout<<"Distance: "<<distance<<std::endl;

}


int main() 
{
	using std::chrono::duration_cast;
	using std::chrono::nanoseconds;
	typedef std::chrono::high_resolution_clock clock;

	//Cuda variables
	cufftHandle plan;
	cufftComplex *h_array2,*array2,*h_array3,*array3;
	float *h_1darray,*dev_1darray,*dev_array;
	double double_array[2448],peaks_array[2448],loc_data[2448];
	int peaks_size[2],loc_size[2];
	float *array = new float;

	//Allocate memory
	h_array2 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
	h_array3 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
	h_1darray = (float*)malloc(width*sizeof(float));

	cudaMalloc((void **) &dev_array,sizeof(float)*size);
	cudaMalloc((void **) &array2, sizeof(cufftComplex)*size);
	cudaMalloc((void **) &array3, sizeof(cufftComplex)*size);
	cudaMalloc((void **) &dev_1darray, sizeof(float)*width);

	cudaError_t err = cudaGetLastError();
		if (err != cudaSuccess) 
			{
			fprintf(stderr, "Cuda Error: Failed to allocate\n");
			fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
			}

	//Copy Variable to GPU
	if (cudaMemcpy(array2,h_array2,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
		{
		std::cout<<"Failed to copy to array2"<<std::endl;
		}

	if (cudaMemcpy(array3,h_array3,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
		{
		std::cout<<"Failed to copy to array3"<<std::endl;
		}
	if (cudaMemcpy(dev_1darray,h_1darray,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
		{
		std::cout<<"Failed to copy to dev_1darray"<<std::endl;
		}


	//Plan 2D FFT
	if (cufftPlan2d(&plan,height,width,CUFFT_C2C) != CUFFT_SUCCESS) 
		{
		std::cout<<"Failed to make 1st FFT plan"<<std::endl;
		}

	
        Camera camera;
        // Connect the camera
    	camera.Connect( 0 );
    	
	// Start capture
	camera.StartCapture();
	
	//Initialize loop 
	cv::Mat image = cv::Mat::ones(height,width,CV_32FC1);
	cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);

	//Capture Loop
	int ii =0;
	auto start = clock::now();
	char key = 0;
	while (key != 'q') {
		

		//autofocus algorithm
		std::thread t1(algorithm,dev_array,
				array2,
				array3,
				h_1darray,
				dev_1darray,
				double_array,
				peaks_array,
				loc_data,
				loc_size,
				peaks_size,
				plan);


		// Get the image
                Image rawImage;
                error = camera.RetrieveBuffer( &rawImage );
                if ( error != PGRERROR_OK )
                {
                        std::cout << "capture error" << std::endl;
                        continue;
                }

                // convert to rgb
            	Image rgbImage;
        	rawImage.Convert( FlyCapture2::PIXEL_FORMAT_MONO8, &rgbImage );

                // convert to OpenCV Mat
                unsigned int rowBytes = (double)rgbImage.GetReceivedDataSize()/(double)rgbImage.GetRows();
                image = cv::Mat(rgbImage.GetRows(), rgbImage.GetCols(), CV_8UC1, rgbImage.GetData(),rowBytes);
		imshow("image",image);

		//Convert
		image.convertTo(image,CV_32FC1);
		
		array = (float*)image.data;
		
		//synchronize
		t1.join();		
		
		//Copy Image Array to GPU
		cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
		err = cudaGetLastError();
			if (err != cudaSuccess) 
				{
				fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");
				fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
				}
 
		

		key = cv::waitKey(3);
		ii = ii+1;
		
	}

	//Finish timing the loop
	cudaDeviceSynchronize();
	auto end = clock::now();
	    std::cout <<"Loop Time: "<< (duration_cast<nanoseconds>(end-start).count())/(ii*1000000)<<std::endl;

	cudaFree(array2);
	cudaFree(array3);
	cudaFree(dev_1darray);
	cudaFree(dev_array);

return 0;
}

You seem to be striving for concurrency - which makes sense.

Have you tried using the profiler to see whether your intended diagram is realized in actual execution? This is a very useful and typical use for the visual profiler. It should be possible to run it on Jetson, but I don’t have a lot of experience with running it on Jetson.

Thanks for the tip Bob this is a good idea. I just posted a question in the TX2 forum about remote access if you’d like to take a look.